Code
source("../dsan-globals/_globals.r")
set.seed(5300)
DSAN 5300: Statistical Learning
Spring 2025, Georgetown University
Today’s Planned Schedule:
Start | End | Topic | |
---|---|---|---|
Lecture | 6:30pm | 6:50pm | Roadmap: Week 4 |
6:50pm | 7:20pm | Non-Linear Data-Generating Processes → | |
7:20pm | 8:00pm | Validation: Evaluating Non-Linear Models → | |
Break! | 8:00pm | 8:10pm | |
8:10pm | 9:00pm |
source("../dsan-globals/_globals.r")
set.seed(5300)
library(tidyverse) |> suppressPackageStartupMessages()
library(plotly) |> suppressPackageStartupMessages()
<- read_csv("assets/dalys_cleaned.csv") daly_df
Rows: 162 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): name, gdp_cap
dbl (4): dalys_pc, population, gdp_pc_clean, log_dalys_pc
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- daly_df |> mutate(
daly_df gdp_pc_1k=gdp_pc_clean/1000
)<- labs(
model_labels x="GDP per capita ($1K PPP, 2021)",
y="Log(DALYs/n)",
title="Decrease in DALYs as GDP/n Increases"
)<- daly_df |> ggplot(aes(x=gdp_pc_1k, y=log_dalys_pc, label=name)) +
daly_plot geom_point() +
# geom_smooth(method="loess", formula=y ~ x) +
geom_smooth(method="lm", formula=y ~ poly(x,5), se=FALSE) +
theme_dsan(base_size=14) +
model_labelsggplotly(daly_plot)
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
<- function(x) {
eval_fitted_poly <- c(
coefs 10.58, -0.2346, 0.01396,
-0.0004156, 0.0000053527, -0.0000000244
)<- c(x^0, x^1, x^2, x^3, x^4, x^5)
x_terms <- sum(coefs * x_terms)
dot_prod return(dot_prod)
}<- 500
N <- runif(N, min=0, max=90)
x_vals <- sapply(X=x_vals, FUN=eval_fitted_poly)
y_vals <- tibble(gdpc=x_vals, ldalys=y_vals)
sim_df ggplot() +
geom_line(data=sim_df, aes(x=gdpc, y=ldalys)) +
geom_point(data=daly_df, aes(x=gdp_pc_1k, y=log_dalys_pc)) +
theme_dsan() +
model_labels
<- function(world_label="Sim", N=60, x_max=90) {
run_dgp <- runif(N, min=0, max=x_max)
x_vals <- sapply(X=x_vals, FUN=eval_fitted_poly)
y_raw <- rnorm(N, mean=0, sd=0.8)
y_noise <- y_raw + y_noise
y_vals <- tibble(
sim_df gdpc=x_vals,
ldalys=y_vals,
world=world_label
)return(sim_df)
}<- run_dgp("World 1")
df1 <- run_dgp("World 2")
df2 <- run_dgp("World 3")
df3 <- bind_rows(df1, df2, df3)
dgp_df |> ggplot(aes(x=gdpc, y=ldalys)) +
dgp_df geom_point(aes(color=world)) +
facet_wrap(vars(world)) +
theme_dsan(base_size=22) +
remove_legend() +
+
model_labels labs(title="Three Possible Realizations of our DGP")
Specifically: evaluating non-linear models on how well they generalize!
graph grid
{
graph [
overlap=true,
scale=0.2
]
nodesep=0.0
ranksep=0.0
rankdir="LR"
scale=0.2
node [
style="filled",
color=black,
fillcolor=lightblue,
shape=box
]
// uncomment to hide the grid
edge [style=invis]
subgraph cluster_01 {
label="Training Set (80%)"
subgraph cluster_02 {
label="Training Fold (80%)"
A1[label="16%"] A2[label="16%"] A3[label="16%"] A4[label="16%"]
}
subgraph cluster_03 {
label="Validation Fold (20%)"
B1[label="16%",fillcolor=lightgreen]
}
}
subgraph cluster_04 {
label="Test Set (20%)"
C1[label="20%",fillcolor=orange]
}
A1 -- A2 -- A3 -- A4 -- B1 -- C1;
}
library(boot)
set.seed(5300)
<- run_dgp(
sim200_df world_label="N=200", N=200, x_max=100
)<- run_dgp(
sim1k_df world_label="N=1000", N=1000, x_max=100
)<- function(df, min_deg=1, max_deg=12) {
compute_deltas <- c()
cv_deltas for (i in min_deg:max_deg) {
<- glm(ldalys ~ poly(gdpc, i), data=df)
cur_poly <- cv.glm(data=df, glmfit=cur_poly, K=5)
cur_poly_cv_result <- cur_poly_cv_result$delta[1]
cur_cv_adj <- c(cv_deltas, cur_cv_adj)
cv_deltas
}return(cv_deltas)
}<- compute_deltas(sim200_df)
sim200_deltas <- tibble(degree=1:12, delta=sim200_deltas)
sim200_delta_df |> ggplot(aes(x=degree, y=delta)) +
sim200_delta_df geom_line() +
geom_point() +
geom_vline(xintercept=5, linetype="dashed") +
scale_x_continuous(
breaks=seq(from=1,to=12,by=1)
+
) theme_dsan(base_size=22) +
labs(title="N = 200")
<- compute_deltas(sim1k_df)
sim1k_deltas <- tibble(degree=1:12, delta=sim1k_deltas)
sim1k_delta_df |> ggplot(aes(x=degree, y=delta)) +
sim1k_delta_df geom_line() +
geom_point() +
geom_vline(xintercept=5, linetype="dashed") +
scale_x_continuous(
breaks=seq(from=1,to=12,by=1)
+
) theme_dsan(base_size=22) +
labs(title="N = 1000")
(加 even more 油!)