Code
source("../dsan-globals/_globals.r")
set.seed(5300)
DSAN 5300: Statistical Learning
Spring 2025, Georgetown University
Today’s Planned Schedule:
Start | End | Topic | |
---|---|---|---|
Lecture | 6:30pm | 6:50pm | Roadmap: Week 4 \(\leadsto\) Week 6 → |
6:50pm | 7:20pm | Non-Linear Data-Generating Processes → | |
7:20pm | 8:00pm | Validation: Evaluating Non-Linear Models → | |
Break! | 8:00pm | 8:10pm | |
8:10pm | 9:00pm | \(K\)-Fold Cross-Validation: Cooking with Gas → |
source("../dsan-globals/_globals.r")
set.seed(5300)
\[ \DeclareMathOperator*{\argmax}{argmax} \DeclareMathOperator*{\argmin}{argmin} \newcommand{\bigexp}[1]{\exp\mkern-4mu\left[ #1 \right]} \newcommand{\bigexpect}[1]{\mathbb{E}\mkern-4mu \left[ #1 \right]} \newcommand{\definedas}{\overset{\small\text{def}}{=}} \newcommand{\definedalign}{\overset{\phantom{\text{defn}}}{=}} \newcommand{\eqeventual}{\overset{\text{eventually}}{=}} \newcommand{\Err}{\text{Err}} \newcommand{\expect}[1]{\mathbb{E}[#1]} \newcommand{\expectsq}[1]{\mathbb{E}^2[#1]} \newcommand{\fw}[1]{\texttt{#1}} \newcommand{\given}{\mid} \newcommand{\green}[1]{\color{green}{#1}} \newcommand{\heads}{\outcome{heads}} \newcommand{\iid}{\overset{\text{\small{iid}}}{\sim}} \newcommand{\lik}{\mathcal{L}} \newcommand{\loglik}{\ell} \DeclareMathOperator*{\maximize}{maximize} \DeclareMathOperator*{\minimize}{minimize} \newcommand{\mle}{\textsf{ML}} \newcommand{\nimplies}{\;\not\!\!\!\!\implies} \newcommand{\orange}[1]{\color{orange}{#1}} \newcommand{\outcome}[1]{\textsf{#1}} \newcommand{\param}[1]{{\color{purple} #1}} \newcommand{\pgsamplespace}{\{\green{1},\green{2},\green{3},\purp{4},\purp{5},\purp{6}\}} \newcommand{\prob}[1]{P\left( #1 \right)} \newcommand{\purp}[1]{\color{purple}{#1}} \newcommand{\sign}{\text{Sign}} \newcommand{\spacecap}{\; \cap \;} \newcommand{\spacewedge}{\; \wedge \;} \newcommand{\tails}{\outcome{tails}} \newcommand{\Var}[1]{\text{Var}[#1]} \newcommand{\bigVar}[1]{\text{Var}\mkern-4mu \left[ #1 \right]} \]
library(tidyverse) |> suppressPackageStartupMessages()
library(plotly) |> suppressPackageStartupMessages()
<- read_csv("assets/dalys_cleaned.csv") daly_df
Rows: 162 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): name, gdp_cap
dbl (4): dalys_pc, population, gdp_pc_clean, log_dalys_pc
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- daly_df |> mutate(
daly_df gdp_pc_1k=gdp_pc_clean/1000
)<- labs(
model_labels x="GDP per capita ($1K PPP, 2021)",
y="Log(DALYs/n)",
title="Decrease in DALYs as GDP/n Increases"
)<- daly_df |> ggplot(aes(x=gdp_pc_1k, y=log_dalys_pc, label=name)) +
daly_plot geom_point() +
# geom_smooth(method="loess", formula=y ~ x) +
geom_smooth(method="lm", formula=y ~ poly(x,5), se=FALSE) +
theme_dsan(base_size=14) +
model_labelsggplotly(daly_plot)
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
\[ \begin{align*} \leadsto Y = &10.58 - 0.2346 X + 0.01396 X^2 \\ &- 0.0004 X^3 + 0.000005 X^4 \\ &- 0.00000002 X^5 + \varepsilon \end{align*} \]
<- function(x) {
eval_fitted_poly <- c(
coefs 10.58, -0.2346, 0.01396,
-0.0004156, 0.0000053527, -0.0000000244
)<- c(x^0, x^1, x^2, x^3, x^4, x^5)
x_terms <- sum(coefs * x_terms)
dot_prod return(dot_prod)
}<- 500
N <- runif(N, min=0, max=90)
x_vals <- sapply(X=x_vals, FUN=eval_fitted_poly)
y_vals <- tibble(gdpc=x_vals, ldalys=y_vals)
sim_df ggplot() +
geom_line(data=sim_df, aes(x=gdpc, y=ldalys)) +
geom_point(data=daly_df, aes(x=gdp_pc_1k, y=log_dalys_pc)) +
theme_dsan() +
model_labels
<- function(world_label="Sim", N=60, x_max=90) {
run_dgp <- runif(N, min=0, max=x_max)
x_vals <- sapply(X=x_vals, FUN=eval_fitted_poly)
y_raw <- rnorm(N, mean=0, sd=0.8)
y_noise <- y_raw + y_noise
y_vals <- tibble(
sim_df gdpc=x_vals,
ldalys=y_vals,
world=world_label
)return(sim_df)
}<- run_dgp("World 1")
df1 <- run_dgp("World 2")
df2 <- run_dgp("World 3")
df3 <- bind_rows(df1, df2, df3)
dgp_df |> ggplot(aes(x=gdpc, y=ldalys)) +
dgp_df geom_point(aes(color=world)) +
facet_wrap(vars(world)) +
theme_dsan(base_size=22) +
remove_legend() +
+
model_labels labs(title="Three Possible Realizations of our DGP")
Specifically: evaluating non-linear models on how well they generalize!
graph grid
{
graph [
overlap=true,
scale=0.2
]
nodesep=0.0
ranksep=0.0
rankdir="LR"
scale=0.2
node [
style="filled",
color=black,
fillcolor=lightblue,
shape=box
]
// uncomment to hide the grid
edge [style=invis]
subgraph cluster_01 {
label="Training Set (80%)"
subgraph cluster_02 {
label="Training Fold (80%)"
A1[label="16%"] A2[label="16%"] A3[label="16%"] A4[label="16%"]
}
subgraph cluster_03 {
label="Validation Fold (20%)"
B1[label="16%",fillcolor=lightgreen]
}
}
subgraph cluster_04 {
label="Test Set (20%)"
C1[label="20%",fillcolor=orange]
}
A1 -- A2 -- A3 -- A4 -- B1 -- C1;
}
library(boot)
set.seed(5300)
<- run_dgp(
sim200_df world_label="N=200", N=200, x_max=100
)<- run_dgp(
sim1k_df world_label="N=1000", N=1000, x_max=100
)<- function(df, min_deg=1, max_deg=12) {
compute_deltas <- c()
cv_deltas for (i in min_deg:max_deg) {
<- glm(ldalys ~ poly(gdpc, i), data=df)
cur_poly <- cv.glm(data=df, glmfit=cur_poly, K=5)
cur_poly_cv_result <- cur_poly_cv_result$delta[1]
cur_cv_adj <- c(cv_deltas, cur_cv_adj)
cv_deltas
}return(cv_deltas)
}<- compute_deltas(sim200_df)
sim200_deltas <- tibble(degree=1:12, delta=sim200_deltas)
sim200_delta_df |> ggplot(aes(x=degree, y=delta)) +
sim200_delta_df geom_line() +
geom_point() +
geom_vline(xintercept=5, linetype="dashed") +
scale_x_continuous(
breaks=seq(from=1,to=12,by=1)
+
) theme_dsan(base_size=22) +
labs(title="N = 200")
<- compute_deltas(sim1k_df)
sim1k_deltas <- tibble(degree=1:12, delta=sim1k_deltas)
sim1k_delta_df |> ggplot(aes(x=degree, y=delta)) +
sim1k_delta_df geom_line() +
geom_point() +
geom_vline(xintercept=5, linetype="dashed") +
scale_x_continuous(
breaks=seq(from=1,to=12,by=1)
+
) theme_dsan(base_size=22) +
labs(title="N = 1000")
(加 even more 油!)
\[ \varepsilon_{(K)} = \frac{1}{K}\sum_{i=1}^{K}\varepsilon^{\text{Val}}_{i} \]
\[ \begin{align*} \varepsilon_{(N)} =~ & \text{Err}(\mathbf{D}_1 \mid \mathbf{D}_{2:100}) + \text{Err}(\mathbf{D}_2 \mid \mathbf{D}_1, \mathbf{D}_{3:100}) \\ &+ \text{Err}(\mathbf{D}_3 \mid \mathbf{D}_{1:2}, \mathbf{D}_{4:100}) + \cdots + \text{Err}(\mathbf{D}_{100} \mid \mathbf{D}_{1:99}) \end{align*} \]