DSAN 5300: Statistical Learning
Spring 2025, Georgetown University
Monday, March 10, 2025
Today’s Planned Schedule:
Start | End | Topic | |
---|---|---|---|
Lecture | 6:30pm | 7:00pm | Separating Hyperplanes → |
7:00pm | 7:20pm | Max-Margin Classifiers → | |
7:20pm | 8:00pm | Support Vector Classifiers → | |
Break! | 8:00pm | 8:10pm | |
8:10pm | 9:00pm | Support Vector Machines → |
\[ \DeclareMathOperator*{\argmax}{argmax} \DeclareMathOperator*{\argmin}{argmin} \newcommand{\bigexp}[1]{\exp\mkern-4mu\left[ #1 \right]} \newcommand{\bigexpect}[1]{\mathbb{E}\mkern-4mu \left[ #1 \right]} \newcommand{\definedas}{\overset{\small\text{def}}{=}} \newcommand{\definedalign}{\overset{\phantom{\text{defn}}}{=}} \newcommand{\eqeventual}{\overset{\text{eventually}}{=}} \newcommand{\Err}{\text{Err}} \newcommand{\expect}[1]{\mathbb{E}[#1]} \newcommand{\expectsq}[1]{\mathbb{E}^2[#1]} \newcommand{\fw}[1]{\texttt{#1}} \newcommand{\given}{\mid} \newcommand{\green}[1]{\color{green}{#1}} \newcommand{\heads}{\outcome{heads}} \newcommand{\iid}{\overset{\text{\small{iid}}}{\sim}} \newcommand{\lik}{\mathcal{L}} \newcommand{\loglik}{\ell} \DeclareMathOperator*{\maximize}{maximize} \DeclareMathOperator*{\minimize}{minimize} \newcommand{\mle}{\textsf{ML}} \newcommand{\nimplies}{\;\not\!\!\!\!\implies} \newcommand{\orange}[1]{\color{orange}{#1}} \newcommand{\outcome}[1]{\textsf{#1}} \newcommand{\param}[1]{{\color{purple} #1}} \newcommand{\pgsamplespace}{\{\green{1},\green{2},\green{3},\purp{4},\purp{5},\purp{6}\}} \newcommand{\prob}[1]{P\left( #1 \right)} \newcommand{\purp}[1]{\color{purple}{#1}} \newcommand{\sign}{\text{Sign}} \newcommand{\spacecap}{\; \cap \;} \newcommand{\spacewedge}{\; \wedge \;} \newcommand{\tails}{\outcome{tails}} \newcommand{\Var}[1]{\text{Var}[#1]} \newcommand{\bigVar}[1]{\text{Var}\mkern-4mu \left[ #1 \right]} \]
library(tidyverse) |> suppressPackageStartupMessages()
house_df <- tibble::tribble(
~sqm, ~yrs, ~Rating,
10, 5, "Disliked",
10, 20, "Disliked",
20, 22, "Liked",
25, 12, "Liked",
11, 15, "Disliked",
22.1, 5, "Liked"
) |> mutate(
label = ifelse(Rating == "Liked", 1, -1)
)
base_plot <- house_df |> ggplot(aes(x=sqm, y=yrs)) +
labs(
title = "Jeff's House Search",
x = "Square Meters",
y = "Years Old"
) +
expand_limits(x=c(0,25), y=c(0,25)) +
coord_equal() +
# 45 is minus sign, 95 is em-dash
scale_shape_manual(values=c(95, 43)) +
theme_dsan(base_size=14)
base_plot +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.9,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=6, stroke=0.75, alpha=0.333)
set.seed(5300)
is_separating <- function(beta_vec) {
beta_str <- paste0(beta_vec, collapse=",")
# print(paste0("is_separating: ",beta_str))
margins <- c()
for (i in 1:nrow(house_df)) {
cur_data <- house_df[i,]
# print(cur_data)
linear_comb <- beta_vec[1] + beta_vec[2] * cur_data$sqm + beta_vec[3] * cur_data$yrs
cur_margin <- cur_data$label * linear_comb
# print(cur_margin)
margins <- c(margins, cur_margin)
}
#print(margins)
return(all(margins > 0) | all(margins < 0))
}
cust_rand_lines_df <- tribble(
~b0, ~b1, ~b2,
# 41, -0.025, -1,
165, -8, -1,
-980, 62, -1
) |> mutate(
slope=-(b1/b2),
intercept=-(b0/b2)
)
num_lines <- 20
rand_b0 <- runif(num_lines, min=-40, max=40)
rand_b1 <- runif(num_lines, min=-2, max=2)
# rand_b2 <- -1 + 2*rbernoulli(num_lines)
rand_b2 <- -1
rand_lines_df <- tibble::tibble(
id=1:num_lines,
b0=rand_b0,
b1=rand_b1,
b2=rand_b2
) |> mutate(
slope=-(b1/b2),
intercept=-(b0/b2)
)
rand_lines_df <- bind_rows(rand_lines_df, cust_rand_lines_df)
# Old school for loop
for (i in 1:nrow(rand_lines_df)) {
cur_line <- rand_lines_df[i,]
cur_beta_vec <- c(cur_line$b0, cur_line$b1, cur_line$b2)
cur_is_sep <- is_separating(cur_beta_vec)
rand_lines_df[i, "is_sep"] <- cur_is_sep
}
base_plot +
geom_abline(
data=rand_lines_df, aes(slope=slope, intercept=intercept), linetype="dashed"
) +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.9,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=6, stroke=0.75, alpha=0.333) +
labs(
title = paste0("10 Boundary Guesses"),
x = "Square Meters",
y = "Years Old"
)
(…Which one is “best”?)
base_plot +
geom_abline(
data=rand_lines_df, aes(slope=slope, intercept=intercept, linetype=is_sep)
) +
geom_abline(
data=rand_lines_df |> filter(is_sep),
aes(slope=slope, intercept=intercept),
linewidth=3, color=cb_palette[4], alpha=0.333
) +
scale_linetype_manual("Separating?", values=c("dotted", "dashed")) +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.9,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=6, stroke=0.75, alpha=0.333) +
labs(
title = paste0("The Like vs. Dislike Boundary: 10 Guesses"),
x = "Square Meters",
y = "Years Old"
)
sep_lines_df <- rand_lines_df |> filter(is_sep) |> mutate(
norm_slope = (-1)/slope
)
cur_line_df <- sep_lines_df |> filter(slope > 0)
# left_line_df
# And make one copy per point
cur_sup_df <- uncount(cur_line_df, nrow(house_df))
cur_sup_df <- bind_cols(cur_sup_df, house_df)
cur_sup_df <- cur_sup_df |> mutate(
norm_intercept = yrs - norm_slope * sqm,
margin_intercept = yrs - slope * sqm,
margin_intercept_gap = intercept - margin_intercept,
margin_intercept_inv = intercept + margin_intercept_gap,
norm_cross_x = -(norm_intercept - intercept) / (norm_slope - slope),
x_gap = norm_cross_x - sqm,
norm_cross_y = yrs + x_gap * norm_slope,
vec_margin = label * (b0 + b1 * sqm + b2 * yrs),
is_sv = vec_margin <= 240
)
base_plot +
geom_abline(
data=cur_line_df, aes(slope=slope, intercept=intercept), linetype="solid"
) +
geom_abline(
data=cur_sup_df |> filter(is_sv),
aes(
slope=slope,
intercept=margin_intercept
),
linetype="dashed"
) +
geom_abline(
data=cur_sup_df |> filter(is_sv),
aes(
slope=slope,
intercept=margin_intercept_inv
),
linetype="dashed"
) +
geom_segment(
data=cur_sup_df |> filter(is_sv),
aes(x=sqm, y=yrs, xend=norm_cross_x, yend=norm_cross_y),
color=cb_palette[4], linewidth=3
) +
geom_segment(
data=cur_sup_df,
aes(x=sqm, y=yrs, xend=norm_cross_x, yend=norm_cross_y, linetype=is_sv)
) +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.9,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=6, stroke=0.75, alpha=0.333) +
scale_linetype_manual("Support\nVector?", values=c("dotted", "solid")) +
labs(
title = paste0("Left Hyperplane Distances"),
x = "Square Meters",
y = "Years Old"
)
# New calculation: line with same slope but that hits the SV
# y - y1 = m(x - x1), so...
# y - yrs = m(x - sqm) <=> y = m(x-sqm) + yrs <=> y = mx - m*sqm + yrs
# <=> b = yrs - -m*sqm
cur_line_df <- sep_lines_df |> filter(slope < 0)
# left_line_df
# And make one copy per point
cur_sup_df <- uncount(cur_line_df, nrow(house_df))
cur_sup_df <- bind_cols(cur_sup_df, house_df)
cur_sup_df <- cur_sup_df |> mutate(
norm_intercept = yrs - norm_slope * sqm,
margin_intercept = yrs - slope * sqm,
margin_intercept_gap = intercept - margin_intercept,
margin_intercept_inv = intercept + margin_intercept_gap,
norm_cross_x = -(norm_intercept - intercept) / (norm_slope - slope),
x_gap = norm_cross_x - sqm,
norm_cross_y = yrs + x_gap * norm_slope,
vec_margin = abs(label * (b0 + b1 * sqm + b2 * yrs)),
is_sv = vec_margin <= 25
)
base_plot +
geom_abline(
data=cur_line_df, aes(slope=slope, intercept=intercept), linetype="solid"
) +
geom_abline(
data=cur_sup_df |> filter(is_sv),
aes(
slope=slope,
intercept=margin_intercept
),
linetype="dashed"
) +
geom_abline(
data=cur_sup_df |> filter(is_sv),
aes(
slope=slope,
intercept=margin_intercept_inv
),
linetype="dashed"
) +
# geom_abline(
# data=cur_line_df,
# aes(slope=slope, intercept=intercept),
# linewidth=3, color=cb_palette[4], alpha=0.333
# ) +
geom_segment(
data=cur_sup_df |> filter(vec_margin <= 18),
aes(x=sqm, y=yrs, xend=norm_cross_x, yend=norm_cross_y),
color=cb_palette[4], linewidth=3
) +
geom_segment(
data=cur_sup_df,
aes(x=sqm, y=yrs, xend=norm_cross_x, yend=norm_cross_y, linetype=is_sv)
) +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.9,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=6, stroke=0.75, alpha=0.333) +
scale_linetype_manual("Support\nVector?", values=c("dotted", "solid")) +
labs(
title = paste0("Right Hyperplane Margin"),
x = "Square Meters",
y = "Years Old"
)
Let \(y_i = \begin{cases} +1 &\text{if house }i\text{ Liked} \\ -1 &\text{if house }i\text{ Disliked}\end{cases}\)
\[ \begin{align*} \underset{\beta_0, \beta_1, \beta_2, M}{\text{maximize}}\text{ } & M \\ \text{s.t. } & y_i(\beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2}) \geq M, \\ ~ & \beta_0^2 + \beta_1^2 + \beta_2^2 = 1 \end{align*} \]
library(e1071)
liked <- as.factor(house_df$Rating == "Liked")
cent_df <- house_df
cent_df$sqm <- scale(cent_df$sqm)
cent_df$yrs <- scale(cent_df$yrs)
svm_model <- svm(liked ~ sqm + yrs, data=cent_df, kernel="linear")
cf <- coef(svm_model)
sep_intercept <- -cf[1] / cf[3]
sep_slope <- -cf[2] / cf[3]
# Invert Z-scores
sd_ratio <- sd(house_df$yrs) / sd(house_df$sqm)
inv_slope <- sd_ratio * sep_slope
inv_intercept <- mean(house_df$yrs) - inv_slope * mean(house_df$sqm) + sd(house_df$yrs)*sep_intercept
# And the margin boundary
sv_index <- svm_model$index[1]
sv_sqm <- house_df$sqm[sv_index]
sv_yrs <- house_df$yrs[sv_index]
margin_intercept <- sv_yrs - inv_slope * sv_sqm
margin_diff <- inv_intercept - margin_intercept
margin_intercept_inv <- inv_intercept + margin_diff
base_plot +
coord_equal() +
scale_shape_manual(values=c(95, 43)) +
theme_dsan(base_size=14) +
geom_abline(
intercept=inv_intercept, slope=inv_slope, linetype="solid"
) +
geom_abline(
intercept=margin_intercept, slope=inv_slope, linetype="dashed"
) +
geom_abline(
intercept=margin_intercept_inv, slope=inv_slope, linetype="dashed"
) +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.9,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=6, stroke=0.75, alpha=0.333) +
scale_linetype_manual("Support\nVector?", values=c("dotted", "solid")) +
labs(
title = "Optimal Max-Margin Hyperplane",
x = "Square Meters",
y = "Years Old"
)
# Generate gaussian blob of disliked + gaussian
# blob of liked :3
library(mvtnorm) |> suppressPackageStartupMessages()
set.seed(5304)
num_houses <- 100
# Shared covariance matrix
Sigma_all <- matrix(c(12,0,0,20), nrow=2, ncol=2, byrow=TRUE)
# Negative datapoints
mu_neg <- c(10, 12.5)
neg_matrix <- rmvnorm(num_houses/2, mean=mu_neg, sigma=Sigma_all)
colnames(neg_matrix) <- c("sqm", "yrs")
neg_df <- as_tibble(neg_matrix) |> mutate(Rating="Disliked")
# Positive datapoints
mu_pos <- c(21, 12.5)
pos_matrix <- rmvnorm(num_houses/2, mean=mu_pos, sigma=Sigma_all)
colnames(pos_matrix) <- c("sqm", "yrs")
pos_df <- as_tibble(pos_matrix) |> mutate(Rating="Liked")
# And combine
nonsep_df <- bind_rows(neg_df, pos_df)
nonsep_df <- nonsep_df |> filter(yrs >= 5 & sqm <= 24 & sqm >= 7)
# Plot
nonsep_plot <- nonsep_df |> ggplot(aes(x=sqm, y=yrs)) +
labs(
title = "Jeff's House Search",
x = "Square Meters",
y = "Years Old"
) +
# xlim(6,25) + ylim(2,22) +
coord_equal() +
# 45 is minus sign, 95 is em-dash
scale_shape_manual(values=c(95, 43)) +
theme_dsan(base_size=22)
nonsep_plot +
geom_point(
aes(color=Rating, shape=Rating), size=g_pointsize * 0.25,
stroke=6
) +
geom_point(aes(fill=Rating), color='black', shape=21, size=4, stroke=0.75, alpha=0.333)
ISLR Figure 9.5: Notice how the dashed line on the right side may be better in terms of generalization, even though the solid line is the optimal Max-Margin Hyperplane!
\[ \begin{align*} \underset{\beta_0, \beta_1, \beta_2, \varepsilon_1, \ldots, \varepsilon_n, M}{\text{maximize}}\text{ } \; & M \\ \text{s.t. } \; & y_i(\beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2}) \geq M(1 - \varepsilon_i), \\ ~ & \varepsilon_i \geq 0, \sum_{i=1}^{n}\varepsilon_i \leq C, \\ ~ & \beta_0^2 + \beta_1^2 + \beta_2^2 = 1 \end{align*} \]
library(e1071)
liked <- as.factor(nonsep_df$Rating == "Liked")
nonsep_cent_df <- nonsep_df
nonsep_cent_df$sqm <- scale(nonsep_cent_df$sqm)
nonsep_cent_df$yrs <- scale(nonsep_cent_df$yrs)
# Compute boundary for different cost budgets
budget_vals <- c(0.01, 1, 5)
svm_df <- tibble(
sep_slope=numeric(), sep_intercept=numeric(),
inv_slope=numeric(), inv_intercept=numeric(),
margin_intercept=numeric(), margin_intercept_inv=numeric(),
budget=numeric(), budget_label=character()
)
for (cur_c in budget_vals) {
# print(cur_c)
svm_model <- svm(liked ~ sqm + yrs, data=nonsep_cent_df, kernel="linear", cost=cur_c)
cf <- coef(svm_model)
sep_intercept <- -cf[1] / cf[3]
sep_slope <- -cf[2] / cf[3]
# Invert Z-scores
sd_ratio <- sd(nonsep_df$yrs) / sd(nonsep_df$sqm)
inv_slope <- sd_ratio * sep_slope
inv_intercept <- mean(nonsep_df$yrs) - inv_slope * mean(nonsep_df$sqm) + sd(nonsep_df$yrs)*sep_intercept
# And the margin boundary
sv_index <- svm_model$index[1]
sv_sqm <- nonsep_df$sqm[sv_index]
sv_yrs <- nonsep_df$yrs[sv_index]
margin_intercept <- sv_yrs - inv_slope * sv_sqm
margin_diff <- inv_intercept - margin_intercept
margin_intercept_inv <- inv_intercept + margin_diff
cur_svm_row <- tibble_row(
budget = cur_c,
budget_label = paste0("Penalty = ",cur_c),
sep_slope = sep_slope,
sep_intercept = sep_intercept,
inv_slope = inv_slope,
inv_intercept = inv_intercept,
margin_intercept = margin_intercept,
margin_intercept_inv = margin_intercept_inv
)
svm_df <- bind_rows(svm_df, cur_svm_row)
}
ggplot() +
# xlim(6,25) + ylim(2,22) +
coord_equal() +
# 45 is minus sign, 95 is em-dash
scale_shape_manual(values=c(95, 43)) +
theme_dsan(base_size=14) +
geom_abline(
data=svm_df,
aes(intercept=inv_intercept, slope=inv_slope),
linetype="solid"
) +
geom_abline(
data=svm_df,
aes(intercept=margin_intercept, slope=inv_slope),
linetype="dashed"
) +
geom_abline(
data=svm_df,
aes(intercept=margin_intercept_inv, slope=inv_slope),
linetype="dashed"
) +
geom_point(
data=nonsep_df,
aes(x=sqm, y=yrs, color=Rating, shape=Rating), size=g_pointsize * 0.1,
stroke=5
) +
geom_point(
data=nonsep_df,
aes(x=sqm, y=yrs, fill=Rating), color='black', shape=21, size=3, stroke=0.75, alpha=0.333
) +
labs(
title = "Support Vector Classifier",
x = "Square Meters",
y = "Years Old"
) +
facet_wrap(vars(budget_label), nrow=1) +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 0.4)
)
\[ \text{Nonlinear model} = \text{Linear model} \underbrace{- \; \text{linearity restriction}}_{\text{Enables overfitting...}} \underbrace{+ \text{ Complexity penalty}}_{\text{Prevent overfitting}} \]
From MathWorks
This is where stuff gets super…
Linear
1D Hyperplane = point! But no separating point here 😰
library(latex2exp) |> suppressPackageStartupMessages()
x_vals <- runif(50, min=-9, max=9)
data_df <- tibble(x=x_vals) |> mutate(
label = factor(ifelse(abs(x) >= 6, 1, -1)),
x2 = x^2
)
data_df |> ggplot(aes(x=x, y=0, color=label)) +
geom_point(
aes(color=label, shape=label), size=g_pointsize,
stroke=6
) +
geom_point(
aes(fill=label), color='black', shape=21, size=6, stroke=0.75, alpha=0.333
) +
# xlim(6,25) + ylim(2,22) +
# coord_equal() +
# 45 is minus sign, 95 is em-dash
scale_shape_manual(values=c(95, 43)) +
theme_dsan(base_size=28) +
theme(
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank()
) +
xlim(-10, 10) +
ylim(-0.1, 0.1) +
labs(title="Non-Linearly-Separable Data") +
theme(
plot.margin = unit(c(0,0,0,20), "mm")
)
data_df |> ggplot(aes(x=x, y=x2, color=label)) +
geom_point(
aes(color=label, shape=label), size=g_pointsize,
stroke=6
) +
geom_point(
aes(fill=label), color='black', shape=21, size=6, stroke=0.75, alpha=0.333
) +
geom_hline(yintercept=36, linetype="dashed") +
# xlim(6,25) + ylim(2,22) +
# coord_equal() +
# 45 is minus sign, 95 is em-dash
scale_shape_manual(values=c(95, 43)) +
theme_dsan(base_size=28) +
xlim(-10, 10) +
labs(
title = TeX("...Linearly Separable in $R^2$!"),
y = TeX("$x^2$")
)
Radial Basis Function (RBF) Kernel:
\[ K(x_i, x_{i'}) = \exp\left[ -\gamma \sum_{j=1}^{J} (x_{ij} - x_{i'j})^2 \right] \]
library(kernlab) |> suppressPackageStartupMessages()
library(mlbench) |> suppressPackageStartupMessages()
if (!file.exists("assets/linspike_df.rds")) {
set.seed(5300)
N <- 120
x1_vals <- runif(N, min=-5, max=5)
x2_raw <- x1_vals
x2_noise <- rnorm(N, mean=0, sd=1.25)
x2_vals <- x2_raw + x2_noise
linspike_df <- tibble(x1=x1_vals, x2=x2_vals) |>
mutate(
label = factor(ifelse(x1^2 + x2^2 <= 2.75, 1, -1))
)
linspike_svm <- ksvm(
label ~ x1 + x2,
data = linspike_df,
kernel = "rbfdot",
C = 500,
prob.model = TRUE
)
# Grid over which to evaluate decision boundaries
npts <- 500
lsgrid <- expand.grid(
x1 = seq(from = -5, 5, length = npts),
x2 = seq(from = -5, 5, length = npts)
)
# Predicted probabilities (as a two-column matrix)
prob_svm <- predict(
linspike_svm,
newdata = lsgrid,
type = "probabilities"
)
# Add predicted class probabilities
lsgrid2 <- lsgrid |>
cbind("SVM" = prob_svm[, 1L]) |>
tidyr::gather(Model, Prob, -x1, -x2)
# Serialize for quicker rendering
saveRDS(linspike_df, "assets/linspike_df.rds")
saveRDS(lsgrid2, "assets/lsgrid2.rds")
} else {
linspike_df <- readRDS("assets/linspike_df.rds")
lsgrid2 <- readRDS("assets/lsgrid2.rds")
}
linspike_df <- linspike_df |> mutate(
Label = label
)
linspike_df |> ggplot(aes(x = x1, y = x2)) +
# geom_point(aes(shape = label, color = label), size = 3, alpha = 0.75) +
geom_point(aes(shape = Label, color = Label), size = 3, stroke=4) +
geom_point(aes(fill=Label), color='black', shape=21, size=4, stroke=0.75, alpha=0.4) +
xlab(expression(X[1])) +
ylab(expression(X[2])) +
coord_fixed() +
theme(legend.position = "none") +
theme_dsan(base_size=28) +
xlim(-5, 5) + ylim(-5, 5) +
stat_contour(
data = lsgrid2,
aes(x = x1, y = x2, z = Prob),
breaks = 0.5,
color = "black"
) +
scale_shape_manual(values=c(95, 43))
if (!file.exists("assets/spiral_df.rds")) {
spiral_df <- as.data.frame(
mlbench.spirals(300, cycles = 2, sd = 0.09)
)
names(spiral_df) <- c("x1", "x2", "label")
# Fit SVM using a RBF kernel
spirals_svm <- ksvm(
label ~ x1 + x2,
data = spiral_df,
kernel = "rbfdot",
C = 500,
prob.model = TRUE
)
# Grid over which to evaluate decision boundaries
npts <- 500
xgrid <- expand.grid(
x1 = seq(from = -2, 2, length = npts),
x2 = seq(from = -2, 2, length = npts)
)
# Predicted probabilities (as a two-column matrix)
prob_svm <- predict(
spirals_svm,
newdata = xgrid,
type = "probabilities"
)
# Add predicted class probabilities
xgrid2 <- xgrid |>
cbind("SVM" = prob_svm[, 1L]) |>
tidyr::gather(Model, Prob, -x1, -x2)
# Serialize for quicker rendering
saveRDS(spiral_df, "assets/spiral_df.rds")
saveRDS(xgrid2, "assets/xgrid2.rds")
} else {
spiral_df <- readRDS("assets/spiral_df.rds")
xgrid2 <- readRDS("assets/xgrid2.rds")
}
# And plot
spiral_df <- spiral_df |> mutate(
Label = factor(ifelse(label == 2, 1, -1))
)
spiral_df |> ggplot(aes(x = x1, y = x2)) +
geom_point(aes(shape = Label, color = Label), size = 3, stroke=4) +
geom_point(aes(fill=Label), color='black', shape=21, size=4, stroke=0.75, alpha=0.4) +
xlab(expression(X[1])) +
ylab(expression(X[2])) +
xlim(-2, 2) +
ylim(-2, 2) +
coord_fixed() +
theme(legend.position = "none") +
theme_dsan(base_size=28) +
stat_contour(
data = xgrid2,
aes(x = x1, y = x2, z = Prob),
breaks = 0.5,
color = "black"
) +
scale_shape_manual("Label", values=c(95, 43))
\[ f(\mathbf{x}) = \beta_0 + \sum_{i=1}^{n} \alpha_i \langle \mathbf{x}, \mathbf{x}_i \rangle \]
\[ f(\mathbf{x}) = \beta_0 + \sum_{i \in \mathcal{SV}} \alpha_i K(\mathbf{x}, \mathbf{x}_i) \]
If we have a linearly-separating transformation (like \(f(x) = x^2\)), can “encode” as kernel, saving computation
Example: Transformation of features \(f(x) = x^2\) equivalent to quadratic kernel:
\[ K(x_i, x_{i'}) = (1 + \sum_{j = 1}^{p}x_{ij}x_{i'j})^2 \]
If we don’t have a transformation (and having trouble figuring it out), can change the problem into one of finding a “good” similarity function
Example: Look again at RBF Kernel:
\[ K(x_i, x_{i'}) = \exp\left[ -\gamma \sum_{j=1}^{J} (x_{ij} - x_{i'j})^2 \right] \]
It turns out: no finite collection of transformed features is equivalent to this kernel! (Roughly: can keep adding transformed features to asymptotically approach it—space of SVMs w/kernel thus \(>\) space of SVMs w/transformed features)
DSAN 5300-01 Week 8: Support Vector Machines (SVMs)