Code
source("../../dsan-globals/_globals.r")
source("../../dsan-globals/_globals.r")
library(ggplot2)
library(tibble)
<- seq(from=0, to=1, by=0.02)
x_data <- length(x_data)
num_x <- x_data + runif(num_x, 0, 0.2)
y_data <- tibble(x=x_data, y=y_data)
reg_df ggplot(reg_df, aes(x=x, y=y)) +
geom_point(size=g_pointsize) +
dsan_theme("quarter")
psychology
trending_flat
ggplot(reg_df, aes(x=x, y=y)) +
geom_point(size=g_pointsize) +
geom_smooth(method = "lm", se = FALSE, color = cbPalette[1], formula = y ~ x, linewidth = g_linewidth*3) +
dsan_theme("quarter")
Find a line
<- 11
N <- seq(from = 0, to = 1, by = 1 / (N - 1))
x <- x + rnorm(N, 0, 0.25)
y <- mean(y)
mean_y <- y - mean_y
spread <- tibble(x = x, y = y, spread = spread)
df ggplot(df, aes(x=x, y=y)) +
geom_abline(slope=1, intercept=0, linetype="dashed", color=cbPalette[1], linewidth=g_linewidth*2) +
geom_segment(xend=(x+y)/2, yend=(x+y)/2, linewidth=g_linewidth*2, color=cbPalette[2]) +
geom_point(size=g_pointsize) +
coord_equal() +
dsan_theme("half") +
labs(
title = "Principal Component Line"
)ggplot(df, aes(x=x, y=y)) +
geom_abline(slope=1, intercept=0, linetype="dashed", color=cbPalette[1], linewidth=g_linewidth*2) +
geom_segment(xend=x, yend=x, linewidth=g_linewidth*2, color=cbPalette[2]) +
geom_point(size=g_pointsize) +
coord_equal() +
dsan_theme("half") +
labs(
title = "Regression Line"
)
On the difference between these two lines, and why it matters, I cannot recommend Gelman and Hill (2007) enough!
Principal Component Line can be used to project the data onto its dimension of highest variance
More simply: PCA can discover meaningful axes in data (unsupervised learning / exploratory data analysis settings)
library(readr)
library(ggplot2)
<- read_csv("assets/gdp_pca.csv")
gdp_df
<- function(x0, y0, a, c) {
dist_to_line <- abs(a * x0 - y0 + c)
numer <- sqrt(a * a + 1)
denom return(numer / denom)
}# Finding PCA line for industrial vs. exports
<- gdp_df$industrial
x <- gdp_df$exports
y <- function(lineParams, x0, y0) {
lossFn <- lineParams[1]
a <- lineParams[2]
c return(sum(dist_to_line(x0, y0, a, c)))
}<- optim(c(0, 0), lossFn, x0 = x, y0 = y)
o ggplot(gdp_df, aes(x = industrial, y = exports)) +
geom_point(size=g_pointsize/2) +
geom_abline(aes(slope = o$par[1], intercept = o$par[2], color="pca"), linewidth=g_linewidth, show.legend = TRUE) +
geom_smooth(aes(color="lm"), method = "lm", se = FALSE, linewidth=g_linewidth, key_glyph = "blank") +
scale_color_manual(element_blank(), values=c("pca"=cbPalette[2],"lm"=cbPalette[1]), labels=c("Regression","PCA")) +
dsan_theme("half") +
remove_legend_title() +
labs(
title = "PCA Line vs. Regression Line",
x = "Industrial Production (% of GDP)",
y = "Exports (% of GDP)"
)
See https://juliasilge.com/blog/un-voting/ for an amazing blog post using PCA, with 2 dimensions, to explore UN voting patterns!
ggplot(gdp_df, aes(pc1, .fittedPC2)) +
geom_point(size = g_pointsize/2) +
geom_hline(aes(yintercept=0, color='PCA Line'), linetype='solid', size=g_linesize) +
geom_rug(sides = "b", linewidth=g_linewidth/1.2, length = unit(0.1, "npc"), color=cbPalette[3]) +
expand_limits(y=-1.6) +
scale_color_manual(element_blank(), values=c("PCA Line"=cbPalette[2])) +
dsan_theme("full") +
remove_legend_title() +
labs(
title = "Exports vs. Industrial Production in Principal Component Space",
x = "First Principal Component (Dimension of Greatest Variance)",
y = "Second Principal Component"
)
library(dplyr)
library(tidyr)
<- gdp_df %>% select(c(country_code, pc1, agriculture, military))
plot_df <- plot_df %>% pivot_longer(!c(country_code, pc1), names_to = "var", values_to = "val")
long_df <- long_df |> mutate(
long_df var = case_match(
var,"agriculture" ~ "Agricultural Production",
"military" ~ "Military Spending"
)
)ggplot(long_df, aes(x = pc1, y = val, facet = var)) +
geom_point() +
facet_wrap(vars(var), scales = "free") +
dsan_theme("full") +
labs(
x = "Industrial-Export Dimension",
y = "% of GDP"
)
Given data
is chosen so that
R
<- lm(military ~ industrial, data=gdp_df)
lin_model summary(lin_model)
Call:
lm(formula = military ~ industrial, data = gdp_df)
Residuals:
Min 1Q Median 3Q Max
-2.3354 -1.0997 -0.3870 0.6081 6.7508
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.61969 0.59526 1.041 0.3010
industrial 0.05253 0.02019 2.602 0.0111 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.671 on 79 degrees of freedom
(8 observations deleted due to missingness)
Multiple R-squared: 0.07895, Adjusted R-squared: 0.06729
F-statistic: 6.771 on 1 and 79 DF, p-value: 0.01106
lm
Syntaxlm(
formula = dependent ~ independent + controls,
data = my_df
)
Call: lm(formula = military ~ industrial, data = gdp_df) Residuals: Min 1Q Median 3Q Max -2.3354 -1.0997 -0.3870 0.6081 6.7508
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.61969 0.59526 1.041 0.3010 industrial 0.05253 0.02019 2.602 0.0111 *
--- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 1.671 on 79 degrees of freedom (8 observations deleted due to missingness) Multiple R-squared: 0.07895, Adjusted R-squared: 0.06729 F-statistic: 6.771 on 1 and 79 DF, p-value: 0.01106
Estimate | Std. Error | t value | Pr(>|t|) | ||
---|---|---|---|---|---|
(Intercept) | 0.61969 | 0.59526 | 1.041 | 0.3010 | |
industrial | 0.05253 | 0.02019 | 2.602 | 0.0111 | * |
Uncertainty | Test statistic | How extreme is test stat? | Statistical significance |
Estimate | Std. Error | t value | Pr(>|t|) | ||
---|---|---|---|---|---|
(Intercept) | 0.61969 | 0.59526 | 1.041 | 0.3010 | |
industrial | 0.05253 | 0.02019 | 2.602 | 0.0111 | * |
Uncertainty | Test statistic | How extreme is test stat? | Statistical significance |
library(ggplot2)
<- 1.041
int_tstat <- sprintf("%.02f", int_tstat)
int_tstat_str <- tribble(
label_df_int ~x, ~y, ~label,
0.25, 0.05, paste0("P(t > ",int_tstat_str,")\n= 0.3")
)<- tribble(
label_df_signif_int ~x, ~y, ~label,
2.7, 0.075, "95% Signif.\nCutoff"
)<- function(x, lower_bound, upper_bound){
funcShaded <- dnorm(x)
y < lower_bound | x > upper_bound] <- NA
y[x return(y)
}<- function(x) funcShaded(x, int_tstat, Inf)
funcShadedIntercept <- function(x) funcShaded(x, 1.96, Inf)
funcShadedSignif ggplot(data=data.frame(x=c(-3,3)), aes(x=x)) +
stat_function(fun=dnorm, linewidth=g_linewidth) +
geom_vline(aes(xintercept=int_tstat), linewidth=g_linewidth) +
geom_vline(aes(xintercept = 1.96), linewidth=g_linewidth, linetype="dashed") +
stat_function(fun = funcShadedIntercept, geom = "area", fill = cbPalette[1], alpha = 0.5) +
stat_function(fun = funcShadedSignif, geom = "area", fill = "grey", alpha = 0.333) +
geom_text(label_df_int, mapping = aes(x = x, y = y, label = label), size = 10) +
geom_text(label_df_signif_int, mapping = aes(x = x, y = y, label = label), size = 8) +
# Add single additional tick
scale_x_continuous(breaks=c(-2, 0, int_tstat, 2), labels=c("-2","0",int_tstat_str,"2")) +
dsan_theme("quarter") +
labs(
title = "t Value for Intercept",
x = "t",
y = "Density"
+
) theme(axis.text.x = element_text(colour = c("black", "black", cbPalette[1], "black")))
library(ggplot2)
<- 2.602
coef_tstat <- sprintf("%.02f", coef_tstat)
coef_tstat_str <- tribble(
label_df_coef ~x, ~y, ~label,
3.65, 0.06, paste0("P(t > ",coef_tstat_str,")\n= 0.01")
)<- tribble(
label_df_signif_coef ~x, ~y, ~label,
1.05, 0.03, "95% Signif.\nCutoff"
)<- function(x) funcShaded(x, coef_tstat, Inf)
funcShadedCoef ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
stat_function(fun=dnorm, linewidth=g_linewidth) +
geom_vline(aes(xintercept=coef_tstat), linetype="solid", linewidth=g_linewidth) +
geom_vline(aes(xintercept=1.96), linetype="dashed", linewidth=g_linewidth) +
stat_function(fun = funcShadedCoef, geom = "area", fill = cbPalette[2], alpha = 0.5) +
stat_function(fun = funcShadedSignif, geom = "area", fill = "grey", alpha = 0.333) +
# Label shaded area
geom_text(label_df_coef, mapping = aes(x = x, y = y, label = label), size = 10) +
# Label significance cutoff
geom_text(label_df_signif_coef, mapping = aes(x = x, y = y, label = label), size = 8) +
coord_cartesian(clip = "off") +
# Add single additional tick
scale_x_continuous(breaks=c(-4, -2, 0, 2, coef_tstat, 4), labels=c("-4", "-2","0", "2", coef_tstat_str,"4")) +
dsan_theme("quarter") +
labs(
title = "t Value for Coefficient",
x = "t",
y = "Density"
+
) theme(axis.text.x = element_text(colour = c("black", "black", "black", "black", cbPalette[2], "black")))
library(broom)
<- augment(lin_model)
gdp_resid_df ggplot(gdp_resid_df, aes(x = .fitted, y = .resid)) +
geom_point(size = g_pointsize/2) +
geom_hline(yintercept=0, linetype="dashed") +
dsan_theme("quarter") +
labs(
title = "Residual Plot for Industrial ~ Military",
x = "Fitted Value",
y = "Residual"
)
<- 1:80
x <- rnorm(length(x), 0, x^2/1000)
errors <- x + errors
y <- lm(y ~ x)
het_model <- augment(het_model)
df_het ggplot(df_het, aes(x = .fitted, y = .resid)) +
geom_point(size = g_pointsize / 2) +
geom_hline(yintercept = 0, linetype = "dashed") +
dsan_theme("quarter") +
labs(
title = "Residual Plot for Heteroskedastic Data",
x = "Fitted Value",
y = "Residual"
)
ggplot(df_het, aes(sample=.resid)) +
stat_qq(size = g_pointsize/2) + stat_qq_line(linewidth = g_linewidth) +
dsan_theme("half") +
labs(
title = "Q-Q Plot for Heteroskedastic Data",
x = "Normal Distribution Quantiles",
y = "Observed Data Quantiles"
)
ggplot(gdp_resid_df, aes(sample=.resid)) +
stat_qq(size = g_pointsize/2) + stat_qq_line(linewidth = g_linewidth) +
dsan_theme("half") +
labs(
title = "Q-Q Plot for Industrial ~ Military Residuals",
x = "Normal Distribution Quantiles",
y = "Observed Data Quantiles"
)