Code
source("../dsan-globals/_globals.r")
set.seed(5300)
DSAN 5300: Statistical Learning
Spring 2025, Georgetown University
Today’s Planned Schedule:
Start | End | Topic | |
---|---|---|---|
Lecture | 6:30pm | 7:00pm | Estimating Survival Curves → |
7:00pm | 7:20pm | Comparing Between Groups → | |
7:20pm | 8:00pm | Regression (Cox Proportional Hazard Model) → | |
Break! | 8:00pm | 8:10pm | |
8:10pm | 9:00pm | Quiz 3 → |
source("../dsan-globals/_globals.r")
set.seed(5300)
Patient ( |
Observed Outcome ( |
Observed? ( |
---|---|---|
1 | 300 | 1 |
2 | 365 | 0 |
3 | 150 | 1 |
4 | 250 | 0 |
Patient ( |
Survival Time ( |
Censor Point ( |
||
---|---|---|---|---|
1 | 300 | 1 | 300 | NA |
2 | 365 | 0 | NA |
365 |
3 | 150 | 1 | 150 | NA |
4 | 250 | 0 | NA |
250 |
…If we’re testing effect of treatment, which column do we most care about?
Patient ( |
Survival Time ( |
Censor Point ( |
||
---|---|---|---|---|
1 | 300 | 1 | 300 | NA |
2 | 365 | 0 | NA |
365 |
3 | 150 | 1 | 150 | NA |
4 | 250 | 0 | NA |
250 |
…If we’re testing effect of treatment,
Each death event
Break
Gives us a recurrence relation:
Plug each eq into eq above it to derive:
Two death points:
library(tidyverse) |> suppressPackageStartupMessages()
library(survival) |> suppressPackageStartupMessages()
library(latex2exp) |> suppressPackageStartupMessages()
<- tribble(
surv_df ~id, ~y, ~delta,
1, 300, 1,
2, 365, 0,
3, 150, 1,
4, 250, 0
)<- Surv(surv_df$y, event = surv_df$delta)
surv_obj <- survfit(surv_obj ~ 1)
surv_model # Plot options
par(mar=c(2,4,1.25,1.0)) # bltr
<- TeX("$\\Pr(T > t)$")
y_label plot(
surv_model,ylab=y_label,
lwd=1,
main="Survival Curve for 4-Patient Example"
# conf.int=FALSE
) # Add colors
# lines(c(0, 150), c(1.0, 1.0), type='l', col='#E69F00', lwd=2)
rect(xleft = 0, xright = 150, ybottom = 0, ytop = 1.0, col="#E69F0040", lwd=0)
# lines(c(150, 300), c(3/4, 3/4), type='l', col='#56B4E9', lwd=2)
rect(xleft = 150, xright = 300, ybottom = 0, ytop = 1.0, col="#56B4E940", lwd=0)
# lines(c(300, 365), c(3/8, 3/8), type='l', col='#009E73', lwd=2)
rect(xleft = 300, xright = 365, ybottom = 0, ytop = 1.0, col="#009E7340", lwd=0)
Group 1 | Group 2 | Total | |
---|---|---|---|
Died | |||
Survived | |||
Total |
Basically: Features
survival
library in R, very similar syntax to lm()
, glm()
, etc.!library(survival) |> suppressPackageStartupMessages()
library(ISLR2) |> suppressPackageStartupMessages()
<- BrainCancer |> filter(diagnosis != "Other")
bc_df $diagnosis = factor(bc_df$diagnosis)
bc_df$sex <- factor(substr(bc_df$sex, 1, 1))
bc_df$loc <- factor(substr(bc_df$loc, 1, 5))
bc_dfoptions(width=130)
summary(bc_df)
sex diagnosis loc ki gtv stereo status time
F:39 Meningioma:42 Infra:10 Min. : 40.00 Min. : 0.040 SRS:19 Min. :0.000 Min. : 0.07
M:34 LG glioma : 9 Supra:63 1st Qu.: 80.00 1st Qu.: 2.500 SRT:54 1st Qu.:0.000 1st Qu.: 9.77
HG glioma :22 Median : 80.00 Median : 6.480 Median :0.000 Median :26.46
Mean : 81.37 Mean : 8.297 Mean :0.411 Mean :27.83
3rd Qu.: 90.00 3rd Qu.:11.380 3rd Qu.:1.000 3rd Qu.:41.44
Max. :100.00 Max. :33.690 Max. :1.000 Max. :82.56
|> head(5) bc_df
sex | diagnosis | loc | ki | gtv | stereo | status | time |
---|---|---|---|---|---|---|---|
F | Meningioma | Infra | 90 | 6.11 | SRS | 0 | 57.64 |
M | HG glioma | Supra | 90 | 19.35 | SRT | 1 | 8.98 |
F | Meningioma | Infra | 70 | 7.95 | SRS | 0 | 26.46 |
F | LG glioma | Supra | 80 | 7.61 | SRT | 1 | 47.80 |
M | HG glioma | Supra | 90 | 5.06 | SRT | 1 | 6.30 |
coxph()
Estimationlibrary(broom) |> suppressPackageStartupMessages()
<- coxph(
full_cox_model Surv(time, status) ~ sex + diagnosis + loc + ki + gtv + stereo,
data=bc_df
)::tidy(full_cox_model) |> mutate_if(is.numeric, round, 3) broom
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
sexM | 0.456 | 0.396 | 1.154 | 0.249 |
diagnosisLG glioma | 0.864 | 0.641 | 1.347 | 0.178 |
diagnosisHG glioma | 2.116 | 0.470 | 4.504 | 0.000 |
locSupra | 1.482 | 1.105 | 1.342 | 0.180 |
ki | -0.048 | 0.020 | -2.406 | 0.016 |
gtv | 0.036 | 0.026 | 1.361 | 0.173 |
stereoSRT | -0.475 | 0.591 | -0.803 | 0.422 |
ki
): 1-unit increase associated with reduction of hazard to library(extrafont) |> suppressPackageStartupMessages()
par(cex=1.2, family="CMU Sans Serif")
<- c("Meningioma", "LG glioma", "HG glioma")
diag_levels <- tibble(
diag_df diagnosis = diag_levels,
sex = rep("F", 3),
loc = rep("Supra", 3),
ki = rep(mean(bc_df$ki), 3),
gtv = rep(mean(bc_df$gtv), 3),
stereo = rep("SRT", 3)
)<- survfit(full_cox_model, newdata = diag_df)
survplots plot(
survplots,main = "Survival Curves by Diagnosis",
xlab = "Months", ylab = "Survival Probability",
col = cb_palette, lwd=1.5
)legend(
"bottomleft",
diag_levels,col = cb_palette, lty = 1, lwd=1.5
)
ki
, gtv
Valki
gtv
|> ggplot(aes(x=ki)) +
bc_df geom_density(
linewidth=g_linewidth,
fill=cb_palette[1], alpha=0.333
+
) theme_dsan(base_size=28) +
labs(title = "Karnofsky Index Distribution")
|> ggplot(aes(x=gtv)) +
bc_df geom_density(
linewidth=g_linewidth,
fill=cb_palette[1], alpha=0.333
+
) theme_dsan(base_size=28) +
labs(title = "Gross Tumor Volume (GTV) Distribution")
<- quantile(bc_df$ki, c(1/3, 2/3))
ki_terciles <- bc_df |> mutate(
bc_df tercile = ifelse(ki < ki_terciles[1], 1, ifelse(ki < ki_terciles[2], 2, 3))
)<- bc_df |>
(terc_df group_by(tercile) |>
summarize(med_ki=median(ki)))
tercile | med_ki |
---|---|
1 | 70 |
2 | 80 |
3 | 90 |
library(latex2exp) |> suppressPackageStartupMessages()
<- tibble(
ki_df diagnosis = rep("Meningioma", 3),
sex = rep("F", 3),
loc = rep("Supra", 3),
ki = terc_df$med_ki,
gtv = rep(mean(bc_df$gtv), 3),
stereo = rep("SRT", 3)
)<- survfit(full_cox_model, newdata = ki_df)
ki_plots par(
mar=c(4.0,4.0,1.2,0.5),
cex=1.2,
family="CMU Sans Serif"
# bltr
) plot(
ki_plots,main = "Survival Curves by KI Tercile",
xlab = "Months",
ylab = TeX("$\\Pr(T > t)$"),
lwd = 1,
col = cb_palette
)<- c(
ki_labs TeX("$h( t \\, | \\, KI = 70 )$"),
TeX("$h( t \\, | \\, KI = 80 )$"),
TeX("$h( t \\, | \\, KI = 90 )$")
)legend(
"bottomleft",
lwd=1,
ki_labs, col = cb_palette, lty = 1, cex=0.8
)