Week 9: Evaluating Spatial Hypotheses I: Point Data

PPOL 6805 / DSAN 6750: GIS for Spatial Data Science
Fall 2024

Jeff Jacobs

jj1088@georgetown.edu

Wednesday, October 23, 2024

Roadmap to the Midterm!

  • This Week (Oct 23): Evaluating Hypotheses for Point Data
  • Next Week (Oct 30): Evaluating Hypotheses for Areal Data
  • In-Class Midterm (Nov 6): Basically a “mini-homework” on Spatial Data Science unit (topics from HW4 onwards)!

Our GIS Mystery-Solving Toolkit

Why Do Events Appear Where They Do?

Code
set.seed(6809)
N <- 60
r_core <- 0.05
obs_window <- square(1)
### Clustered data
clust_ppp <- rMatClust(
  kappa=6,
  scale=r_core,
  mu=10
)
clust_sf <- clust_ppp |> sf::st_as_sf()
clust_plot <- clust_sf |>
  ggplot() +
  geom_sf(size=2) +
  theme_classic(base_size=18)
ggsave("images/clust_ppp.png", clust_plot, width=3, height=3)
# Intensity fn
clust_intensity <- density(clust_ppp, sigma = 0.1)
png("images/clust_intensity.png")
par(mar=c(0,0,0,2), las=2, oma=c(0,0,0,0), cex=2)
plot(clust_intensity, main=NULL)
contour(clust_intensity, add = TRUE)
dev.off()
### PCF
clust_pcf <- spatstat.explore::pcf(
  clust_ppp, divisor="d",
  r=seq(from=0.00, to=0.50, by=0.01)
)
clust_pcf_plot <- clust_pcf |> ggplot(aes(x=r, y=iso)) +
  geom_hline(yintercept=1, linetype='dashed', linewidth=1) +
  geom_area(color='black', fill=cb_palette[1], alpha=0.75) +
  scale_x_continuous(breaks=seq(from=0.0, to=1.0, by=0.1)) +
  labs(x="Distance", y="Density") +
  theme_classic(base_size=14)
ggsave("images/clust_pcf.png", clust_pcf_plot, width=3, height=3)
Original Data First-Order Second-Order
\(N = 60\) Events Events modeled individually
\(\implies\) Intensity function \(\lambda(\mathbf{s})\)
Events modeled pairwise \(\implies\) \(K\)-function \(K(\vec{h})\)

What Do These Functions “Detect”?

Code
sq_base <- 16
sq_psize <- 2.5
obs_window <- square(1)
r0 <- 0.2
sq_df <- tibble::tribble(
  ~x, ~y,
  0.5-r0,0.5-r0,
  0.5+r0,0.5+r0,
  0.5-r0,0.5+r0,
  0.5+r0,0.5-r0
)
sq_sf <- sf::st_as_sf(
  sq_df,
  coords = c("x","y")
)
sq_ppp <- as.ppp(sq_sf, W=obs_window)
sq_ppp |> sf::st_as_sf() |> ggplot() +
  geom_sf(size=sq_psize) +
  theme_classic(base_size=sq_base)

Code
par(mar=c(0,0,0,2), las=2, oma=c(0,0,1,0))
sq_density <- density(sq_ppp)
plot(sq_density, main=NULL, xaxs='i', yaxs='i')
contour(sq_density, xaxs='i', yaxs='i', add = TRUE)

Code
### PCF
pcf_result <- spatstat.explore::pcf(
  sq_ppp,
  divisor="d",
  r=seq(from=0.00, to=0.8, by=0.01)
)
pcf_result |> ggplot(aes(x=r, y=iso)) +
  geom_hline(yintercept=1, linetype='dashed', linewidth=1.5) +
  geom_area(color='black', fill=cb_palette[1], alpha=0.75) +
  scale_x_continuous(breaks=seq(from=0.0, to=1.0, by=0.1)) +
  theme_classic(base_size=sq_base)

Code
csr_ppp <- spatstat.random::rpoispp(
  lambda = 60,
  win=obs_window
)
csr_ppp |> sf::st_as_sf() |> ggplot() +
  geom_sf(size=sq_psize) +
  theme_classic(base_size=sq_base)

Code
par(
  mar=c(0,0,0,2),
  las=2,
  oma=c(0,0,1,0)
)
csr_density <- density(csr_ppp)
plot(csr_density, main=NULL, xaxs="i", yaxs="i")
contour(csr_density, xaxs="i", yaxs="i", add = TRUE, lwd=1.5)

Code
csr_pcf_result <- spatstat.explore::pcf(
  csr_ppp,
  divisor="d",
  r=seq(from=0.00, to=0.8, by=0.01)
)
csr_pcf_result |> ggplot(aes(x=r, y=iso)) +
  geom_hline(yintercept=1, linetype='dashed', linewidth=1.5) +
  geom_area(color='black', fill=cb_palette[1], alpha=0.75) +
  scale_x_continuous(breaks=seq(from=0.0, to=1.0, by=0.1)) +
  theme_classic(base_size=sq_base)

The Point of All This: Null Models via Base Rates!

Null Models

  • Given an intensity function, we can compute a bunch of simulated point patterns (inhomogeneous Poisson Point Process)…
  • How does an observed point pattern differ from the simulated point patterns?

Constant Risk Hypothesis: Motivation

  • Here is a (fictional) map of flu cases in the US “Midwest”
  • Are people in Chicago and Detroit equally “at risk”?
Code
city_df <- tibble::tribble(
  ~city, ~lon, ~lat, ~pop,
  "Chicago", 41.950567516553896, -87.93011127491978, 2746388,
  "Detroit", 42.45123004999075, -83.18402319217698, 631524
)
city_sf <- sf::st_as_sf(
  city_df,
  coords = c("lat", "lon"),
  crs=4326
)
city_buf_sf <- city_sf |> sf::st_buffer(20000)
city_cases_sf <- city_buf_sf |> sf::st_sample(size=rep(10,2)) |> sf::st_as_sf()
city_cases_sf$city <- "Detroit (10 Cases)"
city_cases_sf[1:10, 'city'] <- "Chicago (10 Cases)"
city_cases_sf$sample <- "Flu"
mapview(city_cases_sf, zcol="city")

Constant Risk Hypothesis: Base Rates

Chicago Detroit
Population 2,746,388 631,524
Code
city_pop_sf <- city_buf_sf |> sf::st_sample(size=c(16, 4)) |> sf::st_as_sf()
city_pop_sf$city <- "Detroit"
city_pop_sf[1:16, 'city'] <- "Chicago"
city_pop_sf$sample <- "People"
city_combined_sf <- bind_rows(city_cases_sf, city_pop_sf)
# mapview(city_combined_sf, zcol="city", marker="sample")
Flu = makeIcon(
    "https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Maki2-danger-24.svg/240px-Maki2-danger-24.svg.png",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Maki2-danger-24.svg/24px-Maki2-danger-24.svg.png",
    20,
    20
)
People = makeIcon(
  "https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Maki2-pitch-24.svg/24px-Maki2-pitch-24.svg.png",
  "https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Maki2-pitch-24.svg/24px-Maki2-pitch-24.svg.png",
  20,
  20
)

city_combined_sf$r <- ifelse(city_combined_sf$sample == "Flu", 0, 4)
city_flu_sf <- city_combined_sf |> filter(sample == "Flu")
city_ppl_sf <- city_combined_sf |> filter(sample == "People")
leaflet(city_flu_sf) |>
  addProviderTiles("CartoDB.Positron") |>
  addMarkers(data=city_flu_sf, icon = Flu) |>
  addMarkers(data=city_ppl_sf, icon = People)

Meaningful Results \(\leftrightarrow\) Comparisons with Null Model(s)

  • Cases of disease form an intensity function \(\lambda(\mathbf{s})\)

  • Controls form an “ambient” intensity function \(\lambda_0(\mathbf{s})\)

  • \(\implies\) The following model allows us to separate the quantities we really care about from the base rate information:

    \[ \lambda(\mathbf{s}) = \alpha \overbrace{\lambda_0(\mathbf{s})}^{\mathclap{\text{Base Rate}}} \underbrace{\rho(\mathbf{s})}_{\mathclap{\text{Relative Risk}}}, \]

    where

    \[ \alpha = \frac{\# \text{Cases}}{\# \text{Controls}} \]

  • (Remember: unit of \(\lambda(\mathbf{s})\) is number of cases; not a probability density!)

Example from Moraga (2024)

Code
library(sparr) |> suppressPackageStartupMessages()
data(pbc)
controls <- unmark(pbc[which(pbc$marks == "control"), ])
controls |> sf::st_as_sf() |> ggplot() +
  geom_sf() +
  theme_classic() +
  labs(title="Controls (Population Sample)")

Code
cases <- unmark(pbc[which(pbc$marks == "case"), ])
cases |> sf::st_as_sf() |> ggplot() +
  geom_sf() +
  theme_classic() +
  labs(title="PBC Cases")

Estimating Comparable Intensity Surfaces

  • To ensure comparable intensity functions, we need a common bandwidth parameter.
  • density() from spatstat makes “smart” choices for this bandwidth, so for now we just average the bandwidths estimated for cases and controls:
Code
library(sparr)
data(pbc)
cases <- unmark(pbc[which(pbc$marks == "case"), ])
controls <- unmark(pbc[which(pbc$marks == "control"), ])
bwcases <- attr(density(cases), "sigma")
bwcontr <- attr(density(controls), "sigma")
(bw <- (bwcases + bwcontr)/2)
[1] 11.45837
Code
int_controls <- density(controls, sigma = bw, eps=0.25)
plot(int_controls, main = NULL)

Code
int_cases <- density(cases, sigma = bw, eps=0.25)
plot(int_cases, main = NULL)

Visualizing Relative Risk Surface

  • All that’s left is \(\alpha = \# \text{Controls} / \# \text{Cases}\)!
Code
library(fields)
(alpha_hat <- cases$n/controls$n)
[1] 0.2519868
Code
x <- int_cases$xcol
y <- int_cases$yrow
rr <- t(int_cases$v)/t(alpha_hat * int_controls$v)
image.plot(x, y, rr, asp = 1)
title(xlab = "Easting", ylab = "Northing")

From Exploratory to Confirmatory Data Analysis

  • How do we know whether this raised risk around North Newcastle could be attributed to “chance”? Maybe relative risk is actually constant, and people in Newcastle are just unlucky
  • Approach: compare with null model

Null Model: CSR

Figure from Gimond (2024)

The Random Labeling Hypothesis

  • Randomly apply 761 “sick” labels to the 3781 points!
Code
set.seed(6810)
pbc_1 <- rlabel(pbc)
plot(pbc_1)

Code
set.seed(6811)
pbc_2 <- rlabel(pbc)
plot(pbc_2)

Code
set.seed(6812)
pbc_3 <- rlabel(pbc)
plot(pbc_3)

The Constant Risk Hypothesis

  • Everyone equally at risk of contracting disease, regardless of location: \(\rho(\mathbf{s}) = 1\)
  • In this case: \(\lambda_{CR}(\mathbf{s}) = \alpha \lambda_0(\mathbf{s})\)
Code
pbc_win_sf <- pbc |> sf::st_as_sf() |> filter(label == "window")
plot_pbc_sim <- function() {
    pbc_sim <- spatstat.random::rpoispp(
        lambda = alpha_hat * int_controls
    )
    # Separate window and points
    pbc_sf <- pbc_sim |> sf::st_as_sf()
    pbc_point_sf <- pbc_sf |> filter(label == "point")
    pbc_win_sf <- pbc_sf |> filter(label == "window")
    pbc_plot <- ggplot() +
      geom_sf(data=pbc_win_sf) +
      geom_sf(data=pbc_point_sf, size=0.5, alpha=0.5) +
      theme_classic()
    return(pbc_plot)
}
plot_pbc_sim()

Code
plot_pbc_sim()

Code
plot_pbc_sim()

References

Gimond, Manuel. 2024. Introduction to GIS and Spatial Analysis. Colby College (ES214). https://mgimond.github.io/Spatial/.