knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(sf)
library(sp)
library(spdep)
library(ggplot2)
library(gstat)
This Part 2 tutorial shows how to compute spatial correlograms (Moran’s I; Geary’s C) and a semivariogram from a simple CSV of points:
dummy_strong_spatial.csv with
COVIDincid (strictly positive; strong spatial
trend)dummy_no_spatial.csv with KuruPrev
(weak/no spatial trend)We’ll: 1. Load and project the data 2. Choose distance
bins (max lag = 1/3 of the maximum pairwise
distance) 3. Compute Moran’s I and
Geary’s C by distance ring 4. Compute a
semivariogram with gstat 5. Plot and
compare
Set
file_nameto either"dummy_strong_spatial.csv"(COVIDincid) or"dummy_no_spatial.csv"(KuruPrev).
If you use your own CSV, ensure it has columnsX,Y, and a numeric value column.
# Choose which CSV to use
file_name <- "dummy_strong_spatial.csv" # or "dummy_no_spatial.csv" (or your own path)
# Read CSV
df <- read.csv(file_name)
# Inspect columns
str(df)
## 'data.frame': 60 obs. of 3 variables:
## $ X : num 5.71 5.7 5.71 5.73 5.7 ...
## $ Y : num 50.8 50.8 50.8 50.8 50.9 ...
## $ COVIDincid: num 8.86 4.19 10.39 6.53 6.99 ...
head(df)
## X Y COVIDincid
## 1 5.709934 50.84042 8.864176
## 2 5.697235 50.84629 4.193381
## 3 5.712954 50.82787 10.389650
## 4 5.730461 50.82608 6.531042
## 5 5.695317 50.86625 6.994092
## 6 5.695317 50.87712 10.201321
# Convert lon/lat (EPSG:4326) to sf POINTS
pts <- st_as_sf(df, coords = c("X","Y"), crs = 4326)
# Project to a suitable projected CRS in meters (adjust EPSG as needed)
# Example: UTM zone 33N
pts <- st_transform(pts, 32633)
# Choose which value column to analyze:
# If using the included datasets, value column is named COVIDincid or KuruPrev.
candidate_cols <- intersect(c("COVIDincid","KuruPrev","Value"), names(pts))
stopifnot(length(candidate_cols) >= 1)
value_col <- candidate_cols[1]
value_col
## [1] "COVIDincid"
coords <- st_coordinates(pts) # matrix of projected x,y (meters)
x <- pts[[value_col]] # numeric variable
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.348 6.115 6.163 8.220 13.680
We’ll pick bins by: - computing the maximum pairwise distance among points, - setting max lag = 1/3 of that maximum, - choosing a fixed number of bins (e.g., 10).
Then we’ll visualize the distribution of pairwise distances with a histogram and use the bin edges as breaks.
# Pairwise Euclidean distances (meters)
dists <- as.numeric(dist(coords))
# Max lag and bins
max_distance <- max(dists) / 3
n_bins <- 10
edges <- seq(0, max_distance, length.out = n_bins + 1)
bins <- data.frame(
bin_id = 1:n_bins,
d_min = head(edges, -1),
d_max = tail(edges, -1)
)
bins
## bin_id d_min d_max
## 1 1 0.0000 394.5233
## 2 2 394.5233 789.0466
## 3 3 789.0466 1183.5699
## 4 4 1183.5699 1578.0932
## 5 5 1578.0932 1972.6165
## 6 6 1972.6165 2367.1398
## 7 7 2367.1398 2761.6631
## 8 8 2761.6631 3156.1863
## 9 9 3156.1863 3550.7096
## 10 10 3550.7096 3945.2329
### Histogram of pairwise distances with chosen bin edges overlaid
hist(dists,
main = "Pairwise distances",
xlab = "Distance (meters)",
ylab = "Pair count")
abline(v = edges, lty = 3) # draw the chosen bin edges as vertical lines
Here we define how to treat pairs of points within a given distance bin: if two points fall inside the same ring, we code them as neighbors (1); if they fall outside the ring (across bins), we code them as not neighbors (0). This creates a binary weights list for each ring.
# Build binary weights for a distance ring [d_min, d_max)
ring_listw <- function(coords, d_min, d_max, zero_ok = TRUE) {
# neighbors if separated by a distance within [d_min, d_max)
nb <- spdep::dnearneigh(coords, d1 = d_min, d2 = d_max, longlat = FALSE)
lw <- spdep::nb2listw(nb, style = "B", zero.policy = zero_ok)
list(nb = nb, lw = lw)
}
moran_ring <- function(x, coords, d_min, d_max, nsim = 499, zero_ok = TRUE) {
w <- ring_listw(coords, d_min, d_max, zero_ok)
# If no neighbor links exist in this ring, return NAs
if (sum(unlist(w$lw$weights), na.rm = TRUE) == 0) {
return(list(I = NA_real_, p_mc = NA_real_, n_links = 0))
}
mc <- spdep::moran.mc(x, w$lw, nsim = nsim,
zero.policy = zero_ok, alternative = "two.sided")
list(I = as.numeric(mc$statistic),
p_mc = mc$p.value,
n_links = sum(unlist(w$lw$weights)))
}
set.seed(123)
res_I <- do.call(rbind, lapply(seq_len(nrow(bins)), function(i) {
b <- bins[i,]
r <- moran_ring(x, coords, b$d_min, b$d_max, nsim = 499, zero_ok = TRUE)
data.frame(bin_id = b$bin_id, d_min = b$d_min, d_max = b$d_max,
I = r$I, p_mc = r$p_mc, n_links = r$n_links)
}))
res_I$mid <- (res_I$d_min + res_I$d_max)/2
res_I
## bin_id d_min d_max I p_mc n_links mid
## 1 1 0.0000 394.5233 0.06031051 0.448 46 197.2616
## 2 2 394.5233 789.0466 0.17674523 0.012 146 591.7849
## 3 3 789.0466 1183.5699 0.13912471 0.028 244 986.3082
## 4 4 1183.5699 1578.0932 0.30410906 0.000 294 1380.8315
## 5 5 1578.0932 1972.6165 0.08241205 0.112 390 1775.3548
## 6 6 1972.6165 2367.1398 0.08422195 0.136 346 2169.8781
## 7 7 2367.1398 2761.6631 -0.03621193 0.836 336 2564.4014
## 8 8 2761.6631 3156.1863 -0.13491529 0.144 292 2958.9247
## 9 9 3156.1863 3550.7096 -0.09299947 0.256 298 3353.4480
## 10 10 3550.7096 3945.2329 -0.15760420 0.124 220 3747.9713
ggplot(res_I, aes(mid, I)) +
geom_hline(yintercept = 0, linetype = 2) +
geom_point() + geom_line() +
labs(x = "Distance (bin midpoint, m)", y = "Moran's I",
title = paste0("Moran's I by distance (", value_col, ")")) +
theme_minimal()
geary_ring <- function(x, coords, d_min, d_max, nsim = 499, zero_ok = TRUE) {
w <- ring_listw(coords, d_min, d_max, zero_ok)
if (sum(unlist(w$lw$weights), na.rm = TRUE) == 0) {
return(list(C = NA_real_, p_mc = NA_real_, n_links = 0))
}
mc <- spdep::geary.mc(x, w$lw, nsim = nsim,
zero.policy = zero_ok, alternative = "two.sided")
list(C = as.numeric(mc$statistic),
p_mc = mc$p.value,
n_links = sum(unlist(w$lw$weights)))
}
res_C <- do.call(rbind, lapply(seq_len(nrow(bins)), function(i) {
b <- bins[i,]
r <- geary_ring(x, coords, b$d_min, b$d_max, nsim = 499, zero_ok = TRUE)
data.frame(bin_id = b$bin_id, d_min = b$d_min, d_max = b$d_max,
C = r$C, p_mc = r$p_mc, n_links = r$n_links)
}))
res_C$mid <- (res_C$d_min + res_C$d_max)/2
res_C
## bin_id d_min d_max C p_mc n_links mid
## 1 1 0.0000 394.5233 0.2302935 0.076 46 197.2616
## 2 2 394.5233 789.0466 0.3144322 0.004 146 591.7849
## 3 3 789.0466 1183.5699 0.4662629 0.004 244 986.3082
## 4 4 1183.5699 1578.0932 0.5632043 0.004 294 1380.8315
## 5 5 1578.0932 1972.6165 0.6014741 0.004 390 1775.3548
## 6 6 1972.6165 2367.1398 0.9062569 0.572 346 2169.8781
## 7 7 2367.1398 2761.6631 0.9560845 0.736 336 2564.4014
## 8 8 2761.6631 3156.1863 1.1326153 0.132 292 2958.9247
## 9 9 3156.1863 3550.7096 1.1598323 0.188 298 3353.4480
## 10 10 3550.7096 3945.2329 1.2284349 0.060 220 3747.9713
ggplot(res_C, aes(mid, C)) +
geom_hline(yintercept = 1, linetype = 2) +
geom_point() + geom_line() +
labs(x = "Distance (bin midpoint, m)", y = "Geary's C",
title = paste0("Geary's C by distance (", value_col, ")")) +
theme_minimal()
The semivariogram uses half the average squared difference between values at a given lag distance. It is variance-like (non-negative, in squared units) and usually increases with distance toward a sill.
# gstat works with sp classes; convert sf to Spatial
pts_sp <- as(pts, "Spatial")
vg <- gstat::variogram(as.formula(paste0(value_col, " ~ 1")), data = pts_sp)
plot(vg, main = paste0("Semivariogram (", value_col, ")"))
Change file_name at the top to: -
"dummy_no_spatial.csv" to use KuruPrev
(little/no spatial dependence), or - back to
"dummy_strong_spatial.csv" for COVIDincid
(clear trend).
Then re-knit and compare the correlograms and semivariograms.