Are the expression differences between Inv4m and B73 haplotypes explained by per-gene CDS sequence divergence, or do some genes show excess DE beyond what divergence predicts?
We use LAST alignments (B73 vs Mi21 CDS) already computed by the jcvi ortholog pipeline. Only genes within the shared introgression on chromosome 4 are analyzed.
library(here)
library(tidyverse)
library(ggrepel)
library(GenomicRanges)
library(MASS) # rlm (robust regression)
library(quantreg) # rq (quantile regression)
library(evd) # GPD fitting for extreme value analysis
library(ggtext) # richtext labels
library(ggbeeswarm) # beeswarm plots
library(ggfx) # shadow effects
source(here("scripts", "utils", "setup_paths.R"))
paths <- setup_project_paths("inversion_paper")
last_file <- "/Users/fvrodriguez/mnt/rrellan/DOE_CAREER/inv4m/nilhifimi21/microsynteny/B73.Mi21.last"
last_cols <- c("query", "subject", "pct_identity", "aln_len", "mismatches",
"gaps", "qstart", "qend", "sstart", "send", "evalue", "score")
last_raw <- read.table(last_file, header = FALSE, sep = "\t",
col.names = last_cols, comment.char = "#")
# Strip transcript suffix to get gene IDs
last_raw <- last_raw %>%
mutate(
b73_gene = sub("_T\\d+$", "", query),
mi21_gene = sub("^Mi21_", "", sub("_T\\d+$", "", subject))
)
cat("Raw LAST alignments:", nrow(last_raw), "\n")
## Raw LAST alignments: 1816628
degs <- read.csv(file.path(paths$intermediate, "deg_both_models_genotype.csv"))
cat("DEG results:", nrow(degs), "genes\n")
## DEG results: 465 genes
v5_gff_file <- file.path(paths$data, "Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3")
genes_gr <- rtracklayer::import(v5_gff_file)
genes_gr <- genes_gr[genes_gr$type == "gene" &
as.character(seqnames(genes_gr)) %in% as.character(1:10)]
genes <- as.data.frame(genes_gr)
genes$ID <- gsub("gene:", "", genes$ID)
# Boundaries
introgression_start <- 157012149
introgression_end <- 195900523
inv4m_start <- genes$start[genes$ID == "Zm00001eb190470"]
inv4m_end <- genes$end[genes$ID == "Zm00001eb194800"]
# Chr4 introgression genes
introgression_genes <- genes %>%
filter(seqnames == 4, start >= introgression_start, end <= introgression_end) %>%
mutate(location = ifelse(start >= inv4m_start & end <= inv4m_end,
"Inv4m", "flanking"))
cat("Introgression genes:", nrow(introgression_genes), "\n")
## Introgression genes: 1099
cat(" Inv4m:", sum(introgression_genes$location == "Inv4m"), "\n")
## Inv4m: 434
cat(" CTRL:", sum(introgression_genes$location == "flanking"), "\n")
## CTRL: 665
Keep the highest-scoring ortholog alignment for each B73 gene within the introgression.
best_hits <- last_raw %>%
filter(b73_gene %in% introgression_genes$ID) %>%
# Keep only self-ortholog hits (same gene ID in B73 and Mi21)
filter(b73_gene == mi21_gene) %>%
group_by(b73_gene) %>%
slice_max(score, n = 1, with_ties = FALSE) %>%
ungroup() %>%
dplyr::select(gene = b73_gene, pct_identity, aln_len, score)
cat("Genes with ortholog hits:", nrow(best_hits), "\n")
## Genes with ortholog hits: 902
cat("Identity range:", round(min(best_hits$pct_identity), 2), "-",
round(max(best_hits$pct_identity), 2), "%\n")
## Identity range: 84.07 - 100 %
# Load master locus labels
locus_labels <- read.csv(file.path(paths$data, "locus_labels_master.csv")) %>%
dplyr::select(gene, locus_label)
merged <- best_hits %>%
inner_join(degs, by = "gene") %>%
inner_join(
introgression_genes %>% dplyr::select(ID, location, start),
by = c("gene" = "ID")
) %>%
left_join(locus_labels, by = "gene") %>%
mutate(
cds_divergence = 100 - pct_identity,
abs_logFC = abs(logFC),
direction = ifelse(logFC > 0, "Up in Inv4m", "Down in Inv4m"),
display_label = coalesce(locus_label, gene)
)
cat("Merged dataset:", nrow(merged), "genes\n")
## Merged dataset: 254 genes
cat(" DEGs (FDR < 0.05):", sum(merged$adj.P.Val < 0.05), "\n")
## DEGs (FDR < 0.05): 254
cat(" Up:", sum(merged$direction == "Up in Inv4m"),
" Down:", sum(merged$direction == "Down in Inv4m"), "\n")
## Up: 114 Down: 140
fit_null <- lm(abs_logFC ~ 1, data = merged)
fit_div <- lm(abs_logFC ~ cds_divergence, data = merged)
fit_loc <- lm(abs_logFC ~ location, data = merged)
fit_add <- lm(abs_logFC ~ cds_divergence + location, data = merged)
fit_int <- lm(abs_logFC ~ cds_divergence * location, data = merged)
models <- list(null = fit_null, divergence = fit_div, location = fit_loc,
additive = fit_add, interaction = fit_int)
data.frame(
model = names(models),
df = sapply(models, function(m) length(coef(m))),
AIC = round(sapply(models, AIC), 1),
BIC = round(sapply(models, BIC), 1),
adj_R2 = round(sapply(models, function(m) summary(m)$adj.r.squared), 4)
) %>% arrange(BIC)
cat("=== Additive model ===\n")
## === Additive model ===
summary(fit_add)
##
## Call:
## lm(formula = abs_logFC ~ cds_divergence + location, data = merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0515 -1.1299 -0.6354 0.5390 8.0448
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.74532 0.17704 9.858 < 2e-16 ***
## cds_divergence 0.33273 0.06887 4.831 2.36e-06 ***
## locationInv4m -0.71719 0.23264 -3.083 0.00228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.828 on 251 degrees of freedom
## Multiple R-squared: 0.1108, Adjusted R-squared: 0.1037
## F-statistic: 15.63 on 2 and 251 DF, p-value: 4.002e-07
cat("\n=== Interaction model ===\n")
##
## === Interaction model ===
summary(fit_int)
##
## Call:
## lm(formula = abs_logFC ~ cds_divergence * location, data = merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8392 -1.1091 -0.6770 0.5359 8.0737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.68515 0.18754 8.986 <2e-16 ***
## cds_divergence 0.37737 0.08275 4.560 8e-06 ***
## locationInv4m -0.50208 0.32089 -1.565 0.119
## cds_divergence:locationInv4m -0.14532 0.14930 -0.973 0.331
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.828 on 250 degrees of freedom
## Multiple R-squared: 0.1141, Adjusted R-squared: 0.1035
## F-statistic: 10.73 on 3 and 250 DF, p-value: 1.164e-06
anova(fit_add, fit_int)
cat("\n=== ANOVA: location-only vs additive ===\n")
##
## === ANOVA: location-only vs additive ===
anova(fit_loc, fit_add)
# Use additive model for ANCOVA lines
fit_ancova <- fit_add
# Predicted ANCOVA lines
ancova_pred <- expand.grid(
cds_divergence = seq(0, max(merged$cds_divergence), length.out = 200),
location = c("Inv4m", "flanking")
)
ancova_pred$abs_logFC <- predict(fit_ancova, newdata = ancova_pred)
# Residuals for outlier labeling
merged$residual <- residuals(fit_ancova)
# Top outliers by residual magnitude
top_outliers <- merged %>%
filter(abs(residual) > quantile(abs(residual), 0.95) | abs_logFC > 2) %>%
mutate(
fontface = ifelse(grepl("^jmj3$", display_label, ignore.case = TRUE),
"bold.italic", "italic")
)
# Extract stats for annotation
ancova_sum <- summary(fit_ancova)
ancova_coefs <- coef(ancova_sum)
r2 <- round(ancova_sum$adj.r.squared, 3)
p_div <- ancova_coefs["cds_divergence", "Pr(>|t|)"]
p_loc <- ancova_coefs["locationInv4m", "Pr(>|t|)"]
x_max <- ceiling(max(merged$cds_divergence))
y_max <- ceiling(max(merged$abs_logFC))
# Stats annotation (ggtext richtext for italic p)
stat_label <- paste0(
"Region *p* = ", format(p_loc, digits = 2), "<br>",
"Divergence *p* = ", format(p_div, digits = 2), "<br>",
"adj. R\u00B2 = ", r2
)
p_scatter <- ggplot(merged, aes(x = cds_divergence, y = abs_logFC,
color = location)) +
geom_point(aes(shape = adj.P.Val < 0.05), size = 2, alpha = 0.5) +
geom_line(data = ancova_pred, linewidth = 1) +
geom_text_repel(
data = top_outliers,
aes(label = display_label, fontface = fontface),
size = 2.5, max.overlaps = 15, segment.alpha = 0.4,
show.legend = FALSE
) +
coord_cartesian(xlim = c(0, x_max), ylim = c(0, y_max)) +
scale_color_manual(
values = c("Inv4m" = "purple4", "flanking" = "plum")
) +
scale_shape_manual(values = c("TRUE" = 16, "FALSE" = 1), guide = "none") +
geom_richtext(
data = data.frame(x = 5.1, y = 11.2),
aes(x = x, y = y), label = stat_label,
inherit.aes = FALSE, hjust = 0, vjust = 1,
size = 4.5, label.colour = NA, fill = NA
) +
labs(
x = "CDS divergence (100 - % identity)",
y = "|log2 fold change|",
title = "Expression change vs sequence divergence in the Inv4m introgression"
) +
guides(color = guide_legend(override.aes = list(size = 4, alpha = 1))) +
theme_bw(base_size = 25) +
theme(
legend.position = c(0.18, 0.98),
legend.justification = c(0, 1),
legend.title = element_blank(),
legend.text = element_text(size = 13),
legend.key.size = unit(1.2, "lines"),
legend.background = element_rect(fill = alpha("white", 0.8), color = NA)
)
print(p_scatter)
for (dir in c("Up in Inv4m", "Down in Inv4m")) {
cat("\n===", dir, "===\n")
d <- merged %>% filter(direction == dir)
cat("n =", nrow(d), "\n")
fit_dir <- lm(logFC ~ cds_divergence + location, data = d)
print(summary(fit_dir))
}
##
## === Up in Inv4m ===
## n = 114
##
## Call:
## lm(formula = logFC ~ cds_divergence + location, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8565 -1.0196 -0.5937 0.6937 7.9301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0623 0.2482 8.311 2.62e-13 ***
## cds_divergence 0.0437 0.1051 0.416 0.67839
## locationInv4m -0.8635 0.3259 -2.650 0.00923 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.713 on 111 degrees of freedom
## Multiple R-squared: 0.06076, Adjusted R-squared: 0.04384
## F-statistic: 3.59 on 2 and 111 DF, p-value: 0.03084
##
##
## === Down in Inv4m ===
## n = 140
##
## Call:
## lm(formula = logFC ~ cds_divergence + location, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3079 -0.5337 0.5653 1.0676 4.1405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.55183 0.24363 -6.369 2.67e-09 ***
## cds_divergence -0.49912 0.08951 -5.576 1.26e-07 ***
## locationInv4m 0.65439 0.31932 2.049 0.0423 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.862 on 137 degrees of freedom
## Multiple R-squared: 0.1965, Adjusted R-squared: 0.1848
## F-statistic: 16.76 on 2 and 137 DF, p-value: 3.094e-07
OLS is sensitive to the few high-divergence, high-|logFC| genes that act as leverage points. Robust regression (Huber M-estimator) downweights these automatically.
fit_rlm_add <- rlm(abs_logFC ~ cds_divergence + location, data = merged)
cat("=== Robust regression (rlm, additive) ===\n")
## === Robust regression (rlm, additive) ===
summary(fit_rlm_add)
##
## Call: rlm(formula = abs_logFC ~ cds_divergence + location, data = merged)
## Residuals:
## Min 1Q Median 3Q Max
## -2.3536 -0.8101 -0.3475 0.9699 8.4444
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 1.4018 0.1239 11.3100
## cds_divergence 0.2526 0.0482 5.2383
## locationInv4m -0.4840 0.1629 -2.9716
##
## Residual standard error: 1.224 on 251 degrees of freedom
# Huber weights: values near 1 = well-behaved, <<1 = downweighted
cat("\nDownweighted genes (Huber weight < 0.5):\n")
##
## Downweighted genes (Huber weight < 0.5):
merged$huber_w <- fit_rlm_add$w
merged %>%
filter(huber_w < 0.5) %>%
dplyr::select(gene, location, cds_divergence, abs_logFC, huber_w) %>%
arrange(huber_w)
coef_table <- data.frame(
term = names(coef(fit_add)),
OLS = round(coef(fit_add), 4),
Robust = round(coef(fit_rlm_add), 4)
)
cat("=== Coefficient comparison ===\n")
## === Coefficient comparison ===
print(coef_table, row.names = FALSE)
## term OLS Robust
## (Intercept) 1.7453 1.4018
## cds_divergence 0.3327 0.2526
## locationInv4m -0.7172 -0.4840
Bin genes by CDS divergence and regress on bin means. This upweights sparse high-divergence bins that would otherwise be overwhelmed by the mass of low-divergence genes. Shown for visual clarity; inferential results come from the robust regression above.
merged$div_bin <- cut(merged$cds_divergence,
breaks = seq(0, ceiling(max(merged$cds_divergence)), by = 1),
include.lowest = TRUE)
bin_means <- merged %>%
group_by(div_bin) %>%
summarise(
n = n(),
mean_div = mean(cds_divergence),
mean_absFC = mean(abs_logFC),
se_absFC = sd(abs_logFC) / sqrt(n()),
.groups = "drop"
)
bin_means
fit_bin_uw <- lm(mean_absFC ~ mean_div, data = bin_means)
fit_bin_w <- lm(mean_absFC ~ mean_div, data = bin_means, weights = n)
cat("=== Unweighted bin-mean regression ===\n")
## === Unweighted bin-mean regression ===
summary(fit_bin_uw)
##
## Call:
## lm(formula = mean_absFC ~ mean_div, data = bin_means)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6646 -1.4613 0.3446 0.5852 3.4868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0327 1.2394 0.833 0.4289
## mean_div 0.4157 0.1800 2.310 0.0497 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.256 on 8 degrees of freedom
## Multiple R-squared: 0.4001, Adjusted R-squared: 0.3251
## F-statistic: 5.335 on 1 and 8 DF, p-value: 0.0497
cat("\n=== Weighted bin-mean regression (weights = n) ===\n")
##
## === Weighted bin-mean regression (weights = n) ===
summary(fit_bin_w)
##
## Call:
## lm(formula = mean_absFC ~ mean_div, data = bin_means, weights = n)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.5261 -2.6069 0.2269 1.3561 4.9254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4288 0.2509 5.694 0.000457 ***
## mean_div 0.3428 0.1152 2.976 0.017702 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.019 on 8 degrees of freedom
## Multiple R-squared: 0.5254, Adjusted R-squared: 0.4661
## F-statistic: 8.858 on 1 and 8 DF, p-value: 0.0177
bin_means_loc <- merged %>%
group_by(div_bin, location) %>%
summarise(
n = n(),
mean_div = mean(cds_divergence),
mean_absFC = mean(abs_logFC),
se_absFC = sd(abs_logFC) / sqrt(n()),
.groups = "drop"
)
# ANCOVA on bin means (abs_logFC)
fit_bin_loc <- lm(mean_absFC ~ mean_div + location, data = bin_means_loc, weights = n)
cat("=== Binned ANCOVA model (divergence + location) ===\n")
## === Binned ANCOVA model (divergence + location) ===
summary(fit_bin_loc)
##
## Call:
## lm(formula = mean_absFC ~ mean_div + location, data = bin_means_loc,
## weights = n)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.1430 -1.7783 -0.4776 1.6589 3.8680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.72403 0.22265 7.743 2e-06 ***
## mean_div 0.34853 0.08741 3.987 0.00135 **
## locationInv4m -0.72020 0.29155 -2.470 0.02697 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.29 on 14 degrees of freedom
## Multiple R-squared: 0.5993, Adjusted R-squared: 0.5421
## F-statistic: 10.47 on 2 and 14 DF, p-value: 0.001658
# Predicted lines from additive model
pred_grid <- expand.grid(
mean_div = seq(0, max(bin_means_loc$mean_div, na.rm = TRUE), length.out = 100),
location = c("flanking", "Inv4m")
)
pred_grid$mean_absFC <- predict(fit_bin_loc, newdata = pred_grid)
# Stats for annotation (from binned ANCOVA)
bin_sum <- summary(fit_bin_loc)
bin_coefs <- coef(bin_sum)
bin_r2 <- round(bin_sum$adj.r.squared, 3)
bin_p_div <- bin_coefs["mean_div", "Pr(>|t|)"]
bin_p_loc <- bin_coefs["locationInv4m", "Pr(>|t|)"]
bin_stat_label <- paste0(
"Region *p* = ", format(bin_p_loc, digits = 2), "<br>",
"Divergence *p* = ", format(bin_p_div, digits = 2), "<br>",
"adj. R\u00B2 = ", bin_r2
)
p_ancova <- ggplot(bin_means_loc, aes(x = mean_div, y = mean_absFC,
color = location)) +
geom_point(aes(size = n), alpha = 0.8) +
geom_errorbar(aes(ymin = mean_absFC - se_absFC,
ymax = mean_absFC + se_absFC),
width = 0.15, alpha = 0.4) +
geom_line(data = pred_grid, linewidth = 0.9) +
geom_richtext(
data = data.frame(x = 0, y = 8.7),
aes(x = x, y = y), label = bin_stat_label,
inherit.aes = FALSE, hjust = 0, vjust = 1,
size = 4.5, label.colour = NA, fill = NA
) +
scale_color_manual(values = c("Inv4m" = "purple4", "flanking" = "plum"),
name = "Region") +
scale_size_continuous(name = "Genes per bin") +
labs(
x = "Mean CDS divergence (%)",
y = "Mean |log2 fold change|",
title = "Binned ANCOVA: shared slope (divergence), offset = location"
) +
theme_bw(base_size = 25)
print(p_ancova)
Genes with large positive residuals have more expression change than expected from their sequence divergence alone — candidates for trans-regulation or CNV effects.
outliers <- merged %>%
arrange(desc(abs(residual))) %>%
dplyr::select(gene, location, pct_identity, cds_divergence,
logFC, abs_logFC, adj.P.Val, residual) %>%
head(20)
outliers
# Pseudocount for zeros: one log10 unit below the minimum nonzero value
min_nz <- min(merged$cds_divergence[merged$cds_divergence > 0])
pseudo_zero <- min_nz / 10
merged_div_plot <- merged %>%
mutate(div_plot = ifelse(cds_divergence == 0, pseudo_zero, cds_divergence))
# Wilcoxon test for annotation
wt <- wilcox.test(cds_divergence ~ location, data = merged)
p_beeswarm <- ggplot(merged_div_plot, aes(x = location, y = div_plot, color = location)) +
geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0, outlier.shape = NA) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(size = 2) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
scale_color_manual(values = c("Inv4m" = "purple4", "flanking" = "plum")) +
scale_x_discrete(labels = c("Inv4m" = "*Inv4m*", "flanking" = "flanking")) +
annotate("segment", x = 1, xend = 2, y = max(merged_div_plot$div_plot) * 1.3,
yend = max(merged_div_plot$div_plot) * 1.3) +
geom_richtext(
data = data.frame(x = 1.5, y = max(merged_div_plot$div_plot) * 1.6),
aes(x = x, y = y),
label = paste0("Wilcoxon *p* = ", format(wt$p.value, digits = 2)),
inherit.aes = FALSE, size = 4, label.colour = NA, fill = NA
) +
scale_y_continuous(trans = "log10", expand = expansion(mult = c(0.1, 0.15))) +
labs(x = NULL, y = "CDS divergence (%)") +
theme_bw(base_size = 25) +
theme(
legend.position = "none",
axis.text.x = element_markdown(size = 13)
)
print(p_beeswarm)
cat("=== Wilcoxon rank-sum test ===\n")
## === Wilcoxon rank-sum test ===
print(wt)
##
## Wilcoxon rank sum test with continuity correction
##
## data: cds_divergence by location
## W = 6173, p-value = 0.003436
## alternative hypothesis: true location shift is not equal to 0
cat("\n=== Welch t-test on log-transformed divergence ===\n")
##
## === Welch t-test on log-transformed divergence ===
merged_div_plot$log_div <- log(merged_div_plot$div_plot)
print(t.test(log_div ~ location, data = merged_div_plot))
##
## Welch Two Sample t-test
##
## data: log_div by location
## t = -4.4951, df = 213.79, p-value = 1.139e-05
## alternative hypothesis: true difference in means between group flanking and group Inv4m is not equal to 0
## 95 percent confidence interval:
## -1.3249481 -0.5171673
## sample estimates:
## mean in group flanking mean in group Inv4m
## -0.85514349 0.06591421
The ANCOVA and quantile regressions model the average or median relationship between divergence and expression change. But our interest is in the DEGs — the genes in the tail of the distribution that show extreme expression responses. Extreme value theory (EVT) models this tail behavior directly.
The approach:
logFC ~ cds_divergence to remove the divergence
effect|residual| to pool both tails (comparable to the
abs_logFC additive model)# Robust residuals from divergence-only model, then take absolute value
# to pool both tails — directly comparable to the abs_logFC additive model
merged$robust_resid <- merged$logFC - predict(
rlm(logFC ~ cds_divergence, data = merged),
newdata = merged
)
merged$abs_resid <- abs(merged$robust_resid)
cat("Absolute residual summary:\n")
## Absolute residual summary:
summary(merged$abs_resid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03874 0.56026 1.24215 1.86948 2.49406 10.08529
# Use 75th percentile of |residual| as threshold
thresh <- quantile(merged$abs_resid, 0.75)
cat("Threshold (75th pct of |residual|):", round(thresh, 3), "\n")
## Threshold (75th pct of |residual|): 2.494
cat("Total exceedances:", sum(merged$abs_resid > thresh), "\n")
## Total exceedances: 64
cat(" Inv4m:", sum(merged$abs_resid[merged$location == "Inv4m"] > thresh), "\n")
## Inv4m: 20
cat(" flanking:", sum(merged$abs_resid[merged$location == "flanking"] > thresh), "\n")
## flanking: 44
gpd_results <- list()
for (loc in c("Inv4m", "flanking")) {
d <- merged %>% filter(location == loc)
exc <- d$abs_resid[d$abs_resid > thresh]
if (length(exc) >= 5) {
fit <- fpot(d$abs_resid, threshold = thresh, model = "gpd")
gpd_results[[loc]] <- list(
location = loc,
n_exceed = length(exc),
scale = fit$estimate["scale"],
shape = fit$estimate["shape"],
se_scale = fit$std.err["scale"],
se_shape = fit$std.err["shape"],
nllh = fit$deviance # negative log-likelihood
)
cat("===", loc, "===\n")
cat(" Exceedances:", length(exc), "\n")
cat(" Scale:", round(fit$estimate["scale"], 4),
"±", round(fit$std.err["scale"], 4), "\n")
cat(" Shape:", round(fit$estimate["shape"], 4),
"±", round(fit$std.err["shape"], 4), "\n\n")
}
}
## === Inv4m ===
## Exceedances: 20
## Scale: 1.5864 ± 0.5794
## Shape: -0.0331 ± 0.2889
##
## === flanking ===
## Exceedances: 44
## Scale: 2.8992 ± 0.5709
## Shape: -0.3001 ± 0.1345
gpd_df <- bind_rows(lapply(gpd_results, as.data.frame))
gpd_df
gpd_long <- gpd_df %>%
pivot_longer(cols = c(scale, shape), names_to = "parameter", values_to = "estimate") %>%
mutate(se = ifelse(parameter == "scale", se_scale, se_shape))
ggplot(gpd_long, aes(x = location, y = estimate, color = location)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = estimate - 1.96 * se, ymax = estimate + 1.96 * se),
width = 0.15) +
facet_wrap(~ parameter, scales = "free_y") +
scale_color_manual(values = c("Inv4m" = "purple4", "flanking" = "plum")) +
labs(
x = NULL, y = "GPD parameter estimate",
title = "Generalized Pareto fits (pooled tails): |residual| by region",
caption = "Error bars: 95% CI from MLE standard errors"
) +
theme_bw(base_size = 25) +
theme(legend.position = "none")
Compare a model where both regions share GPD parameters vs separate fits.
# Pooled fit (all locations combined)
fit_pooled <- fpot(merged$abs_resid, threshold = thresh, model = "gpd")
# Sum of separate negative log-likelihoods
nllh_separate <- sum(sapply(gpd_results, function(x) x$nllh))
nllh_pooled <- fit_pooled$deviance
# LRT: 2 * (loglik_separate - loglik_pooled), df = 2 (extra scale + shape)
lrt_stat <- 2 * (nllh_pooled - nllh_separate)
lrt_pval <- pchisq(lrt_stat, df = 2, lower.tail = FALSE)
cat("=== Likelihood ratio test: shared vs separate GPD ===\n")
## === Likelihood ratio test: shared vs separate GPD ===
cat(" Pooled nllh:", round(nllh_pooled, 2), "\n")
## Pooled nllh: 215.01
cat(" Separate nllh:", round(nllh_separate, 2), "\n")
## Separate nllh: 212.37
cat(" LRT statistic:", round(lrt_stat, 3), "\n")
## LRT statistic: 5.283
cat(" p-value:", format(lrt_pval, digits = 4), "\n")
## p-value: 0.07127
exc_table <- merged %>%
group_by(location) %>%
summarise(
n = n(),
n_extreme = sum(abs_resid > thresh),
pct_extreme = round(100 * n_extreme / n, 1),
.groups = "drop"
)
exc_table
cat("\n=== Fisher's test: exceedance rate by location ===\n")
##
## === Fisher's test: exceedance rate by location ===
mat <- matrix(c(
exc_table$n_extreme[exc_table$location == "Inv4m"],
exc_table$n[exc_table$location == "Inv4m"] - exc_table$n_extreme[exc_table$location == "Inv4m"],
exc_table$n_extreme[exc_table$location == "flanking"],
exc_table$n[exc_table$location == "flanking"] - exc_table$n_extreme[exc_table$location == "flanking"]
), nrow = 2, dimnames = list(c("extreme", "not"), c("Inv4m", "flanking")))
print(fisher.test(mat))
##
## Fisher's Exact Test for Count Data
##
## data: mat
## p-value = 0.05657
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2789642 1.0162047
## sample estimates:
## odds ratio
## 0.5394315
par(mfrow = c(1, 2))
for (loc in c("Inv4m", "flanking")) {
d <- merged %>% filter(location == loc)
if (sum(d$abs_resid > thresh) >= 5) {
fit <- fpot(d$abs_resid, threshold = thresh, model = "gpd")
plot(fit, which = 4, main = paste(loc, "- GPD QQ (pooled tails)"))
}
}
write.csv(merged, file.path(paths$intermediate, "sequence_divergence_vs_DE.csv"),
row.names = FALSE)
ggsave(file.path(paths$figures, "sequence_divergence_vs_DE.pdf"),
p_scatter, width = 8, height = 6)
ggsave(file.path(paths$figures, "sequence_divergence_vs_DE.svg"),
p_scatter, width = 8, height = 6, fix_text_size = TRUE)
cat("Outputs saved to:", paths$intermediate, "and", paths$figures, "\n")
## Outputs saved to: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate and /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/figures
library(cowplot)
# Load logFC by region panel from compare_r2_sliding_window_regions.Rmd
p_logfc_region <- readRDS(file.path(paths$intermediate, "panel_logfc_by_region.rds"))
# Remove titles from plots for the assembled figure
p_scatter_notitle <- p_scatter + labs(title = NULL)
p_beeswarm_notitle <- p_beeswarm + labs(title = NULL)
# Top row: (A) logFC by region | (B) CDS divergence beeswarm
top_row <- plot_grid(
p_logfc_region + labs(title = NULL),
p_beeswarm_notitle,
nrow = 1,
rel_widths = c(0.5, 0.5),
labels = c("A", "B"),
label_size = 18
)
# Bottom row: (C) |logFC| vs CDS divergence scatter (full width)
bottom_row <- plot_grid(
p_scatter_notitle,
nrow = 1,
labels = c("C"),
label_size = 18
)
# Stack rows
p_combined <- plot_grid(
top_row,
bottom_row,
nrow = 2,
rel_heights = c(1, 1)
)
print(p_combined)
ggsave(file.path(paths$figures, "divergence.pdf"),
p_combined, width = 14, height = 12)
ggsave(file.path(paths$figures, "divergence.png"),
p_combined, width = 14, height = 12, dpi = 300)
ggsave(file.path(paths$figures, "divergence.svg"),
p_combined, width = 14, height = 12, fix_text_size = TRUE)
# Genes with 0% CDS divergence
n_zero_inv4m <- sum(merged$cds_divergence == 0 & merged$location == "Inv4m")
n_zero_flank <- sum(merged$cds_divergence == 0 & merged$location == "flanking")
n_nonzero_inv4m <- sum(merged$cds_divergence > 0 & merged$location == "Inv4m")
n_nonzero_flank <- sum(merged$cds_divergence > 0 & merged$location == "flanking")
mat_zero <- matrix(
c(n_zero_inv4m, n_nonzero_inv4m,
n_zero_flank, n_nonzero_flank),
nrow = 2,
dimnames = list(c("zero_div", "nonzero_div"), c("Inv4m", "flanking"))
)
cat("Contingency table:\n")
## Contingency table:
print(mat_zero)
## Inv4m flanking
## zero_div 1 20
## nonzero_div 106 127
ft_zero <- fisher.test(mat_zero)
cat("\nFisher's exact test: zero-divergence genes by location\n")
##
## Fisher's exact test: zero-divergence genes by location
print(ft_zero)
##
## Fisher's Exact Test for Count Data
##
## data: mat_zero
## p-value = 0.0001254
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.001436621 0.389721861
## sample estimates:
## odds ratio
## 0.06031275
cat("\nAll", n_zero_inv4m + n_zero_flank, "zero-divergence genes are DEGs (FDR < 0.05)\n")
##
## All 21 zero-divergence genes are DEGs (FDR < 0.05)
cat(" Inv4m:", n_zero_inv4m, "\n")
## Inv4m: 1
cat(" flanking:", n_zero_flank, "\n")
## flanking: 20
RESULTS PARAGRAPH
To test whether differential expression in the introgression is explained by
coding sequence divergence between the B73 and Mi21 haplotypes, we computed
CDS percent identity from LAST pairwise alignments for 254 expressed
genes with ortholog hits in the shared introgression.
Genes inside the inversion had significantly higher CDS divergence than flanking
genes (median 1.1\% vs 0.75\%;
Wilcoxon $p$ = 0.0034; Fig.~\ref{fig::divergence} A), consistent with
suppressed recombination preserving ancestral \textit{mexicana} divergence within the
inversion.
An additive linear model ($|\text{log}_2\text{FC}| \sim \text{CDS divergence} + \text{region}$)
explained 10.4\% of the variance
in expression change (adj.\ $R^2$ = 0.104;
divergence $p$ = 2.4e-06;
region $p$ = 0.0023; Fig.~\ref{fig::divergence} B).
After accounting for divergence, genes inside the inversion showed smaller
absolute expression changes than flanking genes, indicating that the inversion
dampens rather than amplifies expression divergence.
Notably, 21 genes had identical CDS between B73 and
Mi21, and all were significantly differentially expressed (FDR $<$ 0.05).
Of these, 20 of 21 were located in the
flanking region, a significant overrepresentation
(Fisher's exact test, $p$ = 0.00013,
OR = 0.1).
Because these genes lack coding sequence divergence, their differential expression
is most parsimoniously attributed to \textit{trans}-regulatory effects originating from
divergent loci elsewhere in the introgression.
FIGURE FLOAT
\begin{figure}[!ht]
\includegraphics[width=\linewidth]{figs/sequence_divergence_multipanel.pdf}
\caption{\textbf{CDS sequence divergence and expression change in the \invfour introgression.}
\textbf{(A)} Distribution of CDS divergence (100 $-$ \% identity) for expressed genes inside the inversion (\invfour, $n$ = 107) and in the flanking introgression (flanking, $n$ = 147).
Points show individual genes (beeswarm); boxes show median and interquartile range.
$y$-axis is log$_{10}$-scaled.
Wilcoxon rank-sum $p$ = 0.0034.
\textbf{(B)} Absolute log$_2$ fold change vs CDS divergence.
Parallel lines show the additive ANCOVA model (shared slope for divergence effect, vertical offset for region effect).
Region $p$ = 0.0023;
divergence $p$ = 2.4e-06;
adj.\ $R^2$ = 0.104.}
\label{fig::divergence}
\end{figure}
MODEL COMPARISON TABLE
\begin{table}[!ht]
\caption{\textbf{Model comparison for the relationship between CDS divergence and expression change in the \invfour introgression.}
Five linear models predicting $|\text{log}_2\text{FC}|$ were compared by BIC and AIC.
The additive model (CDS divergence + region) provided the best fit, with the interaction term adding no improvement.
Region distinguishes genes inside \invfour from the flanking introgression.}
\label{tab::divergence_models}
\centering
\begin{tabular}{@{}lcccc@{}}
\toprule
\textbf{Model} & \textbf{df} & \textbf{AIC} & \textbf{BIC} & \textbf{adj.\ $R^2$} \\ \midrule
CDS divergence + region & 3 & 1032.2 & 1046.3 & 0.1037 \\
CDS divergence & 2 & 1039.6 & 1050.2 & 0.0734 \\
CDS divergence $\times$ region & 4 & 1033.2 & 1050.9 & 0.1035 \\
Region & 2 & 1052.8 & 1063.4 & 0.0242 \\
Null (intercept only) & 1 & 1058 & 1065.1 & 0 \\ \bottomrule
\end{tabular}
\end{table}
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] cowplot_1.2.0 ggfx_1.0.3 ggbeeswarm_0.7.3
## [4] ggtext_0.1.2 evd_2.3-7.1 quantreg_6.1
## [7] SparseM_1.84-2 MASS_7.3-65 GenomicRanges_1.62.1
## [10] Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
## [13] BiocGenerics_0.56.0 generics_0.1.4 ggrepel_0.9.6
## [16] lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
## [19] dplyr_1.1.4 purrr_1.2.0 readr_2.1.6
## [22] tidyr_1.3.2 tibble_3.3.0 ggplot2_4.0.1
## [25] tidyverse_2.0.0 here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 rlang_1.1.6
## [3] magrittr_2.0.4 otel_0.2.0
## [5] matrixStats_1.5.0 compiler_4.5.2
## [7] systemfonts_1.3.1 vctrs_0.6.5
## [9] pkgconfig_2.0.3 crayon_1.5.3
## [11] fastmap_1.2.0 backports_1.5.0
## [13] magick_2.9.0 XVector_0.50.0
## [15] labeling_0.4.3 Rsamtools_2.26.0
## [17] rmarkdown_2.30 markdown_2.0
## [19] tzdb_0.5.0 ragg_1.5.0
## [21] MatrixModels_0.5-4 xfun_0.55
## [23] cachem_1.1.0 cigarillo_1.0.0
## [25] litedown_0.9 jsonlite_2.0.0
## [27] DelayedArray_0.36.0 BiocParallel_1.44.0
## [29] broom_1.0.11 parallel_4.5.2
## [31] R6_2.6.1 bslib_0.9.0
## [33] stringi_1.8.7 RColorBrewer_1.1-3
## [35] rtracklayer_1.70.1 car_3.1-3
## [37] jquerylib_0.1.4 Rcpp_1.1.0
## [39] SummarizedExperiment_1.40.0 knitr_1.51
## [41] Matrix_1.7-4 splines_4.5.2
## [43] timechange_0.3.0 tidyselect_1.2.1
## [45] dichromat_2.0-0.1 abind_1.4-8
## [47] yaml_2.3.12 codetools_0.2-20
## [49] curl_7.0.0 lattice_0.22-7
## [51] Biobase_2.70.0 withr_3.0.2
## [53] S7_0.2.1 evaluate_1.0.5
## [55] survival_3.8-3 xml2_1.5.1
## [57] Biostrings_2.78.0 ggpubr_0.6.2
## [59] pillar_1.11.1 carData_3.0-5
## [61] MatrixGenerics_1.22.0 rprojroot_2.1.1
## [63] RCurl_1.98-1.17 hms_1.1.4
## [65] commonmark_2.0.0 scales_1.4.0
## [67] glue_1.8.0 tools_4.5.2
## [69] BiocIO_1.20.0 ggsignif_0.6.4
## [71] GenomicAlignments_1.46.0 XML_3.99-0.20
## [73] grid_4.5.2 beeswarm_0.4.0
## [75] restfulr_0.0.16 vipor_0.4.7
## [77] Formula_1.2-5 cli_3.6.5
## [79] textshaping_1.0.4 S4Arrays_1.10.1
## [81] svglite_2.2.2 gtable_0.3.6
## [83] rstatix_0.7.3 sass_0.4.10
## [85] digest_0.6.39 SparseArray_1.10.8
## [87] rjson_0.2.23 farver_2.1.2
## [89] htmltools_0.5.9 lifecycle_1.0.4
## [91] httr_1.4.7 gridtext_0.1.5