Overview

Are the expression differences between Inv4m and B73 haplotypes explained by per-gene CDS sequence divergence, or do some genes show excess DE beyond what divergence predicts?

We use LAST alignments (B73 vs Mi21 CDS) already computed by the jcvi ortholog pipeline. Only genes within the shared introgression on chromosome 4 are analyzed.

library(here)
library(tidyverse)
library(ggrepel)
library(GenomicRanges)
library(MASS)      # rlm (robust regression)
library(quantreg)  # rq  (quantile regression)
library(evd)       # GPD fitting for extreme value analysis
library(ggtext)    # richtext labels
library(ggbeeswarm) # beeswarm plots
library(ggfx)      # shadow effects

source(here("scripts", "utils", "setup_paths.R"))
paths <- setup_project_paths("inversion_paper")

Load data

LAST alignments (CDS percent identity)

last_file <- "/Users/fvrodriguez/mnt/rrellan/DOE_CAREER/inv4m/nilhifimi21/microsynteny/B73.Mi21.last"

last_cols <- c("query", "subject", "pct_identity", "aln_len", "mismatches",
               "gaps", "qstart", "qend", "sstart", "send", "evalue", "score")

last_raw <- read.table(last_file, header = FALSE, sep = "\t",
                       col.names = last_cols, comment.char = "#")

# Strip transcript suffix to get gene IDs
last_raw <- last_raw %>%
  mutate(
    b73_gene = sub("_T\\d+$", "", query),
    mi21_gene = sub("^Mi21_", "", sub("_T\\d+$", "", subject))
  )

cat("Raw LAST alignments:", nrow(last_raw), "\n")

## Raw LAST alignments: 1816628

DEG results

degs <- read.csv(file.path(paths$intermediate, "deg_both_models_genotype.csv"))
cat("DEG results:", nrow(degs), "genes\n")

## DEG results: 465 genes

Gene coordinates (B73 v5 GFF)

v5_gff_file <- file.path(paths$data, "Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3")
genes_gr <- rtracklayer::import(v5_gff_file)
genes_gr <- genes_gr[genes_gr$type == "gene" &
                     as.character(seqnames(genes_gr)) %in% as.character(1:10)]
genes <- as.data.frame(genes_gr)
genes$ID <- gsub("gene:", "", genes$ID)

Define regions and filter

# Boundaries
introgression_start <- 157012149
introgression_end   <- 195900523
inv4m_start <- genes$start[genes$ID == "Zm00001eb190470"]
inv4m_end   <- genes$end[genes$ID == "Zm00001eb194800"]

# Chr4 introgression genes
introgression_genes <- genes %>%
  filter(seqnames == 4, start >= introgression_start, end <= introgression_end) %>%
  mutate(location = ifelse(start >= inv4m_start & end <= inv4m_end,
                           "Inv4m", "flanking"))

cat("Introgression genes:", nrow(introgression_genes), "\n")

## Introgression genes: 1099

cat("  Inv4m:", sum(introgression_genes$location == "Inv4m"), "\n")

##   Inv4m: 434

cat("  CTRL:", sum(introgression_genes$location == "flanking"), "\n")

##   CTRL: 665

Best LAST hit per gene in introgression

Keep the highest-scoring ortholog alignment for each B73 gene within the introgression.

best_hits <- last_raw %>%
  filter(b73_gene %in% introgression_genes$ID) %>%
  # Keep only self-ortholog hits (same gene ID in B73 and Mi21)
  filter(b73_gene == mi21_gene) %>%
  group_by(b73_gene) %>%
  slice_max(score, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  dplyr::select(gene = b73_gene, pct_identity, aln_len, score)

cat("Genes with ortholog hits:", nrow(best_hits), "\n")

## Genes with ortholog hits: 902

cat("Identity range:", round(min(best_hits$pct_identity), 2), "-",
    round(max(best_hits$pct_identity), 2), "%\n")

## Identity range: 84.07 - 100 %

Merge datasets

# Load master locus labels
locus_labels <- read.csv(file.path(paths$data, "locus_labels_master.csv")) %>%
  dplyr::select(gene, locus_label)

merged <- best_hits %>%
  inner_join(degs, by = "gene") %>%
  inner_join(
    introgression_genes %>% dplyr::select(ID, location, start),
    by = c("gene" = "ID")
  ) %>%
  left_join(locus_labels, by = "gene") %>%
  mutate(
    cds_divergence = 100 - pct_identity,
    abs_logFC = abs(logFC),
    direction = ifelse(logFC > 0, "Up in Inv4m", "Down in Inv4m"),
    display_label = coalesce(locus_label, gene)
  )

cat("Merged dataset:", nrow(merged), "genes\n")

## Merged dataset: 254 genes

cat("  DEGs (FDR < 0.05):", sum(merged$adj.P.Val < 0.05), "\n")

##   DEGs (FDR < 0.05): 254

cat("  Up:", sum(merged$direction == "Up in Inv4m"),
    " Down:", sum(merged$direction == "Down in Inv4m"), "\n")

##   Up: 114  Down: 140

Analysis

Model selection

fit_null <- lm(abs_logFC ~ 1, data = merged)
fit_div  <- lm(abs_logFC ~ cds_divergence, data = merged)
fit_loc  <- lm(abs_logFC ~ location, data = merged)
fit_add  <- lm(abs_logFC ~ cds_divergence + location, data = merged)
fit_int  <- lm(abs_logFC ~ cds_divergence * location, data = merged)

models <- list(null = fit_null, divergence = fit_div, location = fit_loc,
               additive = fit_add, interaction = fit_int)

data.frame(
  model = names(models),
  df = sapply(models, function(m) length(coef(m))),
  AIC = round(sapply(models, AIC), 1),
  BIC = round(sapply(models, BIC), 1),
  adj_R2 = round(sapply(models, function(m) summary(m)$adj.r.squared), 4)
) %>% arrange(BIC)

cat("=== Additive model ===\n")

## === Additive model ===

summary(fit_add)

## 
## Call:
## lm(formula = abs_logFC ~ cds_divergence + location, data = merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0515 -1.1299 -0.6354  0.5390  8.0448 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.74532    0.17704   9.858  < 2e-16 ***
## cds_divergence  0.33273    0.06887   4.831 2.36e-06 ***
## locationInv4m  -0.71719    0.23264  -3.083  0.00228 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.828 on 251 degrees of freedom
## Multiple R-squared:  0.1108, Adjusted R-squared:  0.1037 
## F-statistic: 15.63 on 2 and 251 DF,  p-value: 4.002e-07

cat("\n=== Interaction model ===\n")

## 
## === Interaction model ===

summary(fit_int)

## 
## Call:
## lm(formula = abs_logFC ~ cds_divergence * location, data = merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8392 -1.1091 -0.6770  0.5359  8.0737 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.68515    0.18754   8.986   <2e-16 ***
## cds_divergence                0.37737    0.08275   4.560    8e-06 ***
## locationInv4m                -0.50208    0.32089  -1.565    0.119    
## cds_divergence:locationInv4m -0.14532    0.14930  -0.973    0.331    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.828 on 250 degrees of freedom
## Multiple R-squared:  0.1141, Adjusted R-squared:  0.1035 
## F-statistic: 10.73 on 3 and 250 DF,  p-value: 1.164e-06

anova(fit_add, fit_int)

cat("\n=== ANOVA: location-only vs additive ===\n")

## 
## === ANOVA: location-only vs additive ===

anova(fit_loc, fit_add)

Scatter: |log2FC| vs CDS divergence

# Use additive model for ANCOVA lines
fit_ancova <- fit_add

# Predicted ANCOVA lines
ancova_pred <- expand.grid(
  cds_divergence = seq(0, max(merged$cds_divergence), length.out = 200),
  location = c("Inv4m", "flanking")
)
ancova_pred$abs_logFC <- predict(fit_ancova, newdata = ancova_pred)

# Residuals for outlier labeling
merged$residual <- residuals(fit_ancova)

# Top outliers by residual magnitude
top_outliers <- merged %>%
  filter(abs(residual) > quantile(abs(residual), 0.95) | abs_logFC > 2) %>%
  mutate(
    fontface = ifelse(grepl("^jmj3$", display_label, ignore.case = TRUE),
                      "bold.italic", "italic")
  )

# Extract stats for annotation
ancova_sum <- summary(fit_ancova)
ancova_coefs <- coef(ancova_sum)
r2 <- round(ancova_sum$adj.r.squared, 3)
p_div <- ancova_coefs["cds_divergence", "Pr(>|t|)"]
p_loc <- ancova_coefs["locationInv4m", "Pr(>|t|)"]

x_max <- ceiling(max(merged$cds_divergence))
y_max <- ceiling(max(merged$abs_logFC))

# Stats annotation (ggtext richtext for italic p)
stat_label <- paste0(
  "Region *p* = ", format(p_loc, digits = 2), "<br>",
  "Divergence *p* = ", format(p_div, digits = 2), "<br>",
  "adj. R\u00B2 = ", r2
)

p_scatter <- ggplot(merged, aes(x = cds_divergence, y = abs_logFC,
                                 color = location)) +
  geom_point(aes(shape = adj.P.Val < 0.05), size = 2, alpha = 0.5) +
  geom_line(data = ancova_pred, linewidth = 1) +
  geom_text_repel(
    data = top_outliers,
    aes(label = display_label, fontface = fontface),
    size = 2.5, max.overlaps = 15, segment.alpha = 0.4,
    show.legend = FALSE
  ) +
  coord_cartesian(xlim = c(0, x_max), ylim = c(0, y_max)) +
  scale_color_manual(
    values = c("Inv4m" = "purple4", "flanking" = "plum")
  ) +
  scale_shape_manual(values = c("TRUE" = 16, "FALSE" = 1), guide = "none") +
  geom_richtext(
    data = data.frame(x = 5.1, y = 11.2),
    aes(x = x, y = y), label = stat_label,
    inherit.aes = FALSE, hjust = 0, vjust = 1,
    size = 4.5, label.colour = NA, fill = NA
  ) +
  labs(
    x = "CDS divergence (100 - % identity)",
    y = "|log2 fold change|",
    title = "Expression change vs sequence divergence in the Inv4m introgression"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4, alpha = 1))) +
  theme_bw(base_size = 25) +
  theme(
    legend.position = c(0.18, 0.98),
    legend.justification = c(0, 1),
    legend.title = element_blank(),
    legend.text = element_text(size = 13),
    legend.key.size = unit(1.2, "lines"),
    legend.background = element_rect(fill = alpha("white", 0.8), color = NA)
  )

print(p_scatter)

for (dir in c("Up in Inv4m", "Down in Inv4m")) {
  cat("\n===", dir, "===\n")
  d <- merged %>% filter(direction == dir)
  cat("n =", nrow(d), "\n")
  fit_dir <- lm(logFC ~ cds_divergence + location, data = d)
  print(summary(fit_dir))
}

## 
## === Up in Inv4m ===
## n = 114 
## 
## Call:
## lm(formula = logFC ~ cds_divergence + location, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8565 -1.0196 -0.5937  0.6937  7.9301 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.0623     0.2482   8.311 2.62e-13 ***
## cds_divergence   0.0437     0.1051   0.416  0.67839    
## locationInv4m   -0.8635     0.3259  -2.650  0.00923 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.713 on 111 degrees of freedom
## Multiple R-squared:  0.06076,    Adjusted R-squared:  0.04384 
## F-statistic:  3.59 on 2 and 111 DF,  p-value: 0.03084
## 
## 
## === Down in Inv4m ===
## n = 140 
## 
## Call:
## lm(formula = logFC ~ cds_divergence + location, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3079 -0.5337  0.5653  1.0676  4.1405 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.55183    0.24363  -6.369 2.67e-09 ***
## cds_divergence -0.49912    0.08951  -5.576 1.26e-07 ***
## locationInv4m   0.65439    0.31932   2.049   0.0423 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.862 on 137 degrees of freedom
## Multiple R-squared:  0.1965, Adjusted R-squared:  0.1848 
## F-statistic: 16.76 on 2 and 137 DF,  p-value: 3.094e-07

Robust regression

OLS is sensitive to the few high-divergence, high-|logFC| genes that act as leverage points. Robust regression (Huber M-estimator) downweights these automatically.

fit_rlm_add <- rlm(abs_logFC ~ cds_divergence + location, data = merged)
cat("=== Robust regression (rlm, additive) ===\n")

## === Robust regression (rlm, additive) ===

summary(fit_rlm_add)

## 
## Call: rlm(formula = abs_logFC ~ cds_divergence + location, data = merged)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3536 -0.8101 -0.3475  0.9699  8.4444 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)     1.4018  0.1239    11.3100
## cds_divergence  0.2526  0.0482     5.2383
## locationInv4m  -0.4840  0.1629    -2.9716
## 
## Residual standard error: 1.224 on 251 degrees of freedom

# Huber weights: values near 1 = well-behaved, <<1 = downweighted
cat("\nDownweighted genes (Huber weight < 0.5):\n")

## 
## Downweighted genes (Huber weight < 0.5):

merged$huber_w <- fit_rlm_add$w
merged %>%
  filter(huber_w < 0.5) %>%
  dplyr::select(gene, location, cds_divergence, abs_logFC, huber_w) %>%
  arrange(huber_w)

coef_table <- data.frame(
  term = names(coef(fit_add)),
  OLS = round(coef(fit_add), 4),
  Robust = round(coef(fit_rlm_add), 4)
)
cat("=== Coefficient comparison ===\n")

## === Coefficient comparison ===

print(coef_table, row.names = FALSE)

##            term     OLS  Robust
##     (Intercept)  1.7453  1.4018
##  cds_divergence  0.3327  0.2526
##   locationInv4m -0.7172 -0.4840

Binned regression (visual summary)

Bin genes by CDS divergence and regress on bin means. This upweights sparse high-divergence bins that would otherwise be overwhelmed by the mass of low-divergence genes. Shown for visual clarity; inferential results come from the robust regression above.

merged$div_bin <- cut(merged$cds_divergence,
                      breaks = seq(0, ceiling(max(merged$cds_divergence)), by = 1),
                      include.lowest = TRUE)

bin_means <- merged %>%
  group_by(div_bin) %>%
  summarise(
    n = n(),
    mean_div = mean(cds_divergence),
    mean_absFC = mean(abs_logFC),
    se_absFC = sd(abs_logFC) / sqrt(n()),
    .groups = "drop"
  )

bin_means

fit_bin_uw <- lm(mean_absFC ~ mean_div, data = bin_means)
fit_bin_w  <- lm(mean_absFC ~ mean_div, data = bin_means, weights = n)

cat("=== Unweighted bin-mean regression ===\n")

## === Unweighted bin-mean regression ===

summary(fit_bin_uw)

## 
## Call:
## lm(formula = mean_absFC ~ mean_div, data = bin_means)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6646 -1.4613  0.3446  0.5852  3.4868 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.0327     1.2394   0.833   0.4289  
## mean_div      0.4157     0.1800   2.310   0.0497 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.256 on 8 degrees of freedom
## Multiple R-squared:  0.4001, Adjusted R-squared:  0.3251 
## F-statistic: 5.335 on 1 and 8 DF,  p-value: 0.0497

cat("\n=== Weighted bin-mean regression (weights = n) ===\n")

## 
## === Weighted bin-mean regression (weights = n) ===

summary(fit_bin_w)

## 
## Call:
## lm(formula = mean_absFC ~ mean_div, data = bin_means, weights = n)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5261 -2.6069  0.2269  1.3561  4.9254 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.4288     0.2509   5.694 0.000457 ***
## mean_div      0.3428     0.1152   2.976 0.017702 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.019 on 8 degrees of freedom
## Multiple R-squared:  0.5254, Adjusted R-squared:  0.4661 
## F-statistic: 8.858 on 1 and 8 DF,  p-value: 0.0177

bin_means_loc <- merged %>%
  group_by(div_bin, location) %>%
  summarise(
    n = n(),
    mean_div = mean(cds_divergence),
    mean_absFC = mean(abs_logFC),
    se_absFC = sd(abs_logFC) / sqrt(n()),
    .groups = "drop"
  )

# ANCOVA on bin means (abs_logFC)
fit_bin_loc <- lm(mean_absFC ~ mean_div + location, data = bin_means_loc, weights = n)
cat("=== Binned ANCOVA model (divergence + location) ===\n")

## === Binned ANCOVA model (divergence + location) ===

summary(fit_bin_loc)

## 
## Call:
## lm(formula = mean_absFC ~ mean_div + location, data = bin_means_loc, 
##     weights = n)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1430 -1.7783 -0.4776  1.6589  3.8680 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.72403    0.22265   7.743    2e-06 ***
## mean_div       0.34853    0.08741   3.987  0.00135 ** 
## locationInv4m -0.72020    0.29155  -2.470  0.02697 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.29 on 14 degrees of freedom
## Multiple R-squared:  0.5993, Adjusted R-squared:  0.5421 
## F-statistic: 10.47 on 2 and 14 DF,  p-value: 0.001658

# Predicted lines from additive model
pred_grid <- expand.grid(
  mean_div = seq(0, max(bin_means_loc$mean_div, na.rm = TRUE), length.out = 100),
  location = c("flanking", "Inv4m")
)
pred_grid$mean_absFC <- predict(fit_bin_loc, newdata = pred_grid)

# Stats for annotation (from binned ANCOVA)
bin_sum <- summary(fit_bin_loc)
bin_coefs <- coef(bin_sum)
bin_r2 <- round(bin_sum$adj.r.squared, 3)
bin_p_div <- bin_coefs["mean_div", "Pr(>|t|)"]
bin_p_loc <- bin_coefs["locationInv4m", "Pr(>|t|)"]

bin_stat_label <- paste0(
  "Region *p* = ", format(bin_p_loc, digits = 2), "<br>",
  "Divergence *p* = ", format(bin_p_div, digits = 2), "<br>",
  "adj. R\u00B2 = ", bin_r2
)

p_ancova <- ggplot(bin_means_loc, aes(x = mean_div, y = mean_absFC,
                                       color = location)) +
  geom_point(aes(size = n), alpha = 0.8) +
  geom_errorbar(aes(ymin = mean_absFC - se_absFC,
                    ymax = mean_absFC + se_absFC),
                width = 0.15, alpha = 0.4) +
  geom_line(data = pred_grid, linewidth = 0.9) +
  geom_richtext(
    data = data.frame(x = 0, y = 8.7),
    aes(x = x, y = y), label = bin_stat_label,
    inherit.aes = FALSE, hjust = 0, vjust = 1,
    size = 4.5, label.colour = NA, fill = NA
  ) +
  scale_color_manual(values = c("Inv4m" = "purple4", "flanking" = "plum"),
                     name = "Region") +
  scale_size_continuous(name = "Genes per bin") +
  labs(
    x = "Mean CDS divergence (%)",
    y = "Mean |log2 fold change|",
    title = "Binned ANCOVA: shared slope (divergence), offset = location"
  ) +
  theme_bw(base_size = 25)

print(p_ancova)

Outlier genes (excess DE beyond divergence)

Genes with large positive residuals have more expression change than expected from their sequence divergence alone — candidates for trans-regulation or CNV effects.

outliers <- merged %>%
  arrange(desc(abs(residual))) %>%
  dplyr::select(gene, location, pct_identity, cds_divergence,
                logFC, abs_logFC, adj.P.Val, residual) %>%
  head(20)

outliers

Distribution of divergence by region

# Pseudocount for zeros: one log10 unit below the minimum nonzero value
min_nz <- min(merged$cds_divergence[merged$cds_divergence > 0])
pseudo_zero <- min_nz / 10

merged_div_plot <- merged %>%
  mutate(div_plot = ifelse(cds_divergence == 0, pseudo_zero, cds_divergence))

# Wilcoxon test for annotation
wt <- wilcox.test(cds_divergence ~ location, data = merged)

p_beeswarm <- ggplot(merged_div_plot, aes(x = location, y = div_plot, color = location)) +
  geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0, outlier.shape = NA) %>%
    with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(size = 2) %>%
    with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  scale_color_manual(values = c("Inv4m" = "purple4", "flanking" = "plum")) +
  scale_x_discrete(labels = c("Inv4m" = "*Inv4m*", "flanking" = "flanking")) +
  annotate("segment", x = 1, xend = 2, y = max(merged_div_plot$div_plot) * 1.3,
           yend = max(merged_div_plot$div_plot) * 1.3) +
  geom_richtext(
    data = data.frame(x = 1.5, y = max(merged_div_plot$div_plot) * 1.6),
    aes(x = x, y = y),
    label = paste0("Wilcoxon *p* = ", format(wt$p.value, digits = 2)),
    inherit.aes = FALSE, size = 4, label.colour = NA, fill = NA
  ) +
  scale_y_continuous(trans = "log10", expand = expansion(mult = c(0.1, 0.15))) +
  labs(x = NULL, y = "CDS divergence (%)") +
  theme_bw(base_size = 25) +
  theme(
    legend.position = "none",
    axis.text.x = element_markdown(size = 13)
  )

print(p_beeswarm)

cat("=== Wilcoxon rank-sum test ===\n")

## === Wilcoxon rank-sum test ===

print(wt)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cds_divergence by location
## W = 6173, p-value = 0.003436
## alternative hypothesis: true location shift is not equal to 0

cat("\n=== Welch t-test on log-transformed divergence ===\n")

## 
## === Welch t-test on log-transformed divergence ===

merged_div_plot$log_div <- log(merged_div_plot$div_plot)
print(t.test(log_div ~ location, data = merged_div_plot))

## 
##  Welch Two Sample t-test
## 
## data:  log_div by location
## t = -4.4951, df = 213.79, p-value = 1.139e-05
## alternative hypothesis: true difference in means between group flanking and group Inv4m is not equal to 0
## 95 percent confidence interval:
##  -1.3249481 -0.5171673
## sample estimates:
## mean in group flanking    mean in group Inv4m 
##            -0.85514349             0.06591421

Extreme value analysis

Rationale

The ANCOVA and quantile regressions model the average or median relationship between divergence and expression change. But our interest is in the DEGs — the genes in the tail of the distribution that show extreme expression responses. Extreme value theory (EVT) models this tail behavior directly.

The approach:

Regress logFC ~ cds_divergence to remove the divergence effect
Take |residual| to pool both tails (comparable to the abs_logFC additive model)
Fit a Generalized Pareto Distribution (GPD) to the extreme |residuals|
Compare GPD parameters between Inv4m and flanking regions
If the scale or shape parameter differs, the inversion produces heavier tails (more extreme DE) than divergence alone predicts

Residuals from divergence model

# Robust residuals from divergence-only model, then take absolute value
# to pool both tails — directly comparable to the abs_logFC additive model
merged$robust_resid <- merged$logFC - predict(
  rlm(logFC ~ cds_divergence, data = merged),
  newdata = merged
)
merged$abs_resid <- abs(merged$robust_resid)

cat("Absolute residual summary:\n")

## Absolute residual summary:

summary(merged$abs_resid)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.03874  0.56026  1.24215  1.86948  2.49406 10.08529

GPD threshold selection

# Use 75th percentile of |residual| as threshold
thresh <- quantile(merged$abs_resid, 0.75)

cat("Threshold (75th pct of |residual|):", round(thresh, 3), "\n")

## Threshold (75th pct of |residual|): 2.494

cat("Total exceedances:", sum(merged$abs_resid > thresh), "\n")

## Total exceedances: 64

cat("  Inv4m:", sum(merged$abs_resid[merged$location == "Inv4m"] > thresh), "\n")

##   Inv4m: 20

cat("  flanking:", sum(merged$abs_resid[merged$location == "flanking"] > thresh), "\n")

##   flanking: 44

GPD fits by location (pooled tails)

gpd_results <- list()

for (loc in c("Inv4m", "flanking")) {
  d <- merged %>% filter(location == loc)
  exc <- d$abs_resid[d$abs_resid > thresh]

  if (length(exc) >= 5) {
    fit <- fpot(d$abs_resid, threshold = thresh, model = "gpd")
    gpd_results[[loc]] <- list(
      location = loc,
      n_exceed = length(exc),
      scale = fit$estimate["scale"],
      shape = fit$estimate["shape"],
      se_scale = fit$std.err["scale"],
      se_shape = fit$std.err["shape"],
      nllh = fit$deviance  # negative log-likelihood
    )
    cat("===", loc, "===\n")
    cat("  Exceedances:", length(exc), "\n")
    cat("  Scale:", round(fit$estimate["scale"], 4),
        "±", round(fit$std.err["scale"], 4), "\n")
    cat("  Shape:", round(fit$estimate["shape"], 4),
        "±", round(fit$std.err["shape"], 4), "\n\n")
  }
}

## === Inv4m ===
##   Exceedances: 20 
##   Scale: 1.5864 ± 0.5794 
##   Shape: -0.0331 ± 0.2889 
## 
## === flanking ===
##   Exceedances: 44 
##   Scale: 2.8992 ± 0.5709 
##   Shape: -0.3001 ± 0.1345

gpd_df <- bind_rows(lapply(gpd_results, as.data.frame))
gpd_df

Compare tail behavior

gpd_long <- gpd_df %>%
  pivot_longer(cols = c(scale, shape), names_to = "parameter", values_to = "estimate") %>%
  mutate(se = ifelse(parameter == "scale", se_scale, se_shape))

ggplot(gpd_long, aes(x = location, y = estimate, color = location)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = estimate - 1.96 * se, ymax = estimate + 1.96 * se),
                width = 0.15) +
  facet_wrap(~ parameter, scales = "free_y") +
  scale_color_manual(values = c("Inv4m" = "purple4", "flanking" = "plum")) +
  labs(
    x = NULL, y = "GPD parameter estimate",
    title = "Generalized Pareto fits (pooled tails): |residual| by region",
    caption = "Error bars: 95% CI from MLE standard errors"
  ) +
  theme_bw(base_size = 25) +
  theme(legend.position = "none")

Likelihood ratio test

Compare a model where both regions share GPD parameters vs separate fits.

# Pooled fit (all locations combined)
fit_pooled <- fpot(merged$abs_resid, threshold = thresh, model = "gpd")

# Sum of separate negative log-likelihoods
nllh_separate <- sum(sapply(gpd_results, function(x) x$nllh))
nllh_pooled <- fit_pooled$deviance

# LRT: 2 * (loglik_separate - loglik_pooled), df = 2 (extra scale + shape)
lrt_stat <- 2 * (nllh_pooled - nllh_separate)
lrt_pval <- pchisq(lrt_stat, df = 2, lower.tail = FALSE)

cat("=== Likelihood ratio test: shared vs separate GPD ===\n")

## === Likelihood ratio test: shared vs separate GPD ===

cat("  Pooled nllh:", round(nllh_pooled, 2), "\n")

##   Pooled nllh: 215.01

cat("  Separate nllh:", round(nllh_separate, 2), "\n")

##   Separate nllh: 212.37

cat("  LRT statistic:", round(lrt_stat, 3), "\n")

##   LRT statistic: 5.283

cat("  p-value:", format(lrt_pval, digits = 4), "\n")

##   p-value: 0.07127

Exceedance probability by location

exc_table <- merged %>%
  group_by(location) %>%
  summarise(
    n = n(),
    n_extreme = sum(abs_resid > thresh),
    pct_extreme = round(100 * n_extreme / n, 1),
    .groups = "drop"
  )

exc_table

cat("\n=== Fisher's test: exceedance rate by location ===\n")

## 
## === Fisher's test: exceedance rate by location ===

mat <- matrix(c(
  exc_table$n_extreme[exc_table$location == "Inv4m"],
  exc_table$n[exc_table$location == "Inv4m"] - exc_table$n_extreme[exc_table$location == "Inv4m"],
  exc_table$n_extreme[exc_table$location == "flanking"],
  exc_table$n[exc_table$location == "flanking"] - exc_table$n_extreme[exc_table$location == "flanking"]
), nrow = 2, dimnames = list(c("extreme", "not"), c("Inv4m", "flanking")))
print(fisher.test(mat))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mat
## p-value = 0.05657
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2789642 1.0162047
## sample estimates:
## odds ratio 
##  0.5394315

QQ plots

par(mfrow = c(1, 2))

for (loc in c("Inv4m", "flanking")) {
  d <- merged %>% filter(location == loc)
  if (sum(d$abs_resid > thresh) >= 5) {
    fit <- fpot(d$abs_resid, threshold = thresh, model = "gpd")
    plot(fit, which = 4, main = paste(loc, "- GPD QQ (pooled tails)"))
  }
}

Save outputs

write.csv(merged, file.path(paths$intermediate, "sequence_divergence_vs_DE.csv"),
          row.names = FALSE)

ggsave(file.path(paths$figures, "sequence_divergence_vs_DE.pdf"),
       p_scatter, width = 8, height = 6)
ggsave(file.path(paths$figures, "sequence_divergence_vs_DE.svg"),
       p_scatter, width = 8, height = 6, fix_text_size = TRUE)

cat("Outputs saved to:", paths$intermediate, "and", paths$figures, "\n")

## Outputs saved to: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate and /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/figures

Multipanel figure

library(cowplot)

# Load logFC by region panel from compare_r2_sliding_window_regions.Rmd
p_logfc_region <- readRDS(file.path(paths$intermediate, "panel_logfc_by_region.rds"))

# Remove titles from plots for the assembled figure
p_scatter_notitle <- p_scatter + labs(title = NULL)
p_beeswarm_notitle <- p_beeswarm + labs(title = NULL)

# Top row: (A) logFC by region | (B) CDS divergence beeswarm
top_row <- plot_grid(
  p_logfc_region + labs(title = NULL),
  p_beeswarm_notitle,
  nrow = 1,
  rel_widths = c(0.5, 0.5),
  labels = c("A", "B"),
  label_size = 18
)

# Bottom row: (C) |logFC| vs CDS divergence scatter (full width)
bottom_row <- plot_grid(
  p_scatter_notitle,
  nrow = 1,
  labels = c("C"),
  label_size = 18
)

# Stack rows
p_combined <- plot_grid(
  top_row,
  bottom_row,
  nrow = 2,
  rel_heights = c(1, 1)
)

print(p_combined)

ggsave(file.path(paths$figures, "divergence.pdf"),
       p_combined, width = 14, height = 12)
ggsave(file.path(paths$figures, "divergence.png"),
       p_combined, width = 14, height = 12, dpi = 300)
ggsave(file.path(paths$figures, "divergence.svg"),
       p_combined, width = 14, height = 12, fix_text_size = TRUE)

Zero-divergence DEGs and flanking enrichment

# Genes with 0% CDS divergence
n_zero_inv4m <- sum(merged$cds_divergence == 0 & merged$location == "Inv4m")
n_zero_flank <- sum(merged$cds_divergence == 0 & merged$location == "flanking")
n_nonzero_inv4m <- sum(merged$cds_divergence > 0 & merged$location == "Inv4m")
n_nonzero_flank <- sum(merged$cds_divergence > 0 & merged$location == "flanking")

mat_zero <- matrix(
  c(n_zero_inv4m, n_nonzero_inv4m,
    n_zero_flank, n_nonzero_flank),
  nrow = 2,
  dimnames = list(c("zero_div", "nonzero_div"), c("Inv4m", "flanking"))
)

cat("Contingency table:\n")

## Contingency table:

print(mat_zero)

##             Inv4m flanking
## zero_div        1       20
## nonzero_div   106      127

ft_zero <- fisher.test(mat_zero)
cat("\nFisher's exact test: zero-divergence genes by location\n")

## 
## Fisher's exact test: zero-divergence genes by location

print(ft_zero)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mat_zero
## p-value = 0.0001254
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.001436621 0.389721861
## sample estimates:
## odds ratio 
## 0.06031275

cat("\nAll", n_zero_inv4m + n_zero_flank, "zero-divergence genes are DEGs (FDR < 0.05)\n")

## 
## All 21 zero-divergence genes are DEGs (FDR < 0.05)

cat("  Inv4m:", n_zero_inv4m, "\n")

##   Inv4m: 1

cat("  flanking:", n_zero_flank, "\n")

##   flanking: 20

Results paragraph and figure caption

RESULTS PARAGRAPH

To test whether differential expression in the introgression is explained by
coding sequence divergence between the B73 and Mi21 haplotypes, we computed
CDS percent identity from LAST pairwise alignments for 254 expressed
genes with ortholog hits in the shared introgression.
Genes inside the inversion had significantly higher CDS divergence than flanking
genes (median 1.1\% vs 0.75\%;
Wilcoxon $p$ = 0.0034; Fig.~\ref{fig::divergence} A), consistent with
suppressed recombination preserving ancestral \textit{mexicana} divergence within the
inversion.
An additive linear model ($|\text{log}_2\text{FC}| \sim \text{CDS divergence} + \text{region}$)
explained 10.4\% of the variance
in expression change (adj.\ $R^2$ = 0.104;
divergence $p$ = 2.4e-06;
region $p$ = 0.0023; Fig.~\ref{fig::divergence} B).
After accounting for divergence, genes inside the inversion showed smaller
absolute expression changes than flanking genes, indicating that the inversion
dampens rather than amplifies expression divergence.
Notably, 21 genes had identical CDS between B73 and
Mi21, and all were significantly differentially expressed (FDR $<$ 0.05).
Of these, 20 of 21 were located in the
flanking region, a significant overrepresentation
(Fisher's exact test, $p$ = 0.00013,
OR = 0.1).
Because these genes lack coding sequence divergence, their differential expression
is most parsimoniously attributed to \textit{trans}-regulatory effects originating from
divergent loci elsewhere in the introgression.

FIGURE FLOAT

\begin{figure}[!ht]
\includegraphics[width=\linewidth]{figs/sequence_divergence_multipanel.pdf}
\caption{\textbf{CDS sequence divergence and expression change in the \invfour introgression.}
\textbf{(A)} Distribution of CDS divergence (100 $-$ \% identity) for expressed genes inside the inversion (\invfour, $n$ = 107) and in the flanking introgression (flanking, $n$ = 147).
Points show individual genes (beeswarm); boxes show median and interquartile range.
$y$-axis is log$_{10}$-scaled.
Wilcoxon rank-sum $p$ = 0.0034.
\textbf{(B)} Absolute log$_2$ fold change vs CDS divergence.
Parallel lines show the additive ANCOVA model (shared slope for divergence effect, vertical offset for region effect).
Region $p$ = 0.0023;
divergence $p$ = 2.4e-06;
adj.\ $R^2$ = 0.104.}
\label{fig::divergence}
\end{figure}

MODEL COMPARISON TABLE

\begin{table}[!ht]
\caption{\textbf{Model comparison for the relationship between CDS divergence and expression change in the \invfour introgression.}
Five linear models predicting $|\text{log}_2\text{FC}|$ were compared by BIC and AIC.
The additive model (CDS divergence + region) provided the best fit, with the interaction term adding no improvement.
Region distinguishes genes inside \invfour from the flanking introgression.}
\label{tab::divergence_models}
\centering
\begin{tabular}{@{}lcccc@{}}
\toprule
\textbf{Model} & \textbf{df} & \textbf{AIC} & \textbf{BIC} & \textbf{adj.\ $R^2$} \\ \midrule
CDS divergence + region & 3 & 1032.2 & 1046.3 & 0.1037 \\
CDS divergence & 2 & 1039.6 & 1050.2 & 0.0734 \\
CDS divergence $\times$ region & 4 & 1033.2 & 1050.9 & 0.1035 \\
Region & 2 & 1052.8 & 1063.4 & 0.0242 \\
Null (intercept only) & 1 & 1058 & 1065.1 & 0 \\ \bottomrule
\end{tabular}
\end{table}

Session info

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] cowplot_1.2.0        ggfx_1.0.3           ggbeeswarm_0.7.3    
##  [4] ggtext_0.1.2         evd_2.3-7.1          quantreg_6.1        
##  [7] SparseM_1.84-2       MASS_7.3-65          GenomicRanges_1.62.1
## [10] Seqinfo_1.0.0        IRanges_2.44.0       S4Vectors_0.48.0    
## [13] BiocGenerics_0.56.0  generics_0.1.4       ggrepel_0.9.6       
## [16] lubridate_1.9.4      forcats_1.0.1        stringr_1.6.0       
## [19] dplyr_1.1.4          purrr_1.2.0          readr_2.1.6         
## [22] tidyr_1.3.2          tibble_3.3.0         ggplot2_4.0.1       
## [25] tidyverse_2.0.0      here_1.0.2          
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9                rlang_1.1.6                
##  [3] magrittr_2.0.4              otel_0.2.0                 
##  [5] matrixStats_1.5.0           compiler_4.5.2             
##  [7] systemfonts_1.3.1           vctrs_0.6.5                
##  [9] pkgconfig_2.0.3             crayon_1.5.3               
## [11] fastmap_1.2.0               backports_1.5.0            
## [13] magick_2.9.0                XVector_0.50.0             
## [15] labeling_0.4.3              Rsamtools_2.26.0           
## [17] rmarkdown_2.30              markdown_2.0               
## [19] tzdb_0.5.0                  ragg_1.5.0                 
## [21] MatrixModels_0.5-4          xfun_0.55                  
## [23] cachem_1.1.0                cigarillo_1.0.0            
## [25] litedown_0.9                jsonlite_2.0.0             
## [27] DelayedArray_0.36.0         BiocParallel_1.44.0        
## [29] broom_1.0.11                parallel_4.5.2             
## [31] R6_2.6.1                    bslib_0.9.0                
## [33] stringi_1.8.7               RColorBrewer_1.1-3         
## [35] rtracklayer_1.70.1          car_3.1-3                  
## [37] jquerylib_0.1.4             Rcpp_1.1.0                 
## [39] SummarizedExperiment_1.40.0 knitr_1.51                 
## [41] Matrix_1.7-4                splines_4.5.2              
## [43] timechange_0.3.0            tidyselect_1.2.1           
## [45] dichromat_2.0-0.1           abind_1.4-8                
## [47] yaml_2.3.12                 codetools_0.2-20           
## [49] curl_7.0.0                  lattice_0.22-7             
## [51] Biobase_2.70.0              withr_3.0.2                
## [53] S7_0.2.1                    evaluate_1.0.5             
## [55] survival_3.8-3              xml2_1.5.1                 
## [57] Biostrings_2.78.0           ggpubr_0.6.2               
## [59] pillar_1.11.1               carData_3.0-5              
## [61] MatrixGenerics_1.22.0       rprojroot_2.1.1            
## [63] RCurl_1.98-1.17             hms_1.1.4                  
## [65] commonmark_2.0.0            scales_1.4.0               
## [67] glue_1.8.0                  tools_4.5.2                
## [69] BiocIO_1.20.0               ggsignif_0.6.4             
## [71] GenomicAlignments_1.46.0    XML_3.99-0.20              
## [73] grid_4.5.2                  beeswarm_0.4.0             
## [75] restfulr_0.0.16             vipor_0.4.7                
## [77] Formula_1.2-5               cli_3.6.5                  
## [79] textshaping_1.0.4           S4Arrays_1.10.1            
## [81] svglite_2.2.2               gtable_0.3.6               
## [83] rstatix_0.7.3               sass_0.4.10                
## [85] digest_0.6.39               SparseArray_1.10.8         
## [87] rjson_0.2.23                farver_2.1.2               
## [89] htmltools_0.5.9             lifecycle_1.0.4            
## [91] httr_1.4.7                  gridtext_0.1.5

Sequence Divergence vs Differential Expression in the Inv4m Introgression

Fausto Rodriguez

March 23, 2026