Overview

Purpose: Plot genotype per NIL. Characterize linkage disequilibrium between the Inv4m chromosomal inversion and genome-wide markers in maize near-isogenic lines.

Approach: Calculate Spearman correlations between a tagging SNP within Inv4m (S4_181558683) and all genotyped markers across the genome. Identify regions with significant LD and visualize genotype structure.

Expected outcome: Identify the physical extent of linkage with Inv4m, detect recombination breakpoints, and characterize flanking introgression segments.

Load Libraries

library(vcfR)
library(dplyr)
library(ggplot2)
library(ggtext)
library(rtracklayer)
library(GenomicRanges)
library(cowplot)

Define Genomic Regions

Purpose: Establish coordinates for Inv4m inversion and shared introgression.

Approach: Extract gene boundaries from B73 v5.0 annotation to define inversion breakpoints identified in MCScan analysis.

# Load B73 annotation
myGFF <- file.path(paths$data, "Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3")
B73 <- rtracklayer::import(myGFF) %>%
  subset(type == "gene" & seqnames %in% 1:10)

genes <- as.data.frame(B73)
genes$ID <- gsub("gene:", "", genes$ID)

# Inv4m boundaries (from MCScan)
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

# Shared introgression boundaries (empirical)
introgression_start <- 157012149
introgression_end <- 195900523

# Create GRanges objects
inv4m <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = inv4m_start, end = inv4m_end),
  strand = "+"
)

shared_introgression <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = introgression_start, end = introgression_end),
  strand = "+"
)

# Region sizes
inv4m_size <- round((inv4m_end - inv4m_start) / 1e6, 1)
introgression_size <- round((introgression_end - introgression_start) / 1e6, 1)

# Count genes in regions
olap_inv4m <- findOverlaps(inv4m, B73, ignore.strand = TRUE)
inv4m_gene_ids <- genes$ID[subjectHits(olap_inv4m)]

olap_introgression <- findOverlaps(shared_introgression, B73, ignore.strand = TRUE)
shared_introgression_gene_ids <- genes$ID[subjectHits(olap_introgression)]

flanking_introgression_gene_ids <- shared_introgression_gene_ids[
  !(shared_introgression_gene_ids %in% inv4m_gene_ids)
]

cat("Inv4m inversion:", inv4m_size, "Mb |", length(inv4m_gene_ids), "genes\n")

## Inv4m inversion: 15.2 Mb | 434 genes

cat("Shared introgression:", introgression_size, "Mb |", 
    length(shared_introgression_gene_ids), "genes\n")

## Shared introgression: 38.9 Mb | 1101 genes

cat("Flanking regions:", 
    round(introgression_size - inv4m_size, 1), "Mb |", 
    length(flanking_introgression_gene_ids), "genes\n")

## Flanking regions: 23.7 Mb | 667 genes

Import and Process VCF

Purpose: Load genotype data and convert to numeric matrix.

Approach: Read VCF, extract GT field, recode genotypes as 0/1/2 for B73/Het/Mi21.

# Read VCF
vcf <- read.vcfR(file.path(paths$data, "inv4m_PSU_imputed.vcf"))

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 10
##   header_line: 11
##   variant count: 19861
##   column count: 22
## Meta line 10 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 19861
##   Character matrix gt cols: 22
##   skip: 0
##   nrows: 19861
##   row_num: 0
## Processed variant 1000Processed variant 2000Processed variant 3000Processed variant 4000Processed variant 5000Processed variant 6000Processed variant 7000Processed variant 8000Processed variant 9000Processed variant 10000Processed variant 11000Processed variant 12000Processed variant 13000Processed variant 14000Processed variant 15000Processed variant 16000Processed variant 17000Processed variant 18000Processed variant 19000Processed variant: 19861
## All variants processed

gt <- extract.gt(vcf, element = "GT")

# Recode genotypes
gt[gt == "0/0"] <- 0
gt[gt == "0/1" | gt == "1/0"] <- 1
gt[gt == "1/1"] <- 2

# Convert to numeric matrix
ngt <- matrix(as.integer(gt), ncol = ncol(gt))
colnames(ngt) <- colnames(gt)
rownames(ngt) <- rownames(gt)

# Sample ordering (CTRL then INV4 carriers, alphabetically within group)
sample_order <- c(
  sort(c("3095", "3113", "4113", "3083", "4107", "4122")),
  sort(c("3092", "3080", "3059", "3131", "4125", "4080", "4131"))
)

ngt <- ngt[, sample_order] %>% t()

# Genotype distribution (by site)
total_sites <- ncol(ngt) * nrow(ngt)
geno_freq <- table(ngt) / total_sites
cat("Genotype frequencies (by site):\n")

## Genotype frequencies (by site):

cat("  B73/B73 (0):", round(geno_freq["0"], 3), "\n")

##   B73/B73 (0): 0.636

cat("  B73/Mi21 (1):", round(geno_freq["1"], 3), "\n")

##   B73/Mi21 (1): 0.09

cat("  Mi21/Mi21 (2):", round(geno_freq["2"], 3), "\n")

##   Mi21/Mi21 (2): 0.273

cat("  Missing:", round(sum(is.na(ngt)) / total_sites, 4), "\n")

##   Missing: 0.0016

cat("Dimensions:", nrow(ngt), "samples ×", ncol(ngt), "markers\n")

## Dimensions: 13 samples × 19861 markers

Genotype Run Analysis

Purpose: Quantify introgression content across samples.

Approach: Calculate run-length encoding of genotypes, sum by allelic state.

get_genotype_runs <- function(x, sample_order) {
  result <- data.frame()
  chr_levels <- paste0("S", 1:10)
  
  for (ind_idx in sample_order) {
    result_ind <- data.frame()
    
    for (chr_idx in 1:10) {
      chr_prefix <- paste0("S", chr_idx, "_")
      chr_name <- paste0("S", chr_idx)
      chr <- x[grepl(chr_prefix, rownames(x), perl = TRUE), ]
      runs <- rle(chr[, ind_idx])
      
      chr_long <- data.frame(
        ind = ind_idx,
        chr = chr_name,
        pos = as.integer(gsub(chr_prefix, "", names(runs$values))),
        gt = runs$values
      ) %>%
        group_by(ind, chr) %>%
        mutate(
          lag = lag(pos, default = 0),
          span = (pos - lag) / 1e6,
          chr = factor(chr, levels = chr_levels),
          ind = factor(ind, levels = sample_order),
          missing = is.na(gt),
          missing_length = missing * span,
          B73 = as.numeric(gt == 0),
          B73_length = B73 * span,
          Het = as.numeric(gt == 1),
          Het_length = Het * span,
          Mi21 = as.numeric(gt == 2),
          Mi21_length = Mi21 * span
        )
      
      result_ind <- rbind(result_ind, chr_long)
    }
    
    result <- rbind(result, result_ind)
  }
  result
}

# Calculate runs
m <- t(ngt)
gt_runs <- get_genotype_runs(m, sample_order)

# Add grouping
gt_runs$group <- case_when(
  gt_runs$ind %in% c("3092", "3080", "3059", "3131", "4125", "4080", "4131") ~ 
    "Inv4m",
  .default = "CTRL"
)

# Genome size
chr_length <- gt_runs %>%
  group_by(chr) %>%
  summarise(chr_length = max(pos) / 1e6)
genome_size <- sum(chr_length$chr_length)

# Summarize by genotype
by_length <- gt_runs %>%
  inner_join(chr_length, by = "chr") %>%
  ungroup() %>%
  group_by(ind) %>%
  summarise(
    B73 = sum(B73_length, na.rm = TRUE),
    Het = sum(Het_length, na.rm = TRUE),
    Mi21 = sum(Mi21_length, na.rm = TRUE),
    missing = sum(missing_length, na.rm = TRUE),
    group = group[1]
  ) %>%
  mutate(group = factor(group, levels = c("CTRL", "Inv4m"))) %>%
  tidyr::pivot_longer(
    cols = c("B73", "Het", "Mi21", "missing"),
    names_to = "Genotype",
    values_to = "length"
  ) %>%
  mutate(pct = 100 * length / genome_size)

cat("Genome size:", round(genome_size, 1), "Mb\n")

## Genome size: 2128.1 Mb

cat("\nGenotype run length summary (Mb):\n")

## 
## Genotype run length summary (Mb):

by_length %>%
  group_by(Genotype) %>%
  summarise(
    mean_Mb = round(mean(length), 1),
    mean_pct = round(mean(pct), 2)
  ) %>%
  print()

## # A tibble: 4 × 3
##   Genotype mean_Mb mean_pct
##   <chr>      <dbl>    <dbl>
## 1 B73       1821.     85.6 
## 2 Het        183.      8.61
## 3 Mi21       117.      5.5 
## 4 missing      7.3     0.34

cat("\nBy introgression group:\n")

## 
## By introgression group:

by_length %>%
  group_by(group, Genotype) %>%
  summarise(
    mean_Mb = round(mean(length), 1),
    mean_pct = round(mean(pct), 2),
    .groups = "drop"
  ) %>%
  print()

## # A tibble: 8 × 4
##   group Genotype mean_Mb mean_pct
##   <fct> <chr>      <dbl>    <dbl>
## 1 CTRL  B73       1849.     86.9 
## 2 CTRL  Het        170       7.99
## 3 CTRL  Mi21       102       4.79
## 4 CTRL  missing      6.8     0.32
## 5 Inv4m B73       1796.     84.4 
## 6 Inv4m Het        195.      9.15
## 7 Inv4m Mi21       130.      6.1 
## 8 Inv4m missing      7.8     0.37

Genotype Visualization

Purpose: Create chromosome-wide haplotype plots.

Approach: Plot genotype runs as colored bars across chromosomes.

# Genotype color palette (3 colors at values 0, 1, 2)
gt_colors <- c("gold", "forestgreen", "purple4")
gt_labels <- c("B73/B73", "B73/Mi21", "Mi21/Mi21")

# Selection marker triangle
triangle <- data.frame(
  pos = 0,
  span = 181.6,
  ind = sample_order[length(sample_order)],
  chr = "4",
  label = factor("Selection marker<br>PZE4175660223")
)

# Coordinate limits
inversion_limit <- round(c(inv4m_start, inv4m_end) / 1e6, 2)
introgression_limit <- round(c(introgression_start, introgression_end) / 1e6, 2)

gt_plot <- gt_runs %>%
  ungroup() %>%
  mutate(chr = gsub("S", "", chr)) %>%
  ggplot(aes(y = ind, x = span, fill = gt)) +
  geom_col(width = 1) +
  scale_fill_gradientn(
    colours = gt_colors,
    values = scales::rescale(c(0, 1, 2)),
    limits = c(0, 2),
    breaks = c(0, 1, 2),
    labels = gt_labels,
    na.value = "gray85",
    guide = guide_legend(
      title = "Genotype",
      order = 1,
      reverse = TRUE,
      label.position = "left",
      label.hjust = 1,
      theme = theme(
        legend.title = element_text(size = 28, hjust = 0,
                                    margin = margin(t = 0, b = 6, unit = "pt")),
        legend.text = element_text(size = 25, vjust = 0.5),
        legend.key.width = unit(2, "lines"),
        legend.key.height = unit(2, "lines")
      )
    )
  ) +
  geom_point(
    data = triangle,
    mapping = aes(shape = label),
    position = position_nudge(y = 2),
    size = 3,
    color = "purple4",
    fill = "purple4"
  ) +
  scale_shape_manual(values = 25) +
  facet_wrap(factor(chr, 1:10) ~ ., strip.position = "left", ncol = 1) +
  coord_cartesian(xlim = c(0, 450), expand = FALSE) +
  xlab("Chromosome 4 Position [Mb]") +
  ylab("Chromosome") +
  guides(
    shape = guide_legend(
      order = 2,
      override.aes = list(size = 8),
      label.position = "left",
      label.hjust = 1,
      theme = theme(
        legend.title = element_blank(),
        legend.text = element_markdown(size = 25),
        legend.key.width = unit(2, "lines"),
        legend.background = element_rect(fill = "transparent", color = NA),
        legend.key = element_rect(fill = "transparent", color = NA)
      )
    )
  ) +
  ggpubr::theme_classic2(base_size = 30) +
  theme(
    legend.position = c(0.9 - 3.8 / (18 * 25.4), 0.83),
    legend.spacing.y = unit(-1.5, "lines"),
    legend.box.just = "right",
    strip.background = element_blank(),
    strip.text.y.left = element_text(angle = 0),
    axis.title.y = element_text(margin = margin(l = 2.5, unit = "mm")),
    panel.spacing.y = unit(0.5, "lines"),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")
  )

gt_plot

# Random spike IDs for SVG coordinate calibration (unique, won't collide)
set.seed(42)
spike_id_a <- paste0("XK", paste0(sample(c(0:9, letters[1:6]), 12, replace = TRUE), collapse = ""))
spike_id_b <- paste0("XK", paste0(sample(c(0:9, letters[1:6]), 12, replace = TRUE), collapse = ""))

# Chr4 detailed inset
inlay <- gt_runs %>%
  filter(chr == "S4") %>%
  ungroup() %>%
  mutate(chr = gsub("S", "", chr)) %>%
  ggplot(aes(x = span, y = ind, fill = gt)) +
  geom_col(width = 1) +
  scale_fill_gradientn(
    colours = gt_colors,
    values = scales::rescale(c(0, 1, 2)),
    limits = c(0, 2),
    na.value = "gray85"
  ) +
  annotate("point",
    x = 181.6, y = 14.4,
    shape = 25, size = 6, color = "purple4", fill = "purple4"
  ) +
  # Invisible diagonal spikes for SVG coordinate calibration
  annotate("text", x = 160, y = 2,  label = spike_id_a, alpha = 0, size = 1) +
  annotate("text", x = 190, y = 12, label = spike_id_b, alpha = 0, size = 1) +
  coord_cartesian(xlim = c(150, 200)) +
  xlab(NULL) +
  ylab("NIL") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(
    legend.position = "none",
    strip.background = element_blank(),
    strip.text = element_blank(),
    plot.title = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    plot.background = element_rect(fill = "transparent", color = NA),
    plot.margin = margin(t = 0, r = 0, b = 26, l = 0, unit = "mm")
  )

# Combine plots with annotation brackets
# First build base composite, then save SVG skeleton for coordinate extraction,
# then add annotations.

plot_base <- ggdraw() +
  draw_plot(gt_plot) +
  draw_plot(inlay,
    x = 0.45, y = 0.03 - 19 / (8 * 25.4),
    width = 0.6, height = 0.6,
    scale = 0.8
  )

# Save skeleton SVG for coordinate calibration
ggsave(file.path(paths$figures, "Fig1_panel_EF_skeleton.svg"),
       plot = plot_base, width = 18, height = 8, device = "svg",
       fix_text_size = TRUE)

plot_base

Extract annotation bracket positions from the skeleton SVG spike markers.

# --- Parse spike markers from skeleton SVG ---
# spike_id_a is at data (160 Mb, y=2), spike_id_b at (190 Mb, y=12)
svg_path <- file.path(paths$figures, "Fig1_panel_EF_skeleton.svg")
svg_lines <- readLines(svg_path)

extract_spike <- function(lines, label) {
  idx <- grep(label, lines, fixed = TRUE)
  stopifnot(length(idx) == 1)
  line <- lines[idx]
  sx <- as.numeric(sub(".*x='([^']+)'.*", "\\1", line))
  sy <- as.numeric(sub(".*y='([^']+)'.*", "\\1", line))
  c(x = sx, y = sy)
}

spike_a <- extract_spike(svg_lines, spike_id_a)  # data: (160, 2)
spike_b <- extract_spike(svg_lines, spike_id_b)  # data: (190, 12)

svg_header <- svg_lines[grep("viewBox", svg_lines)[1]]
svg_W <- as.numeric(sub(".*viewBox='0 0 ([0-9.]+) .*", "\\1", svg_header))
svg_H <- as.numeric(sub(".*viewBox='0 0 [0-9.]+ ([0-9.]+)'.*", "\\1", svg_header))

# Linear mapping: data Mb → NPC x  (via SVG pixels)
# SVG x = a_x * Mb + b_x, then NPC = SVG_x / svg_W
a_x <- (spike_b["x"] - spike_a["x"]) / (190 - 160)
b_x <- spike_a["x"] - a_x * 160

mb_to_npc <- function(mb) {
  svg_x <- unname(a_x * mb + b_x)
  svg_x / svg_W
}

# Linear mapping: data y-index → NPC y  (SVG y is top-down, NPC is bottom-up)
# SVG y = a_y * data_y + b_y, then NPC = 1 - SVG_y / svg_H
a_y <- (spike_b["y"] - spike_a["y"]) / (12 - 2)
b_y <- spike_a["y"] - a_y * 2

data_y_to_npc <- function(dy) {
  svg_y <- unname(a_y * dy + b_y)
  1 - svg_y / svg_H
}

cat(sprintf("Spike A SVG: (%.2f, %.2f)\nSpike B SVG: (%.2f, %.2f)\n",
            spike_a["x"], spike_a["y"], spike_b["x"], spike_b["y"]))

## Spike A SVG: (851.23, 460.93)
## Spike B SVG: (1156.04, 333.21)

cat(sprintf("SVG canvas: %.0f x %.0f\n", svg_W, svg_H))

## SVG canvas: 1296 x 576

# --- Compute NPC positions for annotations ---
col_inv4m <- "purple4"

introg_left  <- mb_to_npc(introgression_start / 1e6)
introg_right <- mb_to_npc(introgression_end / 1e6)
inv4m_left   <- mb_to_npc(inv4m_start / 1e6)
inv4m_right  <- mb_to_npc(inv4m_end / 1e6)

introg_size_mb <- round((introgression_end - introgression_start) / 1e6)
inv4m_size_mb  <- round((inv4m_end - inv4m_start) / 1e6)

# y: bracket above inlay top, rectangle spans inlay data area
# Inlay y-data ranges ~0.5 (bottom bar) to ~14 (top bar)
y_inv4m_top <- data_y_to_npc(14)    # top of heatmap
y_inv4m_bot <- data_y_to_npc(0.5)   # bottom of heatmap
y_introg    <- y_inv4m_top + 0.025   # bracket just above

cat(sprintf("introg_left=%.4f  introg_right=%.4f\n", introg_left, introg_right))

## introg_left=0.6334  introg_right=0.9383

cat(sprintf("inv4m_left=%.4f   inv4m_right=%.4f\n", inv4m_left, inv4m_right))

## inv4m_left=0.7578   inv4m_right=0.8774

cat(sprintf("y_inv4m_top=%.4f  y_inv4m_bot=%.4f  y_introg=%.4f\n",
            y_inv4m_top, y_inv4m_bot, y_introg))

## y_inv4m_top=0.4659  y_inv4m_bot=0.1665  y_introg=0.4909

# Layout (bottom to top):
#   inlay heatmap top edge = y_inv4m_top
#   Inv4m bracket just above heatmap
#   Inv4m label above that
#   Shared introgression bracket above Inv4m label
#   Shared introgression label at top
# Position bracket stack above inlay heatmap top
line_h           <- 1.2 * 25 / svg_H   # 1.2 lines in NPC
triangle_h       <- 3 * 72 / 25.4 / svg_H  # NPC height of size=3 triangle glyph
inv4m_tick_h     <- 0.04 - triangle_h   # shorten arms by triangle height
shift_down       <- 1.2 * (0.04)        # close gap to heatmap (original arm length)
y_inv4m_arms_bot <- y_inv4m_top + line_h - shift_down

mm_npc           <- function(mm) mm * 72 / 25.4 / svg_H

y_inv4m_bracket  <- y_inv4m_arms_bot + inv4m_tick_h
y_inv4m_label    <- y_inv4m_bracket + mm_npc(7.7)  # +7.7mm above bracket bar

# Introgression bracket: raise bar so label-to-bar gap = 6.7mm (matching Inv4m)
# Label stays at bar + 20.2mm, so bar must be 20.2 - 6.7 = 13.5mm above Inv4m label
y_introg_bracket <- y_inv4m_label + mm_npc(14.0)
introg_tick_h    <- y_introg_bracket - y_inv4m_arms_bot  # arms reach inv4m level

# Helper: make a ⊓-shaped bracket grob (left tick ↓, horizontal, right tick ↓)
make_bracket <- function(xl, xr, y_top, tick, col, lwd) {
  draw_grob(grid::polylineGrob(
    x = unit(c(xl, xl, xr, xr), "npc"),
    y = unit(c(y_top - tick, y_top, y_top, y_top - tick), "npc"),
    gp = grid::gpar(col = col, lwd = lwd, lineend = "square", linejoin = "mitre")
  ))
}

plot_with_inset <- plot_base +
  # --- Inv4m bracket (arms down, above triangle marker) ---
  make_bracket(inv4m_left, inv4m_right, y_inv4m_bracket, inv4m_tick_h,
               col_inv4m, 6) +
  draw_label(
    expression(italic("Inv4m") ~ "15 Mb"),
    x = (inv4m_left + inv4m_right) / 2, y = y_inv4m_label,
    size = 30, fontface = "plain", color = col_inv4m, hjust = 0.5) +
  # --- Shared introgression bracket (arms reach inv4m level) ---
  make_bracket(introg_left, introg_right, y_introg_bracket, introg_tick_h,
               col_inv4m, 6) +
  draw_label(
    paste0("Shared introgression ", introg_size_mb, " Mb"),
    x = (introg_left + introg_right) / 2, y = y_introg_bracket + mm_npc(7.7),
    size = 30, fontface = "plain", color = col_inv4m, hjust = 0.5) +
  # Panel labels
  draw_label("E", x = 0.005, y = 0.995, size = 45, fontface = "bold",
             hjust = 0, vjust = 1) +
  draw_label("F", x = 0.505, y = 0.52, size = 45, fontface = "bold",
             hjust = 0, vjust = 1)

plot_with_inset

saveRDS(plot_with_inset, file.path(paths$intermediate, "fig1_panel_E.rds"))
saveRDS(gt_plot, file.path(paths$intermediate, "fig1_panel_E_main.rds"))
saveRDS(inlay, file.path(paths$intermediate, "fig1_panel_E_inlay.rds"))

ggsave(file.path(paths$figures, "Fig1_panel_EF.png"),
       plot = plot_with_inset, width = 18, height = 8, dpi = 300, bg = "white")
ggsave(file.path(paths$figures, "Fig1_panel_EF.svg"),
       plot = plot_with_inset, width = 18, height = 8, device = "svg",
       fix_text_size = TRUE)

Panel F: Selected NIL Haplotypes

Purpose: Create Panel F for Figure 3 showing haplotype map for 3 representative NIL lines.

# Selected samples for Panel F (representative NILs)
selected3 <- c("3131", "4125", "4131")

# Create Panel F plot
gt3_plot <- gt_runs %>%
  filter(chr == "S4", ind %in% selected3) %>%
  droplevels() %>%
  ungroup() %>%
  mutate(
    ind = factor(ind, levels = selected3),
    chr = gsub("S", "", chr)
  ) %>%
  ggplot(aes(x = span, y = ind, fill = gt)) +
  ylab("Line") +
  geom_col(width = 1) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_fill_gradientn(
    colours = gt_colors,
    values = scales::rescale(c(0, 1, 2)),
    limits = c(0, 2),
    breaks = c(0, 1, 2),
    labels = gt_labels,
    na.value = "gray85"
  ) +
  guides(fill = guide_legend(title = "Genotype", ncol = 3)) +
  coord_cartesian(xlim = c(150, 200)) +
  annotate("point",
    x = 181.6, y = 4,
    shape = 25, size = 7, color = "purple4", fill = "purple4"
  ) +
  annotate("text",
    x = c(inversion_limit, introgression_limit), y = 4,
    label = rep("|", 4), size = 7
  ) +
  annotate("text",
    x = c(inversion_limit, introgression_limit), y = 0,
    label = rep("|", 4), size = 7
  ) +
  ggpubr::theme_classic2(base_size = 25) +
  theme(
    legend.position = "top",
    legend.byrow = TRUE,
    legend.text = element_text(size = 20),
    legend.box.spacing = unit(0, "pt"),
    plot.margin = unit(c(0, 0, 0, 0), "cm"),
    strip.background = element_blank(),
    strip.text = element_blank(),
    plot.title = element_text(color = "white"),
    axis.title.y = element_text(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.line.x = element_blank(),
    plot.background = element_rect(fill = "transparent", color = NA)
  )

# Save as Panel F for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_F_haplotype.png"),
  plot = gt3_plot,
  width = 8, height = 3, dpi = 300, bg = "white"
)
saveRDS(gt3_plot, file.path(paths$intermediate, "panel_F_haplotype.rds"))

gt3_plot

# Background means
bg_mean <- by_length %>%
  ungroup() %>%
  group_by(Genotype) %>%
  summarise_at(.vars = c("length", "pct"), .funs = mean)

p_genotype <- by_length %>%
  ggplot(aes(x = group, y = pct)) +
  geom_hline(data = bg_mean, aes(yintercept = pct), linetype = 2) +
  geom_boxplot(outlier.colour = "white", width = 0.25) +
  ggbeeswarm::geom_quasirandom(size = 2) +
  facet_wrap(. ~ Genotype, scales = "free_y", ncol = 4) +
  xlab("Introgression group") +
  ylab("% of genome length") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(strip.background = element_blank())

p_genotype

Fixed Alternate Allele Analysis

Purpose: Quantify markers where the Mi21 (alternate) allele is fixed across all individuals, regardless of Inv4m genotype group. These markers cannot represent Inv4m-linked introgression because they are shared between CTRL and Inv4m carriers.

Context: Liang & Schnable (2016) showed that 2.3% of the B73 gene space carries divergent haplotypes across seed stocks, concentrated on chromosomes 2, 4, 5, and 6. Fixed alternate alleles in our NILs may reflect B73 seed stock divergence from the v5 reference rather than true Mi21 introgression.

# Markers where all 13 individuals are homozygous for the alternate (Mi21) allele
# ngt is samples (rows) x markers (cols), coded 0/1/2
fixed_mi21 <- colSums(ngt, na.rm = FALSE) == (2 * nrow(ngt))
fixed_mi21[is.na(fixed_mi21)] <- FALSE

cat("Fixed alternate allele markers:", sum(fixed_mi21),
    "out of", ncol(ngt), "total\n\n")

## Fixed alternate allele markers: 1394 out of 19861 total

# Compute run-length span per chromosome
# A "consensus fixed" vector: 2 where all individuals are Mi21, 0 elsewhere
consensus <- ifelse(fixed_mi21, 2, 0)
names(consensus) <- colnames(ngt)

fixed_runlength <- data.frame()
for (chr_idx in 1:10) {
  chr_prefix <- paste0("S", chr_idx, "_")
  chr_mask <- grepl(chr_prefix, names(consensus))
  chr_vals <- consensus[chr_mask]
  positions <- as.numeric(gsub(chr_prefix, "", names(chr_vals)))

  runs <- rle(chr_vals)
  run_starts <- cumsum(c(1, runs$lengths[-length(runs$lengths)]))

  chr_mb <- 0
  chr_markers <- 0
  for (i in seq_along(runs$values)) {
    if (runs$values[i] == 2) {
      start_pos <- positions[run_starts[i]]
      end_idx <- run_starts[i] + runs$lengths[i] - 1
      if (end_idx < length(positions)) {
        end_pos <- positions[end_idx + 1]
      } else {
        end_pos <- positions[end_idx]
      }
      chr_mb <- chr_mb + (end_pos - start_pos) / 1e6
      chr_markers <- chr_markers + runs$lengths[i]
    }
  }
  fixed_runlength <- rbind(fixed_runlength, data.frame(
    CHR = chr_idx, n_snps = chr_markers, runlength_Mb = round(chr_mb, 1)
  ))
}

fixed_runlength <- fixed_runlength %>%
  mutate(CHR = factor(CHR, levels = 1:10)) %>%
  arrange(desc(n_snps))

knitr::kable(fixed_runlength,
             caption = "Fixed alternate allele markers and genotype run-length by chromosome")

Fixed alternate allele markers and genotype run-length by chromosome
	CHR	n_snps	runlength_Mb
S1_192352	1	650	18.5
S5_3621605	5	417	7.2
S6_16767043	6	193	3.0
S2_5281945	2	47	8.3
S3_4704898	3	29	5.9
S7_17158381	7	14	2.8
S9_5227052	9	12	1.0
S4_38429716	4	11	0.9
S10_34702956	10	11	0.6
S8_9251830	8	10	2.9

cat("\nTotal:", sum(fixed_runlength$n_snps), "markers,",
    sum(fixed_runlength$runlength_Mb), "Mb\n")

## 
## Total: 1394 markers, 51.1 Mb

# How many are on chromosome 4?
n_fixed_chr4 <- fixed_runlength$n_snps[fixed_runlength$CHR == "4"]
pct_fixed_chr4 <- round(100 * n_fixed_chr4 / sum(fixed_mi21), 1)
cat("\nChromosome 4:", n_fixed_chr4, "markers (", pct_fixed_chr4, "% )\n")

## 
## Chromosome 4: 11 markers ( 0.8 % )

cat("Outside chromosome 4:", sum(fixed_mi21) - n_fixed_chr4, "markers (",
    round(100 - pct_fixed_chr4, 1), "% )\n")

## Outside chromosome 4: 1383 markers ( 99.2 % )

Calculate Linkage Disequilibrium

Purpose: Quantify correlation between tagging SNP and all genome-wide markers.

Approach: Pearson correlation with S4_181558683 (within Inv4m). Pearson r² is the standard LD statistic for biallelic markers (Ardlie et al. 2002). FDR correction for multiple testing.

# Calculate correlations
tagging_snp <- "S4_181558683"
cor_matrix <- cor(
  ngt[, tagging_snp, drop = FALSE],
  ngt,
  use = "pairwise.complete.obs"
) %>% t()

# Statistical testing
cor_pvalues <- apply(ngt, 2, function(x) {
  cor.test(x, y = ngt[, tagging_snp], method = "pearson")$p.value
})

# Combine results
significance_threshold <- 0.005
inv4m_cor <- data.frame(
  marker = rownames(cor_matrix),
  r = cor_matrix[, 1]
) %>%
  tidyr::separate(marker, into = c("CHR", "BP"), sep = "_") %>%
  mutate(
    BP = as.numeric(BP),
    CHR = gsub("S", "", CHR) %>% factor(levels = as.character(1:10)),
    p = cor_pvalues,
    FDR = p.adjust(p, method = "fdr"),
    R2 = r^2,
    is_significant = FDR < significance_threshold
  ) %>%
  tibble::rownames_to_column("ID")

cat("Total markers:", nrow(inv4m_cor), "\n")

## Total markers: 19861

cat("Significant (FDR <", significance_threshold, "):", 
    sum(inv4m_cor$is_significant, na.rm = TRUE), 
    sprintf("(%.1f%%)", 100 * mean(inv4m_cor$is_significant, na.rm = TRUE)), "\n")

## Significant (FDR < 0.005 ): 7683 (44.8%)

cat("Perfect LD (R² = 1):", sum(inv4m_cor$R2 == 1, na.rm = TRUE), "\n")

## Perfect LD (R² = 1): 3758

hist(inv4m_cor$R2, 
     main = "R² distribution with Inv4m tagging marker\n19,861 markers total",
     xlab = "R²", 
     ylim = c(0, 8000),
     col = "skyblue",
     breaks = 30)

Identify Correlated Regions

Purpose: Map SNPs with significant LD to genomic features.

Approach: Create GRanges from significant SNPs, classify by position relative to introgression boundaries.

# All significant SNPs
correlated_SNP <- with(inv4m_cor %>% filter(is_significant), {
  GRanges(
    seqnames = CHR,
    ranges = IRanges(start = BP, end = BP),
    ID = ID
  )
})

# Outside shared introgression
outside_correlated <- inv4m_cor %>%
  filter(is_significant) %>%
  filter(!(CHR == 4 & BP >= introgression_start & BP <= introgression_end))

# Anti-correlated region (upstream of introgression)
anticorrelated_start <- 154283637
anticorrelated_end <- 156985577

anticorrelated <- correlated_SNP %>%
  subset(seqnames == 4 & start >= anticorrelated_start & end <= anticorrelated_end)

# Inside shared introgression (39 Mb region, not just Inv4m proper)
inside_correlated <- correlated_SNP %>%
  subset(seqnames == 4 & start >= introgression_start & end <= introgression_end)

cat("Correlated SNPs by location:\n")

## Correlated SNPs by location:

cat("  Total significant:", length(correlated_SNP), "\n")

##   Total significant: 7683

cat("  Inside shared introgression (39 Mb):", length(inside_correlated),
    sprintf("(%.1f%%)", 100 * length(inside_correlated) / length(correlated_SNP)),
    "\n")

##   Inside shared introgression (39 Mb): 7322 (95.3%)

cat("  Outside shared introgression:", nrow(outside_correlated),
    sprintf("(%.1f%%)", 100 * nrow(outside_correlated) / length(correlated_SNP)),
    "\n")

##   Outside shared introgression: 361 (4.7%)

cat("  Anti-correlated upstream (2.7 Mb):", length(anticorrelated), "\n")

##   Anti-correlated upstream (2.7 Mb): 207

cat("  Arithmetic check: inside + outside =",
    length(inside_correlated) + nrow(outside_correlated), "\n")

##   Arithmetic check: inside + outside = 7683

# Run-length of correlated SNPs using proper genotype run-length encoding
# Same methodology as get_genotype_runs: rle on classification, span = distance to next run start
compute_sig_runlength <- function(all_markers_df, is_sig_vector) {
  total_mb <- 0
  for (chr_idx in 1:10) {
    chr_mask <- as.numeric(as.character(all_markers_df$CHR)) == chr_idx
    chr_data <- all_markers_df[chr_mask, ]
    chr_sig <- is_sig_vector[chr_mask]
    if (sum(chr_sig, na.rm = TRUE) == 0) next

    positions <- chr_data$BP
    runs <- rle(chr_sig)
    run_starts <- cumsum(c(1, runs$lengths[-length(runs$lengths)]))

    for (i in seq_along(runs$values)) {
      if (isTRUE(runs$values[i])) {
        start_pos <- positions[run_starts[i]]
        end_idx <- run_starts[i] + runs$lengths[i] - 1
        if (end_idx < length(positions)) {
          end_pos <- positions[end_idx + 1]
        } else {
          end_pos <- positions[end_idx]
        }
        total_mb <- total_mb + (end_pos - start_pos) / 1e6
      }
    }
  }
  total_mb
}

# Classify each marker
all_markers <- inv4m_cor %>% arrange(as.numeric(as.character(CHR)), BP)
is_inside_sig <- all_markers$is_significant &
  as.character(all_markers$CHR) == "4" &
  all_markers$BP >= introgression_start & all_markers$BP <= introgression_end
is_inside_sig[is.na(is_inside_sig)] <- FALSE

is_outside_sig <- all_markers$is_significant & !is_inside_sig
is_outside_sig[is.na(is_outside_sig)] <- FALSE

inside_runlength <- compute_sig_runlength(all_markers, is_inside_sig)
outside_runlength <- compute_sig_runlength(all_markers, is_outside_sig)

cat("\nGenotype run-length of correlated SNPs:\n")

## 
## Genotype run-length of correlated SNPs:

cat("  Inside shared introgression:", round(inside_runlength, 1), "Mb\n")

##   Inside shared introgression: 33.6 Mb

cat("  Outside dispersed:", round(outside_runlength, 1), "Mb\n")

##   Outside dispersed: 16.6 Mb

# Chromosome distribution of significant markers
cat("\nSignificant markers by chromosome:\n")

## 
## Significant markers by chromosome:

sig_by_chr <- with(inv4m_cor %>% filter(is_significant), table(CHR))
print(sig_by_chr)

## CHR
##    1    2    3    4    5    6    7    8    9   10 
##   28    9    9 7547   30    4    4   27    7   18

Gene Overlap Analysis

Purpose: Identify genes overlapping Inv4m-correlated regions to create filter list for downstream analyses.

Approach: Use GenomicRanges::findOverlaps to map significant SNPs to gene annotations. Classify genes by location: inside shared introgression, dispersed background, or anti-correlated upstream.

Expected outcome: Generate comprehensive list of genes to exclude from analyses aimed at identifying trans-regulatory effects of Inv4m.

# Load transcript annotations
transcripts_gff <- rtracklayer::import(myGFF) %>%
  subset(type == "mRNA" & seqnames %in% 1:10)

transcripts <- as.data.frame(transcripts_gff)
transcripts$gene_id <- gsub("transcript:", "", transcripts$ID)
transcripts$gene_id <- gsub("_.*", "", transcripts$gene_id, perl = TRUE)

# Convert SNP sets to GRanges for overlap
outside_correlated_gr <- with(outside_correlated, {
  GRanges(
    seqnames = CHR,
    ranges = IRanges(start = BP, end = BP),
    ID = ID
  )
})

# 1. Outside shared introgression (dispersed background)
olap_outside <- findOverlaps(outside_correlated_gr, transcripts_gff, 
                              ignore.strand = TRUE)
outside_correlated_genes <- transcripts$gene_id[subjectHits(olap_outside)] %>%
  sort() %>%
  unique()

cat("Outside shared introgression:\n")

## Outside shared introgression:

cat("  SNPs:", length(outside_correlated_gr), "\n")

##   SNPs: 361

cat("  Unique SNP positions:", length(unique(queryHits(olap_outside))), "\n")

##   Unique SNP positions: 269

cat("  Genes overlapped:", length(outside_correlated_genes), "\n")

##   Genes overlapped: 52

# 2. Anti-correlated upstream region (154.3-157.0 Mb)
olap_anti <- findOverlaps(anticorrelated, transcripts_gff, 
                           ignore.strand = TRUE)
anticorrelated_genes <- transcripts$gene_id[subjectHits(olap_anti)] %>%
  sort() %>%
  unique()

cat("\nAnti-correlated upstream (154.3-157.0 Mb):\n")

## 
## Anti-correlated upstream (154.3-157.0 Mb):

cat("  SNPs:", length(anticorrelated), "\n")

##   SNPs: 207

cat("  Unique SNP positions:", length(unique(queryHits(olap_anti))), "\n")

##   Unique SNP positions: 177

cat("  Genes overlapped:", length(anticorrelated_genes), "\n")

##   Genes overlapped: 14

# 3. Inside shared introgression (157.0-195.9 Mb, 35 Mb)
olap_inside <- findOverlaps(inside_correlated, transcripts_gff, 
                             ignore.strand = TRUE)
inside_correlated_genes <- transcripts$gene_id[subjectHits(olap_inside)] %>%
  sort() %>%
  unique()

cat("\nInside shared introgression (157.0-195.9 Mb):\n")

## 
## Inside shared introgression (157.0-195.9 Mb):

cat("  SNPs:", length(inside_correlated), "\n")

##   SNPs: 7322

cat("  Unique SNP positions:", length(unique(queryHits(olap_inside))), "\n")

##   Unique SNP positions: 6910

cat("  Gene models overlapped:", length(inside_correlated_genes), "\n")

##   Gene models overlapped: 512

# Filter to expressed genes only (genes present in expression matrix)
expr_matrix <- read.csv(file.path(paths$intermediate, "PSU2022_gene_expression_matrix.csv"),
                        row.names = 1, check.names = FALSE)
expressed_genes <- rownames(expr_matrix)

inside_expressed <- inside_correlated_genes[inside_correlated_genes %in% expressed_genes]
outside_expressed <- outside_correlated_genes[outside_correlated_genes %in% expressed_genes]
anti_expressed <- anticorrelated_genes[anticorrelated_genes %in% expressed_genes]

cat("\nExpressed gene counts (genes in RNA-seq expression matrix):\n")

## 
## Expressed gene counts (genes in RNA-seq expression matrix):

cat("  Inside shared introgression:", length(inside_expressed), "expressed genes\n")

##   Inside shared introgression: 512 expressed genes

cat("  Outside dispersed:", length(outside_expressed), "expressed genes\n")

##   Outside dispersed: 52 expressed genes

cat("  Anti-correlated upstream:", length(anti_expressed), "expressed genes\n")

##   Anti-correlated upstream: 14 expressed genes

# 4. Create filter list and classification table
# Only filter genes OUTSIDE the introgression that are correlated
# Note: anticorrelated genes are a SUBSET of outside_correlated genes
# (they're outside the introgression, specifically in the upstream region)

# Filter list is simply all outside genes (includes anticorrelated as subset)
outside_inv4m_genes <- outside_correlated_genes

cat("\nGenes to filter out (outside Inv4m but correlated):", 
    length(outside_inv4m_genes), "\n")

## 
## Genes to filter out (outside Inv4m but correlated): 52

cat("  Total outside (includes anti-correlated):", length(outside_correlated_genes), "\n")

##   Total outside (includes anti-correlated): 52

cat("  Anti-correlated subset:", length(anticorrelated_genes), 
    sprintf("(%.1f%% of outside)", 100 * length(anticorrelated_genes) / length(outside_correlated_genes)), 
    "\n")

##   Anti-correlated subset: 14 (26.9% of outside)

# Create classification table for all correlated genes
# Anti-correlated genes get their own label even though they're subset of outside
gene_classification <- data.frame(
  gene_id = c(inside_correlated_genes, 
              outside_correlated_genes),
  classification = c(
    rep("inside_introgression", length(inside_correlated_genes)),
    rep("outside_dispersed", length(outside_correlated_genes))
  )
) %>%
  distinct() %>%
  mutate(
    classification = ifelse(
      gene_id %in% anticorrelated_genes,
      "outside_anticorrelated",
      classification
    )
  )

cat("\nTotal genes correlated with Inv4m:", nrow(gene_classification), "\n")

## 
## Total genes correlated with Inv4m: 564

cat("  Inside introgression:", sum(gene_classification$classification == "inside_introgression"), "\n")

##   Inside introgression: 512

cat("  Outside dispersed:", sum(gene_classification$classification == "outside_dispersed"), "\n")

##   Outside dispersed: 38

cat("  Outside anti-correlated:", sum(gene_classification$classification == "outside_anticorrelated"), "\n")

##   Outside anti-correlated: 14

# Save filter list (only outside genes)
saveRDS(outside_inv4m_genes, file.path(paths$intermediate, "inv4m_linked_genes_to_filter.rds"))
write.table(
  data.frame(gene_id = outside_inv4m_genes),
  file = file.path(paths$intermediate, "inv4m_linked_genes_to_filter.txt"),
  row.names = FALSE,
  quote = FALSE
)

# Save complete classification table
write.table(
  gene_classification,
  file = file.path(paths$intermediate, "inv4m_correlated_genes_classification.txt"),
  row.names = FALSE,
  quote = FALSE,
  sep = "\t"
)
saveRDS(gene_classification, file.path(paths$intermediate, "inv4m_correlated_genes_classification.rds"))

cat("\nFiles saved:\n")

## 
## Files saved:

cat("  Filter list (outside genes only):\n")

##   Filter list (outside genes only):

cat("    - inv4m_linked_genes_to_filter.rds\n")

##     - inv4m_linked_genes_to_filter.rds

cat("    - inv4m_linked_genes_to_filter.txt\n")

##     - inv4m_linked_genes_to_filter.txt

cat("  Classification table (all correlated genes):\n")

##   Classification table (all correlated genes):

cat("    - inv4m_correlated_genes_classification.txt\n")

##     - inv4m_correlated_genes_classification.txt

cat("    - inv4m_correlated_genes_classification.rds\n")

##     - inv4m_correlated_genes_classification.rds

Gene Overlap of Fixed Alternate Allele Markers

Purpose: Quantify how many fixed alternate allele markers (computed in the Fixed Alternate Allele Analysis section above) overlap expressed gene models. This is the gene-level count corresponding to the “fixed highland introgression” category in the manuscript.

# Build GRanges for fixed markers
fixed_markers <- data.frame(
  marker = colnames(ngt)[fixed_mi21],
  stringsAsFactors = FALSE
) %>%
  mutate(
    CHR = gsub("^S(\\d+)_.*", "\\1", marker),
    BP = as.numeric(gsub("^S\\d+_", "", marker))
  )

fixed_gr <- GRanges(
  seqnames = fixed_markers$CHR,
  ranges = IRanges(start = fixed_markers$BP, end = fixed_markers$BP)
)

# Overlap with gene models (transcripts_gff loaded in gene_overlap chunk)
olap_fixed <- findOverlaps(fixed_gr, transcripts_gff, ignore.strand = TRUE)
fixed_snps_in_genes <- length(unique(queryHits(olap_fixed)))
fixed_genes <- transcripts$gene_id[subjectHits(olap_fixed)] %>%
  sort() %>% unique()

cat("Fixed alternate allele markers overlapping gene models:\n")

## Fixed alternate allele markers overlapping gene models:

cat("  Total fixed markers:", sum(fixed_mi21), "\n")

##   Total fixed markers: 1394

cat("  Markers in genes:", fixed_snps_in_genes, "\n")

##   Markers in genes: 1088

cat("  Unique genes:", length(fixed_genes), "\n")

##   Unique genes: 162

# Breakdown by chromosome
fixed_gene_markers <- fixed_markers[queryHits(olap_fixed), ] %>% distinct()
fixed_gene_by_chr <- fixed_gene_markers %>%
  group_by(CHR) %>%
  summarise(n_snps_in_genes = n(), .groups = "drop")

cat("\nFixed markers in genes by chromosome:\n")

## 
## Fixed markers in genes by chromosome:

print(as.data.frame(fixed_gene_by_chr))

##    CHR n_snps_in_genes
## 1    1             603
## 2   10               5
## 3    2              27
## 4    3              16
## 5    4               9
## 6    5             389
## 7    6              15
## 8    7              10
## 9    8               7
## 10   9               7

Example: Applying Filter to DE Results

# Example showing how to filter DE results to exclude Inv4m-linked genes
# This removes genes outside Inv4m that show linkage (trans-effects/background)

# Load your DE results (example structure)
# de_results <- read.csv("differential_expression_results.csv")

# Filter out genes outside Inv4m that are correlated with it
de_filtered <- de_results %>%
  filter(!gene %in% outside_inv4m_genes)

# Compare before/after
cat("DE results before filtering:", nrow(de_results), "genes\n")
cat("DE results after filtering:", nrow(de_filtered), "genes\n")
cat("Genes removed:", nrow(de_results) - nrow(de_filtered), "\n")

# This filtered set excludes background linkage effects
# Genes inside Inv4m are retained (cis-regulatory effects of interest)

SNP Distribution Analysis

Purpose: Compare genome-wide vs. Inv4m-linked marker distributions.

Approach: Calculate chromosome-specific proportions for all SNPs and significant Inv4m-correlated SNPs.

inv4_distro <- data.frame(
  CHR = factor(1:10, levels = 1:10),
  all = with(inv4m_cor, table(CHR)) %>% as.numeric(),
  inv4m = with(inv4m_cor %>% filter(is_significant), table(CHR)) %>% 
    as.numeric()
) %>%
  mutate(
    all_pct = 100 * all / sum(all),
    inv4m_pct = 100 * inv4m / sum(inv4m)
  )

cat("SNP distribution:\n")

## SNP distribution:

cat("  Chr4 contains", inv4_distro$all[4], "SNPs (", 
    round(inv4_distro$all_pct[4], 1), "% of total)\n")

##   Chr4 contains 8892 SNPs ( 44.8 % of total)

cat("  Chr4 Inv4m-linked:", inv4_distro$inv4m[4], "SNPs (",
    round(inv4_distro$inv4m_pct[4], 1), "% of significant)\n")

##   Chr4 Inv4m-linked: 7547 SNPs ( 98.2 % of significant)

cat("  Other chromosomes average:", 
    round(mean(inv4_distro$all[-4]), 0), "SNPs per chromosome\n")

##   Other chromosomes average: 1219 SNPs per chromosome

p_distribution <- inv4_distro %>%
  tidyr::pivot_longer(
    cols = c("all_pct", "inv4m_pct"),
    names_to = "marker_type",
    values_to = "pct"
  ) %>%
  ggplot(aes(y = pct, x = CHR, group = marker_type, fill = marker_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(
    labels = c(
      sprintf("All SNPs (n=%s)", format(sum(inv4_distro$all), big.mark = ",")),
      sprintf("Inv4m-correlated (n=%s)", format(sum(inv4_distro$inv4m), big.mark = ","))
    ),
    values = c("gold", "purple4")
  ) +
  ylab("%") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(
    legend.position = c(0.7, 0.8),
    legend.title = element_blank(),
    axis.title.x = element_blank()
  )

p_distribution

# Save Supplementary Figure S1 (SNP distribution by chromosome).
ggsave(
  file.path(paths$figures, "SNP_distribution.png"),
  p_distribution,
  width = 8, height = 5, dpi = 300
)
ggsave(
  file.path(paths$figures, "SNP_distribution.svg"),
  p_distribution,
  width = 8, height = 5
)

Manhattan Plots

Purpose: Visualize genome-wide LD patterns.

Approach: Plot -log10(FDR) and correlation coefficients across chromosomes.

# Calculate cumulative positions
manhattan <- inv4m_cor %>%
  group_by(CHR) %>%
  summarise(chr_len = max(BP)) %>%
  mutate(tot = cumsum(chr_len) - chr_len) %>%
  select(-chr_len) %>%
  left_join(inv4m_cor, ., by = "CHR") %>%
  arrange(CHR, BP) %>%
  mutate(BPcum = BP + tot)

# Chromosome centers for x-axis
axisdf <- manhattan %>%
  group_by(CHR) %>%
  summarize(center = (max(BPcum) + min(BPcum)) / 2)

# Handle zero FDR values
manhattan$FDR[manhattan$FDR == 0] <- 1e-8

p_fdr <- manhattan %>%
  ggplot(aes(x = BPcum, y = -log10(FDR))) +
  geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
  scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
  geom_hline(yintercept = -log10(significance_threshold), col = "red") +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  coord_cartesian(ylim = c(0, 10)) +
  theme_bw(base_size = 20) +
  theme(
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

p_fdr

p_r <- manhattan %>%
  ggplot(aes(x = BPcum, y = r)) +
  geom_point(aes(color = as.factor(CHR)), size = 1.3) +
  geom_point(
    data = manhattan %>% filter(FDR >= significance_threshold),
    aes(color = as.factor(CHR)),
    fill = "white",
    shape = 21,
    size = 1.3
  ) +
  scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
  xlab("Chromosome") +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  coord_cartesian(ylim = c(-1.1, 1.1)) +
  theme_bw(base_size = 20) +
  theme(
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

p_r

Combined Figure

combined <- ggpubr::ggarrange(
  p_distribution,
  p_fdr,
  p_r,
  p_genotype,
  nrow = 4,
  align = "hv",
  heights = c(2, 2, 3, 3)
)

combined

Summary

# LD statistics
ld_summary <- inv4m_cor %>%
  group_by(CHR) %>%
  summarise(
    total = n(),
    significant = sum(is_significant, na.rm = TRUE),
    pct_sig = round(100 * significant / total, 1),
    mean_r = round(mean(r, na.rm = TRUE), 3),
    mean_R2 = round(mean(R2, na.rm = TRUE), 3)
  )

knitr::kable(ld_summary, caption = "LD summary by chromosome")

LD summary by chromosome
CHR	total	significant	pct_sig	mean_r	mean_R2
1	2173	28	1.3	0.038	0.104
2	1198	9	0.8	-0.001	0.092
3	1069	9	0.8	0.027	0.095
4	8892	7547	84.9	0.801	0.829
5	1595	30	1.9	0.051	0.112
6	1885	4	0.2	0.028	0.085
7	731	4	0.5	0.015	0.091
8	854	27	3.2	0.043	0.114
9	762	7	0.9	0.041	0.093
10	702	18	2.6	0.002	0.114

# Genotype composition
geno_summary <- by_length %>%
  group_by(group, Genotype) %>%
  summarise(
    mean_length = round(mean(length), 1),
    mean_pct = round(mean(pct), 1),
    .groups = "drop"
  )

knitr::kable(geno_summary, caption = "Mean genotype composition")

Mean genotype composition
group	Genotype	mean_length	mean_pct
CTRL	B73	1849.3	86.9
CTRL	Het	170.0	8.0
CTRL	Mi21	102.0	4.8
CTRL	missing	6.8	0.3
Inv4m	B73	1795.9	84.4
Inv4m	Het	194.6	9.1
Inv4m	Mi21	129.8	6.1
Inv4m	missing	7.8	0.4

Key Findings

Chromosome 4 enrichment: 98.2% of Inv4m-correlated SNPs map to Chr4
Perfect LD markers: 3758 SNPs show R² = 1 with tagging marker
Genes outside Inv4m to filter: 66 genes (dispersed + anti-correlated regions)
Introgression extent: Shared segment spans 38.9 Mb vs. 15.2 Mb for Inv4m core
Fixed alternate alleles: 1394 markers are homozygous for the alternate allele across all individuals. Only 0.8% are on chromosome 4. The majority map to chromosomes 1, 5, and 6, consistent with B73 seed stock divergence from the reference genome (Liang & Schnable 2016)

Session Info

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] cowplot_1.2.0        rtracklayer_1.70.1   GenomicRanges_1.62.1
##  [4] Seqinfo_1.0.0        IRanges_2.44.0       S4Vectors_0.48.0    
##  [7] BiocGenerics_0.56.0  generics_0.1.4       ggtext_0.1.2        
## [10] ggplot2_4.0.1        dplyr_1.1.4          vcfR_1.15.0         
## [13] here_1.0.2          
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-9                permute_0.9-8              
##   [3] rlang_1.1.6                 magrittr_2.0.4             
##   [5] otel_0.2.0                  matrixStats_1.5.0          
##   [7] compiler_4.5.2              mgcv_1.9-4                 
##   [9] systemfonts_1.3.1           vctrs_0.6.5                
##  [11] stringr_1.6.0               pkgconfig_2.0.3            
##  [13] crayon_1.5.3                memuse_4.2-3               
##  [15] fastmap_1.2.0               backports_1.5.0            
##  [17] XVector_0.50.0              labeling_0.4.3             
##  [19] utf8_1.2.6                  Rsamtools_2.26.0           
##  [21] rmarkdown_2.30              markdown_2.0               
##  [23] ggbeeswarm_0.7.3            ragg_1.5.0                 
##  [25] purrr_1.2.0                 xfun_0.55                  
##  [27] cachem_1.1.0                cigarillo_1.0.0            
##  [29] litedown_0.9                jsonlite_2.0.0             
##  [31] DelayedArray_0.36.0         BiocParallel_1.44.0        
##  [33] broom_1.0.11                parallel_4.5.2             
##  [35] cluster_2.1.8.1             R6_2.6.1                   
##  [37] bslib_0.9.0                 stringi_1.8.7              
##  [39] RColorBrewer_1.1-3          car_3.1-3                  
##  [41] jquerylib_0.1.4             Rcpp_1.1.0                 
##  [43] SummarizedExperiment_1.40.0 knitr_1.51                 
##  [45] Matrix_1.7-4                splines_4.5.2              
##  [47] tidyselect_1.2.1            dichromat_2.0-0.1          
##  [49] abind_1.4-8                 yaml_2.3.12                
##  [51] vegan_2.7-2                 codetools_0.2-20           
##  [53] curl_7.0.0                  lattice_0.22-7             
##  [55] tibble_3.3.0                Biobase_2.70.0             
##  [57] withr_3.0.2                 S7_0.2.1                   
##  [59] evaluate_1.0.5              pinfsc50_1.3.0             
##  [61] xml2_1.5.1                  Biostrings_2.78.0          
##  [63] pillar_1.11.1               ggpubr_0.6.2               
##  [65] MatrixGenerics_1.22.0       carData_3.0-5              
##  [67] rprojroot_2.1.1             RCurl_1.98-1.17            
##  [69] commonmark_2.0.0            scales_1.4.0               
##  [71] glue_1.8.0                  tools_4.5.2                
##  [73] BiocIO_1.20.0               ggsignif_0.6.4             
##  [75] GenomicAlignments_1.46.0    XML_3.99-0.20              
##  [77] grid_4.5.2                  tidyr_1.3.2                
##  [79] ape_5.8-1                   nlme_3.1-168               
##  [81] beeswarm_0.4.0              vipor_0.4.7                
##  [83] restfulr_0.0.16             Formula_1.2-5              
##  [85] cli_3.6.5                   textshaping_1.0.4          
##  [87] S4Arrays_1.10.1             viridisLite_0.4.2          
##  [89] svglite_2.2.2               gtable_0.3.6               
##  [91] rstatix_0.7.3               sass_0.4.10                
##  [93] digest_0.6.39               SparseArray_1.10.8         
##  [95] rjson_0.2.23                farver_2.1.2               
##  [97] htmltools_0.5.9             lifecycle_1.0.4            
##  [99] httr_1.4.7                  gridtext_0.1.5             
## [101] MASS_7.3-65

NIL Genotype Plotting and Genotype Correlation Analysis of Inv4m Introgression

Fausto Rodriguez Zapata

2026-05-07