Overview

Purpose: Plot genotype per NIL. Characterize linkage disequilibrium between the Inv4m chromosomal inversion and genome-wide markers in maize near-isogenic lines.

Approach: Calculate Spearman correlations between a tagging SNP within Inv4m (S4_181558683) and all genotyped markers across the genome. Identify regions with significant LD and visualize genotype structure.

Expected outcome: Identify the physical extent of linkage with Inv4m, detect recombination breakpoints, and characterize flanking introgression segments.

Load Libraries

library(vcfR)
library(dplyr)
library(ggplot2)
library(ggtext)
library(rtracklayer)
library(GenomicRanges)
library(cowplot)

Define Genomic Regions

Purpose: Establish coordinates for Inv4m inversion and shared introgression.

Approach: Extract gene boundaries from B73 v5.0 annotation to define inversion breakpoints identified in MCScan analysis.

# Load B73 annotation
myGFF <- file.path(paths$data, "Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3")
B73 <- rtracklayer::import(myGFF) %>%
  subset(type == "gene" & seqnames %in% 1:10)

genes <- as.data.frame(B73)
genes$ID <- gsub("gene:", "", genes$ID)

# Inv4m boundaries (from MCScan)
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

# Shared introgression boundaries (empirical)
introgression_start <- 157012149
introgression_end <- 195900523

# Create GRanges objects
inv4m <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = inv4m_start, end = inv4m_end),
  strand = "+"
)

shared_introgression <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = introgression_start, end = introgression_end),
  strand = "+"
)

# Region sizes
inv4m_size <- round((inv4m_end - inv4m_start) / 1e6, 1)
introgression_size <- round((introgression_end - introgression_start) / 1e6, 1)

# Count genes in regions
olap_inv4m <- findOverlaps(inv4m, B73, ignore.strand = TRUE)
inv4m_gene_ids <- genes$ID[subjectHits(olap_inv4m)]

olap_introgression <- findOverlaps(shared_introgression, B73, ignore.strand = TRUE)
shared_introgression_gene_ids <- genes$ID[subjectHits(olap_introgression)]

flanking_introgression_gene_ids <- shared_introgression_gene_ids[
  !(shared_introgression_gene_ids %in% inv4m_gene_ids)
]

cat("Inv4m inversion:", inv4m_size, "Mb |", length(inv4m_gene_ids), "genes\n")
## Inv4m inversion: 15.2 Mb | 434 genes
cat("Shared introgression:", introgression_size, "Mb |", 
    length(shared_introgression_gene_ids), "genes\n")
## Shared introgression: 38.9 Mb | 1101 genes
cat("Flanking regions:", 
    round(introgression_size - inv4m_size, 1), "Mb |", 
    length(flanking_introgression_gene_ids), "genes\n")
## Flanking regions: 23.7 Mb | 667 genes

Import and Process VCF

Purpose: Load genotype data and convert to numeric matrix.

Approach: Read VCF, extract GT field, recode genotypes as 0/1/2 for B73/Het/Mi21.

# Read VCF
vcf <- read.vcfR(file.path(paths$data, "inv4m_PSU_imputed.vcf"))
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 10
##   header_line: 11
##   variant count: 19861
##   column count: 22
## Meta line 10 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 19861
##   Character matrix gt cols: 22
##   skip: 0
##   nrows: 19861
##   row_num: 0
## Processed variant 1000Processed variant 2000Processed variant 3000Processed variant 4000Processed variant 5000Processed variant 6000Processed variant 7000Processed variant 8000Processed variant 9000Processed variant 10000Processed variant 11000Processed variant 12000Processed variant 13000Processed variant 14000Processed variant 15000Processed variant 16000Processed variant 17000Processed variant 18000Processed variant 19000Processed variant: 19861
## All variants processed
gt <- extract.gt(vcf, element = "GT")

# Recode genotypes
gt[gt == "0/0"] <- 0
gt[gt == "0/1" | gt == "1/0"] <- 1
gt[gt == "1/1"] <- 2

# Convert to numeric matrix
ngt <- matrix(as.integer(gt), ncol = ncol(gt))
colnames(ngt) <- colnames(gt)
rownames(ngt) <- rownames(gt)

# Sample ordering (CTRL then INV4 carriers, alphabetically within group)
sample_order <- c(
  sort(c("3095", "3113", "4113", "3083", "4107", "4122")),
  sort(c("3092", "3080", "3059", "3131", "4125", "4080", "4131"))
)

ngt <- ngt[, sample_order] %>% t()

# Genotype distribution (by site)
total_sites <- ncol(ngt) * nrow(ngt)
geno_freq <- table(ngt) / total_sites
cat("Genotype frequencies (by site):\n")
## Genotype frequencies (by site):
cat("  B73/B73 (0):", round(geno_freq["0"], 3), "\n")
##   B73/B73 (0): 0.636
cat("  B73/Mi21 (1):", round(geno_freq["1"], 3), "\n")
##   B73/Mi21 (1): 0.09
cat("  Mi21/Mi21 (2):", round(geno_freq["2"], 3), "\n")
##   Mi21/Mi21 (2): 0.273
cat("  Missing:", round(sum(is.na(ngt)) / total_sites, 4), "\n")
##   Missing: 0.0016
cat("Dimensions:", nrow(ngt), "samples ×", ncol(ngt), "markers\n")
## Dimensions: 13 samples × 19861 markers

Genotype Run Analysis

Purpose: Quantify introgression content across samples.

Approach: Calculate run-length encoding of genotypes, sum by allelic state.

get_genotype_runs <- function(x, sample_order) {
  result <- data.frame()
  chr_levels <- paste0("S", 1:10)
  
  for (ind_idx in sample_order) {
    result_ind <- data.frame()
    
    for (chr_idx in 1:10) {
      chr_prefix <- paste0("S", chr_idx, "_")
      chr_name <- paste0("S", chr_idx)
      chr <- x[grepl(chr_prefix, rownames(x), perl = TRUE), ]
      runs <- rle(chr[, ind_idx])
      
      chr_long <- data.frame(
        ind = ind_idx,
        chr = chr_name,
        pos = as.integer(gsub(chr_prefix, "", names(runs$values))),
        gt = runs$values
      ) %>%
        group_by(ind, chr) %>%
        mutate(
          lag = lag(pos, default = 0),
          span = (pos - lag) / 1e6,
          chr = factor(chr, levels = chr_levels),
          ind = factor(ind, levels = sample_order),
          missing = is.na(gt),
          missing_length = missing * span,
          B73 = as.numeric(gt == 0),
          B73_length = B73 * span,
          Het = as.numeric(gt == 1),
          Het_length = Het * span,
          Mi21 = as.numeric(gt == 2),
          Mi21_length = Mi21 * span
        )
      
      result_ind <- rbind(result_ind, chr_long)
    }
    
    result <- rbind(result, result_ind)
  }
  result
}

# Calculate runs
m <- t(ngt)
gt_runs <- get_genotype_runs(m, sample_order)

# Add grouping
gt_runs$group <- case_when(
  gt_runs$ind %in% c("3092", "3080", "3059", "3131", "4125", "4080", "4131") ~ 
    "Inv4m",
  .default = "CTRL"
)

# Genome size
chr_length <- gt_runs %>%
  group_by(chr) %>%
  summarise(chr_length = max(pos) / 1e6)
genome_size <- sum(chr_length$chr_length)

# Summarize by genotype
by_length <- gt_runs %>%
  inner_join(chr_length, by = "chr") %>%
  ungroup() %>%
  group_by(ind) %>%
  summarise(
    B73 = sum(B73_length, na.rm = TRUE),
    Het = sum(Het_length, na.rm = TRUE),
    Mi21 = sum(Mi21_length, na.rm = TRUE),
    missing = sum(missing_length, na.rm = TRUE),
    group = group[1]
  ) %>%
  mutate(group = factor(group, levels = c("CTRL", "Inv4m"))) %>%
  tidyr::pivot_longer(
    cols = c("B73", "Het", "Mi21", "missing"),
    names_to = "Genotype",
    values_to = "length"
  ) %>%
  mutate(pct = 100 * length / genome_size)

cat("Genome size:", round(genome_size, 1), "Mb\n")
## Genome size: 2128.1 Mb
cat("\nGenotype run length summary (Mb):\n")
## 
## Genotype run length summary (Mb):
by_length %>%
  group_by(Genotype) %>%
  summarise(
    mean_Mb = round(mean(length), 1),
    mean_pct = round(mean(pct), 2)
  ) %>%
  print()
## # A tibble: 4 × 3
##   Genotype mean_Mb mean_pct
##   <chr>      <dbl>    <dbl>
## 1 B73       1821.     85.6 
## 2 Het        183.      8.61
## 3 Mi21       117.      5.5 
## 4 missing      7.3     0.34
cat("\nBy introgression group:\n")
## 
## By introgression group:
by_length %>%
  group_by(group, Genotype) %>%
  summarise(
    mean_Mb = round(mean(length), 1),
    mean_pct = round(mean(pct), 2),
    .groups = "drop"
  ) %>%
  print()
## # A tibble: 8 × 4
##   group Genotype mean_Mb mean_pct
##   <fct> <chr>      <dbl>    <dbl>
## 1 CTRL  B73       1849.     86.9 
## 2 CTRL  Het        170       7.99
## 3 CTRL  Mi21       102       4.79
## 4 CTRL  missing      6.8     0.32
## 5 Inv4m B73       1796.     84.4 
## 6 Inv4m Het        195.      9.15
## 7 Inv4m Mi21       130.      6.1 
## 8 Inv4m missing      7.8     0.37

Genotype Visualization

Purpose: Create chromosome-wide haplotype plots.

Approach: Plot genotype runs as colored bars across chromosomes.

# Genotype color palette (3 colors at values 0, 1, 2)
gt_colors <- c("gold", "forestgreen", "purple4")
gt_labels <- c("B73/B73", "B73/Mi21", "Mi21/Mi21")

# Selection marker triangle
triangle <- data.frame(
  pos = 0,
  span = 181.6,
  ind = sample_order[length(sample_order)],
  chr = "4",
  label = factor("Selection marker<br>PZE4175660223")
)

# Coordinate limits
inversion_limit <- round(c(inv4m_start, inv4m_end) / 1e6, 2)
introgression_limit <- round(c(introgression_start, introgression_end) / 1e6, 2)

gt_plot <- gt_runs %>%
  ungroup() %>%
  mutate(chr = gsub("S", "", chr)) %>%
  ggplot(aes(y = ind, x = span, fill = gt)) +
  geom_col(width = 1) +
  scale_fill_gradientn(
    colours = gt_colors,
    values = scales::rescale(c(0, 1, 2)),
    limits = c(0, 2),
    breaks = c(0, 1, 2),
    labels = gt_labels,
    na.value = "gray85",
    guide = guide_legend(
      title = "Genotype",
      order = 1,
      reverse = TRUE,
      label.position = "left",
      label.hjust = 1,
      theme = theme(
        legend.title = element_text(size = 28, hjust = 0,
                                    margin = margin(t = 0, b = 6, unit = "pt")),
        legend.text = element_text(size = 25, vjust = 0.5),
        legend.key.width = unit(2, "lines"),
        legend.key.height = unit(2, "lines")
      )
    )
  ) +
  geom_point(
    data = triangle,
    mapping = aes(shape = label),
    position = position_nudge(y = 2),
    size = 3,
    color = "purple4",
    fill = "purple4"
  ) +
  scale_shape_manual(values = 25) +
  facet_wrap(factor(chr, 1:10) ~ ., strip.position = "left", ncol = 1) +
  coord_cartesian(xlim = c(0, 450), expand = FALSE) +
  xlab("Chromosome 4 Position [Mb]") +
  ylab("Chromosome") +
  guides(
    shape = guide_legend(
      order = 2,
      override.aes = list(size = 8),
      label.position = "left",
      label.hjust = 1,
      theme = theme(
        legend.title = element_blank(),
        legend.text = element_markdown(size = 25),
        legend.key.width = unit(2, "lines"),
        legend.background = element_rect(fill = "transparent", color = NA),
        legend.key = element_rect(fill = "transparent", color = NA)
      )
    )
  ) +
  ggpubr::theme_classic2(base_size = 30) +
  theme(
    legend.position = c(0.9 - 3.8 / (18 * 25.4), 0.83),
    legend.spacing.y = unit(-1.5, "lines"),
    legend.box.just = "right",
    strip.background = element_blank(),
    strip.text.y.left = element_text(angle = 0),
    axis.title.y = element_text(margin = margin(l = 2.5, unit = "mm")),
    panel.spacing.y = unit(0.5, "lines"),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")
  )

gt_plot

# Random spike IDs for SVG coordinate calibration (unique, won't collide)
set.seed(42)
spike_id_a <- paste0("XK", paste0(sample(c(0:9, letters[1:6]), 12, replace = TRUE), collapse = ""))
spike_id_b <- paste0("XK", paste0(sample(c(0:9, letters[1:6]), 12, replace = TRUE), collapse = ""))

# Chr4 detailed inset
inlay <- gt_runs %>%
  filter(chr == "S4") %>%
  ungroup() %>%
  mutate(chr = gsub("S", "", chr)) %>%
  ggplot(aes(x = span, y = ind, fill = gt)) +
  geom_col(width = 1) +
  scale_fill_gradientn(
    colours = gt_colors,
    values = scales::rescale(c(0, 1, 2)),
    limits = c(0, 2),
    na.value = "gray85"
  ) +
  annotate("point",
    x = 181.6, y = 14.4,
    shape = 25, size = 6, color = "purple4", fill = "purple4"
  ) +
  # Invisible diagonal spikes for SVG coordinate calibration
  annotate("text", x = 160, y = 2,  label = spike_id_a, alpha = 0, size = 1) +
  annotate("text", x = 190, y = 12, label = spike_id_b, alpha = 0, size = 1) +
  coord_cartesian(xlim = c(150, 200)) +
  xlab(NULL) +
  ylab("NIL") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(
    legend.position = "none",
    strip.background = element_blank(),
    strip.text = element_blank(),
    plot.title = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    plot.background = element_rect(fill = "transparent", color = NA),
    plot.margin = margin(t = 0, r = 0, b = 26, l = 0, unit = "mm")
  )

# Combine plots with annotation brackets
# First build base composite, then save SVG skeleton for coordinate extraction,
# then add annotations.

plot_base <- ggdraw() +
  draw_plot(gt_plot) +
  draw_plot(inlay,
    x = 0.45, y = 0.03 - 19 / (8 * 25.4),
    width = 0.6, height = 0.6,
    scale = 0.8
  )

# Save skeleton SVG for coordinate calibration
ggsave(file.path(paths$figures, "Fig1_panel_EF_skeleton.svg"),
       plot = plot_base, width = 18, height = 8, device = "svg",
       fix_text_size = TRUE)

plot_base

Extract annotation bracket positions from the skeleton SVG spike markers.

# --- Parse spike markers from skeleton SVG ---
# spike_id_a is at data (160 Mb, y=2), spike_id_b at (190 Mb, y=12)
svg_path <- file.path(paths$figures, "Fig1_panel_EF_skeleton.svg")
svg_lines <- readLines(svg_path)

extract_spike <- function(lines, label) {
  idx <- grep(label, lines, fixed = TRUE)
  stopifnot(length(idx) == 1)
  line <- lines[idx]
  sx <- as.numeric(sub(".*x='([^']+)'.*", "\\1", line))
  sy <- as.numeric(sub(".*y='([^']+)'.*", "\\1", line))
  c(x = sx, y = sy)
}

spike_a <- extract_spike(svg_lines, spike_id_a)  # data: (160, 2)
spike_b <- extract_spike(svg_lines, spike_id_b)  # data: (190, 12)

svg_header <- svg_lines[grep("viewBox", svg_lines)[1]]
svg_W <- as.numeric(sub(".*viewBox='0 0 ([0-9.]+) .*", "\\1", svg_header))
svg_H <- as.numeric(sub(".*viewBox='0 0 [0-9.]+ ([0-9.]+)'.*", "\\1", svg_header))

# Linear mapping: data Mb → NPC x  (via SVG pixels)
# SVG x = a_x * Mb + b_x, then NPC = SVG_x / svg_W
a_x <- (spike_b["x"] - spike_a["x"]) / (190 - 160)
b_x <- spike_a["x"] - a_x * 160

mb_to_npc <- function(mb) {
  svg_x <- unname(a_x * mb + b_x)
  svg_x / svg_W
}

# Linear mapping: data y-index → NPC y  (SVG y is top-down, NPC is bottom-up)
# SVG y = a_y * data_y + b_y, then NPC = 1 - SVG_y / svg_H
a_y <- (spike_b["y"] - spike_a["y"]) / (12 - 2)
b_y <- spike_a["y"] - a_y * 2

data_y_to_npc <- function(dy) {
  svg_y <- unname(a_y * dy + b_y)
  1 - svg_y / svg_H
}

cat(sprintf("Spike A SVG: (%.2f, %.2f)\nSpike B SVG: (%.2f, %.2f)\n",
            spike_a["x"], spike_a["y"], spike_b["x"], spike_b["y"]))
## Spike A SVG: (851.23, 460.93)
## Spike B SVG: (1156.04, 333.21)
cat(sprintf("SVG canvas: %.0f x %.0f\n", svg_W, svg_H))
## SVG canvas: 1296 x 576
# --- Compute NPC positions for annotations ---
col_inv4m <- "purple4"

introg_left  <- mb_to_npc(introgression_start / 1e6)
introg_right <- mb_to_npc(introgression_end / 1e6)
inv4m_left   <- mb_to_npc(inv4m_start / 1e6)
inv4m_right  <- mb_to_npc(inv4m_end / 1e6)

introg_size_mb <- round((introgression_end - introgression_start) / 1e6)
inv4m_size_mb  <- round((inv4m_end - inv4m_start) / 1e6)

# y: bracket above inlay top, rectangle spans inlay data area
# Inlay y-data ranges ~0.5 (bottom bar) to ~14 (top bar)
y_inv4m_top <- data_y_to_npc(14)    # top of heatmap
y_inv4m_bot <- data_y_to_npc(0.5)   # bottom of heatmap
y_introg    <- y_inv4m_top + 0.025   # bracket just above

cat(sprintf("introg_left=%.4f  introg_right=%.4f\n", introg_left, introg_right))
## introg_left=0.6334  introg_right=0.9383
cat(sprintf("inv4m_left=%.4f   inv4m_right=%.4f\n", inv4m_left, inv4m_right))
## inv4m_left=0.7578   inv4m_right=0.8774
cat(sprintf("y_inv4m_top=%.4f  y_inv4m_bot=%.4f  y_introg=%.4f\n",
            y_inv4m_top, y_inv4m_bot, y_introg))
## y_inv4m_top=0.4659  y_inv4m_bot=0.1665  y_introg=0.4909
# Layout (bottom to top):
#   inlay heatmap top edge = y_inv4m_top
#   Inv4m bracket just above heatmap
#   Inv4m label above that
#   Shared introgression bracket above Inv4m label
#   Shared introgression label at top
# Position bracket stack above inlay heatmap top
line_h           <- 1.2 * 25 / svg_H   # 1.2 lines in NPC
triangle_h       <- 3 * 72 / 25.4 / svg_H  # NPC height of size=3 triangle glyph
inv4m_tick_h     <- 0.04 - triangle_h   # shorten arms by triangle height
shift_down       <- 1.2 * (0.04)        # close gap to heatmap (original arm length)
y_inv4m_arms_bot <- y_inv4m_top + line_h - shift_down

mm_npc           <- function(mm) mm * 72 / 25.4 / svg_H

y_inv4m_bracket  <- y_inv4m_arms_bot + inv4m_tick_h
y_inv4m_label    <- y_inv4m_bracket + mm_npc(7.7)  # +7.7mm above bracket bar

# Introgression bracket: raise bar so label-to-bar gap = 6.7mm (matching Inv4m)
# Label stays at bar + 20.2mm, so bar must be 20.2 - 6.7 = 13.5mm above Inv4m label
y_introg_bracket <- y_inv4m_label + mm_npc(14.0)
introg_tick_h    <- y_introg_bracket - y_inv4m_arms_bot  # arms reach inv4m level

# Helper: make a ⊓-shaped bracket grob (left tick ↓, horizontal, right tick ↓)
make_bracket <- function(xl, xr, y_top, tick, col, lwd) {
  draw_grob(grid::polylineGrob(
    x = unit(c(xl, xl, xr, xr), "npc"),
    y = unit(c(y_top - tick, y_top, y_top, y_top - tick), "npc"),
    gp = grid::gpar(col = col, lwd = lwd, lineend = "square", linejoin = "mitre")
  ))
}

plot_with_inset <- plot_base +
  # --- Inv4m bracket (arms down, above triangle marker) ---
  make_bracket(inv4m_left, inv4m_right, y_inv4m_bracket, inv4m_tick_h,
               col_inv4m, 6) +
  draw_label(
    expression(italic("Inv4m") ~ "15 Mb"),
    x = (inv4m_left + inv4m_right) / 2, y = y_inv4m_label,
    size = 30, fontface = "plain", color = col_inv4m, hjust = 0.5) +
  # --- Shared introgression bracket (arms reach inv4m level) ---
  make_bracket(introg_left, introg_right, y_introg_bracket, introg_tick_h,
               col_inv4m, 6) +
  draw_label(
    paste0("Shared introgression ", introg_size_mb, " Mb"),
    x = (introg_left + introg_right) / 2, y = y_introg_bracket + mm_npc(7.7),
    size = 30, fontface = "plain", color = col_inv4m, hjust = 0.5) +
  # Panel labels
  draw_label("E", x = 0.005, y = 0.995, size = 45, fontface = "bold",
             hjust = 0, vjust = 1) +
  draw_label("F", x = 0.505, y = 0.52, size = 45, fontface = "bold",
             hjust = 0, vjust = 1)

plot_with_inset

saveRDS(plot_with_inset, file.path(paths$intermediate, "fig1_panel_E.rds"))
saveRDS(gt_plot, file.path(paths$intermediate, "fig1_panel_E_main.rds"))
saveRDS(inlay, file.path(paths$intermediate, "fig1_panel_E_inlay.rds"))

ggsave(file.path(paths$figures, "Fig1_panel_EF.png"),
       plot = plot_with_inset, width = 18, height = 8, dpi = 300, bg = "white")
ggsave(file.path(paths$figures, "Fig1_panel_EF.svg"),
       plot = plot_with_inset, width = 18, height = 8, device = "svg",
       fix_text_size = TRUE)

Panel F: Selected NIL Haplotypes

Purpose: Create Panel F for Figure 3 showing haplotype map for 3 representative NIL lines.

# Selected samples for Panel F (representative NILs)
selected3 <- c("3131", "4125", "4131")

# Create Panel F plot
gt3_plot <- gt_runs %>%
  filter(chr == "S4", ind %in% selected3) %>%
  droplevels() %>%
  ungroup() %>%
  mutate(
    ind = factor(ind, levels = selected3),
    chr = gsub("S", "", chr)
  ) %>%
  ggplot(aes(x = span, y = ind, fill = gt)) +
  ylab("Line") +
  geom_col(width = 1) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_fill_gradientn(
    colours = gt_colors,
    values = scales::rescale(c(0, 1, 2)),
    limits = c(0, 2),
    breaks = c(0, 1, 2),
    labels = gt_labels,
    na.value = "gray85"
  ) +
  guides(fill = guide_legend(title = "Genotype", ncol = 3)) +
  coord_cartesian(xlim = c(150, 200)) +
  annotate("point",
    x = 181.6, y = 4,
    shape = 25, size = 7, color = "purple4", fill = "purple4"
  ) +
  annotate("text",
    x = c(inversion_limit, introgression_limit), y = 4,
    label = rep("|", 4), size = 7
  ) +
  annotate("text",
    x = c(inversion_limit, introgression_limit), y = 0,
    label = rep("|", 4), size = 7
  ) +
  ggpubr::theme_classic2(base_size = 25) +
  theme(
    legend.position = "top",
    legend.byrow = TRUE,
    legend.text = element_text(size = 20),
    legend.box.spacing = unit(0, "pt"),
    plot.margin = unit(c(0, 0, 0, 0), "cm"),
    strip.background = element_blank(),
    strip.text = element_blank(),
    plot.title = element_text(color = "white"),
    axis.title.y = element_text(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.line.x = element_blank(),
    plot.background = element_rect(fill = "transparent", color = NA)
  )

# Save as Panel F for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_F_haplotype.png"),
  plot = gt3_plot,
  width = 8, height = 3, dpi = 300, bg = "white"
)
saveRDS(gt3_plot, file.path(paths$intermediate, "panel_F_haplotype.rds"))

gt3_plot

# Background means
bg_mean <- by_length %>%
  ungroup() %>%
  group_by(Genotype) %>%
  summarise_at(.vars = c("length", "pct"), .funs = mean)

p_genotype <- by_length %>%
  ggplot(aes(x = group, y = pct)) +
  geom_hline(data = bg_mean, aes(yintercept = pct), linetype = 2) +
  geom_boxplot(outlier.colour = "white", width = 0.25) +
  ggbeeswarm::geom_quasirandom(size = 2) +
  facet_wrap(. ~ Genotype, scales = "free_y", ncol = 4) +
  xlab("Introgression group") +
  ylab("% of genome length") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(strip.background = element_blank())

p_genotype

Fixed Alternate Allele Analysis

Purpose: Quantify markers where the Mi21 (alternate) allele is fixed across all individuals, regardless of Inv4m genotype group. These markers cannot represent Inv4m-linked introgression because they are shared between CTRL and Inv4m carriers.

Context: Liang & Schnable (2016) showed that 2.3% of the B73 gene space carries divergent haplotypes across seed stocks, concentrated on chromosomes 2, 4, 5, and 6. Fixed alternate alleles in our NILs may reflect B73 seed stock divergence from the v5 reference rather than true Mi21 introgression.

# Markers where all 13 individuals are homozygous for the alternate (Mi21) allele
# ngt is samples (rows) x markers (cols), coded 0/1/2
fixed_mi21 <- colSums(ngt, na.rm = FALSE) == (2 * nrow(ngt))
fixed_mi21[is.na(fixed_mi21)] <- FALSE

cat("Fixed alternate allele markers:", sum(fixed_mi21),
    "out of", ncol(ngt), "total\n\n")
## Fixed alternate allele markers: 1394 out of 19861 total
# Compute run-length span per chromosome
# A "consensus fixed" vector: 2 where all individuals are Mi21, 0 elsewhere
consensus <- ifelse(fixed_mi21, 2, 0)
names(consensus) <- colnames(ngt)

fixed_runlength <- data.frame()
for (chr_idx in 1:10) {
  chr_prefix <- paste0("S", chr_idx, "_")
  chr_mask <- grepl(chr_prefix, names(consensus))
  chr_vals <- consensus[chr_mask]
  positions <- as.numeric(gsub(chr_prefix, "", names(chr_vals)))

  runs <- rle(chr_vals)
  run_starts <- cumsum(c(1, runs$lengths[-length(runs$lengths)]))

  chr_mb <- 0
  chr_markers <- 0
  for (i in seq_along(runs$values)) {
    if (runs$values[i] == 2) {
      start_pos <- positions[run_starts[i]]
      end_idx <- run_starts[i] + runs$lengths[i] - 1
      if (end_idx < length(positions)) {
        end_pos <- positions[end_idx + 1]
      } else {
        end_pos <- positions[end_idx]
      }
      chr_mb <- chr_mb + (end_pos - start_pos) / 1e6
      chr_markers <- chr_markers + runs$lengths[i]
    }
  }
  fixed_runlength <- rbind(fixed_runlength, data.frame(
    CHR = chr_idx, n_snps = chr_markers, runlength_Mb = round(chr_mb, 1)
  ))
}

fixed_runlength <- fixed_runlength %>%
  mutate(CHR = factor(CHR, levels = 1:10)) %>%
  arrange(desc(n_snps))

knitr::kable(fixed_runlength,
             caption = "Fixed alternate allele markers and genotype run-length by chromosome")
Fixed alternate allele markers and genotype run-length by chromosome
CHR n_snps runlength_Mb
S1_192352 1 650 18.5
S5_3621605 5 417 7.2
S6_16767043 6 193 3.0
S2_5281945 2 47 8.3
S3_4704898 3 29 5.9
S7_17158381 7 14 2.8
S9_5227052 9 12 1.0
S4_38429716 4 11 0.9
S10_34702956 10 11 0.6
S8_9251830 8 10 2.9
cat("\nTotal:", sum(fixed_runlength$n_snps), "markers,",
    sum(fixed_runlength$runlength_Mb), "Mb\n")
## 
## Total: 1394 markers, 51.1 Mb
# How many are on chromosome 4?
n_fixed_chr4 <- fixed_runlength$n_snps[fixed_runlength$CHR == "4"]
pct_fixed_chr4 <- round(100 * n_fixed_chr4 / sum(fixed_mi21), 1)
cat("\nChromosome 4:", n_fixed_chr4, "markers (", pct_fixed_chr4, "% )\n")
## 
## Chromosome 4: 11 markers ( 0.8 % )
cat("Outside chromosome 4:", sum(fixed_mi21) - n_fixed_chr4, "markers (",
    round(100 - pct_fixed_chr4, 1), "% )\n")
## Outside chromosome 4: 1383 markers ( 99.2 % )

Calculate Linkage Disequilibrium

Purpose: Quantify correlation between tagging SNP and all genome-wide markers.

Approach: Pearson correlation with S4_181558683 (within Inv4m). Pearson r² is the standard LD statistic for biallelic markers (Ardlie et al. 2002). FDR correction for multiple testing.

# Calculate correlations
tagging_snp <- "S4_181558683"
cor_matrix <- cor(
  ngt[, tagging_snp, drop = FALSE],
  ngt,
  use = "pairwise.complete.obs"
) %>% t()

# Statistical testing
cor_pvalues <- apply(ngt, 2, function(x) {
  cor.test(x, y = ngt[, tagging_snp], method = "pearson")$p.value
})

# Combine results
significance_threshold <- 0.005
inv4m_cor <- data.frame(
  marker = rownames(cor_matrix),
  r = cor_matrix[, 1]
) %>%
  tidyr::separate(marker, into = c("CHR", "BP"), sep = "_") %>%
  mutate(
    BP = as.numeric(BP),
    CHR = gsub("S", "", CHR) %>% factor(levels = as.character(1:10)),
    p = cor_pvalues,
    FDR = p.adjust(p, method = "fdr"),
    R2 = r^2,
    is_significant = FDR < significance_threshold
  ) %>%
  tibble::rownames_to_column("ID")

cat("Total markers:", nrow(inv4m_cor), "\n")
## Total markers: 19861
cat("Significant (FDR <", significance_threshold, "):", 
    sum(inv4m_cor$is_significant, na.rm = TRUE), 
    sprintf("(%.1f%%)", 100 * mean(inv4m_cor$is_significant, na.rm = TRUE)), "\n")
## Significant (FDR < 0.005 ): 7683 (44.8%)
cat("Perfect LD (R² = 1):", sum(inv4m_cor$R2 == 1, na.rm = TRUE), "\n")
## Perfect LD (R² = 1): 3758
hist(inv4m_cor$R2, 
     main = "R² distribution with Inv4m tagging marker\n19,861 markers total",
     xlab = "R²", 
     ylim = c(0, 8000),
     col = "skyblue",
     breaks = 30)

Identify Correlated Regions

Purpose: Map SNPs with significant LD to genomic features.

Approach: Create GRanges from significant SNPs, classify by position relative to introgression boundaries.

# All significant SNPs
correlated_SNP <- with(inv4m_cor %>% filter(is_significant), {
  GRanges(
    seqnames = CHR,
    ranges = IRanges(start = BP, end = BP),
    ID = ID
  )
})

# Outside shared introgression
outside_correlated <- inv4m_cor %>%
  filter(is_significant) %>%
  filter(!(CHR == 4 & BP >= introgression_start & BP <= introgression_end))

# Anti-correlated region (upstream of introgression)
anticorrelated_start <- 154283637
anticorrelated_end <- 156985577

anticorrelated <- correlated_SNP %>%
  subset(seqnames == 4 & start >= anticorrelated_start & end <= anticorrelated_end)

# Inside shared introgression (39 Mb region, not just Inv4m proper)
inside_correlated <- correlated_SNP %>%
  subset(seqnames == 4 & start >= introgression_start & end <= introgression_end)

cat("Correlated SNPs by location:\n")
## Correlated SNPs by location:
cat("  Total significant:", length(correlated_SNP), "\n")
##   Total significant: 7683
cat("  Inside shared introgression (39 Mb):", length(inside_correlated),
    sprintf("(%.1f%%)", 100 * length(inside_correlated) / length(correlated_SNP)),
    "\n")
##   Inside shared introgression (39 Mb): 7322 (95.3%)
cat("  Outside shared introgression:", nrow(outside_correlated),
    sprintf("(%.1f%%)", 100 * nrow(outside_correlated) / length(correlated_SNP)),
    "\n")
##   Outside shared introgression: 361 (4.7%)
cat("  Anti-correlated upstream (2.7 Mb):", length(anticorrelated), "\n")
##   Anti-correlated upstream (2.7 Mb): 207
cat("  Arithmetic check: inside + outside =",
    length(inside_correlated) + nrow(outside_correlated), "\n")
##   Arithmetic check: inside + outside = 7683
# Run-length of correlated SNPs using proper genotype run-length encoding
# Same methodology as get_genotype_runs: rle on classification, span = distance to next run start
compute_sig_runlength <- function(all_markers_df, is_sig_vector) {
  total_mb <- 0
  for (chr_idx in 1:10) {
    chr_mask <- as.numeric(as.character(all_markers_df$CHR)) == chr_idx
    chr_data <- all_markers_df[chr_mask, ]
    chr_sig <- is_sig_vector[chr_mask]
    if (sum(chr_sig, na.rm = TRUE) == 0) next

    positions <- chr_data$BP
    runs <- rle(chr_sig)
    run_starts <- cumsum(c(1, runs$lengths[-length(runs$lengths)]))

    for (i in seq_along(runs$values)) {
      if (isTRUE(runs$values[i])) {
        start_pos <- positions[run_starts[i]]
        end_idx <- run_starts[i] + runs$lengths[i] - 1
        if (end_idx < length(positions)) {
          end_pos <- positions[end_idx + 1]
        } else {
          end_pos <- positions[end_idx]
        }
        total_mb <- total_mb + (end_pos - start_pos) / 1e6
      }
    }
  }
  total_mb
}

# Classify each marker
all_markers <- inv4m_cor %>% arrange(as.numeric(as.character(CHR)), BP)
is_inside_sig <- all_markers$is_significant &
  as.character(all_markers$CHR) == "4" &
  all_markers$BP >= introgression_start & all_markers$BP <= introgression_end
is_inside_sig[is.na(is_inside_sig)] <- FALSE

is_outside_sig <- all_markers$is_significant & !is_inside_sig
is_outside_sig[is.na(is_outside_sig)] <- FALSE

inside_runlength <- compute_sig_runlength(all_markers, is_inside_sig)
outside_runlength <- compute_sig_runlength(all_markers, is_outside_sig)

cat("\nGenotype run-length of correlated SNPs:\n")
## 
## Genotype run-length of correlated SNPs:
cat("  Inside shared introgression:", round(inside_runlength, 1), "Mb\n")
##   Inside shared introgression: 33.6 Mb
cat("  Outside dispersed:", round(outside_runlength, 1), "Mb\n")
##   Outside dispersed: 16.6 Mb
# Chromosome distribution of significant markers
cat("\nSignificant markers by chromosome:\n")
## 
## Significant markers by chromosome:
sig_by_chr <- with(inv4m_cor %>% filter(is_significant), table(CHR))
print(sig_by_chr)
## CHR
##    1    2    3    4    5    6    7    8    9   10 
##   28    9    9 7547   30    4    4   27    7   18

Gene Overlap Analysis

Purpose: Identify genes overlapping Inv4m-correlated regions to create filter list for downstream analyses.

Approach: Use GenomicRanges::findOverlaps to map significant SNPs to gene annotations. Classify genes by location: inside shared introgression, dispersed background, or anti-correlated upstream.

Expected outcome: Generate comprehensive list of genes to exclude from analyses aimed at identifying trans-regulatory effects of Inv4m.

# Load transcript annotations
transcripts_gff <- rtracklayer::import(myGFF) %>%
  subset(type == "mRNA" & seqnames %in% 1:10)

transcripts <- as.data.frame(transcripts_gff)
transcripts$gene_id <- gsub("transcript:", "", transcripts$ID)
transcripts$gene_id <- gsub("_.*", "", transcripts$gene_id, perl = TRUE)

# Convert SNP sets to GRanges for overlap
outside_correlated_gr <- with(outside_correlated, {
  GRanges(
    seqnames = CHR,
    ranges = IRanges(start = BP, end = BP),
    ID = ID
  )
})

# 1. Outside shared introgression (dispersed background)
olap_outside <- findOverlaps(outside_correlated_gr, transcripts_gff, 
                              ignore.strand = TRUE)
outside_correlated_genes <- transcripts$gene_id[subjectHits(olap_outside)] %>%
  sort() %>%
  unique()

cat("Outside shared introgression:\n")
## Outside shared introgression:
cat("  SNPs:", length(outside_correlated_gr), "\n")
##   SNPs: 361
cat("  Unique SNP positions:", length(unique(queryHits(olap_outside))), "\n")
##   Unique SNP positions: 269
cat("  Genes overlapped:", length(outside_correlated_genes), "\n")
##   Genes overlapped: 52
# 2. Anti-correlated upstream region (154.3-157.0 Mb)
olap_anti <- findOverlaps(anticorrelated, transcripts_gff, 
                           ignore.strand = TRUE)
anticorrelated_genes <- transcripts$gene_id[subjectHits(olap_anti)] %>%
  sort() %>%
  unique()

cat("\nAnti-correlated upstream (154.3-157.0 Mb):\n")
## 
## Anti-correlated upstream (154.3-157.0 Mb):
cat("  SNPs:", length(anticorrelated), "\n")
##   SNPs: 207
cat("  Unique SNP positions:", length(unique(queryHits(olap_anti))), "\n")
##   Unique SNP positions: 177
cat("  Genes overlapped:", length(anticorrelated_genes), "\n")
##   Genes overlapped: 14
# 3. Inside shared introgression (157.0-195.9 Mb, 35 Mb)
olap_inside <- findOverlaps(inside_correlated, transcripts_gff, 
                             ignore.strand = TRUE)
inside_correlated_genes <- transcripts$gene_id[subjectHits(olap_inside)] %>%
  sort() %>%
  unique()

cat("\nInside shared introgression (157.0-195.9 Mb):\n")
## 
## Inside shared introgression (157.0-195.9 Mb):
cat("  SNPs:", length(inside_correlated), "\n")
##   SNPs: 7322
cat("  Unique SNP positions:", length(unique(queryHits(olap_inside))), "\n")
##   Unique SNP positions: 6910
cat("  Gene models overlapped:", length(inside_correlated_genes), "\n")
##   Gene models overlapped: 512
# Filter to expressed genes only (genes present in expression matrix)
expr_matrix <- read.csv(file.path(paths$intermediate, "PSU2022_gene_expression_matrix.csv"),
                        row.names = 1, check.names = FALSE)
expressed_genes <- rownames(expr_matrix)

inside_expressed <- inside_correlated_genes[inside_correlated_genes %in% expressed_genes]
outside_expressed <- outside_correlated_genes[outside_correlated_genes %in% expressed_genes]
anti_expressed <- anticorrelated_genes[anticorrelated_genes %in% expressed_genes]

cat("\nExpressed gene counts (genes in RNA-seq expression matrix):\n")
## 
## Expressed gene counts (genes in RNA-seq expression matrix):
cat("  Inside shared introgression:", length(inside_expressed), "expressed genes\n")
##   Inside shared introgression: 512 expressed genes
cat("  Outside dispersed:", length(outside_expressed), "expressed genes\n")
##   Outside dispersed: 52 expressed genes
cat("  Anti-correlated upstream:", length(anti_expressed), "expressed genes\n")
##   Anti-correlated upstream: 14 expressed genes
# 4. Create filter list and classification table
# Only filter genes OUTSIDE the introgression that are correlated
# Note: anticorrelated genes are a SUBSET of outside_correlated genes
# (they're outside the introgression, specifically in the upstream region)

# Filter list is simply all outside genes (includes anticorrelated as subset)
outside_inv4m_genes <- outside_correlated_genes

cat("\nGenes to filter out (outside Inv4m but correlated):", 
    length(outside_inv4m_genes), "\n")
## 
## Genes to filter out (outside Inv4m but correlated): 52
cat("  Total outside (includes anti-correlated):", length(outside_correlated_genes), "\n")
##   Total outside (includes anti-correlated): 52
cat("  Anti-correlated subset:", length(anticorrelated_genes), 
    sprintf("(%.1f%% of outside)", 100 * length(anticorrelated_genes) / length(outside_correlated_genes)), 
    "\n")
##   Anti-correlated subset: 14 (26.9% of outside)
# Create classification table for all correlated genes
# Anti-correlated genes get their own label even though they're subset of outside
gene_classification <- data.frame(
  gene_id = c(inside_correlated_genes, 
              outside_correlated_genes),
  classification = c(
    rep("inside_introgression", length(inside_correlated_genes)),
    rep("outside_dispersed", length(outside_correlated_genes))
  )
) %>%
  distinct() %>%
  mutate(
    classification = ifelse(
      gene_id %in% anticorrelated_genes,
      "outside_anticorrelated",
      classification
    )
  )

cat("\nTotal genes correlated with Inv4m:", nrow(gene_classification), "\n")
## 
## Total genes correlated with Inv4m: 564
cat("  Inside introgression:", sum(gene_classification$classification == "inside_introgression"), "\n")
##   Inside introgression: 512
cat("  Outside dispersed:", sum(gene_classification$classification == "outside_dispersed"), "\n")
##   Outside dispersed: 38
cat("  Outside anti-correlated:", sum(gene_classification$classification == "outside_anticorrelated"), "\n")
##   Outside anti-correlated: 14
# Save filter list (only outside genes)
saveRDS(outside_inv4m_genes, file.path(paths$intermediate, "inv4m_linked_genes_to_filter.rds"))
write.table(
  data.frame(gene_id = outside_inv4m_genes),
  file = file.path(paths$intermediate, "inv4m_linked_genes_to_filter.txt"),
  row.names = FALSE,
  quote = FALSE
)

# Save complete classification table
write.table(
  gene_classification,
  file = file.path(paths$intermediate, "inv4m_correlated_genes_classification.txt"),
  row.names = FALSE,
  quote = FALSE,
  sep = "\t"
)
saveRDS(gene_classification, file.path(paths$intermediate, "inv4m_correlated_genes_classification.rds"))

cat("\nFiles saved:\n")
## 
## Files saved:
cat("  Filter list (outside genes only):\n")
##   Filter list (outside genes only):
cat("    - inv4m_linked_genes_to_filter.rds\n")
##     - inv4m_linked_genes_to_filter.rds
cat("    - inv4m_linked_genes_to_filter.txt\n")
##     - inv4m_linked_genes_to_filter.txt
cat("  Classification table (all correlated genes):\n")
##   Classification table (all correlated genes):
cat("    - inv4m_correlated_genes_classification.txt\n")
##     - inv4m_correlated_genes_classification.txt
cat("    - inv4m_correlated_genes_classification.rds\n")
##     - inv4m_correlated_genes_classification.rds

Gene Overlap of Fixed Alternate Allele Markers

Purpose: Quantify how many fixed alternate allele markers (computed in the Fixed Alternate Allele Analysis section above) overlap expressed gene models. This is the gene-level count corresponding to the “fixed highland introgression” category in the manuscript.

# Build GRanges for fixed markers
fixed_markers <- data.frame(
  marker = colnames(ngt)[fixed_mi21],
  stringsAsFactors = FALSE
) %>%
  mutate(
    CHR = gsub("^S(\\d+)_.*", "\\1", marker),
    BP = as.numeric(gsub("^S\\d+_", "", marker))
  )

fixed_gr <- GRanges(
  seqnames = fixed_markers$CHR,
  ranges = IRanges(start = fixed_markers$BP, end = fixed_markers$BP)
)

# Overlap with gene models (transcripts_gff loaded in gene_overlap chunk)
olap_fixed <- findOverlaps(fixed_gr, transcripts_gff, ignore.strand = TRUE)
fixed_snps_in_genes <- length(unique(queryHits(olap_fixed)))
fixed_genes <- transcripts$gene_id[subjectHits(olap_fixed)] %>%
  sort() %>% unique()

cat("Fixed alternate allele markers overlapping gene models:\n")
## Fixed alternate allele markers overlapping gene models:
cat("  Total fixed markers:", sum(fixed_mi21), "\n")
##   Total fixed markers: 1394
cat("  Markers in genes:", fixed_snps_in_genes, "\n")
##   Markers in genes: 1088
cat("  Unique genes:", length(fixed_genes), "\n")
##   Unique genes: 162
# Breakdown by chromosome
fixed_gene_markers <- fixed_markers[queryHits(olap_fixed), ] %>% distinct()
fixed_gene_by_chr <- fixed_gene_markers %>%
  group_by(CHR) %>%
  summarise(n_snps_in_genes = n(), .groups = "drop")

cat("\nFixed markers in genes by chromosome:\n")
## 
## Fixed markers in genes by chromosome:
print(as.data.frame(fixed_gene_by_chr))
##    CHR n_snps_in_genes
## 1    1             603
## 2   10               5
## 3    2              27
## 4    3              16
## 5    4               9
## 6    5             389
## 7    6              15
## 8    7              10
## 9    8               7
## 10   9               7

Example: Applying Filter to DE Results

# Example showing how to filter DE results to exclude Inv4m-linked genes
# This removes genes outside Inv4m that show linkage (trans-effects/background)

# Load your DE results (example structure)
# de_results <- read.csv("differential_expression_results.csv")

# Filter out genes outside Inv4m that are correlated with it
de_filtered <- de_results %>%
  filter(!gene %in% outside_inv4m_genes)

# Compare before/after
cat("DE results before filtering:", nrow(de_results), "genes\n")
cat("DE results after filtering:", nrow(de_filtered), "genes\n")
cat("Genes removed:", nrow(de_results) - nrow(de_filtered), "\n")

# This filtered set excludes background linkage effects
# Genes inside Inv4m are retained (cis-regulatory effects of interest)

SNP Distribution Analysis

Purpose: Compare genome-wide vs. Inv4m-linked marker distributions.

Approach: Calculate chromosome-specific proportions for all SNPs and significant Inv4m-correlated SNPs.

inv4_distro <- data.frame(
  CHR = factor(1:10, levels = 1:10),
  all = with(inv4m_cor, table(CHR)) %>% as.numeric(),
  inv4m = with(inv4m_cor %>% filter(is_significant), table(CHR)) %>% 
    as.numeric()
) %>%
  mutate(
    all_pct = 100 * all / sum(all),
    inv4m_pct = 100 * inv4m / sum(inv4m)
  )

cat("SNP distribution:\n")
## SNP distribution:
cat("  Chr4 contains", inv4_distro$all[4], "SNPs (", 
    round(inv4_distro$all_pct[4], 1), "% of total)\n")
##   Chr4 contains 8892 SNPs ( 44.8 % of total)
cat("  Chr4 Inv4m-linked:", inv4_distro$inv4m[4], "SNPs (",
    round(inv4_distro$inv4m_pct[4], 1), "% of significant)\n")
##   Chr4 Inv4m-linked: 7547 SNPs ( 98.2 % of significant)
cat("  Other chromosomes average:", 
    round(mean(inv4_distro$all[-4]), 0), "SNPs per chromosome\n")
##   Other chromosomes average: 1219 SNPs per chromosome
p_distribution <- inv4_distro %>%
  tidyr::pivot_longer(
    cols = c("all_pct", "inv4m_pct"),
    names_to = "marker_type",
    values_to = "pct"
  ) %>%
  ggplot(aes(y = pct, x = CHR, group = marker_type, fill = marker_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(
    labels = c(
      sprintf("All SNPs (n=%s)", format(sum(inv4_distro$all), big.mark = ",")),
      sprintf("Inv4m-correlated (n=%s)", format(sum(inv4_distro$inv4m), big.mark = ","))
    ),
    values = c("gold", "purple4")
  ) +
  ylab("%") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(
    legend.position = c(0.7, 0.8),
    legend.title = element_blank(),
    axis.title.x = element_blank()
  )

p_distribution

Manhattan Plots

Purpose: Visualize genome-wide LD patterns.

Approach: Plot -log10(FDR) and correlation coefficients across chromosomes.

# Calculate cumulative positions
manhattan <- inv4m_cor %>%
  group_by(CHR) %>%
  summarise(chr_len = max(BP)) %>%
  mutate(tot = cumsum(chr_len) - chr_len) %>%
  select(-chr_len) %>%
  left_join(inv4m_cor, ., by = "CHR") %>%
  arrange(CHR, BP) %>%
  mutate(BPcum = BP + tot)

# Chromosome centers for x-axis
axisdf <- manhattan %>%
  group_by(CHR) %>%
  summarize(center = (max(BPcum) + min(BPcum)) / 2)

# Handle zero FDR values
manhattan$FDR[manhattan$FDR == 0] <- 1e-8
p_fdr <- manhattan %>%
  ggplot(aes(x = BPcum, y = -log10(FDR))) +
  geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
  scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
  geom_hline(yintercept = -log10(significance_threshold), col = "red") +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  coord_cartesian(ylim = c(0, 10)) +
  theme_bw(base_size = 20) +
  theme(
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

p_fdr

p_r <- manhattan %>%
  ggplot(aes(x = BPcum, y = r)) +
  geom_point(aes(color = as.factor(CHR)), size = 1.3) +
  geom_point(
    data = manhattan %>% filter(FDR >= significance_threshold),
    aes(color = as.factor(CHR)),
    fill = "white",
    shape = 21,
    size = 1.3
  ) +
  scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
  xlab("Chromosome") +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  coord_cartesian(ylim = c(-1.1, 1.1)) +
  theme_bw(base_size = 20) +
  theme(
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

p_r

Combined Figure

combined <- ggpubr::ggarrange(
  p_distribution,
  p_fdr,
  p_r,
  p_genotype,
  nrow = 4,
  align = "hv",
  heights = c(2, 2, 3, 3)
)

combined

Summary

# LD statistics
ld_summary <- inv4m_cor %>%
  group_by(CHR) %>%
  summarise(
    total = n(),
    significant = sum(is_significant, na.rm = TRUE),
    pct_sig = round(100 * significant / total, 1),
    mean_r = round(mean(r, na.rm = TRUE), 3),
    mean_R2 = round(mean(R2, na.rm = TRUE), 3)
  )

knitr::kable(ld_summary, caption = "LD summary by chromosome")
LD summary by chromosome
CHR total significant pct_sig mean_r mean_R2
1 2173 28 1.3 0.038 0.104
2 1198 9 0.8 -0.001 0.092
3 1069 9 0.8 0.027 0.095
4 8892 7547 84.9 0.801 0.829
5 1595 30 1.9 0.051 0.112
6 1885 4 0.2 0.028 0.085
7 731 4 0.5 0.015 0.091
8 854 27 3.2 0.043 0.114
9 762 7 0.9 0.041 0.093
10 702 18 2.6 0.002 0.114
# Genotype composition
geno_summary <- by_length %>%
  group_by(group, Genotype) %>%
  summarise(
    mean_length = round(mean(length), 1),
    mean_pct = round(mean(pct), 1),
    .groups = "drop"
  )

knitr::kable(geno_summary, caption = "Mean genotype composition")
Mean genotype composition
group Genotype mean_length mean_pct
CTRL B73 1849.3 86.9
CTRL Het 170.0 8.0
CTRL Mi21 102.0 4.8
CTRL missing 6.8 0.3
Inv4m B73 1795.9 84.4
Inv4m Het 194.6 9.1
Inv4m Mi21 129.8 6.1
Inv4m missing 7.8 0.4

Key Findings

  1. Chromosome 4 enrichment: 98.2% of Inv4m-correlated SNPs map to Chr4

  2. Perfect LD markers: 3758 SNPs show R² = 1 with tagging marker

  3. Genes outside Inv4m to filter: 66 genes (dispersed + anti-correlated regions)

  4. Introgression extent: Shared segment spans 38.9 Mb vs. 15.2 Mb for Inv4m core

  5. Fixed alternate alleles: 1394 markers are homozygous for the alternate allele across all individuals. Only 0.8% are on chromosome 4. The majority map to chromosomes 1, 5, and 6, consistent with B73 seed stock divergence from the reference genome (Liang & Schnable 2016)

Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] cowplot_1.2.0        rtracklayer_1.70.1   GenomicRanges_1.62.1
##  [4] Seqinfo_1.0.0        IRanges_2.44.0       S4Vectors_0.48.0    
##  [7] BiocGenerics_0.56.0  generics_0.1.4       ggtext_0.1.2        
## [10] ggplot2_4.0.1        dplyr_1.1.4          vcfR_1.15.0         
## [13] here_1.0.2          
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-9                permute_0.9-8              
##   [3] rlang_1.1.6                 magrittr_2.0.4             
##   [5] otel_0.2.0                  matrixStats_1.5.0          
##   [7] compiler_4.5.2              mgcv_1.9-4                 
##   [9] systemfonts_1.3.1           vctrs_0.6.5                
##  [11] stringr_1.6.0               pkgconfig_2.0.3            
##  [13] crayon_1.5.3                memuse_4.2-3               
##  [15] fastmap_1.2.0               backports_1.5.0            
##  [17] XVector_0.50.0              labeling_0.4.3             
##  [19] utf8_1.2.6                  Rsamtools_2.26.0           
##  [21] rmarkdown_2.30              markdown_2.0               
##  [23] ggbeeswarm_0.7.3            ragg_1.5.0                 
##  [25] purrr_1.2.0                 xfun_0.55                  
##  [27] cachem_1.1.0                cigarillo_1.0.0            
##  [29] litedown_0.9                jsonlite_2.0.0             
##  [31] DelayedArray_0.36.0         BiocParallel_1.44.0        
##  [33] broom_1.0.11                parallel_4.5.2             
##  [35] cluster_2.1.8.1             R6_2.6.1                   
##  [37] bslib_0.9.0                 stringi_1.8.7              
##  [39] RColorBrewer_1.1-3          car_3.1-3                  
##  [41] jquerylib_0.1.4             Rcpp_1.1.0                 
##  [43] SummarizedExperiment_1.40.0 knitr_1.51                 
##  [45] Matrix_1.7-4                splines_4.5.2              
##  [47] tidyselect_1.2.1            dichromat_2.0-0.1          
##  [49] abind_1.4-8                 yaml_2.3.12                
##  [51] vegan_2.7-2                 codetools_0.2-20           
##  [53] curl_7.0.0                  lattice_0.22-7             
##  [55] tibble_3.3.0                Biobase_2.70.0             
##  [57] withr_3.0.2                 S7_0.2.1                   
##  [59] evaluate_1.0.5              pinfsc50_1.3.0             
##  [61] xml2_1.5.1                  Biostrings_2.78.0          
##  [63] pillar_1.11.1               ggpubr_0.6.2               
##  [65] MatrixGenerics_1.22.0       carData_3.0-5              
##  [67] rprojroot_2.1.1             RCurl_1.98-1.17            
##  [69] commonmark_2.0.0            scales_1.4.0               
##  [71] glue_1.8.0                  tools_4.5.2                
##  [73] BiocIO_1.20.0               ggsignif_0.6.4             
##  [75] GenomicAlignments_1.46.0    XML_3.99-0.20              
##  [77] grid_4.5.2                  tidyr_1.3.2                
##  [79] ape_5.8-1                   nlme_3.1-168               
##  [81] beeswarm_0.4.0              vipor_0.4.7                
##  [83] restfulr_0.0.16             Formula_1.2-5              
##  [85] cli_3.6.5                   textshaping_1.0.4          
##  [87] S4Arrays_1.10.1             viridisLite_0.4.2          
##  [89] svglite_2.2.2               gtable_0.3.6               
##  [91] rstatix_0.7.3               sass_0.4.10                
##  [93] digest_0.6.39               SparseArray_1.10.8         
##  [95] rjson_0.2.23                farver_2.1.2               
##  [97] htmltools_0.5.9             lifecycle_1.0.4            
##  [99] httr_1.4.7                  gridtext_0.1.5             
## [101] MASS_7.3-65