Overview

Purpose: Identify Inv4m DEGs that overlap published flowering time (FT) and plant height (PH) candidate gene lists. Produces Table 2 (tab:FT_PH_candidates) in the manuscript.

Methods reference: Lines 1406–1432 of main.tex

Sources for FT candidates:

  1. Wang et al. 2021 — 991 highland-adapted FT genes (v3 IDs)
  2. Li et al. 2022 — 62 FT genes (v4 IDs)
  3. GWAS Atlas — “days to flowering” PPTO:0000155 (v4 coords)
  4. Tibbs-Cortes et al. 2024 — plasticity SNPs for DTS/DTA (v5 coords)

Sources for PH candidates:

  1. Li et al. 2022 — 23 PH genes (v4 IDs, hardcoded)
  2. GWAS Atlas — “plant height” PPTO:0000126 (v4 coords)
  3. Tibbs-Cortes et al. 2024 — plasticity SNPs for PH (v5 coords)

Setup

library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
library(xtable)
# Thresholds
FDR_THRESHOLD <- 0.05
LOGFC_THRESHOLD <- 1.5
FLANK_BP <- 5000 # ±5 kb extension for GWAS SNP–gene overlap

# Inv4m region boundaries (v5 coordinates)
INV4M_START <- 172883675 # Zm00001eb190470 gene start
INV4M_END   <- 188132113 # Zm00001eb194800 gene end

# Shared introgression region (broader)
SHARED_START <- 157012149
SHARED_END   <- 195900523

Load Gene Annotations

# Gene symbols (for multi-alias expansion, e.g. jmj2/jmj6/jmj9)
gene_symbols <- read.delim(
  file.path(paths$data, "gene_symbol.tab"),
  stringsAsFactors = FALSE
)

# Master labels (consolidated: curated > symbol > llm_proposed)
master_labels <- read.csv(
  file.path(paths$data, "locus_labels_master.csv"),
  stringsAsFactors = FALSE
) %>%
  dplyr::select(gene_model = 1, master_label = 4) %>%
  dplyr::filter(!is.na(master_label) & master_label != "")

# PANNZER functional descriptions — keep first per gene
gene_pannzer <- read.delim(
  file.path(paths$data, "PANNZER_DESC.tab"),
  stringsAsFactors = FALSE
) %>%
  group_by(gene_model) %>%
  dplyr::slice(1) %>%
  ungroup()

# v3 → v5 cross-reference (no header; one-to-many comma-separated)
v3_v5_raw <- read.delim(
  file.path(paths$data, "B73v3_to_B73v5.tsv"),
  header = FALSE,
  col.names = c("v3", "v5_list"),
  stringsAsFactors = FALSE
)
v3_v5 <- v3_v5_raw %>%
  separate_rows(v5_list, sep = ",") %>%
  dplyr::rename(v5 = v5_list) %>%
  dplyr::filter(v5 != "")

# v4 → v5 cross-reference
v4_v5_raw <- read.delim(
  file.path(paths$data, "B73v4_to_B73v5.tsv"),
  header = FALSE,
  col.names = c("v4", "v5_list"),
  stringsAsFactors = FALSE
)
v4_v5 <- v4_v5_raw %>%
  separate_rows(v5_list, sep = ",") %>%
  dplyr::rename(v5 = v5_list) %>%
  dplyr::filter(v5 != "")

cat("v3→v5 mappings:", nrow(v3_v5), "\n")
## v3→v5 mappings: 36147
cat("v4→v5 mappings:", nrow(v4_v5), "\n")
## v4→v5 mappings: 38564

Load Gene Coordinates

# v5 GFF — genes on chr 1-10 only
v5_gff <- import(file.path(paths$data,
  "Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"))
v5_genes <- v5_gff[v5_gff$type == "gene" &
                    seqnames(v5_gff) %in% as.character(1:10)]
v5_genes$gene_id <- sub("^gene:", "", v5_genes$ID)
names(v5_genes) <- v5_genes$gene_id

# Extended ±5 kb version for SNP overlap
v5_genes_ext <- v5_genes
start(v5_genes_ext) <- pmax(1, start(v5_genes_ext) - FLANK_BP)
end(v5_genes_ext) <- end(v5_genes_ext) + FLANK_BP

cat("v5 genes loaded:", length(v5_genes), "\n")
## v5 genes loaded: 43459
# v4 GFF — genes on chr 1-10 only
v4_gff <- import(file.path(paths$data,
  "Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2.gff3"))
v4_genes <- v4_gff[v4_gff$type == "gene" &
                    seqnames(v4_gff) %in% as.character(1:10)]
v4_genes$gene_id <- sub("^gene:", "", v4_genes$ID)
names(v4_genes) <- v4_genes$gene_id

# Extended ±5 kb version for GWAS Atlas SNP overlap
v4_genes_ext <- v4_genes
start(v4_genes_ext) <- pmax(1, start(v4_genes_ext) - FLANK_BP)
end(v4_genes_ext) <- end(v4_genes_ext) + FLANK_BP

cat("v4 genes loaded:", length(v4_genes), "\n")
## v4 genes loaded: 39005

Load Differential Expression Results

effects_all <- read.csv(
  file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv"),
  stringsAsFactors = FALSE
)

# Filter to Inv4m predictor, keep best p-value per gene
effects <- effects_all %>%
  dplyr::filter(predictor == "Inv4m") %>%
  mutate(
    is_significant = adj.P.Val < FDR_THRESHOLD,
    is_topDEG = is_significant & abs(logFC) > LOGFC_THRESHOLD
  ) %>%
  group_by(gene) %>%
  arrange(adj.P.Val) %>%
  dplyr::slice(1) %>%
  ungroup()

cat("Total Inv4m-tested genes:", nrow(effects), "\n")
## Total Inv4m-tested genes: 24011
cat("Significant DEGs (FDR < 0.05):", sum(effects$is_significant), "\n")
## Significant DEGs (FDR < 0.05): 465
cat("Top DEGs (|logFC| >", LOGFC_THRESHOLD, "):",
    sum(effects$is_topDEG), "\n")
## Top DEGs (|logFC| > 1.5 ): 155

Build Flowering Time Candidate List

Wang et al. 2021 (991 highland-adapted FT genes)

liwang <- read.csv(
  file.path(paths$data, "floweringTime_genes_liwang.csv"),
  stringsAsFactors = FALSE
)

# Convert v3 IDs to v5
wang_v5 <- liwang %>%
  dplyr::select(v3 = gene) %>%
  inner_join(v3_v5, by = "v3", relationship = "many-to-many") %>%
  pull(v5) %>%
  unique()

cat("Wang 2021: ", nrow(liwang), "v3 genes →", length(wang_v5), "v5 genes\n")
## Wang 2021:  904 v3 genes → 991 v5 genes

Li et al. 2022 (62 FT genes)

li2022_ft_v4 <- readLines(file.path(paths$data, "li2022_FT.list"))
li2022_ft_v4 <- li2022_ft_v4[li2022_ft_v4 != ""]

li2022_ft_v5 <- v4_v5 %>%
  dplyr::filter(v4 %in% li2022_ft_v4) %>%
  pull(v5) %>%
  unique()

cat("Li 2022 FT:", length(li2022_ft_v4), "v4 genes →",
    length(li2022_ft_v5), "v5 genes\n")
## Li 2022 FT: 416 v4 genes → 62 v5 genes

GWAS Atlas — days to flowering (PPTO:0000155)

atlas <- read.delim(
  file.path(paths$data, "gwas_association_result_for_maize.txt"),
  stringsAsFactors = FALSE,
  comment.char = "",
  check.names = FALSE
)

# Standardise column names
colnames(atlas)[colnames(atlas) == "#GaP_id"] <- "idx"
colnames(atlas)[colnames(atlas) == "Trait accession"] <- "TOID"
colnames(atlas)[colnames(atlas) == "P-value"] <- "p.value"
colnames(atlas)[colnames(atlas) == "PMID"] <- "PUBMID"
colnames(atlas)[colnames(atlas) == "First author"] <- "first_author"

# FT associations
atlas_ft <- atlas %>% dplyr::filter(TOID == "PPTO:0000155")

# Build GRanges for FT SNPs (v4 coordinates)
atlas_ft_gr <- with(atlas_ft,
  GRanges(
    seqnames = Chr,
    ranges = IRanges(start = Pos, end = Pos),
    idx = idx,
    p.value = as.numeric(p.value),
    PUBMID = PUBMID,
    first_author = first_author
  )
)

# Overlap with v4 genes (±5 kb)
olap <- findOverlaps(atlas_ft_gr, v4_genes_ext)

atlas_ft_hits <- data.frame(
  v4 = v4_genes_ext$gene_id[subjectHits(olap)],
  p.value = atlas_ft_gr$p.value[queryHits(olap)],
  stringsAsFactors = FALSE
) %>%
  group_by(v4) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%  # best p-value per gene
  ungroup()

# Convert to v5
atlas_ft_v5 <- atlas_ft_hits %>%
  inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
  group_by(v5) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup()

cat("GWAS Atlas FT: ", nrow(atlas_ft), "SNPs →",
    nrow(atlas_ft_hits), "v4 genes →",
    length(unique(atlas_ft_v5$v5)), "v5 genes\n")
## GWAS Atlas FT:  4392 SNPs → 2051 v4 genes → 2067 v5 genes

Tibbs-Cortes et al. 2024 — FT plasticity SNPs

plasticity <- read.csv(
  file.path(paths$data, "Supplemental_Table_S3.csv"),
  skip = 1,
  stringsAsFactors = FALSE
)

# FT traits: DTS_slope, DTS_intcp, DTA_slope, DTA_intcp
tibbs_ft_long <- plasticity %>%
  dplyr::select(seqnames = Chr, start = bp, Marker,
                DTS_slope, DTS_intcp, DTA_slope, DTA_intcp) %>%
  tidyr::pivot_longer(
    cols = c(DTS_slope, DTS_intcp, DTA_slope, DTA_intcp),
    values_to = "P", names_to = "trait"
  ) %>%
  dplyr::filter(!is.na(P))

# Build GRanges (v5 coordinates)
tibbs_ft_gr <- with(tibbs_ft_long,
  GRanges(
    seqnames = seqnames,
    ranges = IRanges(start = start, end = start),
    P = P,
    Marker = Marker
  )
)

# Overlap with v5 genes (±5 kb)
olap <- findOverlaps(tibbs_ft_gr, v5_genes_ext)

tibbs_ft_hits <- data.frame(
  gene = v5_genes_ext$gene_id[subjectHits(olap)],
  p.value = tibbs_ft_gr$P[queryHits(olap)],
  stringsAsFactors = FALSE
) %>%
  dplyr::filter(!is.na(gene)) %>%
  dplyr::distinct() %>%
  group_by(gene) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup()

cat("Tibbs-Cortes FT:", nrow(tibbs_ft_long), "SNP-trait pairs →",
    length(unique(tibbs_ft_hits$gene)), "v5 genes\n")
## Tibbs-Cortes FT: 45031 SNP-trait pairs → 1137 v5 genes

Combine FT Candidate List

FT_candidates <- sort(unique(c(
  wang_v5,
  li2022_ft_v5,
  atlas_ft_v5$v5,
  tibbs_ft_hits$gene
)))

cat("Nonredundant FT candidates:", length(FT_candidates),
    "(expected ~2224)\n")
## Nonredundant FT candidates: 4007 (expected ~2224)

Build Plant Height Candidate List

Li et al. 2022 (23 PH genes)

# Hardcoded v4 IDs from original analysis
li2022_ph_v4 <- c(
  "Zm00001d010396", "Zm00001d011069", "Zm00001d029657", "Zm00001d029656",
  "Zm00001d029658", "Zm00001d029659", "Zm00001d029758", "Zm00001d029884",
  "Zm00001d032747", "Zm00001d032776", "Zm00001d032788", "Zm00001d032790",
  "Zm00001d032791", "Zm00001d033204", "Zm00001d033206", "Zm00001d033205",
  "Zm00001d033223", "Zm00001d033224", "Zm00001d033225", "Zm00001d033226",
  "Zm00001d033228", "Zm00001d033227"
)

li2022_ph_v5 <- v4_v5 %>%
  dplyr::filter(v4 %in% li2022_ph_v4) %>%
  pull(v5) %>%
  unique()

cat("Li 2022 PH:", length(li2022_ph_v4), "v4 genes →",
    length(li2022_ph_v5), "v5 genes\n")
## Li 2022 PH: 22 v4 genes → 23 v5 genes

GWAS Atlas — plant height (PPTO:0000126)

atlas_ph <- atlas %>% dplyr::filter(TOID == "PPTO:0000126")

atlas_ph_gr <- with(atlas_ph,
  GRanges(
    seqnames = Chr,
    ranges = IRanges(start = Pos, end = Pos),
    idx = idx,
    p.value = as.numeric(p.value),
    PUBMID = PUBMID,
    first_author = first_author
  )
)

olap <- findOverlaps(atlas_ph_gr, v4_genes_ext)

atlas_ph_hits <- data.frame(
  v4 = v4_genes_ext$gene_id[subjectHits(olap)],
  p.value = atlas_ph_gr$p.value[queryHits(olap)],
  stringsAsFactors = FALSE
) %>%
  group_by(v4) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup()

atlas_ph_v5 <- atlas_ph_hits %>%
  inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
  group_by(v5) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup()

cat("GWAS Atlas PH:", nrow(atlas_ph), "SNPs →",
    nrow(atlas_ph_hits), "v4 genes →",
    length(unique(atlas_ph_v5$v5)), "v5 genes\n")
## GWAS Atlas PH: 3908 SNPs → 2884 v4 genes → 2919 v5 genes

Tibbs-Cortes et al. 2024 — PH plasticity SNPs

tibbs_ph_long <- plasticity %>%
  dplyr::select(seqnames = Chr, start = bp, Marker,
                PH_slope, PH_intcp) %>%
  tidyr::pivot_longer(
    cols = c(PH_slope, PH_intcp),
    values_to = "P", names_to = "trait"
  ) %>%
  dplyr::filter(!is.na(P))

tibbs_ph_gr <- with(tibbs_ph_long,
  GRanges(
    seqnames = seqnames,
    ranges = IRanges(start = start, end = start),
    P = P,
    Marker = Marker
  )
)

olap <- findOverlaps(tibbs_ph_gr, v5_genes_ext)

tibbs_ph_hits <- data.frame(
  gene = v5_genes_ext$gene_id[subjectHits(olap)],
  p.value = tibbs_ph_gr$P[queryHits(olap)],
  stringsAsFactors = FALSE
) %>%
  dplyr::filter(!is.na(gene)) %>%
  dplyr::distinct() %>%
  group_by(gene) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup()

cat("Tibbs-Cortes PH:", nrow(tibbs_ph_long), "SNP-trait pairs →",
    length(unique(tibbs_ph_hits$gene)), "v5 genes\n")
## Tibbs-Cortes PH: 646 SNP-trait pairs → 131 v5 genes

Combine PH Candidate List

PH_candidates <- sort(unique(c(
  li2022_ph_v5,
  atlas_ph_v5$v5,
  tibbs_ph_hits$gene
)))

cat("Nonredundant PH candidates:", length(PH_candidates),
    "(expected ~1272)\n")
## Nonredundant PH candidates: 3038 (expected ~1272)

Join DEGs × Candidates

# Flag candidate status in effects table
effects <- effects %>%
  mutate(
    is_FT_candidate = gene %in% FT_candidates,
    is_PH_candidate = gene %in% PH_candidates
  )

cat("Inv4m DEGs that are FT candidates:",
    sum(effects$is_significant & effects$is_FT_candidate), "\n")
## Inv4m DEGs that are FT candidates: 41
cat("Inv4m top DEGs that are FT candidates:",
    sum(effects$is_topDEG & effects$is_FT_candidate), "\n")
## Inv4m top DEGs that are FT candidates: 12
cat("Inv4m DEGs that are PH candidates:",
    sum(effects$is_significant & effects$is_PH_candidate), "\n")
## Inv4m DEGs that are PH candidates: 48
cat("Inv4m top DEGs that are PH candidates:",
    sum(effects$is_topDEG & effects$is_PH_candidate), "\n")
## Inv4m top DEGs that are PH candidates: 17
# Collect best GWAS p-value per gene across all sources (FT)
gwas_pval_ft <- bind_rows(
  atlas_ft_v5 %>% dplyr::select(gene = v5, p.value),
  tibbs_ft_hits %>% dplyr::select(gene, p.value)
) %>%
  group_by(gene) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  dplyr::rename(gwas_pvalue_ft = p.value)

# Collect best GWAS p-value per gene across all sources (PH)
gwas_pval_ph <- bind_rows(
  atlas_ph_v5 %>% dplyr::select(gene = v5, p.value),
  tibbs_ph_hits %>% dplyr::select(gene, p.value)
) %>%
  group_by(gene) %>%
  arrange(p.value) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  dplyr::rename(gwas_pvalue_ph = p.value)
# Map GWAS Atlas PMIDs to BibTeX citation keys
pmid_to_cite <- c(
  "24514905" = "peiffer2014",
  "27012534" = "li2016",
  "22660547" = "jiao2012",
  "23840585" = "peiffer2013"
)

# Helper: resolve PMIDs to cite keys, adding liu2023 (the GWAS Atlas database)
resolve_atlas_cites <- function(pmids) {
  keys <- unique(na.omit(pmid_to_cite[as.character(pmids)]))
  # Always include liu2023 as the database reference
  sort(unique(c(keys, "liu2023")))
}
# --- GWAS Atlas: track actual PMIDs per gene ---

# FT atlas refs with PMIDs
atlas_ft_with_pmid <- data.frame(
  v4 = v4_genes_ext$gene_id[subjectHits(findOverlaps(atlas_ft_gr, v4_genes_ext))],
  PUBMID = atlas_ft_gr$PUBMID[queryHits(findOverlaps(atlas_ft_gr, v4_genes_ext))],
  stringsAsFactors = FALSE
) %>%
  inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
  dplyr::select(gene = v5, PUBMID) %>%
  distinct()

# PH atlas refs with PMIDs
atlas_ph_with_pmid <- data.frame(
  v4 = v4_genes_ext$gene_id[subjectHits(findOverlaps(atlas_ph_gr, v4_genes_ext))],
  PUBMID = atlas_ph_gr$PUBMID[queryHits(findOverlaps(atlas_ph_gr, v4_genes_ext))],
  stringsAsFactors = FALSE
) %>%
  inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
  dplyr::select(gene = v5, PUBMID) %>%
  distinct()

# Build per-gene citation lists for FT
ft_cite_list <- list()

# wang2021
for (g in wang_v5) ft_cite_list[[g]] <- c(ft_cite_list[[g]], "wang2021")
# li2022
for (g in li2022_ft_v5) ft_cite_list[[g]] <- c(ft_cite_list[[g]], "li2022")
# GWAS Atlas (with PMID resolution)
for (i in seq_len(nrow(atlas_ft_with_pmid))) {
  g <- atlas_ft_with_pmid$gene[i]
  cites <- resolve_atlas_cites(atlas_ft_with_pmid$PUBMID[i])
  ft_cite_list[[g]] <- c(ft_cite_list[[g]], cites)
}
# tibbs-cortes
for (g in tibbs_ft_hits$gene) ft_cite_list[[g]] <- c(ft_cite_list[[g]], "tibbs-cortes2024")

ft_refs <- data.frame(
  gene = names(ft_cite_list),
  ft_refs = sapply(ft_cite_list, function(x) paste(sort(unique(x)), collapse = ", ")),
  stringsAsFactors = FALSE,
  row.names = NULL
)

# Build per-gene citation lists for PH
ph_cite_list <- list()

# li2022
for (g in li2022_ph_v5) ph_cite_list[[g]] <- c(ph_cite_list[[g]], "li2022")
# GWAS Atlas (with PMID resolution)
for (i in seq_len(nrow(atlas_ph_with_pmid))) {
  g <- atlas_ph_with_pmid$gene[i]
  cites <- resolve_atlas_cites(atlas_ph_with_pmid$PUBMID[i])
  ph_cite_list[[g]] <- c(ph_cite_list[[g]], cites)
}
# tibbs-cortes
for (g in tibbs_ph_hits$gene) ph_cite_list[[g]] <- c(ph_cite_list[[g]], "tibbs-cortes2024")

ph_refs <- data.frame(
  gene = names(ph_cite_list),
  ph_refs = sapply(ph_cite_list, function(x) paste(sort(unique(x)), collapse = ", ")),
  stringsAsFactors = FALSE,
  row.names = NULL
)
# Location label helper
location_label <- function(in_inv4m, in_flank) {
  case_when(
    in_inv4m ~ "Inv4m",
    in_flank ~ "flanking",
    TRUE ~ "outside"
  )
}

# Genes with multiple aliases in gene_symbol.tab (for row expansion, e.g. jmj2/jmj6/jmj9)
gene_symbol_aliases <- gene_symbols %>%
  dplyr::filter(!is.na(locus_symbol) & locus_symbol != "")

# Identify multi-alias genes (genes with >1 alias in gene_symbol.tab)
multi_alias_genes <- gene_symbol_aliases %>%
  group_by(gene_model) %>%
  dplyr::filter(n() > 1) %>%
  ungroup()

# Annotate and expand: use desc (PANNZER) as primary description,
# use locus_labels_master.csv for shorthand labels,
# expand only multi-alias genes (e.g. jmj2/jmj6/jmj9) from gene_symbol.tab
annotate_and_expand <- function(df) {
  # Supplement description: prefer desc (PANNZER functional) over desc_merged
  df <- df %>%
    mutate(
      description = ifelse(!is.na(desc) & desc != "", desc, desc_merged),
      description = ifelse(is.na(description) | description == "",
                           "Uncharacterized protein", description)
    )

  # Expand multi-alias genes (creates multiple rows)
  genes_with_multi_aliases <- df %>%
    dplyr::filter(gene %in% multi_alias_genes$gene_model) %>%
    inner_join(
      multi_alias_genes %>% dplyr::select(gene_model, alias = locus_symbol),
      by = c("gene" = "gene_model"),
      relationship = "many-to-many"
    ) %>%
    mutate(locus_symbol = alias) %>%
    dplyr::select(-alias)

  # All other genes: use master label
  other_genes <- df %>%
    dplyr::filter(!(gene %in% multi_alias_genes$gene_model)) %>%
    left_join(master_labels, by = c("gene" = "gene_model")) %>%
    mutate(locus_symbol = ifelse(
      !is.na(master_label) & master_label != "",
      master_label,
      ifelse(is.na(locus_symbol) | locus_symbol == "", "", locus_symbol)
    )) %>%
    dplyr::select(-master_label)

  bind_rows(genes_with_multi_aliases, other_genes)
}

# FT top DEGs table
FT_table <- effects %>%
  dplyr::filter(is_topDEG & is_FT_candidate) %>%
  left_join(gwas_pval_ft, by = "gene") %>%
  left_join(ft_refs, by = "gene") %>%
  mutate(location = location_label(in_Inv4m, in_flank)) %>%
  dplyr::select(gene, locus_symbol, desc_merged, desc, logFC, neglogP,
                location, gwas_pvalue_ft, ft_refs) %>%
  annotate_and_expand() %>%
  arrange(-neglogP, gene, locus_symbol)

# PH top DEGs table
PH_table <- effects %>%
  dplyr::filter(is_topDEG & is_PH_candidate) %>%
  left_join(gwas_pval_ph, by = "gene") %>%
  left_join(ph_refs, by = "gene") %>%
  mutate(location = location_label(in_Inv4m, in_flank)) %>%
  dplyr::select(gene, locus_symbol, desc_merged, desc, logFC, neglogP,
                location, gwas_pvalue_ph, ph_refs) %>%
  annotate_and_expand() %>%
  arrange(-neglogP, gene, locus_symbol)

cat("FT top DEGs for table:", nrow(FT_table), "\n")
## FT top DEGs for table: 14
cat("PH top DEGs for table:", nrow(PH_table), "\n")
## PH top DEGs for table: 17
knitr::kable(
  FT_table %>% dplyr::select(gene, locus_symbol, description, logFC, neglogP,
                              location, gwas_pvalue_ft, ft_refs),
  digits = 2, caption = "Flowering Time candidate DEGs (|logFC| > 1.5)")
Flowering Time candidate DEGs (|logFC| > 1.5)
gene locus_symbol description logFC neglogP location gwas_pvalue_ft ft_refs
Zm00001eb191790 jmj2 Lysine-specific demethylase JMJ703 -3.49 21.93 Inv4m 0 tibbs-cortes2024, wang2021
Zm00001eb191790 jmj6 Lysine-specific demethylase JMJ703 -3.49 21.93 Inv4m 0 tibbs-cortes2024, wang2021
Zm00001eb191790 jmj9 Lysine-specific demethylase JMJ703 -3.49 21.93 Inv4m 0 tibbs-cortes2024, wang2021
Zm00001eb191820 jmj4 Putative lysine-specific demethylase JMJ16 -2.84 19.72 Inv4m NA wang2021
Zm00001eb060540 act1 Actin-1 2.23 9.93 outside NA wang2021
Zm00001eb070810 ychf Uncharacterized protein 4.19 9.70 outside NA li2016, liu2023
Zm00001eb192350 pkwd1 Protein kinase family protein / WD-40 repeat family protein -2.20 8.73 Inv4m 0 liu2023
Zm00001eb190090 nep1 Aspartic proteinase nepenthesin-1 7.18 7.00 flanking 0 liu2023, peiffer2013, peiffer2014
Zm00001eb197370 abi40 ABI3VP1-type transcription factor (Fragment) -2.25 5.77 outside 0 liu2023, peiffer2013, peiffer2014, wang2021
Zm00001eb428740 ocl2 Homeobox-leucine zipper protein ROC5 -2.66 5.46 outside NA wang2021
Zm00001eb186670 Uncharacterized protein -2.83 5.41 flanking 0 jiao2012, li2016, liu2023
Zm00001eb157030 Uncharacterized protein 5.68 4.84 outside NA li2016, liu2023
Zm00001eb191000 4cll5 4-coumarate–CoA ligase-like 5 -3.35 2.76 Inv4m NA wang2021
Zm00001eb187450 mybr58 HTH myb-type domain-containing protein 1.74 2.51 flanking NA wang2021
knitr::kable(
  PH_table %>% dplyr::select(gene, locus_symbol, description, logFC, neglogP,
                              location, gwas_pvalue_ph, ph_refs),
  digits = 2, caption = "Plant Height candidate DEGs (|logFC| > 1.5)")
Plant Height candidate DEGs (|logFC| > 1.5)
gene locus_symbol description logFC neglogP location gwas_pvalue_ph ph_refs
Zm00001eb194380 sec6 Exocyst complex component Sec6 2.87 16.53 Inv4m 0 liu2023, peiffer2014
Zm00001eb190240 Uncharacterized protein -9.16 14.16 flanking 0 liu2023
Zm00001eb189920 aldh2 Aldehyde dehydrogenase2 2.70 13.58 flanking 0 liu2023, peiffer2014
Zm00001eb190350 ank1 Ankyrin repeat-containing protein -1.83 12.96 flanking 0 liu2023
Zm00001eb196600 r3h1 R3H-assoc domain-containing protein -2.25 12.49 flanking 0 liu2023, peiffer2014
Zm00001eb195790 traf1 TRAF-type domain-containing protein 3.15 8.47 flanking 0 liu2023, peiffer2014
Zm00001eb123230 tip4c Tonoplast intrinsic protein 41 2.56 8.25 outside NA liu2023
Zm00001eb195970 mpv17 Mpv17/PMP22 family protein, expressed -2.41 7.75 flanking 0 liu2023, peiffer2014
Zm00001eb188570 his2b5 histone 2B5 -5.61 7.65 flanking NA liu2023
Zm00001eb196920 Uncharacterized protein -2.81 7.37 flanking 0 liu2023, peiffer2014
Zm00001eb192120 enth1 ENTH domain-containing protein -2.41 5.70 Inv4m 0 liu2023, peiffer2014
Zm00001eb186670 Uncharacterized protein -2.83 5.41 flanking 0 liu2023, peiffer2014
Zm00001eb195080 imk3b probable leucine-rich repeat receptor-like protein kinase IMK3 -4.68 5.34 flanking 0 liu2023, peiffer2014
Zm00001eb332450 roc4-2 Peptidyl-prolyl cis-trans isomerase -2.92 4.77 outside 0 liu2023, peiffer2014
Zm00001eb190380 rie1 E3 ubiquitin protein ligase RIE1 2.15 2.06 flanking 0 liu2023
Zm00001eb197300 stk1 Non-specific serine/threonine protein kinase -2.62 2.00 flanking 0 liu2023, peiffer2014
Zm00001eb194020 phi1 Phi-1-like phosphate-induced protein -2.39 1.86 Inv4m 0 liu2023, peiffer2014

Fisher Enrichment Tests

cat("=== Flowering Time Enrichment ===\n\n")
## === Flowering Time Enrichment ===
# Top DEGs vs FT candidates
ft_top <- table(effects$is_topDEG, effects$is_FT_candidate)
dimnames(ft_top) <- list(TopDEG = c("No", "Yes"),
                         FT_candidate = c("No", "Yes"))
cat("Top DEGs (|logFC| >", LOGFC_THRESHOLD, ") × FT candidates:\n")
## Top DEGs (|logFC| > 1.5 ) × FT candidates:
print(ft_top)
##       FT_candidate
## TopDEG    No   Yes
##    No  21130  2726
##    Yes   143    12
cat("\nFisher exact test (two-sided):\n")
## 
## Fisher exact test (two-sided):
print(fisher.test(ft_top))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ft_top
## p-value = 0.1641
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.3279357 1.1739395
## sample estimates:
## odds ratio 
##  0.6504676
cat("\nFisher exact test (greater):\n")
## 
## Fisher exact test (greater):
print(fisher.test(ft_top, alternative = "greater"))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ft_top
## p-value = 0.9478
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.3671634       Inf
## sample estimates:
## odds ratio 
##  0.6504676
# All significant DEGs vs FT candidates
ft_sig <- table(effects$is_significant, effects$is_FT_candidate)
dimnames(ft_sig) <- list(DEG = c("No", "Yes"),
                         FT_candidate = c("No", "Yes"))
cat("\nAll significant DEGs × FT candidates:\n")
## 
## All significant DEGs × FT candidates:
print(ft_sig)
##      FT_candidate
## DEG      No   Yes
##   No  20849  2697
##   Yes   424    41
cat("\nFisher exact test (two-sided):\n")
## 
## Fisher exact test (two-sided):
print(fisher.test(ft_sig))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ft_sig
## p-value = 0.07717
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5273949 1.0343878
## sample estimates:
## odds ratio 
##  0.7475275
cat("\nFisher exact test (greater):\n")
## 
## Fisher exact test (greater):
print(fisher.test(ft_sig, alternative = "greater"))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ft_sig
## p-value = 0.971
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.5580931       Inf
## sample estimates:
## odds ratio 
##  0.7475275
cat("=== Plant Height Enrichment ===\n\n")
## === Plant Height Enrichment ===
ph_top <- table(effects$is_topDEG, effects$is_PH_candidate)
dimnames(ph_top) <- list(TopDEG = c("No", "Yes"),
                         PH_candidate = c("No", "Yes"))
cat("Top DEGs (|logFC| >", LOGFC_THRESHOLD, ") × PH candidates:\n")
## Top DEGs (|logFC| > 1.5 ) × PH candidates:
print(ph_top)
##       PH_candidate
## TopDEG    No   Yes
##    No  21666  2190
##    Yes   138    17
cat("\nFisher exact test (two-sided):\n")
## 
## Fisher exact test (two-sided):
print(fisher.test(ph_top))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ph_top
## p-value = 0.4041
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.688717 2.027985
## sample estimates:
## odds ratio 
##    1.21871
ph_sig <- table(effects$is_significant, effects$is_PH_candidate)
dimnames(ph_sig) <- list(DEG = c("No", "Yes"),
                         PH_candidate = c("No", "Yes"))
cat("\nAll significant DEGs × PH candidates:\n")
## 
## All significant DEGs × PH candidates:
print(ph_sig)
##      PH_candidate
## DEG      No   Yes
##   No  21387  2159
##   Yes   417    48
cat("\nFisher exact test (two-sided):\n")
## 
## Fisher exact test (two-sided):
print(fisher.test(ph_sig))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ph_sig
## p-value = 0.3732
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8247723 1.5455306
## sample estimates:
## odds ratio 
##   1.140239

Output

CSV

out_csv <- bind_rows(
  FT_table %>% mutate(
    pheno = "Flowering Time",
    gwas_pvalue = gwas_pvalue_ft,
    refs = ft_refs
  ) %>% dplyr::select(-gwas_pvalue_ft, -ft_refs),
  PH_table %>% mutate(
    pheno = "Plant Height",
    gwas_pvalue = gwas_pvalue_ph,
    refs = ph_refs
  ) %>% dplyr::select(-gwas_pvalue_ph, -ph_refs)
) %>%
  dplyr::select(pheno, gene, locus_symbol, description, logFC, neglogP,
                location, gwas_pvalue, refs) %>%
  arrange(pheno, -neglogP)

write.csv(out_csv,
  file.path(paths$intermediate, "inv4m_DEGs_by_phenotype_association.csv"),
  row.names = FALSE
)
cat("Wrote:", file.path(paths$intermediate,
    "inv4m_DEGs_by_phenotype_association.csv"), "\n")
## Wrote: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate/inv4m_DEGs_by_phenotype_association.csv

LaTeX Table

# Format for LaTeX
format_pval <- function(x) {
  ifelse(is.na(x), "", formatC(x, format = "e", digits = 2))
}

tex_lines <- c(
  "\\begin{table*}[!ht]",
  "\\caption{\\textbf{Effect of \\invfour on Flowering Time and Plant Height Gene",
  "Candidates}}",
  "\\label{tab:FT_PH_candidates}",
  "\\resizebox{\\textwidth}{!}{%",
  "\\begin{tabular}{llllllll}",
  "\\toprule",
  "  \\textbf{Gene} &",
  "  \\textbf{Label} &",
  "  \\textbf{Description} &",
  "  \\textbf{\\begin{tabular}[c]{@{}l@{}}$\\bf{log_2}$\\\\(FC)\\end{tabular}} &",
  "  \\textbf{\\begin{tabular}[c]{@{}l@{}}$\\bf{-log_{10}}$\\\\",
  "(\\textit{FDR})\\end{tabular}} &",
  "  \\textbf{Location} &",
  "  \\textbf{\\begin{tabular}[c]{@{}l@{}} GWAS \\\\",
  "\\textit{p-value}\\end{tabular}} &",
  "  \\textbf{ref} \\\\ \\midrule"
)

# Helper: format a single gene row
format_row <- function(gene, symbol, desc, logfc, nlogp, loc, pval, refs_str) {
  # Escape LaTeX special characters in description
  desc_clean <- gsub("&", "\\\\&", desc)
  desc_clean <- gsub("%", "\\\\%", desc_clean)
  desc_clean <- gsub("_", "\\\\_", desc_clean)

  # Format location: bold Inv4m
  loc_fmt <- if (loc == "Inv4m") "\\textbf{\\invfour}" else loc

  # Format GWAS p-value
  pval_fmt <- format_pval(pval)

  # Build citation string — refs_str already contains proper cite keys
  cite_fmt <- ""
  if (!is.na(refs_str) && refs_str != "") {
    cite_keys <- trimws(unlist(strsplit(refs_str, ",")))
    cite_fmt <- paste0("\\cite{", paste(unique(cite_keys), collapse = ", "), "}")
  }

  # Format symbol
  sym_fmt <- if (is.na(symbol) || symbol == "") "" else paste0("\\textit{", symbol, "}")

  paste0(
    "  \\textit{", gene, "} &\n",
    "  ", sym_fmt, " &\n",
    "  ", desc_clean, " &\n",
    "  ", sprintf("%.2f", logfc), " &\n",
    "  ", sprintf("%.2f", nlogp), " &\n",
    "  ", loc_fmt, " &\n",
    "  ", pval_fmt, " &\n",
    "  ", cite_fmt, " \\\\"
  )
}

# FT section
tex_lines <- c(tex_lines,
  "\\multicolumn{8}{l}{\\textbf{Flowering Time}} \\\\ \\midrule")

for (i in seq_len(nrow(FT_table))) {
  r <- FT_table[i, ]
  tex_lines <- c(tex_lines, format_row(
    r$gene, r$locus_symbol, r$description,
    r$logFC, r$neglogP, r$location,
    r$gwas_pvalue_ft, r$ft_refs
  ))
}

# PH section
tex_lines <- c(tex_lines,
  " \\midrule",
  "\\multicolumn{8}{l}{\\textbf{Plant Height}} \\\\ \\midrule")

for (i in seq_len(nrow(PH_table))) {
  r <- PH_table[i, ]
  tex_lines <- c(tex_lines, format_row(
    r$gene, r$locus_symbol, r$description,
    r$logFC, r$neglogP, r$location,
    r$gwas_pvalue_ph, r$ph_refs
  ))
}

tex_lines <- c(tex_lines,
  " \\bottomrule",
  "\\end{tabular}%",
  "}",
  "\\end{table*}"
)

writeLines(tex_lines,
  file.path(paths$tables, "FT_PH_candidates.tex"))
cat("Wrote:", file.path(paths$tables, "FT_PH_candidates.tex"), "\n")
## Wrote: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/tables/FT_PH_candidates.tex

Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] xtable_1.8-4         rtracklayer_1.70.1   GenomicRanges_1.62.1
##  [4] Seqinfo_1.0.0        IRanges_2.44.0       S4Vectors_0.48.0    
##  [7] BiocGenerics_0.56.0  generics_0.1.4       lubridate_1.9.4     
## [10] forcats_1.0.1        stringr_1.6.0        dplyr_1.1.4         
## [13] purrr_1.2.0          readr_2.1.6          tidyr_1.3.2         
## [16] tibble_3.3.0         ggplot2_4.0.1        tidyverse_2.0.0     
## [19] here_1.0.2          
## 
## loaded via a namespace (and not attached):
##  [1] SummarizedExperiment_1.40.0 gtable_0.3.6               
##  [3] rjson_0.2.23                xfun_0.55                  
##  [5] bslib_0.9.0                 lattice_0.22-7             
##  [7] Biobase_2.70.0              tzdb_0.5.0                 
##  [9] vctrs_0.6.5                 tools_4.5.2                
## [11] bitops_1.0-9                curl_7.0.0                 
## [13] parallel_4.5.2              pkgconfig_2.0.3            
## [15] Matrix_1.7-4                RColorBrewer_1.1-3         
## [17] cigarillo_1.0.0             S7_0.2.1                   
## [19] lifecycle_1.0.4             compiler_4.5.2             
## [21] farver_2.1.2                Rsamtools_2.26.0           
## [23] Biostrings_2.78.0           codetools_0.2-20           
## [25] htmltools_0.5.9             sass_0.4.10                
## [27] RCurl_1.98-1.17             yaml_2.3.12                
## [29] pillar_1.11.1               crayon_1.5.3               
## [31] jquerylib_0.1.4             BiocParallel_1.44.0        
## [33] DelayedArray_0.36.0         cachem_1.1.0               
## [35] abind_1.4-8                 tidyselect_1.2.1           
## [37] digest_0.6.39               stringi_1.8.7              
## [39] restfulr_0.0.16             rprojroot_2.1.1            
## [41] fastmap_1.2.0               grid_4.5.2                 
## [43] SparseArray_1.10.8          cli_3.6.5                  
## [45] magrittr_2.0.4              S4Arrays_1.10.1            
## [47] dichromat_2.0-0.1           XML_3.99-0.20              
## [49] withr_3.0.2                 scales_1.4.0               
## [51] timechange_0.3.0            rmarkdown_2.30             
## [53] XVector_0.50.0              httr_1.4.7                 
## [55] matrixStats_1.5.0           otel_0.2.0                 
## [57] hms_1.1.4                   evaluate_1.0.5             
## [59] knitr_1.51                  BiocIO_1.20.0              
## [61] rlang_1.1.6                 glue_1.8.0                 
## [63] jsonlite_2.0.0              R6_2.6.1                   
## [65] MatrixGenerics_1.22.0       GenomicAlignments_1.46.0