Purpose: Identify Inv4m DEGs that overlap published
flowering time (FT) and plant height (PH) candidate gene lists. Produces
Table 2 (tab:FT_PH_candidates) in the manuscript.
Methods reference: Lines 1406–1432 of
main.tex
Sources for FT candidates:
Sources for PH candidates:
library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
library(xtable)
# Thresholds
FDR_THRESHOLD <- 0.05
LOGFC_THRESHOLD <- 1.5
FLANK_BP <- 5000 # ±5 kb extension for GWAS SNP–gene overlap
# Inv4m region boundaries (v5 coordinates)
INV4M_START <- 172883675 # Zm00001eb190470 gene start
INV4M_END <- 188132113 # Zm00001eb194800 gene end
# Shared introgression region (broader)
SHARED_START <- 157012149
SHARED_END <- 195900523
# Gene symbols (for multi-alias expansion, e.g. jmj2/jmj6/jmj9)
gene_symbols <- read.delim(
file.path(paths$data, "gene_symbol.tab"),
stringsAsFactors = FALSE
)
# Master labels (consolidated: curated > symbol > llm_proposed)
master_labels <- read.csv(
file.path(paths$data, "locus_labels_master.csv"),
stringsAsFactors = FALSE
) %>%
dplyr::select(gene_model = 1, master_label = 4) %>%
dplyr::filter(!is.na(master_label) & master_label != "")
# PANNZER functional descriptions — keep first per gene
gene_pannzer <- read.delim(
file.path(paths$data, "PANNZER_DESC.tab"),
stringsAsFactors = FALSE
) %>%
group_by(gene_model) %>%
dplyr::slice(1) %>%
ungroup()
# v3 → v5 cross-reference (no header; one-to-many comma-separated)
v3_v5_raw <- read.delim(
file.path(paths$data, "B73v3_to_B73v5.tsv"),
header = FALSE,
col.names = c("v3", "v5_list"),
stringsAsFactors = FALSE
)
v3_v5 <- v3_v5_raw %>%
separate_rows(v5_list, sep = ",") %>%
dplyr::rename(v5 = v5_list) %>%
dplyr::filter(v5 != "")
# v4 → v5 cross-reference
v4_v5_raw <- read.delim(
file.path(paths$data, "B73v4_to_B73v5.tsv"),
header = FALSE,
col.names = c("v4", "v5_list"),
stringsAsFactors = FALSE
)
v4_v5 <- v4_v5_raw %>%
separate_rows(v5_list, sep = ",") %>%
dplyr::rename(v5 = v5_list) %>%
dplyr::filter(v5 != "")
cat("v3→v5 mappings:", nrow(v3_v5), "\n")
## v3→v5 mappings: 36147
cat("v4→v5 mappings:", nrow(v4_v5), "\n")
## v4→v5 mappings: 38564
# v5 GFF — genes on chr 1-10 only
v5_gff <- import(file.path(paths$data,
"Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"))
v5_genes <- v5_gff[v5_gff$type == "gene" &
seqnames(v5_gff) %in% as.character(1:10)]
v5_genes$gene_id <- sub("^gene:", "", v5_genes$ID)
names(v5_genes) <- v5_genes$gene_id
# Extended ±5 kb version for SNP overlap
v5_genes_ext <- v5_genes
start(v5_genes_ext) <- pmax(1, start(v5_genes_ext) - FLANK_BP)
end(v5_genes_ext) <- end(v5_genes_ext) + FLANK_BP
cat("v5 genes loaded:", length(v5_genes), "\n")
## v5 genes loaded: 43459
# v4 GFF — genes on chr 1-10 only
v4_gff <- import(file.path(paths$data,
"Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2.gff3"))
v4_genes <- v4_gff[v4_gff$type == "gene" &
seqnames(v4_gff) %in% as.character(1:10)]
v4_genes$gene_id <- sub("^gene:", "", v4_genes$ID)
names(v4_genes) <- v4_genes$gene_id
# Extended ±5 kb version for GWAS Atlas SNP overlap
v4_genes_ext <- v4_genes
start(v4_genes_ext) <- pmax(1, start(v4_genes_ext) - FLANK_BP)
end(v4_genes_ext) <- end(v4_genes_ext) + FLANK_BP
cat("v4 genes loaded:", length(v4_genes), "\n")
## v4 genes loaded: 39005
effects_all <- read.csv(
file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv"),
stringsAsFactors = FALSE
)
# Filter to Inv4m predictor, keep best p-value per gene
effects <- effects_all %>%
dplyr::filter(predictor == "Inv4m") %>%
mutate(
is_significant = adj.P.Val < FDR_THRESHOLD,
is_topDEG = is_significant & abs(logFC) > LOGFC_THRESHOLD
) %>%
group_by(gene) %>%
arrange(adj.P.Val) %>%
dplyr::slice(1) %>%
ungroup()
cat("Total Inv4m-tested genes:", nrow(effects), "\n")
## Total Inv4m-tested genes: 24011
cat("Significant DEGs (FDR < 0.05):", sum(effects$is_significant), "\n")
## Significant DEGs (FDR < 0.05): 465
cat("Top DEGs (|logFC| >", LOGFC_THRESHOLD, "):",
sum(effects$is_topDEG), "\n")
## Top DEGs (|logFC| > 1.5 ): 155
liwang <- read.csv(
file.path(paths$data, "floweringTime_genes_liwang.csv"),
stringsAsFactors = FALSE
)
# Convert v3 IDs to v5
wang_v5 <- liwang %>%
dplyr::select(v3 = gene) %>%
inner_join(v3_v5, by = "v3", relationship = "many-to-many") %>%
pull(v5) %>%
unique()
cat("Wang 2021: ", nrow(liwang), "v3 genes →", length(wang_v5), "v5 genes\n")
## Wang 2021: 904 v3 genes → 991 v5 genes
li2022_ft_v4 <- readLines(file.path(paths$data, "li2022_FT.list"))
li2022_ft_v4 <- li2022_ft_v4[li2022_ft_v4 != ""]
li2022_ft_v5 <- v4_v5 %>%
dplyr::filter(v4 %in% li2022_ft_v4) %>%
pull(v5) %>%
unique()
cat("Li 2022 FT:", length(li2022_ft_v4), "v4 genes →",
length(li2022_ft_v5), "v5 genes\n")
## Li 2022 FT: 416 v4 genes → 62 v5 genes
atlas <- read.delim(
file.path(paths$data, "gwas_association_result_for_maize.txt"),
stringsAsFactors = FALSE,
comment.char = "",
check.names = FALSE
)
# Standardise column names
colnames(atlas)[colnames(atlas) == "#GaP_id"] <- "idx"
colnames(atlas)[colnames(atlas) == "Trait accession"] <- "TOID"
colnames(atlas)[colnames(atlas) == "P-value"] <- "p.value"
colnames(atlas)[colnames(atlas) == "PMID"] <- "PUBMID"
colnames(atlas)[colnames(atlas) == "First author"] <- "first_author"
# FT associations
atlas_ft <- atlas %>% dplyr::filter(TOID == "PPTO:0000155")
# Build GRanges for FT SNPs (v4 coordinates)
atlas_ft_gr <- with(atlas_ft,
GRanges(
seqnames = Chr,
ranges = IRanges(start = Pos, end = Pos),
idx = idx,
p.value = as.numeric(p.value),
PUBMID = PUBMID,
first_author = first_author
)
)
# Overlap with v4 genes (±5 kb)
olap <- findOverlaps(atlas_ft_gr, v4_genes_ext)
atlas_ft_hits <- data.frame(
v4 = v4_genes_ext$gene_id[subjectHits(olap)],
p.value = atlas_ft_gr$p.value[queryHits(olap)],
stringsAsFactors = FALSE
) %>%
group_by(v4) %>%
arrange(p.value) %>%
dplyr::slice(1) %>% # best p-value per gene
ungroup()
# Convert to v5
atlas_ft_v5 <- atlas_ft_hits %>%
inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
group_by(v5) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup()
cat("GWAS Atlas FT: ", nrow(atlas_ft), "SNPs →",
nrow(atlas_ft_hits), "v4 genes →",
length(unique(atlas_ft_v5$v5)), "v5 genes\n")
## GWAS Atlas FT: 4392 SNPs → 2051 v4 genes → 2067 v5 genes
plasticity <- read.csv(
file.path(paths$data, "Supplemental_Table_S3.csv"),
skip = 1,
stringsAsFactors = FALSE
)
# FT traits: DTS_slope, DTS_intcp, DTA_slope, DTA_intcp
tibbs_ft_long <- plasticity %>%
dplyr::select(seqnames = Chr, start = bp, Marker,
DTS_slope, DTS_intcp, DTA_slope, DTA_intcp) %>%
tidyr::pivot_longer(
cols = c(DTS_slope, DTS_intcp, DTA_slope, DTA_intcp),
values_to = "P", names_to = "trait"
) %>%
dplyr::filter(!is.na(P))
# Build GRanges (v5 coordinates)
tibbs_ft_gr <- with(tibbs_ft_long,
GRanges(
seqnames = seqnames,
ranges = IRanges(start = start, end = start),
P = P,
Marker = Marker
)
)
# Overlap with v5 genes (±5 kb)
olap <- findOverlaps(tibbs_ft_gr, v5_genes_ext)
tibbs_ft_hits <- data.frame(
gene = v5_genes_ext$gene_id[subjectHits(olap)],
p.value = tibbs_ft_gr$P[queryHits(olap)],
stringsAsFactors = FALSE
) %>%
dplyr::filter(!is.na(gene)) %>%
dplyr::distinct() %>%
group_by(gene) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup()
cat("Tibbs-Cortes FT:", nrow(tibbs_ft_long), "SNP-trait pairs →",
length(unique(tibbs_ft_hits$gene)), "v5 genes\n")
## Tibbs-Cortes FT: 45031 SNP-trait pairs → 1137 v5 genes
FT_candidates <- sort(unique(c(
wang_v5,
li2022_ft_v5,
atlas_ft_v5$v5,
tibbs_ft_hits$gene
)))
cat("Nonredundant FT candidates:", length(FT_candidates),
"(expected ~2224)\n")
## Nonredundant FT candidates: 4007 (expected ~2224)
# Hardcoded v4 IDs from original analysis
li2022_ph_v4 <- c(
"Zm00001d010396", "Zm00001d011069", "Zm00001d029657", "Zm00001d029656",
"Zm00001d029658", "Zm00001d029659", "Zm00001d029758", "Zm00001d029884",
"Zm00001d032747", "Zm00001d032776", "Zm00001d032788", "Zm00001d032790",
"Zm00001d032791", "Zm00001d033204", "Zm00001d033206", "Zm00001d033205",
"Zm00001d033223", "Zm00001d033224", "Zm00001d033225", "Zm00001d033226",
"Zm00001d033228", "Zm00001d033227"
)
li2022_ph_v5 <- v4_v5 %>%
dplyr::filter(v4 %in% li2022_ph_v4) %>%
pull(v5) %>%
unique()
cat("Li 2022 PH:", length(li2022_ph_v4), "v4 genes →",
length(li2022_ph_v5), "v5 genes\n")
## Li 2022 PH: 22 v4 genes → 23 v5 genes
atlas_ph <- atlas %>% dplyr::filter(TOID == "PPTO:0000126")
atlas_ph_gr <- with(atlas_ph,
GRanges(
seqnames = Chr,
ranges = IRanges(start = Pos, end = Pos),
idx = idx,
p.value = as.numeric(p.value),
PUBMID = PUBMID,
first_author = first_author
)
)
olap <- findOverlaps(atlas_ph_gr, v4_genes_ext)
atlas_ph_hits <- data.frame(
v4 = v4_genes_ext$gene_id[subjectHits(olap)],
p.value = atlas_ph_gr$p.value[queryHits(olap)],
stringsAsFactors = FALSE
) %>%
group_by(v4) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup()
atlas_ph_v5 <- atlas_ph_hits %>%
inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
group_by(v5) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup()
cat("GWAS Atlas PH:", nrow(atlas_ph), "SNPs →",
nrow(atlas_ph_hits), "v4 genes →",
length(unique(atlas_ph_v5$v5)), "v5 genes\n")
## GWAS Atlas PH: 3908 SNPs → 2884 v4 genes → 2919 v5 genes
tibbs_ph_long <- plasticity %>%
dplyr::select(seqnames = Chr, start = bp, Marker,
PH_slope, PH_intcp) %>%
tidyr::pivot_longer(
cols = c(PH_slope, PH_intcp),
values_to = "P", names_to = "trait"
) %>%
dplyr::filter(!is.na(P))
tibbs_ph_gr <- with(tibbs_ph_long,
GRanges(
seqnames = seqnames,
ranges = IRanges(start = start, end = start),
P = P,
Marker = Marker
)
)
olap <- findOverlaps(tibbs_ph_gr, v5_genes_ext)
tibbs_ph_hits <- data.frame(
gene = v5_genes_ext$gene_id[subjectHits(olap)],
p.value = tibbs_ph_gr$P[queryHits(olap)],
stringsAsFactors = FALSE
) %>%
dplyr::filter(!is.na(gene)) %>%
dplyr::distinct() %>%
group_by(gene) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup()
cat("Tibbs-Cortes PH:", nrow(tibbs_ph_long), "SNP-trait pairs →",
length(unique(tibbs_ph_hits$gene)), "v5 genes\n")
## Tibbs-Cortes PH: 646 SNP-trait pairs → 131 v5 genes
PH_candidates <- sort(unique(c(
li2022_ph_v5,
atlas_ph_v5$v5,
tibbs_ph_hits$gene
)))
cat("Nonredundant PH candidates:", length(PH_candidates),
"(expected ~1272)\n")
## Nonredundant PH candidates: 3038 (expected ~1272)
# Flag candidate status in effects table
effects <- effects %>%
mutate(
is_FT_candidate = gene %in% FT_candidates,
is_PH_candidate = gene %in% PH_candidates
)
cat("Inv4m DEGs that are FT candidates:",
sum(effects$is_significant & effects$is_FT_candidate), "\n")
## Inv4m DEGs that are FT candidates: 41
cat("Inv4m top DEGs that are FT candidates:",
sum(effects$is_topDEG & effects$is_FT_candidate), "\n")
## Inv4m top DEGs that are FT candidates: 12
cat("Inv4m DEGs that are PH candidates:",
sum(effects$is_significant & effects$is_PH_candidate), "\n")
## Inv4m DEGs that are PH candidates: 48
cat("Inv4m top DEGs that are PH candidates:",
sum(effects$is_topDEG & effects$is_PH_candidate), "\n")
## Inv4m top DEGs that are PH candidates: 17
# Collect best GWAS p-value per gene across all sources (FT)
gwas_pval_ft <- bind_rows(
atlas_ft_v5 %>% dplyr::select(gene = v5, p.value),
tibbs_ft_hits %>% dplyr::select(gene, p.value)
) %>%
group_by(gene) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::rename(gwas_pvalue_ft = p.value)
# Collect best GWAS p-value per gene across all sources (PH)
gwas_pval_ph <- bind_rows(
atlas_ph_v5 %>% dplyr::select(gene = v5, p.value),
tibbs_ph_hits %>% dplyr::select(gene, p.value)
) %>%
group_by(gene) %>%
arrange(p.value) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::rename(gwas_pvalue_ph = p.value)
# Map GWAS Atlas PMIDs to BibTeX citation keys
pmid_to_cite <- c(
"24514905" = "peiffer2014",
"27012534" = "li2016",
"22660547" = "jiao2012",
"23840585" = "peiffer2013"
)
# Helper: resolve PMIDs to cite keys, adding liu2023 (the GWAS Atlas database)
resolve_atlas_cites <- function(pmids) {
keys <- unique(na.omit(pmid_to_cite[as.character(pmids)]))
# Always include liu2023 as the database reference
sort(unique(c(keys, "liu2023")))
}
# --- GWAS Atlas: track actual PMIDs per gene ---
# FT atlas refs with PMIDs
atlas_ft_with_pmid <- data.frame(
v4 = v4_genes_ext$gene_id[subjectHits(findOverlaps(atlas_ft_gr, v4_genes_ext))],
PUBMID = atlas_ft_gr$PUBMID[queryHits(findOverlaps(atlas_ft_gr, v4_genes_ext))],
stringsAsFactors = FALSE
) %>%
inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
dplyr::select(gene = v5, PUBMID) %>%
distinct()
# PH atlas refs with PMIDs
atlas_ph_with_pmid <- data.frame(
v4 = v4_genes_ext$gene_id[subjectHits(findOverlaps(atlas_ph_gr, v4_genes_ext))],
PUBMID = atlas_ph_gr$PUBMID[queryHits(findOverlaps(atlas_ph_gr, v4_genes_ext))],
stringsAsFactors = FALSE
) %>%
inner_join(v4_v5, by = "v4", relationship = "many-to-many") %>%
dplyr::select(gene = v5, PUBMID) %>%
distinct()
# Build per-gene citation lists for FT
ft_cite_list <- list()
# wang2021
for (g in wang_v5) ft_cite_list[[g]] <- c(ft_cite_list[[g]], "wang2021")
# li2022
for (g in li2022_ft_v5) ft_cite_list[[g]] <- c(ft_cite_list[[g]], "li2022")
# GWAS Atlas (with PMID resolution)
for (i in seq_len(nrow(atlas_ft_with_pmid))) {
g <- atlas_ft_with_pmid$gene[i]
cites <- resolve_atlas_cites(atlas_ft_with_pmid$PUBMID[i])
ft_cite_list[[g]] <- c(ft_cite_list[[g]], cites)
}
# tibbs-cortes
for (g in tibbs_ft_hits$gene) ft_cite_list[[g]] <- c(ft_cite_list[[g]], "tibbs-cortes2024")
ft_refs <- data.frame(
gene = names(ft_cite_list),
ft_refs = sapply(ft_cite_list, function(x) paste(sort(unique(x)), collapse = ", ")),
stringsAsFactors = FALSE,
row.names = NULL
)
# Build per-gene citation lists for PH
ph_cite_list <- list()
# li2022
for (g in li2022_ph_v5) ph_cite_list[[g]] <- c(ph_cite_list[[g]], "li2022")
# GWAS Atlas (with PMID resolution)
for (i in seq_len(nrow(atlas_ph_with_pmid))) {
g <- atlas_ph_with_pmid$gene[i]
cites <- resolve_atlas_cites(atlas_ph_with_pmid$PUBMID[i])
ph_cite_list[[g]] <- c(ph_cite_list[[g]], cites)
}
# tibbs-cortes
for (g in tibbs_ph_hits$gene) ph_cite_list[[g]] <- c(ph_cite_list[[g]], "tibbs-cortes2024")
ph_refs <- data.frame(
gene = names(ph_cite_list),
ph_refs = sapply(ph_cite_list, function(x) paste(sort(unique(x)), collapse = ", ")),
stringsAsFactors = FALSE,
row.names = NULL
)
# Location label helper
location_label <- function(in_inv4m, in_flank) {
case_when(
in_inv4m ~ "Inv4m",
in_flank ~ "flanking",
TRUE ~ "outside"
)
}
# Genes with multiple aliases in gene_symbol.tab (for row expansion, e.g. jmj2/jmj6/jmj9)
gene_symbol_aliases <- gene_symbols %>%
dplyr::filter(!is.na(locus_symbol) & locus_symbol != "")
# Identify multi-alias genes (genes with >1 alias in gene_symbol.tab)
multi_alias_genes <- gene_symbol_aliases %>%
group_by(gene_model) %>%
dplyr::filter(n() > 1) %>%
ungroup()
# Annotate and expand: use desc (PANNZER) as primary description,
# use locus_labels_master.csv for shorthand labels,
# expand only multi-alias genes (e.g. jmj2/jmj6/jmj9) from gene_symbol.tab
annotate_and_expand <- function(df) {
# Supplement description: prefer desc (PANNZER functional) over desc_merged
df <- df %>%
mutate(
description = ifelse(!is.na(desc) & desc != "", desc, desc_merged),
description = ifelse(is.na(description) | description == "",
"Uncharacterized protein", description)
)
# Expand multi-alias genes (creates multiple rows)
genes_with_multi_aliases <- df %>%
dplyr::filter(gene %in% multi_alias_genes$gene_model) %>%
inner_join(
multi_alias_genes %>% dplyr::select(gene_model, alias = locus_symbol),
by = c("gene" = "gene_model"),
relationship = "many-to-many"
) %>%
mutate(locus_symbol = alias) %>%
dplyr::select(-alias)
# All other genes: use master label
other_genes <- df %>%
dplyr::filter(!(gene %in% multi_alias_genes$gene_model)) %>%
left_join(master_labels, by = c("gene" = "gene_model")) %>%
mutate(locus_symbol = ifelse(
!is.na(master_label) & master_label != "",
master_label,
ifelse(is.na(locus_symbol) | locus_symbol == "", "", locus_symbol)
)) %>%
dplyr::select(-master_label)
bind_rows(genes_with_multi_aliases, other_genes)
}
# FT top DEGs table
FT_table <- effects %>%
dplyr::filter(is_topDEG & is_FT_candidate) %>%
left_join(gwas_pval_ft, by = "gene") %>%
left_join(ft_refs, by = "gene") %>%
mutate(location = location_label(in_Inv4m, in_flank)) %>%
dplyr::select(gene, locus_symbol, desc_merged, desc, logFC, neglogP,
location, gwas_pvalue_ft, ft_refs) %>%
annotate_and_expand() %>%
arrange(-neglogP, gene, locus_symbol)
# PH top DEGs table
PH_table <- effects %>%
dplyr::filter(is_topDEG & is_PH_candidate) %>%
left_join(gwas_pval_ph, by = "gene") %>%
left_join(ph_refs, by = "gene") %>%
mutate(location = location_label(in_Inv4m, in_flank)) %>%
dplyr::select(gene, locus_symbol, desc_merged, desc, logFC, neglogP,
location, gwas_pvalue_ph, ph_refs) %>%
annotate_and_expand() %>%
arrange(-neglogP, gene, locus_symbol)
cat("FT top DEGs for table:", nrow(FT_table), "\n")
## FT top DEGs for table: 14
cat("PH top DEGs for table:", nrow(PH_table), "\n")
## PH top DEGs for table: 17
knitr::kable(
FT_table %>% dplyr::select(gene, locus_symbol, description, logFC, neglogP,
location, gwas_pvalue_ft, ft_refs),
digits = 2, caption = "Flowering Time candidate DEGs (|logFC| > 1.5)")
| gene | locus_symbol | description | logFC | neglogP | location | gwas_pvalue_ft | ft_refs |
|---|---|---|---|---|---|---|---|
| Zm00001eb191790 | jmj2 | Lysine-specific demethylase JMJ703 | -3.49 | 21.93 | Inv4m | 0 | tibbs-cortes2024, wang2021 |
| Zm00001eb191790 | jmj6 | Lysine-specific demethylase JMJ703 | -3.49 | 21.93 | Inv4m | 0 | tibbs-cortes2024, wang2021 |
| Zm00001eb191790 | jmj9 | Lysine-specific demethylase JMJ703 | -3.49 | 21.93 | Inv4m | 0 | tibbs-cortes2024, wang2021 |
| Zm00001eb191820 | jmj4 | Putative lysine-specific demethylase JMJ16 | -2.84 | 19.72 | Inv4m | NA | wang2021 |
| Zm00001eb060540 | act1 | Actin-1 | 2.23 | 9.93 | outside | NA | wang2021 |
| Zm00001eb070810 | ychf | Uncharacterized protein | 4.19 | 9.70 | outside | NA | li2016, liu2023 |
| Zm00001eb192350 | pkwd1 | Protein kinase family protein / WD-40 repeat family protein | -2.20 | 8.73 | Inv4m | 0 | liu2023 |
| Zm00001eb190090 | nep1 | Aspartic proteinase nepenthesin-1 | 7.18 | 7.00 | flanking | 0 | liu2023, peiffer2013, peiffer2014 |
| Zm00001eb197370 | abi40 | ABI3VP1-type transcription factor (Fragment) | -2.25 | 5.77 | outside | 0 | liu2023, peiffer2013, peiffer2014, wang2021 |
| Zm00001eb428740 | ocl2 | Homeobox-leucine zipper protein ROC5 | -2.66 | 5.46 | outside | NA | wang2021 |
| Zm00001eb186670 | Uncharacterized protein | -2.83 | 5.41 | flanking | 0 | jiao2012, li2016, liu2023 | |
| Zm00001eb157030 | Uncharacterized protein | 5.68 | 4.84 | outside | NA | li2016, liu2023 | |
| Zm00001eb191000 | 4cll5 | 4-coumarate–CoA ligase-like 5 | -3.35 | 2.76 | Inv4m | NA | wang2021 |
| Zm00001eb187450 | mybr58 | HTH myb-type domain-containing protein | 1.74 | 2.51 | flanking | NA | wang2021 |
knitr::kable(
PH_table %>% dplyr::select(gene, locus_symbol, description, logFC, neglogP,
location, gwas_pvalue_ph, ph_refs),
digits = 2, caption = "Plant Height candidate DEGs (|logFC| > 1.5)")
| gene | locus_symbol | description | logFC | neglogP | location | gwas_pvalue_ph | ph_refs |
|---|---|---|---|---|---|---|---|
| Zm00001eb194380 | sec6 | Exocyst complex component Sec6 | 2.87 | 16.53 | Inv4m | 0 | liu2023, peiffer2014 |
| Zm00001eb190240 | Uncharacterized protein | -9.16 | 14.16 | flanking | 0 | liu2023 | |
| Zm00001eb189920 | aldh2 | Aldehyde dehydrogenase2 | 2.70 | 13.58 | flanking | 0 | liu2023, peiffer2014 |
| Zm00001eb190350 | ank1 | Ankyrin repeat-containing protein | -1.83 | 12.96 | flanking | 0 | liu2023 |
| Zm00001eb196600 | r3h1 | R3H-assoc domain-containing protein | -2.25 | 12.49 | flanking | 0 | liu2023, peiffer2014 |
| Zm00001eb195790 | traf1 | TRAF-type domain-containing protein | 3.15 | 8.47 | flanking | 0 | liu2023, peiffer2014 |
| Zm00001eb123230 | tip4c | Tonoplast intrinsic protein 41 | 2.56 | 8.25 | outside | NA | liu2023 |
| Zm00001eb195970 | mpv17 | Mpv17/PMP22 family protein, expressed | -2.41 | 7.75 | flanking | 0 | liu2023, peiffer2014 |
| Zm00001eb188570 | his2b5 | histone 2B5 | -5.61 | 7.65 | flanking | NA | liu2023 |
| Zm00001eb196920 | Uncharacterized protein | -2.81 | 7.37 | flanking | 0 | liu2023, peiffer2014 | |
| Zm00001eb192120 | enth1 | ENTH domain-containing protein | -2.41 | 5.70 | Inv4m | 0 | liu2023, peiffer2014 |
| Zm00001eb186670 | Uncharacterized protein | -2.83 | 5.41 | flanking | 0 | liu2023, peiffer2014 | |
| Zm00001eb195080 | imk3b | probable leucine-rich repeat receptor-like protein kinase IMK3 | -4.68 | 5.34 | flanking | 0 | liu2023, peiffer2014 |
| Zm00001eb332450 | roc4-2 | Peptidyl-prolyl cis-trans isomerase | -2.92 | 4.77 | outside | 0 | liu2023, peiffer2014 |
| Zm00001eb190380 | rie1 | E3 ubiquitin protein ligase RIE1 | 2.15 | 2.06 | flanking | 0 | liu2023 |
| Zm00001eb197300 | stk1 | Non-specific serine/threonine protein kinase | -2.62 | 2.00 | flanking | 0 | liu2023, peiffer2014 |
| Zm00001eb194020 | phi1 | Phi-1-like phosphate-induced protein | -2.39 | 1.86 | Inv4m | 0 | liu2023, peiffer2014 |
cat("=== Flowering Time Enrichment ===\n\n")
## === Flowering Time Enrichment ===
# Top DEGs vs FT candidates
ft_top <- table(effects$is_topDEG, effects$is_FT_candidate)
dimnames(ft_top) <- list(TopDEG = c("No", "Yes"),
FT_candidate = c("No", "Yes"))
cat("Top DEGs (|logFC| >", LOGFC_THRESHOLD, ") × FT candidates:\n")
## Top DEGs (|logFC| > 1.5 ) × FT candidates:
print(ft_top)
## FT_candidate
## TopDEG No Yes
## No 21130 2726
## Yes 143 12
cat("\nFisher exact test (two-sided):\n")
##
## Fisher exact test (two-sided):
print(fisher.test(ft_top))
##
## Fisher's Exact Test for Count Data
##
## data: ft_top
## p-value = 0.1641
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.3279357 1.1739395
## sample estimates:
## odds ratio
## 0.6504676
cat("\nFisher exact test (greater):\n")
##
## Fisher exact test (greater):
print(fisher.test(ft_top, alternative = "greater"))
##
## Fisher's Exact Test for Count Data
##
## data: ft_top
## p-value = 0.9478
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.3671634 Inf
## sample estimates:
## odds ratio
## 0.6504676
# All significant DEGs vs FT candidates
ft_sig <- table(effects$is_significant, effects$is_FT_candidate)
dimnames(ft_sig) <- list(DEG = c("No", "Yes"),
FT_candidate = c("No", "Yes"))
cat("\nAll significant DEGs × FT candidates:\n")
##
## All significant DEGs × FT candidates:
print(ft_sig)
## FT_candidate
## DEG No Yes
## No 20849 2697
## Yes 424 41
cat("\nFisher exact test (two-sided):\n")
##
## Fisher exact test (two-sided):
print(fisher.test(ft_sig))
##
## Fisher's Exact Test for Count Data
##
## data: ft_sig
## p-value = 0.07717
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5273949 1.0343878
## sample estimates:
## odds ratio
## 0.7475275
cat("\nFisher exact test (greater):\n")
##
## Fisher exact test (greater):
print(fisher.test(ft_sig, alternative = "greater"))
##
## Fisher's Exact Test for Count Data
##
## data: ft_sig
## p-value = 0.971
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.5580931 Inf
## sample estimates:
## odds ratio
## 0.7475275
cat("=== Plant Height Enrichment ===\n\n")
## === Plant Height Enrichment ===
ph_top <- table(effects$is_topDEG, effects$is_PH_candidate)
dimnames(ph_top) <- list(TopDEG = c("No", "Yes"),
PH_candidate = c("No", "Yes"))
cat("Top DEGs (|logFC| >", LOGFC_THRESHOLD, ") × PH candidates:\n")
## Top DEGs (|logFC| > 1.5 ) × PH candidates:
print(ph_top)
## PH_candidate
## TopDEG No Yes
## No 21666 2190
## Yes 138 17
cat("\nFisher exact test (two-sided):\n")
##
## Fisher exact test (two-sided):
print(fisher.test(ph_top))
##
## Fisher's Exact Test for Count Data
##
## data: ph_top
## p-value = 0.4041
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.688717 2.027985
## sample estimates:
## odds ratio
## 1.21871
ph_sig <- table(effects$is_significant, effects$is_PH_candidate)
dimnames(ph_sig) <- list(DEG = c("No", "Yes"),
PH_candidate = c("No", "Yes"))
cat("\nAll significant DEGs × PH candidates:\n")
##
## All significant DEGs × PH candidates:
print(ph_sig)
## PH_candidate
## DEG No Yes
## No 21387 2159
## Yes 417 48
cat("\nFisher exact test (two-sided):\n")
##
## Fisher exact test (two-sided):
print(fisher.test(ph_sig))
##
## Fisher's Exact Test for Count Data
##
## data: ph_sig
## p-value = 0.3732
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.8247723 1.5455306
## sample estimates:
## odds ratio
## 1.140239
out_csv <- bind_rows(
FT_table %>% mutate(
pheno = "Flowering Time",
gwas_pvalue = gwas_pvalue_ft,
refs = ft_refs
) %>% dplyr::select(-gwas_pvalue_ft, -ft_refs),
PH_table %>% mutate(
pheno = "Plant Height",
gwas_pvalue = gwas_pvalue_ph,
refs = ph_refs
) %>% dplyr::select(-gwas_pvalue_ph, -ph_refs)
) %>%
dplyr::select(pheno, gene, locus_symbol, description, logFC, neglogP,
location, gwas_pvalue, refs) %>%
arrange(pheno, -neglogP)
write.csv(out_csv,
file.path(paths$intermediate, "inv4m_DEGs_by_phenotype_association.csv"),
row.names = FALSE
)
cat("Wrote:", file.path(paths$intermediate,
"inv4m_DEGs_by_phenotype_association.csv"), "\n")
## Wrote: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate/inv4m_DEGs_by_phenotype_association.csv
# Format for LaTeX
format_pval <- function(x) {
ifelse(is.na(x), "", formatC(x, format = "e", digits = 2))
}
tex_lines <- c(
"\\begin{table*}[!ht]",
"\\caption{\\textbf{Effect of \\invfour on Flowering Time and Plant Height Gene",
"Candidates}}",
"\\label{tab:FT_PH_candidates}",
"\\resizebox{\\textwidth}{!}{%",
"\\begin{tabular}{llllllll}",
"\\toprule",
" \\textbf{Gene} &",
" \\textbf{Label} &",
" \\textbf{Description} &",
" \\textbf{\\begin{tabular}[c]{@{}l@{}}$\\bf{log_2}$\\\\(FC)\\end{tabular}} &",
" \\textbf{\\begin{tabular}[c]{@{}l@{}}$\\bf{-log_{10}}$\\\\",
"(\\textit{FDR})\\end{tabular}} &",
" \\textbf{Location} &",
" \\textbf{\\begin{tabular}[c]{@{}l@{}} GWAS \\\\",
"\\textit{p-value}\\end{tabular}} &",
" \\textbf{ref} \\\\ \\midrule"
)
# Helper: format a single gene row
format_row <- function(gene, symbol, desc, logfc, nlogp, loc, pval, refs_str) {
# Escape LaTeX special characters in description
desc_clean <- gsub("&", "\\\\&", desc)
desc_clean <- gsub("%", "\\\\%", desc_clean)
desc_clean <- gsub("_", "\\\\_", desc_clean)
# Format location: bold Inv4m
loc_fmt <- if (loc == "Inv4m") "\\textbf{\\invfour}" else loc
# Format GWAS p-value
pval_fmt <- format_pval(pval)
# Build citation string — refs_str already contains proper cite keys
cite_fmt <- ""
if (!is.na(refs_str) && refs_str != "") {
cite_keys <- trimws(unlist(strsplit(refs_str, ",")))
cite_fmt <- paste0("\\cite{", paste(unique(cite_keys), collapse = ", "), "}")
}
# Format symbol
sym_fmt <- if (is.na(symbol) || symbol == "") "" else paste0("\\textit{", symbol, "}")
paste0(
" \\textit{", gene, "} &\n",
" ", sym_fmt, " &\n",
" ", desc_clean, " &\n",
" ", sprintf("%.2f", logfc), " &\n",
" ", sprintf("%.2f", nlogp), " &\n",
" ", loc_fmt, " &\n",
" ", pval_fmt, " &\n",
" ", cite_fmt, " \\\\"
)
}
# FT section
tex_lines <- c(tex_lines,
"\\multicolumn{8}{l}{\\textbf{Flowering Time}} \\\\ \\midrule")
for (i in seq_len(nrow(FT_table))) {
r <- FT_table[i, ]
tex_lines <- c(tex_lines, format_row(
r$gene, r$locus_symbol, r$description,
r$logFC, r$neglogP, r$location,
r$gwas_pvalue_ft, r$ft_refs
))
}
# PH section
tex_lines <- c(tex_lines,
" \\midrule",
"\\multicolumn{8}{l}{\\textbf{Plant Height}} \\\\ \\midrule")
for (i in seq_len(nrow(PH_table))) {
r <- PH_table[i, ]
tex_lines <- c(tex_lines, format_row(
r$gene, r$locus_symbol, r$description,
r$logFC, r$neglogP, r$location,
r$gwas_pvalue_ph, r$ph_refs
))
}
tex_lines <- c(tex_lines,
" \\bottomrule",
"\\end{tabular}%",
"}",
"\\end{table*}"
)
writeLines(tex_lines,
file.path(paths$tables, "FT_PH_candidates.tex"))
cat("Wrote:", file.path(paths$tables, "FT_PH_candidates.tex"), "\n")
## Wrote: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/tables/FT_PH_candidates.tex
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] xtable_1.8-4 rtracklayer_1.70.1 GenomicRanges_1.62.1
## [4] Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
## [7] BiocGenerics_0.56.0 generics_0.1.4 lubridate_1.9.4
## [10] forcats_1.0.1 stringr_1.6.0 dplyr_1.1.4
## [13] purrr_1.2.0 readr_2.1.6 tidyr_1.3.2
## [16] tibble_3.3.0 ggplot2_4.0.1 tidyverse_2.0.0
## [19] here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] SummarizedExperiment_1.40.0 gtable_0.3.6
## [3] rjson_0.2.23 xfun_0.55
## [5] bslib_0.9.0 lattice_0.22-7
## [7] Biobase_2.70.0 tzdb_0.5.0
## [9] vctrs_0.6.5 tools_4.5.2
## [11] bitops_1.0-9 curl_7.0.0
## [13] parallel_4.5.2 pkgconfig_2.0.3
## [15] Matrix_1.7-4 RColorBrewer_1.1-3
## [17] cigarillo_1.0.0 S7_0.2.1
## [19] lifecycle_1.0.4 compiler_4.5.2
## [21] farver_2.1.2 Rsamtools_2.26.0
## [23] Biostrings_2.78.0 codetools_0.2-20
## [25] htmltools_0.5.9 sass_0.4.10
## [27] RCurl_1.98-1.17 yaml_2.3.12
## [29] pillar_1.11.1 crayon_1.5.3
## [31] jquerylib_0.1.4 BiocParallel_1.44.0
## [33] DelayedArray_0.36.0 cachem_1.1.0
## [35] abind_1.4-8 tidyselect_1.2.1
## [37] digest_0.6.39 stringi_1.8.7
## [39] restfulr_0.0.16 rprojroot_2.1.1
## [41] fastmap_1.2.0 grid_4.5.2
## [43] SparseArray_1.10.8 cli_3.6.5
## [45] magrittr_2.0.4 S4Arrays_1.10.1
## [47] dichromat_2.0-0.1 XML_3.99-0.20
## [49] withr_3.0.2 scales_1.4.0
## [51] timechange_0.3.0 rmarkdown_2.30
## [53] XVector_0.50.0 httr_1.4.7
## [55] matrixStats_1.5.0 otel_0.2.0
## [57] hms_1.1.4 evaluate_1.0.5
## [59] knitr_1.51 BiocIO_1.20.0
## [61] rlang_1.1.6 glue_1.8.0
## [63] jsonlite_2.0.0 R6_2.6.1
## [65] MatrixGenerics_1.22.0 GenomicAlignments_1.46.0