KEGG pathway enrichment on differentially expressed genes across Leaf development and Phosphorus deficiency.
Predictors: - Leaf: Leaf development/senescence (FC threshold: ±0.7) - -P: Phosphorus deficiency (FC threshold: ±2) - Leaf:-P: Interaction term
library(dplyr)
library(ggplot2)
library(clusterProfiler)
library(tidyr)
library(forcats)
# Load gene ID cross-reference
id_map <- read.csv(
file.path(paths$data, "Gene_xRef.csv"),
header = FALSE
)
colnames(id_map) <- c("gene", "v5")
# Load differential expression results
effects_df <- read.csv(file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv"))
cat("Loaded", nrow(id_map), "gene ID mappings\n")
## Loaded 29193 gene ID mappings
# Extract gene lists
redundant_DEGs <- with(effects_df %>% filter(is_strong_DEG), {
list(
Leaf.Down = gene[predictor == "Leaf" & regulation == "Downregulated"],
Leaf.Up = gene[predictor == "Leaf" & regulation == "Upregulated"],
`-P.Down` = gene[predictor == "-P" & regulation == "Downregulated"],
`-P.Up` = gene[predictor == "-P" & regulation == "Upregulated"],
`Leaf:-P.Down` = gene[predictor == "Leaf:-P" & regulation == "Downregulated"],
`Leaf:-P.Up` = gene[predictor == "Leaf:-P" & regulation == "Upregulated"]
)
})
DEGs <- lapply(redundant_DEGs, unique)
n_genes <- sapply(DEGs, length)
print(n_genes)
## Leaf.Down Leaf.Up -P.Down -P.Up Leaf:-P.Down Leaf:-P.Up
## 746 676 212 580 194 361
#' Filter and convert gene IDs to NCBI format
#'
#' @param gene_list Character vector of Zm gene IDs
#' @param id_mapping Data frame with v5 (Zm IDs) and gene (NCBI IDs)
#' @return Character vector of NCBI IDs
filter_valid_kegg_genes <- function(gene_list, id_mapping) {
ncbi_ids <- id_mapping %>%
filter(v5 %in% gene_list) %>%
pull(gene) %>%
unique() %>%
na.omit()
as.character(ncbi_ids)
}
# Convert all DEG lists
DEGs_kegg <- lapply(DEGs, filter_valid_kegg_genes, id_mapping = id_map)
# Conversion summary
conversion_summary <- data.frame(
DEG_set = names(DEGs),
original = sapply(DEGs, length),
with_ncbi = sapply(DEGs_kegg, length),
pct_converted = round(100 * sapply(DEGs_kegg, length) / sapply(DEGs, length), 1)
)
print(conversion_summary)
## DEG_set original with_ncbi pct_converted
## Leaf.Down Leaf.Down 746 666 89.3
## Leaf.Up Leaf.Up 676 552 81.7
## -P.Down -P.Down 212 141 66.5
## -P.Up -P.Up 580 512 88.3
## Leaf:-P.Down Leaf:-P.Down 194 163 84.0
## Leaf:-P.Up Leaf:-P.Up 361 340 94.2
# Run enrichment for each DEG set
compKEGG <- lapply(DEGs_kegg, function(x) {
if (length(x) < 5) return(NULL)
enrichKEGG(
gene = x,
organism = "zma",
pvalueCutoff = 0.05,
keyType = "ncbi-geneid"
)
})
# Extract results
cluster_result <- lapply(names(compKEGG), function(x) {
if (is.null(compKEGG[[x]])) return(NULL)
result <- compKEGG[[x]]@result
if (nrow(result) == 0) return(NULL)
result %>%
filter(p.adjust < 0.05) %>%
mutate(Cluster = x) %>%
select(Cluster, everything())
}) %>%
bind_rows()
cat("Total enriched pathways:", nrow(cluster_result), "\n")
## Total enriched pathways: 54
# Summary by cluster
enrichment_summary <- cluster_result %>%
group_by(Cluster) %>%
summarise(
n_pathways = n(),
mean_count = mean(Count),
min_pvalue = min(p.adjust)
)
print(enrichment_summary)
## # A tibble: 6 × 4
## Cluster n_pathways mean_count min_pvalue
## <chr> <int> <dbl> <dbl>
## 1 -P.Down 9 2.67 0.0227
## 2 -P.Up 8 6.88 0.00000319
## 3 Leaf.Down 11 5.36 0.00189
## 4 Leaf.Up 1 6 0.0218
## 5 Leaf:-P.Down 15 4 0.0000000154
## 6 Leaf:-P.Up 10 5.2 0.000205
cluster_order <- names(DEGs)
# Select top 7 pathways per cluster
sorted_by_p <- cluster_result %>%
mutate(neglogP = -log10(p.adjust)) %>%
mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
droplevels() %>%
group_by(Cluster) %>%
arrange(Cluster, p.adjust) %>%
slice(1:4) %>%
mutate(label_order = 1:n()) %>%
mutate(Description = fct_reorder(Description, label_order))
top_terms <- levels(sorted_by_p$Description)
cat("Selected", length(top_terms), "pathways\n")
## Selected 17 pathways
# Prepare plotting data
to_plot <- cluster_result %>%
mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
filter(Description %in% top_terms) %>%
mutate(neglogP = -log10(p.adjust)) %>%
group_by(Cluster) %>%
arrange(Cluster, p.adjust) %>%
mutate(Description = factor(Description, levels = top_terms)) %>%
mutate(label_order = 1:n()) %>%
mutate(Description = fct_reorder(Description, label_order))
# Prepare gene-level data
effects_gene_label <- effects_df %>%
rename(v5 = gene) %>%
left_join(id_map, by = "v5") %>%
mutate(Cluster = case_when(
regulation == "Upregulated" ~ paste(predictor, "Up", sep = "."),
regulation == "Downregulated" ~ paste(predictor, "Down", sep = ".")
)) %>%
select(
Cluster, v5, gene, locus_label, locus_symbol,
locus_name, logFC, P = adj.P.Val, mahalanobis
) %>%
filter(!is.na(locus_label) & !is.na(locus_symbol) & !is.na(Cluster)) %>%
mutate(neglogPG = -log10(P))
cat("Genes with NCBI IDs:", sum(!is.na(effects_gene_label$gene)), "\n")
## Genes with NCBI IDs: 4554
# Filter to Leaf/-P only
cluster_order <- cluster_order[!grepl("Inv4m", cluster_order)]
to_plot <- to_plot %>%
filter(!grepl("Inv4m", Cluster)) %>%
droplevels() %>%
mutate(Cluster = factor(Cluster, levels = cluster_order))
with_label <-to_plot %>%
separate_longer_delim(geneID, delim = "/") %>%
rename(gene = geneID) %>%
mutate(gene = as.integer(gene)) %>%
inner_join(effects_gene_label, by = c("Cluster", "gene"))
# Map top gene per pathway
term2gene_label <- with_label %>%
select(
Cluster, gene, locus_label, locus_symbol, ID, Description,
p.adjust, Count, mahalanobis, logFC, P, neglogPG
) %>%
distinct() %>%
group_by(Cluster, ID) %>%
arrange(Cluster, ID, -mahalanobis) %>%
slice(1)
# Manual fix for specific gene
is_diox <- term2gene_label$gene == 100272845
term2gene_label$locus_label[is_diox] <- "fba2"
cat("Annotated", nrow(term2gene_label), "pathway-cluster combinations\n")
## Annotated 25 pathway-cluster combinations
# Format labels
labels <- paste0(
gsub(".", " ", cluster_order, fixed = TRUE),
"\n(", n_genes, ")"
) %>%
gsub("Leaf:-P.Down", "Negative\nLeaf × -P", .) %>%
gsub("Leaf:-P.Up", "Positive\nLeaf × -P", .)
names(labels) <- names(n_genes)
# Create plot
annotation_plot <- to_plot %>%
ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
ggtitle("KEGG Pathways") +
scale_fill_continuous(
low = "dodgerblue",
high = "tomato",
name = "-log10(FDR)",
guide = guide_colorbar()
) +
scale_y_discrete(limits = rev(levels(to_plot$Description))) +
scale_x_discrete(
labels = labels,
drop = FALSE,
expand = expansion(add = c(0.5, 1))
) +
geom_point(shape = 21) +
geom_text(
data = term2gene_label,
position = position_nudge(0.1),
hjust = 0,
vjust = 0.5,
fontface = "italic",
size = 7,
mapping = aes(x = Cluster, y = Description, label = locus_label),
inherit.aes = FALSE
) +
scale_size(range = c(2, 8)) +
ylab(NULL) +
xlab(NULL) +
DOSE::theme_dose(font.size = 20) +
theme(
plot.title = element_text( size = 25),
axis.text.x = element_text(face = "bold", size =20)
)
print(annotation_plot)
# Export data used to generate the KEGG plot
kegg_plot_data <- to_plot %>%
left_join(
term2gene_label %>%
select(Cluster, ID, locus_label),
by = c("Cluster", "ID")
) %>%
select(
Cluster, ID, Description, GeneRatio, BgRatio,
pvalue, p.adjust, qvalue, geneID, Count,
neglogP, locus_label
)
write.csv(
kegg_plot_data,
file = file.path(paths$intermediate, "kegg_plot_input_data.csv"),
row.names = FALSE
)
cat("Exported", nrow(kegg_plot_data), "KEGG pathways\n")
## Exported 28 KEGG pathways
# Save for figure assembly
saveRDS(
annotation_plot,
file = file.path(paths$intermediate, "kegg_panel.rds")
)
ggsave(
annotation_plot,
file = file.path(paths$figures, "kegg_panel.pdf"),
height = 7,
width = 16
)
ggsave(
annotation_plot,
file = file.path(paths$figures, "HC_DEG_KEGG_annotation.svg"),
height = 7,
width = 12,
dpi = 150
)
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_1.0.1 tidyr_1.3.2 clusterProfiler_4.18.4
## [4] ggplot2_4.0.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gson_0.1.0 httr2_1.2.2
## [4] rlang_1.1.6 magrittr_2.0.4 DOSE_4.4.0
## [7] otel_0.2.0 compiler_4.5.2 RSQLite_2.4.5
## [10] systemfonts_1.3.1 png_0.1-8 vctrs_0.6.5
## [13] reshape2_1.4.5 stringr_1.6.0 pkgconfig_2.0.3
## [16] crayon_1.5.3 fastmap_1.2.0 XVector_0.50.0
## [19] labeling_0.4.3 utf8_1.2.6 rmarkdown_2.30
## [22] enrichplot_1.30.4 ragg_1.5.0 purrr_1.2.0
## [25] bit_4.6.0 xfun_0.55 cachem_1.1.0
## [28] aplot_0.2.9 jsonlite_2.0.0 blob_1.2.4
## [31] tidydr_0.0.6 tweenr_2.0.3 BiocParallel_1.44.0
## [34] cluster_2.1.8.1 parallel_4.5.2 R6_2.6.1
## [37] bslib_0.9.0 stringi_1.8.7 RColorBrewer_1.1-3
## [40] jquerylib_0.1.4 GOSemSim_2.36.0 Rcpp_1.1.0
## [43] Seqinfo_1.0.0 knitr_1.51 ggtangle_0.0.9
## [46] R.utils_2.13.0 IRanges_2.44.0 Matrix_1.7-4
## [49] splines_4.5.2 igraph_2.2.1 tidyselect_1.2.1
## [52] qvalue_2.42.0 dichromat_2.0-0.1 yaml_2.3.12
## [55] codetools_0.2-20 curl_7.0.0 lattice_0.22-7
## [58] tibble_3.3.0 plyr_1.8.9 treeio_1.34.0
## [61] Biobase_2.70.0 withr_3.0.2 KEGGREST_1.50.0
## [64] S7_0.2.1 evaluate_1.0.5 gridGraphics_0.5-1
## [67] polyclip_1.10-7 scatterpie_0.2.6 Biostrings_2.78.0
## [70] pillar_1.11.1 ggtree_4.0.3 stats4_4.5.2
## [73] ggfun_0.2.0 generics_0.1.4 rprojroot_2.1.1
## [76] S4Vectors_0.48.0 scales_1.4.0 tidytree_0.4.6
## [79] glue_1.8.0 gdtools_0.4.4 lazyeval_0.2.2
## [82] tools_4.5.2 ggnewscale_0.5.2 data.table_1.18.0
## [85] fgsea_1.36.0 ggiraph_0.9.2 fs_1.6.6
## [88] fastmatch_1.1-6 cowplot_1.2.0 grid_4.5.2
## [91] ape_5.8-1 AnnotationDbi_1.72.0 nlme_3.1-168
## [94] patchwork_1.3.2 ggforce_0.5.0 cli_3.6.5
## [97] rappdirs_0.3.3 textshaping_1.0.4 fontBitstreamVera_0.1.1
## [100] svglite_2.2.2 gtable_0.3.6 R.methodsS3_1.8.2
## [103] yulab.utils_0.2.3 sass_0.4.10 digest_0.6.39
## [106] fontquiver_0.2.1 BiocGenerics_0.56.0 ggrepel_0.9.6
## [109] ggplotify_0.1.3 htmlwidgets_1.6.4 farver_2.1.2
## [112] memoise_2.0.1 htmltools_0.5.9 R.oo_1.27.1
## [115] lifecycle_1.0.4 httr_1.4.7 here_1.0.2
## [118] GO.db_3.22.0 MASS_7.3-65 fontLiberation_0.1.0
## [121] bit64_4.6.0-1