KEGG pathway enrichment on differentially expressed genes across Leaf development and Phosphorus deficiency.
Predictors: - Leaf: Leaf development/senescence (FC threshold: ±0.7) - -P: Phosphorus deficiency (FC threshold: ±2) - Leaf:-P: Interaction term
library(dplyr)
library(ggplot2)
library(clusterProfiler)
library(tidyr)
library(forcats)
# Load gene ID cross-reference
id_map <- read.csv(
file.path(paths$data, "Gene_xRef.csv"),
header = FALSE
)
colnames(id_map) <- c("gene", "v5")
# Load differential expression results
effects_df <- read.csv(file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv"))
cat("Loaded", nrow(id_map), "gene ID mappings\n")
## Loaded 29193 gene ID mappings
# Extract gene lists
redundant_DEGs <- with(effects_df %>% filter(is_hiconf_DEG), {
list(
Leaf.Down = gene[predictor == "Leaf" & regulation == "Downregulated"],
Leaf.Up = gene[predictor == "Leaf" & regulation == "Upregulated"],
`-P.Down` = gene[predictor == "-P" & regulation == "Downregulated"],
`-P.Up` = gene[predictor == "-P" & regulation == "Upregulated"],
`Leaf:-P.Down` = gene[predictor == "Leaf:-P" & regulation == "Downregulated"],
`Leaf:-P.Up` = gene[predictor == "Leaf:-P" & regulation == "Upregulated"]
)
})
DEGs <- lapply(redundant_DEGs, unique)
n_genes <- sapply(DEGs, length)
print(n_genes)
## Leaf.Down Leaf.Up -P.Down -P.Up Leaf:-P.Down Leaf:-P.Up
## 746 676 212 580 194 361
#' Filter and convert gene IDs to NCBI format
#'
#' @param gene_list Character vector of Zm gene IDs
#' @param id_mapping Data frame with v5 (Zm IDs) and gene (NCBI IDs)
#' @return Character vector of NCBI IDs
filter_valid_kegg_genes <- function(gene_list, id_mapping) {
ncbi_ids <- id_mapping %>%
filter(v5 %in% gene_list) %>%
pull(gene) %>%
unique() %>%
na.omit()
as.character(ncbi_ids)
}
# Convert all DEG lists
DEGs_kegg <- lapply(DEGs, filter_valid_kegg_genes, id_mapping = id_map)
# Conversion summary
conversion_summary <- data.frame(
DEG_set = names(DEGs),
original = sapply(DEGs, length),
with_ncbi = sapply(DEGs_kegg, length),
pct_converted = round(100 * sapply(DEGs_kegg, length) / sapply(DEGs, length), 1)
)
print(conversion_summary)
## DEG_set original with_ncbi pct_converted
## Leaf.Down Leaf.Down 746 666 89.3
## Leaf.Up Leaf.Up 676 552 81.7
## -P.Down -P.Down 212 141 66.5
## -P.Up -P.Up 580 512 88.3
## Leaf:-P.Down Leaf:-P.Down 194 163 84.0
## Leaf:-P.Up Leaf:-P.Up 361 340 94.2
# Run enrichment for each DEG set
compKEGG <- lapply(DEGs_kegg, function(x) {
if (length(x) < 5) return(NULL)
enrichKEGG(
gene = x,
organism = "zma",
pvalueCutoff = 0.05,
keyType = "ncbi-geneid"
)
})
# Extract results
cluster_result <- lapply(names(compKEGG), function(x) {
if (is.null(compKEGG[[x]])) return(NULL)
result <- compKEGG[[x]]@result
if (nrow(result) == 0) return(NULL)
result %>%
filter(p.adjust < 0.05) %>%
mutate(Cluster = x) %>%
select(Cluster, everything())
}) %>%
bind_rows()
cat("Total enriched pathways:", nrow(cluster_result), "\n")
## Total enriched pathways: 53
# Summary by cluster
enrichment_summary <- cluster_result %>%
group_by(Cluster) %>%
summarise(
n_pathways = n(),
mean_count = mean(Count),
min_pvalue = min(p.adjust)
)
print(enrichment_summary)
## # A tibble: 6 × 4
## Cluster n_pathways mean_count min_pvalue
## <chr> <int> <dbl> <dbl>
## 1 -P.Down 9 2.67 0.0233
## 2 -P.Up 8 6.88 0.00000360
## 3 Leaf.Down 11 5.36 0.00191
## 4 Leaf.Up 1 6 0.0101
## 5 Leaf:-P.Down 14 4.14 0.0000000138
## 6 Leaf:-P.Up 10 5.2 0.000204
cluster_order <- names(DEGs)
# Select top 7 pathways per cluster
sorted_by_p <- cluster_result %>%
mutate(neglogP = -log10(p.adjust)) %>%
mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
droplevels() %>%
group_by(Cluster) %>%
arrange(Cluster, p.adjust) %>%
slice(1:4) %>%
mutate(label_order = 1:n()) %>%
mutate(Description = fct_reorder(Description, label_order))
top_terms <- levels(sorted_by_p$Description)
cat("Selected", length(top_terms), "pathways\n")
## Selected 17 pathways
# Prepare plotting data
to_plot <- cluster_result %>%
mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
filter(Description %in% top_terms) %>%
mutate(neglogP = -log10(p.adjust)) %>%
group_by(Cluster) %>%
arrange(Cluster, p.adjust) %>%
mutate(Description = factor(Description, levels = top_terms)) %>%
mutate(label_order = 1:n()) %>%
mutate(Description = fct_reorder(Description, label_order))
# Prepare gene-level data
effects_gene_label <- effects_df %>%
rename(v5 = gene) %>%
left_join(id_map, by = "v5") %>%
mutate(Cluster = case_when(
regulation == "Upregulated" ~ paste(predictor, "Up", sep = "."),
regulation == "Downregulated" ~ paste(predictor, "Down", sep = ".")
)) %>%
select(
Cluster, v5, gene, locus_label, locus_symbol,
locus_name, logFC, P = adj.P.Val, mahalanobis
) %>%
filter(!is.na(locus_label) & !is.na(locus_symbol) & !is.na(Cluster)) %>%
mutate(neglogPG = -log10(P))
cat("Genes with NCBI IDs:", sum(!is.na(effects_gene_label$gene)), "\n")
## Genes with NCBI IDs: 4839
# Filter to Leaf/-P only
cluster_order <- cluster_order[!grepl("Inv4m", cluster_order)]
to_plot <- to_plot %>%
filter(!grepl("Inv4m", Cluster)) %>%
droplevels() %>%
mutate(Cluster = factor(Cluster, levels = cluster_order))
with_label <-to_plot %>%
separate_longer_delim(geneID, delim = "/") %>%
rename(gene = geneID) %>%
mutate(gene = as.integer(gene)) %>%
inner_join(effects_gene_label, by = c("Cluster", "gene"))
# Map top gene per pathway
term2gene_label <- with_label %>%
select(
Cluster, gene, locus_label, locus_symbol, ID, Description,
p.adjust, Count, mahalanobis, logFC, P, neglogPG
) %>%
distinct() %>%
group_by(Cluster, ID) %>%
arrange(Cluster, ID, -mahalanobis) %>%
slice(1)
# Manual fix for specific gene
is_diox <- term2gene_label$gene == 100272845
term2gene_label$locus_label[is_diox] <- "fba2"
cat("Annotated", nrow(term2gene_label), "pathway-cluster combinations\n")
## Annotated 25 pathway-cluster combinations
# Format labels
labels <- paste0(
gsub(".", " ", cluster_order, fixed = TRUE),
"\n(", n_genes, ")"
) %>%
gsub("Leaf:-P.Down", "Negative\nLeaf × -P", .) %>%
gsub("Leaf:-P.Up", "Positive\nLeaf × -P", .)
names(labels) <- names(n_genes)
# Create plot
annotation_plot <- to_plot %>%
ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
ggtitle("KEGG Pathways") +
scale_fill_continuous(
low = "dodgerblue",
high = "tomato",
name = "-log10(FDR)",
guide = guide_colorbar()
) +
scale_y_discrete(limits = rev(levels(to_plot$Description))) +
scale_x_discrete(
labels = labels,
drop = FALSE,
expand = expansion(add = c(0.5, 1))
) +
geom_point(shape = 21) +
geom_text(
data = term2gene_label,
position = position_nudge(0.1),
hjust = 0,
vjust = 0.5,
fontface = "italic",
size = 7,
mapping = aes(x = Cluster, y = Description, label = locus_label),
inherit.aes = FALSE
) +
scale_size(range = c(2, 8)) +
ylab(NULL) +
xlab(NULL) +
DOSE::theme_dose(font.size = 20) +
theme(
plot.title = element_text( size = 25),
axis.text.x = element_text(face = "bold", size =20)
)
print(annotation_plot)
# Export data used to generate the KEGG plot
kegg_plot_data <- to_plot %>%
left_join(
term2gene_label %>%
select(Cluster, ID, locus_label),
by = c("Cluster", "ID")
) %>%
select(
Cluster, ID, Description, GeneRatio, BgRatio,
pvalue, p.adjust, qvalue, geneID, Count,
neglogP, locus_label
)
write.csv(
kegg_plot_data,
file = file.path(paths$intermediate, "kegg_plot_input_data.csv"),
row.names = FALSE
)
cat("Exported", nrow(kegg_plot_data), "KEGG pathways\n")
## Exported 28 KEGG pathways
# Save for figure assembly
saveRDS(
annotation_plot,
file = file.path(paths$intermediate, "kegg_panel.rds")
)
ggsave(
annotation_plot,
file = file.path(paths$figures, "kegg_panel.pdf"),
height = 7,
width = 16
)
ggsave(
annotation_plot,
file = file.path(paths$figures, "HC_DEG_KEGG_annotation.svg"),
height = 7,
width = 12,
dpi = 150
)
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_1.0.1 tidyr_1.3.1 clusterProfiler_4.18.2
## [4] ggplot2_4.0.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gson_0.1.0 rlang_1.1.6
## [4] magrittr_2.0.4 DOSE_4.4.0 compiler_4.5.1
## [7] RSQLite_2.4.5 png_0.1-8 systemfonts_1.3.1
## [10] vctrs_0.6.5 reshape2_1.4.5 stringr_1.6.0
## [13] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
## [16] XVector_0.50.0 labeling_0.4.3 utf8_1.2.6
## [19] rmarkdown_2.30 enrichplot_1.30.4 ragg_1.5.0
## [22] purrr_1.2.0 bit_4.6.0 xfun_0.54
## [25] cachem_1.1.0 aplot_0.2.9 jsonlite_2.0.0
## [28] blob_1.2.4 tidydr_0.0.6 tweenr_2.0.3
## [31] BiocParallel_1.44.0 cluster_2.1.8.1 parallel_4.5.1
## [34] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [37] RColorBrewer_1.1-3 jquerylib_0.1.4 GOSemSim_2.36.0
## [40] Rcpp_1.1.0 Seqinfo_1.0.0 knitr_1.50
## [43] ggtangle_0.0.9 R.utils_2.13.0 IRanges_2.44.0
## [46] Matrix_1.7-4 splines_4.5.1 igraph_2.2.1
## [49] tidyselect_1.2.1 qvalue_2.42.0 yaml_2.3.12
## [52] codetools_0.2-20 lattice_0.22-7 tibble_3.3.0
## [55] plyr_1.8.9 treeio_1.34.0 Biobase_2.70.0
## [58] withr_3.0.2 KEGGREST_1.50.0 S7_0.2.1
## [61] evaluate_1.0.5 gridGraphics_0.5-1 polyclip_1.10-7
## [64] scatterpie_0.2.6 Biostrings_2.78.0 pillar_1.11.1
## [67] ggtree_4.0.1 stats4_4.5.1 ggfun_0.2.0
## [70] generics_0.1.4 rprojroot_2.1.1 S4Vectors_0.48.0
## [73] scales_1.4.0 tidytree_0.4.6 glue_1.8.0
## [76] gdtools_0.4.4 lazyeval_0.2.2 tools_4.5.1
## [79] ggnewscale_0.5.2 data.table_1.17.8 fgsea_1.36.0
## [82] ggiraph_0.9.2 fs_1.6.6 fastmatch_1.1-6
## [85] cowplot_1.2.0 grid_4.5.1 ape_5.8-1
## [88] AnnotationDbi_1.72.0 nlme_3.1-168 patchwork_1.3.2
## [91] ggforce_0.5.0 cli_3.6.5 rappdirs_0.3.3
## [94] textshaping_1.0.4 fontBitstreamVera_0.1.1 svglite_2.2.2
## [97] gtable_0.3.6 R.methodsS3_1.8.2 yulab.utils_0.2.2
## [100] sass_0.4.10 digest_0.6.39 fontquiver_0.2.1
## [103] BiocGenerics_0.56.0 ggrepel_0.9.6 ggplotify_0.1.3
## [106] htmlwidgets_1.6.4 farver_2.1.2 memoise_2.0.1
## [109] htmltools_0.5.9 R.oo_1.27.1 lifecycle_1.0.4
## [112] httr_1.4.7 here_1.0.2 GO.db_3.22.0
## [115] MASS_7.3-65 fontLiberation_0.1.0 bit64_4.6.0-1