Overview

KEGG pathway enrichment on differentially expressed genes across Leaf development and Phosphorus deficiency.

Predictors: - Leaf: Leaf development/senescence (FC threshold: ±0.7) - -P: Phosphorus deficiency (FC threshold: ±2) - Leaf:-P: Interaction term

Libraries

library(dplyr)
library(ggplot2)
library(clusterProfiler)
library(tidyr)
library(forcats)

Data Import

# Load gene ID cross-reference
id_map <- read.csv(
  file.path(paths$data, "Gene_xRef.csv"),
  header = FALSE
)
colnames(id_map) <- c("gene", "v5")

# Load differential expression results
effects_df <- read.csv(file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv"))

cat("Loaded", nrow(id_map), "gene ID mappings\n")
## Loaded 29193 gene ID mappings

Prepare DEG Lists

# Extract gene lists
redundant_DEGs <- with(effects_df %>% filter(is_hiconf_DEG), {
  list(
    Leaf.Down = gene[predictor == "Leaf" & regulation == "Downregulated"],
    Leaf.Up = gene[predictor == "Leaf" & regulation == "Upregulated"],
    `-P.Down` = gene[predictor == "-P" & regulation == "Downregulated"],
    `-P.Up` = gene[predictor == "-P" & regulation == "Upregulated"],
    `Leaf:-P.Down` = gene[predictor == "Leaf:-P" & regulation == "Downregulated"],
    `Leaf:-P.Up` = gene[predictor == "Leaf:-P" & regulation == "Upregulated"]
  )
})

DEGs <- lapply(redundant_DEGs, unique)
n_genes <- sapply(DEGs, length)

print(n_genes)
##    Leaf.Down      Leaf.Up      -P.Down        -P.Up Leaf:-P.Down   Leaf:-P.Up 
##          746          676          212          580          194          361

Convert to NCBI IDs

#' Filter and convert gene IDs to NCBI format
#'
#' @param gene_list Character vector of Zm gene IDs
#' @param id_mapping Data frame with v5 (Zm IDs) and gene (NCBI IDs)
#' @return Character vector of NCBI IDs
filter_valid_kegg_genes <- function(gene_list, id_mapping) {
  ncbi_ids <- id_mapping %>%
    filter(v5 %in% gene_list) %>%
    pull(gene) %>%
    unique() %>%
    na.omit()
  
  as.character(ncbi_ids)
}

# Convert all DEG lists
DEGs_kegg <- lapply(DEGs, filter_valid_kegg_genes, id_mapping = id_map)

# Conversion summary
conversion_summary <- data.frame(
  DEG_set = names(DEGs),
  original = sapply(DEGs, length),
  with_ncbi = sapply(DEGs_kegg, length),
  pct_converted = round(100 * sapply(DEGs_kegg, length) / sapply(DEGs, length), 1)
)

print(conversion_summary)
##                   DEG_set original with_ncbi pct_converted
## Leaf.Down       Leaf.Down      746       666          89.3
## Leaf.Up           Leaf.Up      676       552          81.7
## -P.Down           -P.Down      212       141          66.5
## -P.Up               -P.Up      580       512          88.3
## Leaf:-P.Down Leaf:-P.Down      194       163          84.0
## Leaf:-P.Up     Leaf:-P.Up      361       340          94.2

KEGG Enrichment

# Run enrichment for each DEG set
compKEGG <- lapply(DEGs_kegg, function(x) {
  if (length(x) < 5) return(NULL)
  
  enrichKEGG(
    gene = x,
    organism = "zma",
    pvalueCutoff = 0.05,
    keyType = "ncbi-geneid"
  )
})

# Extract results
cluster_result <- lapply(names(compKEGG), function(x) {
  if (is.null(compKEGG[[x]])) return(NULL)
  
  result <- compKEGG[[x]]@result
  if (nrow(result) == 0) return(NULL)
  
  result %>%
    filter(p.adjust < 0.05) %>%
    mutate(Cluster = x) %>%
    select(Cluster, everything())
}) %>%
  bind_rows()

cat("Total enriched pathways:", nrow(cluster_result), "\n")
## Total enriched pathways: 53
# Summary by cluster
enrichment_summary <- cluster_result %>%
  group_by(Cluster) %>%
  summarise(
    n_pathways = n(),
    mean_count = mean(Count),
    min_pvalue = min(p.adjust)
  )

print(enrichment_summary)
## # A tibble: 6 × 4
##   Cluster      n_pathways mean_count   min_pvalue
##   <chr>             <int>      <dbl>        <dbl>
## 1 -P.Down               9       2.67 0.0233      
## 2 -P.Up                 8       6.88 0.00000360  
## 3 Leaf.Down            11       5.36 0.00191     
## 4 Leaf.Up               1       6    0.0101      
## 5 Leaf:-P.Down         14       4.14 0.0000000138
## 6 Leaf:-P.Up           10       5.2  0.000204

Visualization

cluster_order <- names(DEGs)

# Select top 7 pathways per cluster
sorted_by_p <- cluster_result %>%
  mutate(neglogP = -log10(p.adjust)) %>%
  mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
  droplevels() %>%
  group_by(Cluster) %>%
  arrange(Cluster, p.adjust) %>%
  slice(1:4) %>%
  mutate(label_order = 1:n()) %>%
  mutate(Description = fct_reorder(Description, label_order))

top_terms <- levels(sorted_by_p$Description)

cat("Selected", length(top_terms), "pathways\n")
## Selected 17 pathways
# Prepare plotting data
to_plot <- cluster_result %>%
  mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
  filter(Description %in% top_terms) %>%
  mutate(neglogP = -log10(p.adjust)) %>%
  group_by(Cluster) %>%
  arrange(Cluster, p.adjust) %>%
  mutate(Description = factor(Description, levels = top_terms)) %>%
  mutate(label_order = 1:n()) %>%
  mutate(Description = fct_reorder(Description, label_order))
# Prepare gene-level data
effects_gene_label <- effects_df %>%
  rename(v5 = gene) %>%
  left_join(id_map, by = "v5") %>%
  mutate(Cluster = case_when(
    regulation == "Upregulated" ~ paste(predictor, "Up", sep = "."),
    regulation == "Downregulated" ~ paste(predictor, "Down", sep = ".")
  )) %>%
  select(
    Cluster, v5, gene, locus_label, locus_symbol,
    locus_name, logFC, P = adj.P.Val, mahalanobis
  ) %>%
  filter(!is.na(locus_label) & !is.na(locus_symbol) & !is.na(Cluster)) %>%
  mutate(neglogPG = -log10(P))

cat("Genes with NCBI IDs:", sum(!is.na(effects_gene_label$gene)), "\n")
## Genes with NCBI IDs: 4839
# Filter to Leaf/-P only
cluster_order <- cluster_order[!grepl("Inv4m", cluster_order)]

to_plot <- to_plot %>%
  filter(!grepl("Inv4m", Cluster)) %>%
  droplevels() %>%
  mutate(Cluster = factor(Cluster, levels = cluster_order))

with_label <-to_plot %>%
  separate_longer_delim(geneID, delim = "/") %>%
  rename(gene = geneID) %>%
  mutate(gene = as.integer(gene)) %>%
  inner_join(effects_gene_label, by = c("Cluster", "gene"))

# Map top gene per pathway
term2gene_label <- with_label %>%
  select(
    Cluster, gene, locus_label, locus_symbol, ID, Description,
    p.adjust, Count, mahalanobis, logFC, P, neglogPG
  ) %>%
  distinct() %>%
  group_by(Cluster, ID) %>%
  arrange(Cluster, ID, -mahalanobis) %>%
  slice(1)

# Manual fix for specific gene
is_diox <- term2gene_label$gene == 100272845
term2gene_label$locus_label[is_diox] <- "fba2"

cat("Annotated", nrow(term2gene_label), "pathway-cluster combinations\n")
## Annotated 25 pathway-cluster combinations
# Format labels
labels <- paste0(
  gsub(".", " ", cluster_order, fixed = TRUE),
  "\n(", n_genes, ")"
) %>%
  gsub("Leaf:-P.Down", "Negative\nLeaf × -P", .) %>%
  gsub("Leaf:-P.Up", "Positive\nLeaf × -P", .)

names(labels) <- names(n_genes)

# Create plot
annotation_plot <- to_plot %>%
  ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
  ggtitle("KEGG Pathways") +
  scale_fill_continuous(
    low = "dodgerblue",
    high = "tomato",
    name = "-log10(FDR)",
    guide = guide_colorbar()
  ) +
  scale_y_discrete(limits = rev(levels(to_plot$Description))) +
  scale_x_discrete(
    labels = labels,
    drop = FALSE,
    expand = expansion(add = c(0.5, 1))
    ) +
  geom_point(shape = 21) +
  geom_text(
    data = term2gene_label,
    position = position_nudge(0.1),
    hjust = 0,
    vjust = 0.5,
    fontface = "italic",
    size = 7,
    mapping = aes(x = Cluster, y = Description, label = locus_label),
    inherit.aes = FALSE
  ) +
  scale_size(range = c(2, 8)) +
  ylab(NULL) +
  xlab(NULL) +
  DOSE::theme_dose(font.size = 20) +
  theme(
    plot.title = element_text( size = 25),
    axis.text.x = element_text(face = "bold", size =20)
  )


print(annotation_plot)

Export Plot Data

# Export data used to generate the KEGG plot
kegg_plot_data <- to_plot %>%
  left_join(
    term2gene_label %>%
      select(Cluster, ID, locus_label),
    by = c("Cluster", "ID")
  ) %>%
  select(
    Cluster, ID, Description, GeneRatio, BgRatio,
    pvalue, p.adjust, qvalue, geneID, Count,
    neglogP, locus_label
  )

write.csv(
  kegg_plot_data,
  file = file.path(paths$intermediate, "kegg_plot_input_data.csv"),
  row.names = FALSE
)

cat("Exported", nrow(kegg_plot_data), "KEGG pathways\n")
## Exported 28 KEGG pathways

Export

# Save for figure assembly
saveRDS(
  annotation_plot,
  file = file.path(paths$intermediate, "kegg_panel.rds")
)

ggsave(
  annotation_plot,
  file = file.path(paths$figures, "kegg_panel.pdf"),
  height = 7,
  width = 16
)

ggsave(
  annotation_plot,
  file = file.path(paths$figures, "HC_DEG_KEGG_annotation.svg"),
  height = 7,
  width = 12,
  dpi = 150
)

Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forcats_1.0.1          tidyr_1.3.1            clusterProfiler_4.18.2
## [4] ggplot2_4.0.1          dplyr_1.1.4           
## 
## loaded via a namespace (and not attached):
##   [1] DBI_1.2.3               gson_0.1.0              rlang_1.1.6            
##   [4] magrittr_2.0.4          DOSE_4.4.0              compiler_4.5.1         
##   [7] RSQLite_2.4.5           png_0.1-8               systemfonts_1.3.1      
##  [10] vctrs_0.6.5             reshape2_1.4.5          stringr_1.6.0          
##  [13] pkgconfig_2.0.3         crayon_1.5.3            fastmap_1.2.0          
##  [16] XVector_0.50.0          labeling_0.4.3          utf8_1.2.6             
##  [19] rmarkdown_2.30          enrichplot_1.30.4       ragg_1.5.0             
##  [22] purrr_1.2.0             bit_4.6.0               xfun_0.54              
##  [25] cachem_1.1.0            aplot_0.2.9             jsonlite_2.0.0         
##  [28] blob_1.2.4              tidydr_0.0.6            tweenr_2.0.3           
##  [31] BiocParallel_1.44.0     cluster_2.1.8.1         parallel_4.5.1         
##  [34] R6_2.6.1                bslib_0.9.0             stringi_1.8.7          
##  [37] RColorBrewer_1.1-3      jquerylib_0.1.4         GOSemSim_2.36.0        
##  [40] Rcpp_1.1.0              Seqinfo_1.0.0           knitr_1.50             
##  [43] ggtangle_0.0.9          R.utils_2.13.0          IRanges_2.44.0         
##  [46] Matrix_1.7-4            splines_4.5.1           igraph_2.2.1           
##  [49] tidyselect_1.2.1        qvalue_2.42.0           yaml_2.3.12            
##  [52] codetools_0.2-20        lattice_0.22-7          tibble_3.3.0           
##  [55] plyr_1.8.9              treeio_1.34.0           Biobase_2.70.0         
##  [58] withr_3.0.2             KEGGREST_1.50.0         S7_0.2.1               
##  [61] evaluate_1.0.5          gridGraphics_0.5-1      polyclip_1.10-7        
##  [64] scatterpie_0.2.6        Biostrings_2.78.0       pillar_1.11.1          
##  [67] ggtree_4.0.1            stats4_4.5.1            ggfun_0.2.0            
##  [70] generics_0.1.4          rprojroot_2.1.1         S4Vectors_0.48.0       
##  [73] scales_1.4.0            tidytree_0.4.6          glue_1.8.0             
##  [76] gdtools_0.4.4           lazyeval_0.2.2          tools_4.5.1            
##  [79] ggnewscale_0.5.2        data.table_1.17.8       fgsea_1.36.0           
##  [82] ggiraph_0.9.2           fs_1.6.6                fastmatch_1.1-6        
##  [85] cowplot_1.2.0           grid_4.5.1              ape_5.8-1              
##  [88] AnnotationDbi_1.72.0    nlme_3.1-168            patchwork_1.3.2        
##  [91] ggforce_0.5.0           cli_3.6.5               rappdirs_0.3.3         
##  [94] textshaping_1.0.4       fontBitstreamVera_0.1.1 svglite_2.2.2          
##  [97] gtable_0.3.6            R.methodsS3_1.8.2       yulab.utils_0.2.2      
## [100] sass_0.4.10             digest_0.6.39           fontquiver_0.2.1       
## [103] BiocGenerics_0.56.0     ggrepel_0.9.6           ggplotify_0.1.3        
## [106] htmlwidgets_1.6.4       farver_2.1.2            memoise_2.0.1          
## [109] htmltools_0.5.9         R.oo_1.27.1             lifecycle_1.0.4        
## [112] httr_1.4.7              here_1.0.2              GO.db_3.22.0           
## [115] MASS_7.3-65             fontLiberation_0.1.0    bit64_4.6.0-1