Overview

Purpose: Perform Gene Ontology enrichment analysis on the validated trans-regulatory network to identify biological processes associated with genes regulated by Inv4m versus flanking regions.

Approach: We partition network genes based on the strongest regulator criterion:

  1. Flanking network: All genes connected by edges where the regulator is in the flanking region (gets priority - any connection to flanking counts)
  2. Inv4m network: Genes from Inv4m-regulated edges where the target’s strongest correlated regulator is NOT in the flanking region

This approach recovers trans targets that would otherwise be excluded due to any connection to flanking regulators, while ensuring genes are assigned to the network of their dominant regulatory influence.

Expected outcomes:

  • Identification of enriched biological processes in trans-regulatory networks
  • Comparison of functional categories between flanking and Inv4m networks
  • Visualization of GO terms with representative gene examples (e.g., JMJ cluster)

Setup and Dependencies

library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(clusterProfiler)
library(forcats)
library(DOSE)
library(AnnotationHub)
library(rrvgo)

Load Data

Trans Co-expression Network

Purpose: Load gene lists and edges defining trans-regulatory network structure.

trans_regulators <- read.csv(
  file.path(paths$intermediate, "trans_network_regulators.csv")
)
trans_targets <- read.csv(
  file.path(paths$intermediate, "trans_network_targets.csv")
)
trans_network_all <- read.csv(
  file.path(paths$intermediate, "trans_network_all_genes.csv")
)
trans_edges <- read.csv(
  file.path(paths$intermediate, "trans_network_edges_annotated.csv")
)

cat("Network composition:\n")
## Network composition:
cat("  Regulators:", nrow(trans_regulators), "\n")
##   Regulators: 103
cat("  Targets:", nrow(trans_targets), "\n")
##   Targets: 32
cat("  Total genes:", nrow(trans_network_all), "\n")
##   Total genes: 135
cat("  Edges:", nrow(trans_edges), "\n")
##   Edges: 3262

Load Effects Table

Purpose: Define universe of analyzed genes and get expression direction.

effects_df <- read.csv(
  file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv")
) %>%
  filter(predictor == "Inv4m") %>%
  group_by(gene) %>%
  arrange(adj.P.Val) %>%
  dplyr::slice(1) %>%
  ungroup()

background <- effects_df %>%
  pull(gene) %>%
  sort() %>%
  unique()

cat("Background genes:", length(background), "\n")
## Background genes: 24011
cat("DEGs:", sum(effects_df$is_DEG), "\n")
## DEGs: 465

Load GO Annotation

Purpose: Load maize B73 NAM5 GO annotations from PANNZER.

TERM2NAME <- readRDS(file.path(paths$data, "TERM2NAME.rds"))
TERM2GENE <- readRDS(file.path(paths$data, "TERM2GENE_PANNZER_full.rds"))

cat("GO annotation loaded:\n")
## GO annotation loaded:
cat("  Biological Process terms:",
    length(unique(TERM2NAME$BP$go_id)), "\n")
##   Biological Process terms: 29211
cat("  Gene-term associations:", nrow(TERM2GENE), "\n")
##   Gene-term associations: 1103596

Define Network Subsets

Partition by Strongest Regulator

Purpose: Assign genes to flanking or Inv4m networks based on the strongest regulatory connection.

Logic:

  1. Flanking network gets priority: all genes in flanking-regulated edges
  2. Inv4m network: genes from Inv4m edges where target’s strongest regulator is non-flanking
# Flanking network: all genes connected by flanking-regulated edges
flanking_network_genes <- trans_edges %>%
  filter(regulator_region == "flanking") %>%
  {union(.$from, .$to)} %>%
  sort() %>%
  unique()

# Identify regulators by region
inv4m_regulators <- trans_edges %>%
  filter(regulator_region == "inv4m") %>%
  pull(from) %>%
  unique()

flanking_regulators <- trans_edges %>%
  filter(regulator_region == "flanking") %>%
  pull(from) %>%
  unique()

# For each target, find strongest correlated regulator
strongest_edge_per_target <- trans_edges %>%
  mutate(abs_r = abs(r)) %>%
  group_by(to) %>%
  slice_max(abs_r, n = 1, with_ties = FALSE) %>%
  ungroup()

# Targets whose strongest regulator is NOT in flanking region
targets_strongest_nonflanking <- strongest_edge_per_target %>%
  filter(regulator_region != "flanking") %>%
  pull(to)

# Inv4m edges to these targets
edges_inv4m_dominant <- trans_edges %>%
  filter(
    to %in% targets_strongest_nonflanking,
    regulator_region == "inv4m"
  )

# Inv4m network: genes from edges where Inv4m is dominant regulator
inv4m_network_genes <- union(
  edges_inv4m_dominant$from,
  edges_inv4m_dominant$to
) %>%
  sort() %>%
  unique()

cat("Network partitioning (strongest regulator criterion):\n")
## Network partitioning (strongest regulator criterion):
cat("  Flanking regulators:", length(flanking_regulators), "\n")
##   Flanking regulators: 70
cat("  Inv4m regulators:", length(inv4m_regulators), "\n")
##   Inv4m regulators: 33
cat("  Flanking network:", length(flanking_network_genes), "genes\n")
##   Flanking network: 102 genes
cat("  Inv4m network:", length(inv4m_network_genes), "genes\n")
##   Inv4m network: 50 genes
cat("  Targets with Inv4m as strongest:", length(targets_strongest_nonflanking), "\n")
##   Targets with Inv4m as strongest: 17

Create Directional Gene Sets

Purpose: Split network genes by direction of Inv4m effect for enrichment.

# Get logFC for Inv4m network genes
inv4m_genes_with_logfc <- effects_df %>%
  filter(gene %in% inv4m_network_genes) %>%
  group_by(gene) %>%
  slice(1) %>%
  ungroup()

# Define gene sets by network and direction
network_gene_sets <- list(
  `flank.Up` = trans_network_all %>%
    filter(gene %in% flanking_network_genes, logFC > 0) %>%
    pull(gene),

  `flank.Down` = trans_network_all %>%
    filter(gene %in% flanking_network_genes, logFC < 0) %>%
    pull(gene),

  `Inv4m.Up` = inv4m_genes_with_logfc %>%
    filter(logFC > 0) %>%
    pull(gene),

  `Inv4m.Down` = inv4m_genes_with_logfc %>%
    filter(logFC < 0) %>%
    pull(gene)
)

n_genes <- sapply(network_gene_sets, length)

cat("\nGene set sizes:\n")
## 
## Gene set sizes:
print(n_genes)
##   flank.Up flank.Down   Inv4m.Up Inv4m.Down 
##         54         48         22         28
cat("\nFlanking total:", n_genes["flank.Up"] + n_genes["flank.Down"], "\n")
## 
## Flanking total: 102
cat("Inv4m total:", n_genes["Inv4m.Up"] + n_genes["Inv4m.Down"], "\n")
## Inv4m total: 50

GO Enrichment Analysis

Define Enrichment Function

Purpose: Wrapper for clusterProfiler enrichment with consistent parameters.

#' Perform GO enrichment analysis
#'
#' @param x Character vector of gene IDs
#' @param ontology GO ontology: "BP", "MF", or "CC"
#' @param universe Character vector of background genes
#' @return enrichResult object from clusterProfiler
ego <- function(x, ontology = "BP", universe = NULL) {
  enricher(
    gene = x,
    pvalueCutoff = 0.05,
    pAdjustMethod = "none",
    universe = universe,
    minGSSize = 10,
    maxGSSize = 500,
    qvalueCutoff = 0.2,
    TERM2GENE = TERM2GENE[TERM2GENE$GO %in% TERM2NAME[[ontology]]$go_id, ],
    TERM2NAME = TERM2NAME[[ontology]]
  )
}

Run GO Enrichment

Purpose: Identify enriched GO Biological Process terms across network subsets.

compGO_BP <- compareCluster(
  geneCluster = network_gene_sets,
  fun = function(x) {
    ego(x, ontology = "BP", universe = background)
  }
)

cat("RAW enrichment results by cluster:\n")
## RAW enrichment results by cluster:
print(table(compGO_BP@compareClusterResult$Cluster))
## 
##   flank.Up flank.Down   Inv4m.Up Inv4m.Down 
##         29         34         48         47

Filter and Process Results

Remove Redundant Terms

Purpose: Use rrvgo semantic similarity to reduce redundant GO terms.

# Load maize annotation for semantic similarity
ah <- AnnotationHub()
org.Zm.eg.db <- ah[["AH119718"]]

# Filter low-count terms
filtered_result <- compGO_BP@compareClusterResult %>%
  filter(Count > 1) %>%
  group_by(Cluster, geneID) %>%
  arrange(p.adjust, .by_group = TRUE) %>%
  ungroup()

# Calculate semantic similarity matrix
go_terms <- unique(compGO_BP@compareClusterResult$ID)

if (length(go_terms) > 1) {
  simMatrix <- calculateSimMatrix(go_terms,
    orgdb = org.Zm.eg.db,
    ont = "BP",
    method = "Rel"
  )

  # Reduce to representative terms
  scores <- setNames(-log10(compGO_BP@compareClusterResult$p.adjust),
                     compGO_BP@compareClusterResult$ID)
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = org.Zm.eg.db
  )

  # Keep parent terms + growth (GO:0040007) which includes JMJ genes
  compGO_BP@compareClusterResult <- filtered_result %>%
    filter(ID %in% c(reducedTerms$parent, "GO:0040007"))
}

cat("\nFILTERED enrichment results by cluster:\n")
## 
## FILTERED enrichment results by cluster:
print(table(compGO_BP@compareClusterResult$Cluster))
## 
##   flank.Up flank.Down   Inv4m.Up Inv4m.Down 
##          1          2          1          7

Identify Terms with Known Genes

Purpose: Focus on GO terms containing genes with characterized symbols.

effects_symbol <- effects_df %>%
  filter(is_DEG) %>%
  dplyr::select(gene, SYMBOL = locus_symbol, locus_label, locus_name,
         logFC, adj.P.Val) %>%
  filter(!is.na(SYMBOL) & !is.na(locus_name)) %>%
  mutate(neglogP = -log10(adj.P.Val))

with_known_gene <- compGO_BP@compareClusterResult %>%
  separate_longer_delim(geneID, delim = "/") %>%
  dplyr::rename(gene = geneID) %>%
  inner_join(
    effects_symbol %>%
      dplyr::select(gene, SYMBOL, locus_name),
    relationship = "many-to-many"
  ) %>%
  pull(ID) %>%
  sort() %>%
  unique()

cat("GO terms with characterized genes:", length(with_known_gene), "\n")
## GO terms with characterized genes: 8

Visualize Enrichment Results

Prepare Plot Data

Purpose: Select top 5 terms per cluster and identify representative genes.

cluster_order <- c("flank.Up", "flank.Down", "Inv4m.Up", "Inv4m.Down")

to_plot <- compGO_BP@compareClusterResult %>%
  filter(p.adjust < 0.05) %>%
  mutate(
    neglogP = -log10(p.adjust),
    Cluster = factor(Cluster, levels = cluster_order)
  ) %>%
  group_by(Cluster) %>%
  arrange(p.adjust) %>%
  slice_head(n = 5) %>%
  mutate(
    label_order = 1:n(),
    Description = fct_reorder(Description, label_order)
  ) %>%
  ungroup()

# Get gene labels for each term (most significant DEG per term)
term2symbol <- to_plot %>%
  separate_longer_delim(geneID, delim = "/") %>%
  dplyr::rename(gene = geneID) %>%
  inner_join(effects_symbol, relationship = "many-to-many", by = "gene") %>%
  dplyr::select(Cluster, gene, SYMBOL, locus_label, ID, Description,
         p.adjust, Count, logFC, adj.P.Val) %>%
  distinct() %>%
  group_by(Cluster, ID) %>%
  arrange(Cluster, ID, adj.P.Val) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  mutate(gene_label = ifelse(
    !is.na(locus_label) & locus_label != "",
    locus_label,
    SYMBOL
  ))

cat("\nPlot summary:\n")
## 
## Plot summary:
cat("  GO terms to plot:", nrow(to_plot), "\n")
##   GO terms to plot: 9
cat("  Terms with gene labels:", nrow(term2symbol), "\n")
##   Terms with gene labels: 7

Bubble Plot with Gene Annotations

Purpose: Visualize enriched GO terms with representative gene symbols.

# Create facet labels with gene counts
labels <- paste0(
  gsub(".", " ", cluster_order, fixed = TRUE),
  "\n(", n_genes[cluster_order], ")"
)
names(labels) <- cluster_order

# Build bubble plot
bg <- to_plot %>%
  ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
  ggtitle("GO Biological Process Enrichment\nInv4m Trans Co-expression Network") +
  scale_fill_gradient(
    low = "dodgerblue",
    high = "tomato",
    name = "-log10(p)",
    limits = c(1, max(to_plot$neglogP, na.rm = TRUE)),
    guide = guide_colorbar()
  ) +
  scale_y_discrete(
    labels = scales::wrap_format(60),
    limits = rev(levels(to_plot$Description))
  ) +
  scale_x_discrete(
    labels = labels,
    drop = FALSE,
    expand = expansion(add = c(0.5, 1))
  ) +
  geom_point(shape = 21) +
  scale_size(range = c(6, 8), breaks = 2:4) +
  ylab(NULL) +
  xlab(NULL) +
  DOSE::theme_dose(font.size = 16) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    legend.position = "right",
    axis.text.x = element_text(vjust = 0.5, angle = 0, hjust = 0.5)
  )

# Add gene annotations
annotation_plot <- bg +
  geom_text(
    data = term2symbol,
    position = position_nudge(0.15),
    hjust = 0,
    vjust = 0.5,
    fontface = "italic",
    size = 5,
    mapping = aes(x = Cluster, y = Description, label = gene_label),
    inherit.aes = FALSE
  )

print(annotation_plot)

Top Terms Summary

Purpose: Tabular summary of top enriched terms per network.

cat("\n=== Top GO Terms by Network ===\n")
## 
## === Top GO Terms by Network ===
compGO_BP@compareClusterResult %>%
  filter(p.adjust < 0.05) %>%
  mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
  group_by(Cluster) %>%
  arrange(p.adjust) %>%
  slice_head(n = 5) %>%
  dplyr::select(Cluster, Description, Count, p.adjust) %>%
  print(n = 25)
## # A tibble: 9 × 4
## # Groups:   Cluster [4]
##   Cluster    Description                                        Count p.adjust
##   <fct>      <chr>                                              <int>    <dbl>
## 1 flank.Up   protein folding                                        2 0.0382  
## 2 flank.Down plant-type ovary development                           2 0.000834
## 3 flank.Down tRNA aminoacylation for protein translation            2 0.0144  
## 4 Inv4m.Up   protein complex oligomerization                        2 0.000420
## 5 Inv4m.Down negative regulation of gene expression, epigenetic     2 0.000217
## 6 Inv4m.Down acetyl-CoA metabolic process                           2 0.000836
## 7 Inv4m.Down positive regulation of growth                          2 0.00127 
## 8 Inv4m.Down carboxylic acid catabolic process                      2 0.0126  
## 9 Inv4m.Down cellular amide metabolic process                       2 0.0357

Export Results

Purpose: Save enrichment results and visualizations.

# Export full enrichment results
write.csv(
  compGO_BP@compareClusterResult,
  file = file.path(paths$intermediate, "GO_enrichment_trans_network.csv"),
  row.names = FALSE
)

# Export gene-term associations with symbols
gene_term_associations <- to_plot %>%
  separate_longer_delim(geneID, delim = "/") %>%
  dplyr::rename(gene = geneID) %>%
  left_join(effects_symbol, relationship = "many-to-many") %>%
  dplyr::select(Cluster, gene, SYMBOL, locus_name, ID, Description,
         p.adjust, Count, logFC, adj.P.Val)

write.csv(
  gene_term_associations,
  file = file.path(paths$intermediate, "GO_gene_term_associations.csv"),
  row.names = FALSE
)

# Save figures
ggsave(
  annotation_plot,
  file = file.path(paths$figures, "GO_trans_network_annotated.svg"),
  height = 5,
  width = 14
)

ggsave(
  annotation_plot,
  file = file.path(paths$figures, "GO_trans_network_annotated.pdf"),
  height = 5,
  width = 14
)

cat("\nFiles exported:\n")
## 
## Files exported:
cat("  - GO_enrichment_trans_network.csv\n")
##   - GO_enrichment_trans_network.csv
cat("  - GO_gene_term_associations.csv\n")
##   - GO_gene_term_associations.csv
cat("  - GO_trans_network_annotated.svg\n")
##   - GO_trans_network_annotated.svg
cat("  - GO_trans_network_annotated.pdf\n")
##   - GO_trans_network_annotated.pdf

Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] AnnotationDbi_1.72.0   IRanges_2.44.0         S4Vectors_0.48.0      
##  [4] Biobase_2.70.0         rrvgo_1.22.0           AnnotationHub_4.0.0   
##  [7] BiocFileCache_3.0.0    dbplyr_2.5.1           BiocGenerics_0.56.0   
## [10] generics_0.1.4         DOSE_4.4.0             forcats_1.0.1         
## [13] clusterProfiler_4.18.4 stringr_1.6.0          ggplot2_4.0.1         
## [16] tidyr_1.3.2            dplyr_1.1.4            here_1.0.2            
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      jsonlite_2.0.0          umap_0.2.10.0          
##   [4] tidydr_0.0.6            magrittr_2.0.4          ggtangle_0.0.9         
##   [7] farver_2.1.2            rmarkdown_2.30          ragg_1.5.0             
##  [10] fs_1.6.6                vctrs_0.6.5             memoise_2.0.1          
##  [13] askpass_1.2.1           ggtree_4.0.3            htmltools_0.5.9        
##  [16] curl_7.0.0              gridGraphics_0.5-1      sass_0.4.10            
##  [19] bslib_0.9.0             htmlwidgets_1.6.4       plyr_1.8.9             
##  [22] httr2_1.2.2             cachem_1.1.0            igraph_2.2.1           
##  [25] mime_0.13               lifecycle_1.0.4         pkgconfig_2.0.3        
##  [28] Matrix_1.7-4            R6_2.6.1                fastmap_1.2.0          
##  [31] gson_0.1.0              shiny_1.12.1            digest_0.6.39          
##  [34] aplot_0.2.9             enrichplot_1.30.4       colorspace_2.1-2       
##  [37] ggnewscale_0.5.2        patchwork_1.3.2         rprojroot_2.1.1        
##  [40] RSpectra_0.16-2         textshaping_1.0.4       RSQLite_2.4.5          
##  [43] labeling_0.4.3          filelock_1.0.3          httr_1.4.7             
##  [46] polyclip_1.10-7         compiler_4.5.2          bit64_4.6.0-1          
##  [49] fontquiver_0.2.1        withr_3.0.2             S7_0.2.1               
##  [52] BiocParallel_1.44.0     DBI_1.2.3               ggforce_0.5.0          
##  [55] R.utils_2.13.0          MASS_7.3-65             openssl_2.3.4          
##  [58] rappdirs_0.3.3          tools_4.5.2             otel_0.2.0             
##  [61] ape_5.8-1               scatterpie_0.2.6        httpuv_1.6.16          
##  [64] R.oo_1.27.1             glue_1.8.0              promises_1.5.0         
##  [67] nlme_3.1-168            GOSemSim_2.36.0         grid_4.5.2             
##  [70] gridBase_0.4-7          cluster_2.1.8.1         reshape2_1.4.5         
##  [73] fgsea_1.36.0            gtable_0.3.6            R.methodsS3_1.8.2      
##  [76] data.table_1.18.0       utf8_1.2.6              xml2_1.5.1             
##  [79] XVector_0.50.0          ggrepel_0.9.6           BiocVersion_3.22.0     
##  [82] pillar_1.11.1           yulab.utils_0.2.3       later_1.4.4            
##  [85] splines_4.5.2           tweenr_2.0.3            treeio_1.34.0          
##  [88] lattice_0.22-7          bit_4.6.0               tidyselect_1.2.1       
##  [91] fontLiberation_0.1.0    GO.db_3.22.0            tm_0.7-16              
##  [94] Biostrings_2.78.0       knitr_1.51              fontBitstreamVera_0.1.1
##  [97] NLP_0.3-2               Seqinfo_1.0.0           svglite_2.2.2          
## [100] xfun_0.55               pheatmap_1.0.13         stringi_1.8.7          
## [103] lazyeval_0.2.2          ggfun_0.2.0             yaml_2.3.12            
## [106] evaluate_1.0.5          codetools_0.2-20        wordcloud_2.6          
## [109] gdtools_0.4.4           tibble_3.3.0            qvalue_2.42.0          
## [112] BiocManager_1.30.27     ggplotify_0.1.3         cli_3.6.5              
## [115] reticulate_1.44.1       xtable_1.8-4            systemfonts_1.3.1      
## [118] jquerylib_0.1.4         treemap_2.4-4           Rcpp_1.1.0             
## [121] png_0.1-8               parallel_4.5.2          blob_1.2.4             
## [124] slam_0.1-55             tidytree_0.4.6          ggiraph_0.9.2          
## [127] scales_1.4.0            purrr_1.2.0             crayon_1.5.3           
## [130] rlang_1.1.6             cowplot_1.2.0           fastmatch_1.1-6        
## [133] KEGGREST_1.50.0