Purpose: Perform Gene Ontology enrichment analysis on the validated trans-regulatory network to identify biological processes associated with genes regulated by Inv4m versus flanking regions.
Approach: We partition network genes based on the strongest regulator criterion:
This approach recovers trans targets that would otherwise be excluded due to any connection to flanking regulators, while ensuring genes are assigned to the network of their dominant regulatory influence.
Expected outcomes:
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(clusterProfiler)
library(forcats)
library(DOSE)
library(AnnotationHub)
library(rrvgo)
Purpose: Load gene lists and edges defining trans-regulatory network structure.
trans_regulators <- read.csv(
file.path(paths$intermediate, "trans_network_regulators.csv")
)
trans_targets <- read.csv(
file.path(paths$intermediate, "trans_network_targets.csv")
)
trans_network_all <- read.csv(
file.path(paths$intermediate, "trans_network_all_genes.csv")
)
trans_edges <- read.csv(
file.path(paths$intermediate, "trans_network_edges_annotated.csv")
)
cat("Network composition:\n")
## Network composition:
cat(" Regulators:", nrow(trans_regulators), "\n")
## Regulators: 103
cat(" Targets:", nrow(trans_targets), "\n")
## Targets: 32
cat(" Total genes:", nrow(trans_network_all), "\n")
## Total genes: 135
cat(" Edges:", nrow(trans_edges), "\n")
## Edges: 3262
Purpose: Define universe of analyzed genes and get expression direction.
effects_df <- read.csv(
file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv")
) %>%
filter(predictor == "Inv4m") %>%
group_by(gene) %>%
arrange(adj.P.Val) %>%
dplyr::slice(1) %>%
ungroup()
background <- effects_df %>%
pull(gene) %>%
sort() %>%
unique()
cat("Background genes:", length(background), "\n")
## Background genes: 24011
cat("DEGs:", sum(effects_df$is_DEG), "\n")
## DEGs: 465
Purpose: Load maize B73 NAM5 GO annotations from PANNZER.
TERM2NAME <- readRDS(file.path(paths$data, "TERM2NAME.rds"))
TERM2GENE <- readRDS(file.path(paths$data, "TERM2GENE_PANNZER_full.rds"))
cat("GO annotation loaded:\n")
## GO annotation loaded:
cat(" Biological Process terms:",
length(unique(TERM2NAME$BP$go_id)), "\n")
## Biological Process terms: 29211
cat(" Gene-term associations:", nrow(TERM2GENE), "\n")
## Gene-term associations: 1103596
Purpose: Assign genes to flanking or Inv4m networks based on the strongest regulatory connection.
Logic:
# Flanking network: all genes connected by flanking-regulated edges
flanking_network_genes <- trans_edges %>%
filter(regulator_region == "flanking") %>%
{union(.$from, .$to)} %>%
sort() %>%
unique()
# Identify regulators by region
inv4m_regulators <- trans_edges %>%
filter(regulator_region == "inv4m") %>%
pull(from) %>%
unique()
flanking_regulators <- trans_edges %>%
filter(regulator_region == "flanking") %>%
pull(from) %>%
unique()
# For each target, find strongest correlated regulator
strongest_edge_per_target <- trans_edges %>%
mutate(abs_r = abs(r)) %>%
group_by(to) %>%
slice_max(abs_r, n = 1, with_ties = FALSE) %>%
ungroup()
# Targets whose strongest regulator is NOT in flanking region
targets_strongest_nonflanking <- strongest_edge_per_target %>%
filter(regulator_region != "flanking") %>%
pull(to)
# Inv4m edges to these targets
edges_inv4m_dominant <- trans_edges %>%
filter(
to %in% targets_strongest_nonflanking,
regulator_region == "inv4m"
)
# Inv4m network: genes from edges where Inv4m is dominant regulator
inv4m_network_genes <- union(
edges_inv4m_dominant$from,
edges_inv4m_dominant$to
) %>%
sort() %>%
unique()
cat("Network partitioning (strongest regulator criterion):\n")
## Network partitioning (strongest regulator criterion):
cat(" Flanking regulators:", length(flanking_regulators), "\n")
## Flanking regulators: 70
cat(" Inv4m regulators:", length(inv4m_regulators), "\n")
## Inv4m regulators: 33
cat(" Flanking network:", length(flanking_network_genes), "genes\n")
## Flanking network: 102 genes
cat(" Inv4m network:", length(inv4m_network_genes), "genes\n")
## Inv4m network: 50 genes
cat(" Targets with Inv4m as strongest:", length(targets_strongest_nonflanking), "\n")
## Targets with Inv4m as strongest: 17
Purpose: Split network genes by direction of Inv4m effect for enrichment.
# Get logFC for Inv4m network genes
inv4m_genes_with_logfc <- effects_df %>%
filter(gene %in% inv4m_network_genes) %>%
group_by(gene) %>%
slice(1) %>%
ungroup()
# Define gene sets by network and direction
network_gene_sets <- list(
`flank.Up` = trans_network_all %>%
filter(gene %in% flanking_network_genes, logFC > 0) %>%
pull(gene),
`flank.Down` = trans_network_all %>%
filter(gene %in% flanking_network_genes, logFC < 0) %>%
pull(gene),
`Inv4m.Up` = inv4m_genes_with_logfc %>%
filter(logFC > 0) %>%
pull(gene),
`Inv4m.Down` = inv4m_genes_with_logfc %>%
filter(logFC < 0) %>%
pull(gene)
)
n_genes <- sapply(network_gene_sets, length)
cat("\nGene set sizes:\n")
##
## Gene set sizes:
print(n_genes)
## flank.Up flank.Down Inv4m.Up Inv4m.Down
## 54 48 22 28
cat("\nFlanking total:", n_genes["flank.Up"] + n_genes["flank.Down"], "\n")
##
## Flanking total: 102
cat("Inv4m total:", n_genes["Inv4m.Up"] + n_genes["Inv4m.Down"], "\n")
## Inv4m total: 50
Purpose: Wrapper for clusterProfiler enrichment with consistent parameters.
#' Perform GO enrichment analysis
#'
#' @param x Character vector of gene IDs
#' @param ontology GO ontology: "BP", "MF", or "CC"
#' @param universe Character vector of background genes
#' @return enrichResult object from clusterProfiler
ego <- function(x, ontology = "BP", universe = NULL) {
enricher(
gene = x,
pvalueCutoff = 0.05,
pAdjustMethod = "none",
universe = universe,
minGSSize = 10,
maxGSSize = 500,
qvalueCutoff = 0.2,
TERM2GENE = TERM2GENE[TERM2GENE$GO %in% TERM2NAME[[ontology]]$go_id, ],
TERM2NAME = TERM2NAME[[ontology]]
)
}
Purpose: Identify enriched GO Biological Process terms across network subsets.
compGO_BP <- compareCluster(
geneCluster = network_gene_sets,
fun = function(x) {
ego(x, ontology = "BP", universe = background)
}
)
cat("RAW enrichment results by cluster:\n")
## RAW enrichment results by cluster:
print(table(compGO_BP@compareClusterResult$Cluster))
##
## flank.Up flank.Down Inv4m.Up Inv4m.Down
## 29 34 48 47
Purpose: Use rrvgo semantic similarity to reduce redundant GO terms.
# Load maize annotation for semantic similarity
ah <- AnnotationHub()
org.Zm.eg.db <- ah[["AH119718"]]
# Filter low-count terms
filtered_result <- compGO_BP@compareClusterResult %>%
filter(Count > 1) %>%
group_by(Cluster, geneID) %>%
arrange(p.adjust, .by_group = TRUE) %>%
ungroup()
# Calculate semantic similarity matrix
go_terms <- unique(compGO_BP@compareClusterResult$ID)
if (length(go_terms) > 1) {
simMatrix <- calculateSimMatrix(go_terms,
orgdb = org.Zm.eg.db,
ont = "BP",
method = "Rel"
)
# Reduce to representative terms
scores <- setNames(-log10(compGO_BP@compareClusterResult$p.adjust),
compGO_BP@compareClusterResult$ID)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = org.Zm.eg.db
)
# Keep parent terms + growth (GO:0040007) which includes JMJ genes
compGO_BP@compareClusterResult <- filtered_result %>%
filter(ID %in% c(reducedTerms$parent, "GO:0040007"))
}
cat("\nFILTERED enrichment results by cluster:\n")
##
## FILTERED enrichment results by cluster:
print(table(compGO_BP@compareClusterResult$Cluster))
##
## flank.Up flank.Down Inv4m.Up Inv4m.Down
## 1 2 1 7
Purpose: Focus on GO terms containing genes with characterized symbols.
effects_symbol <- effects_df %>%
filter(is_DEG) %>%
dplyr::select(gene, SYMBOL = locus_symbol, locus_label, locus_name,
logFC, adj.P.Val) %>%
filter(!is.na(SYMBOL) & !is.na(locus_name)) %>%
mutate(neglogP = -log10(adj.P.Val))
with_known_gene <- compGO_BP@compareClusterResult %>%
separate_longer_delim(geneID, delim = "/") %>%
dplyr::rename(gene = geneID) %>%
inner_join(
effects_symbol %>%
dplyr::select(gene, SYMBOL, locus_name),
relationship = "many-to-many"
) %>%
pull(ID) %>%
sort() %>%
unique()
cat("GO terms with characterized genes:", length(with_known_gene), "\n")
## GO terms with characterized genes: 8
Purpose: Select top 5 terms per cluster and identify representative genes.
cluster_order <- c("flank.Up", "flank.Down", "Inv4m.Up", "Inv4m.Down")
to_plot <- compGO_BP@compareClusterResult %>%
filter(p.adjust < 0.05) %>%
mutate(
neglogP = -log10(p.adjust),
Cluster = factor(Cluster, levels = cluster_order)
) %>%
group_by(Cluster) %>%
arrange(p.adjust) %>%
slice_head(n = 5) %>%
mutate(
label_order = 1:n(),
Description = fct_reorder(Description, label_order)
) %>%
ungroup()
# Get gene labels for each term (most significant DEG per term)
term2symbol <- to_plot %>%
separate_longer_delim(geneID, delim = "/") %>%
dplyr::rename(gene = geneID) %>%
inner_join(effects_symbol, relationship = "many-to-many", by = "gene") %>%
dplyr::select(Cluster, gene, SYMBOL, locus_label, ID, Description,
p.adjust, Count, logFC, adj.P.Val) %>%
distinct() %>%
group_by(Cluster, ID) %>%
arrange(Cluster, ID, adj.P.Val) %>%
dplyr::slice(1) %>%
ungroup() %>%
mutate(gene_label = ifelse(
!is.na(locus_label) & locus_label != "",
locus_label,
SYMBOL
))
cat("\nPlot summary:\n")
##
## Plot summary:
cat(" GO terms to plot:", nrow(to_plot), "\n")
## GO terms to plot: 9
cat(" Terms with gene labels:", nrow(term2symbol), "\n")
## Terms with gene labels: 7
Purpose: Visualize enriched GO terms with representative gene symbols.
# Create facet labels with gene counts
labels <- paste0(
gsub(".", " ", cluster_order, fixed = TRUE),
"\n(", n_genes[cluster_order], ")"
)
names(labels) <- cluster_order
# Build bubble plot
bg <- to_plot %>%
ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
ggtitle("GO Biological Process Enrichment\nInv4m Trans Co-expression Network") +
scale_fill_gradient(
low = "dodgerblue",
high = "tomato",
name = "-log10(p)",
limits = c(1, max(to_plot$neglogP, na.rm = TRUE)),
guide = guide_colorbar()
) +
scale_y_discrete(
labels = scales::wrap_format(60),
limits = rev(levels(to_plot$Description))
) +
scale_x_discrete(
labels = labels,
drop = FALSE,
expand = expansion(add = c(0.5, 1))
) +
geom_point(shape = 21) +
scale_size(range = c(6, 8), breaks = 2:4) +
ylab(NULL) +
xlab(NULL) +
DOSE::theme_dose(font.size = 16) +
theme(
plot.title = element_text(face = "bold", size = 20),
legend.position = "right",
axis.text.x = element_text(vjust = 0.5, angle = 0, hjust = 0.5)
)
# Add gene annotations
annotation_plot <- bg +
geom_text(
data = term2symbol,
position = position_nudge(0.15),
hjust = 0,
vjust = 0.5,
fontface = "italic",
size = 5,
mapping = aes(x = Cluster, y = Description, label = gene_label),
inherit.aes = FALSE
)
print(annotation_plot)
Purpose: Tabular summary of top enriched terms per network.
cat("\n=== Top GO Terms by Network ===\n")
##
## === Top GO Terms by Network ===
compGO_BP@compareClusterResult %>%
filter(p.adjust < 0.05) %>%
mutate(Cluster = factor(Cluster, levels = cluster_order)) %>%
group_by(Cluster) %>%
arrange(p.adjust) %>%
slice_head(n = 5) %>%
dplyr::select(Cluster, Description, Count, p.adjust) %>%
print(n = 25)
## # A tibble: 9 × 4
## # Groups: Cluster [4]
## Cluster Description Count p.adjust
## <fct> <chr> <int> <dbl>
## 1 flank.Up protein folding 2 0.0382
## 2 flank.Down plant-type ovary development 2 0.000834
## 3 flank.Down tRNA aminoacylation for protein translation 2 0.0144
## 4 Inv4m.Up protein complex oligomerization 2 0.000420
## 5 Inv4m.Down negative regulation of gene expression, epigenetic 2 0.000217
## 6 Inv4m.Down acetyl-CoA metabolic process 2 0.000836
## 7 Inv4m.Down positive regulation of growth 2 0.00127
## 8 Inv4m.Down carboxylic acid catabolic process 2 0.0126
## 9 Inv4m.Down cellular amide metabolic process 2 0.0357
Purpose: Save enrichment results and visualizations.
# Export full enrichment results
write.csv(
compGO_BP@compareClusterResult,
file = file.path(paths$intermediate, "GO_enrichment_trans_network.csv"),
row.names = FALSE
)
# Export gene-term associations with symbols
gene_term_associations <- to_plot %>%
separate_longer_delim(geneID, delim = "/") %>%
dplyr::rename(gene = geneID) %>%
left_join(effects_symbol, relationship = "many-to-many") %>%
dplyr::select(Cluster, gene, SYMBOL, locus_name, ID, Description,
p.adjust, Count, logFC, adj.P.Val)
write.csv(
gene_term_associations,
file = file.path(paths$intermediate, "GO_gene_term_associations.csv"),
row.names = FALSE
)
# Save figures
ggsave(
annotation_plot,
file = file.path(paths$figures, "GO_trans_network_annotated.svg"),
height = 5,
width = 14
)
ggsave(
annotation_plot,
file = file.path(paths$figures, "GO_trans_network_annotated.pdf"),
height = 5,
width = 14
)
cat("\nFiles exported:\n")
##
## Files exported:
cat(" - GO_enrichment_trans_network.csv\n")
## - GO_enrichment_trans_network.csv
cat(" - GO_gene_term_associations.csv\n")
## - GO_gene_term_associations.csv
cat(" - GO_trans_network_annotated.svg\n")
## - GO_trans_network_annotated.svg
cat(" - GO_trans_network_annotated.pdf\n")
## - GO_trans_network_annotated.pdf
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] AnnotationDbi_1.72.0 IRanges_2.44.0 S4Vectors_0.48.0
## [4] Biobase_2.70.0 rrvgo_1.22.0 AnnotationHub_4.0.0
## [7] BiocFileCache_3.0.0 dbplyr_2.5.1 BiocGenerics_0.56.0
## [10] generics_0.1.4 DOSE_4.4.0 forcats_1.0.1
## [13] clusterProfiler_4.18.4 stringr_1.6.0 ggplot2_4.0.1
## [16] tidyr_1.3.2 dplyr_1.1.4 here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 jsonlite_2.0.0 umap_0.2.10.0
## [4] tidydr_0.0.6 magrittr_2.0.4 ggtangle_0.0.9
## [7] farver_2.1.2 rmarkdown_2.30 ragg_1.5.0
## [10] fs_1.6.6 vctrs_0.6.5 memoise_2.0.1
## [13] askpass_1.2.1 ggtree_4.0.3 htmltools_0.5.9
## [16] curl_7.0.0 gridGraphics_0.5-1 sass_0.4.10
## [19] bslib_0.9.0 htmlwidgets_1.6.4 plyr_1.8.9
## [22] httr2_1.2.2 cachem_1.1.0 igraph_2.2.1
## [25] mime_0.13 lifecycle_1.0.4 pkgconfig_2.0.3
## [28] Matrix_1.7-4 R6_2.6.1 fastmap_1.2.0
## [31] gson_0.1.0 shiny_1.12.1 digest_0.6.39
## [34] aplot_0.2.9 enrichplot_1.30.4 colorspace_2.1-2
## [37] ggnewscale_0.5.2 patchwork_1.3.2 rprojroot_2.1.1
## [40] RSpectra_0.16-2 textshaping_1.0.4 RSQLite_2.4.5
## [43] labeling_0.4.3 filelock_1.0.3 httr_1.4.7
## [46] polyclip_1.10-7 compiler_4.5.2 bit64_4.6.0-1
## [49] fontquiver_0.2.1 withr_3.0.2 S7_0.2.1
## [52] BiocParallel_1.44.0 DBI_1.2.3 ggforce_0.5.0
## [55] R.utils_2.13.0 MASS_7.3-65 openssl_2.3.4
## [58] rappdirs_0.3.3 tools_4.5.2 otel_0.2.0
## [61] ape_5.8-1 scatterpie_0.2.6 httpuv_1.6.16
## [64] R.oo_1.27.1 glue_1.8.0 promises_1.5.0
## [67] nlme_3.1-168 GOSemSim_2.36.0 grid_4.5.2
## [70] gridBase_0.4-7 cluster_2.1.8.1 reshape2_1.4.5
## [73] fgsea_1.36.0 gtable_0.3.6 R.methodsS3_1.8.2
## [76] data.table_1.18.0 utf8_1.2.6 xml2_1.5.1
## [79] XVector_0.50.0 ggrepel_0.9.6 BiocVersion_3.22.0
## [82] pillar_1.11.1 yulab.utils_0.2.3 later_1.4.4
## [85] splines_4.5.2 tweenr_2.0.3 treeio_1.34.0
## [88] lattice_0.22-7 bit_4.6.0 tidyselect_1.2.1
## [91] fontLiberation_0.1.0 GO.db_3.22.0 tm_0.7-16
## [94] Biostrings_2.78.0 knitr_1.51 fontBitstreamVera_0.1.1
## [97] NLP_0.3-2 Seqinfo_1.0.0 svglite_2.2.2
## [100] xfun_0.55 pheatmap_1.0.13 stringi_1.8.7
## [103] lazyeval_0.2.2 ggfun_0.2.0 yaml_2.3.12
## [106] evaluate_1.0.5 codetools_0.2-20 wordcloud_2.6
## [109] gdtools_0.4.4 tibble_3.3.0 qvalue_2.42.0
## [112] BiocManager_1.30.27 ggplotify_0.1.3 cli_3.6.5
## [115] reticulate_1.44.1 xtable_1.8-4 systemfonts_1.3.1
## [118] jquerylib_0.1.4 treemap_2.4-4 Rcpp_1.1.0
## [121] png_0.1-8 parallel_4.5.2 blob_1.2.4
## [124] slam_0.1-55 tidytree_0.4.6 ggiraph_0.9.2
## [127] scales_1.4.0 purrr_1.2.0 crayon_1.5.3
## [130] rlang_1.1.6 cowplot_1.2.0 fastmatch_1.1-6
## [133] KEGGREST_1.50.0