Overview

Purpose: Generate volcano plot for Inv4m genotype effect on differential gene expression, highlighting genes selected by Mahalanobis distance outlier detection.

Input: CSV file containing differential expression results with Mahalanobis outlier identification (is_selected_DEG column) from the inversion paper DE analysis.

Output: - Inv4m volcano plot (Panel C for Figure 3) - Standalone volcano figure for manuscript


Setup

Load Libraries

library(dplyr)
library(ggplot2)
library(ggrepel)
library(ggpubr)

Data Import and Filtering

Purpose: Load differential expression results and filter to Inv4m genotype effect.

Approach: Import CSV and subset to Inv4m genotype effects. The input contains pre-calculated Mahalanobis distances and outlier flags from upstream analysis.

Expected outcome: Filtered dataset with Inv4m predictor, retaining all columns including is_selected_DEG for gene labeling.

predictor_order <- "Inv4m"

# Load DE results with Mahalanobis outlier detection
effects_df <- read.csv(file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv")) %>%
  filter(predictor %in% predictor_order) %>%
  droplevels()

# Set predictor factor

effects_df$predictor <- factor(effects_df$predictor, levels = predictor_order)

# Diagnostics
cat("Predictors in dataset (ordered):", levels(effects_df$predictor), "\n")
## Predictors in dataset (ordered): Inv4m
cat("Total genes:", nrow(effects_df), "\n")
## Total genes: 24011
cat("Genes per predictor:\n")
## Genes per predictor:
print(table(effects_df$predictor))
## 
## Inv4m 
## 24011

Gene Selection for Labeling

Purpose: Identify top genes to label in volcano plots based on Mahalanobis distance ranking.

Approach: For each predictor and regulation direction, select the top N genes (default N=10) that are marked as is_selected_DEG and have locus_label annotation in selected_DEGs_curated_locus_label.csv. Load curated labels from the external file, coalesce with existing labels, then filter for genes with valid annotations to label in the plot.

Expected outcome: Data frame of genes to label, with custom label text from curated annotations.

# Load curated labels from intermediate results
curated <- read.csv(file.path(paths$intermediate, "selected_DEGs_leaf_interaction_model.csv")) %>%
  select(
    regulation,
    predictor,
    gene,
    locus_symbol,
    locus_label,
    desc_merged,
    mahalanobis,
    logFC,
    neglogP
  )

# Set predictor order for curated data
curated$predictor <- factor(curated$predictor, levels = predictor_order)

# Define number of labels per predictor and regulation
label_counts <- data.frame(
  predictor = c("Inv4m", "Inv4m"),
  regulation = c("Downregulated", "Upregulated"),
  n_labels = c(15, 15),
  stringsAsFactors = FALSE
)

# Select genes for labeling based on specified counts
to_plot <- curated %>%
  ungroup() %>%
  filter(!is.na(locus_label) & locus_label != "") %>%
  arrange(regulation, predictor, -mahalanobis) %>%
  left_join(label_counts, by = c("predictor", "regulation")) %>%
  group_by(regulation, predictor) %>%
  mutate(row_num = row_number()) %>%
  filter(row_num <= n_labels) %>%
  select(-row_num, -n_labels) %>%
  arrange(-mahalanobis) %>%
  ungroup() %>%
  mutate(predictor = factor(predictor, levels = predictor_order)) %>%  
  data.frame()

# Diagnostics
cat("Total curated genes:", nrow(curated), "\n")
## Total curated genes: 118
cat("Genes with locus_label:", sum(!is.na(curated$locus_label) & curated$locus_label != ""), "\n")
## Genes with locus_label: 69
cat("\nSelected DEGs by regulation and predictor:\n")
## 
## Selected DEGs by regulation and predictor:
with(curated, print(table(regulation, predictor)))
##                predictor
## regulation      Inv4m
##   Downregulated    11
##   Upregulated      14
cat("\nGenes selected for labeling:", nrow(to_plot), "\n")
## 
## Genes selected for labeling: 19
cat("Labels per predictor/regulation:\n")
## Labels per predictor/regulation:
with(to_plot, print(table(regulation, predictor)))
##                predictor
## regulation      Inv4m
##   Downregulated     7
##   Upregulated      12
# Show configured label counts
cat("\nConfigured label counts:\n")
## 
## Configured label counts:
print(label_counts)
##   predictor    regulation n_labels
## 1     Inv4m Downregulated       15
## 2     Inv4m   Upregulated       15

Visualization

Inv4m Volcano Plot

Purpose: Create volcano plot showing logFC vs. FDR for Inv4m genotype effect.

Approach: Use ggplot with: - Points colored by regulation status (down/below threshold/up) - Text labels for top Mahalanobis-selected genes - Expression formatting for axis labels

Expected outcome: Single-panel figure saved as Panel C for Figure 3.

# Custom formatting function for y-axis
scaleFUN <- function(x) sprintf("%.0f", x)

# Set regulation based on high-confidence DEG flag
effects_df$regulation[!effects_df$is_hiconf_DEG] <- "Unregulated"

# Generate Inv4m volcano plot
volcano_inv4m <- effects_df %>%
  ggplot(aes(x = logFC, y = neglogP, col = regulation)) +
  ylab(expression(-log[10](italic(FDR)))) +
  xlab(expression(log[2]("Fold Change"))) +
  scale_y_continuous(labels = scaleFUN, expand = expansion(mult = c(0, 0.1))) +
  geom_point(alpha = 0.3, size = 1) +
  geom_text_repel(
    data = to_plot,
    aes(label = locus_label),
    color = "black",
    fontface = "bold.italic",
    segment.color = "grey",
    force = 2,
    size = 5,
    min.segment.length = 0,
    max.overlaps = 50
  ) +
  scale_color_manual(
    name = "",
    values = c("#00AFBB", "grey", "#bb0c00"),
    labels = c("Downregulated", "Below threshold", "Upregulated")
  ) +
  guides(color = guide_legend(
    reverse = TRUE,
    override.aes = list(size = 5)
  )) +
  theme_classic2(base_size = 20) +
  theme(
    plot.title = element_blank(),
    legend.position = c(0.85, 0.90),
    legend.background = element_rect(fill = "transparent"),
    legend.spacing = unit(0, "line"),
    legend.box.spacing = unit(0, "line"),
    legend.text = element_text(size = 14),
    legend.direction = "vertical"
  )

# Save as RDS for assembly
saveRDS(volcano_inv4m, file.path(paths$intermediate, "volcano_inv4m.rds"))

# Save as Panel C for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_C_volcano.png"),
  plot = volcano_inv4m,
  width = 6, height = 6, dpi = 300, bg = "white"
)

# Also save standalone figure
ggsave(
  file.path(paths$figures, "volcano_inv4m.png"),
  plot = volcano_inv4m,
  width = 8, height = 8, dpi = 150, bg = "white"
)

volcano_inv4m

Summary

cat("Inv4m Volcano Plot Analysis Complete\n")
## Inv4m Volcano Plot Analysis Complete
cat("=====================================\n")
## =====================================
cat("Total genes analyzed:", nrow(effects_df), "\n")
## Total genes analyzed: 24011
cat("Genes labeled in plot:", nrow(to_plot), "\n")
## Genes labeled in plot: 19
cat("\nOutput files:\n")
## 
## Output files:
cat("  - Panel C: panels/panel_C_volcano.png\n")
##   - Panel C: panels/panel_C_volcano.png
cat("  - Standalone: volcano_inv4m.png\n")
##   - Standalone: volcano_inv4m.png

Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggpubr_0.6.2  ggrepel_0.9.6 ggplot2_4.0.1 dplyr_1.1.4   here_1.0.2   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     tidyr_1.3.2        rstatix_0.7.3     
##  [5] digest_0.6.39      magrittr_2.0.4     evaluate_1.0.5     grid_4.5.2        
##  [9] RColorBrewer_1.1-3 fastmap_1.2.0      rprojroot_2.1.1    jsonlite_2.0.0    
## [13] backports_1.5.0    Formula_1.2-5      purrr_1.2.0        scales_1.4.0      
## [17] textshaping_1.0.4  jquerylib_0.1.4    abind_1.4-8        cli_3.6.5         
## [21] rlang_1.1.6        withr_3.0.2        cachem_1.1.0       yaml_2.3.12       
## [25] otel_0.2.0         tools_4.5.2        ggsignif_0.6.4     broom_1.0.11      
## [29] vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4    car_3.1-3         
## [33] ragg_1.5.0         pkgconfig_2.0.3    pillar_1.11.1      bslib_0.9.0       
## [37] gtable_0.3.6       glue_1.8.0         Rcpp_1.1.0         systemfonts_1.3.1 
## [41] xfun_0.55          tibble_3.3.0       tidyselect_1.2.1   knitr_1.51        
## [45] farver_2.1.2       htmltools_0.5.9    labeling_0.4.3     rmarkdown_2.30    
## [49] carData_3.0-5      compiler_4.5.2     S7_0.2.1