Expression Manhattan Plots using fastman

library(here)

## here() starts at /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(scales)
library(ggplot2)
library(rtracklayer)

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: generics

## 
## Attaching package: 'generics'

## The following object is masked from 'package:dplyr':
## 
##     explain

## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union

## 
## Attaching package: 'BiocGenerics'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## Loading required package: Seqinfo

library(GenomicRanges)
library(fastman)
library(RAINBOWR)
library(cowplot)
library(ggpubr)

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:cowplot':
## 
##     get_legend

library(ggtext)
library(ggrepel)

# Setup project paths
source(here("scripts", "utils", "setup_paths.R"))
paths <- setup_project_paths("inversion_paper")

# Load Maize Reference Genome
myGFF <- file.path(paths$data, "Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3")
B73 <- rtracklayer::import(myGFF) %>%
  subset(type == "gene" & seqnames %in% 1:10)

genes <- as.data.frame(B73)
genes$ID <- gsub("gene:", "", genes$ID)

# Inv4m boundaries based on gene coordinates
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194780", "end"]

# Create GRanges objects
inv4m <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = inv4m_start, end = inv4m_end),
  strand = "+"
)

shared_introgression <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = 157012149, end = 195900523),
  strand = "+"
)

# Calculate limits in Mb
introgression_limit <- round(c(157012149, 195900523) / 1000000, 2)
inversion_limit <- round(c(inv4m_start, inv4m_end) / 1000000, 2)

# Find genes overlapping inv4m
olap <- findOverlaps(inv4m, B73)
inv4m_gene_ids <- genes$ID[subjectHits(olap)]

# Find genes in shared introgression
olap2 <- findOverlaps(shared_introgression, B73)
shared_introgression_gene_ids <- genes$ID[subjectHits(olap2)]

# Find flanking region genes (in introgression but not in inv4m)
flanking_introgression_gene_ids <- 
  shared_introgression_gene_ids[!(shared_introgression_gene_ids %in% inv4m_gene_ids)]

cat("Genes in inv4m:", length(inv4m_gene_ids), "\n")

## Genes in inv4m: 213

cat("Genes in shared introgression:", length(shared_introgression_gene_ids), "\n")

## Genes in shared introgression: 539

cat("Genes in flanking regions:", length(flanking_introgression_gene_ids), "\n")

## Genes in flanking regions: 326

cat("Inversion size (Mb):", round(inversion_limit[2] - inversion_limit[1], 1), "\n")

## Inversion size (Mb): 15.2

cat("Introgression size (Mb):", round(introgression_limit[2] - introgression_limit[1], 1), "\n")

## Introgression size (Mb): 38.9

effects <- read.csv(file.path(paths$intermediate, "predictor_effects_leaf_interaction_model.csv"))

# Set Bonferroni threshold for multiple testing
bonferroni_threshold <- -log10(0.05 / nrow(effects))

#' Prepare Data for Manhattan Plot
#'
#' Extracts and formats the required columns for fastman_gg plotting
#'
#' @param stats Data frame containing gene-level statistics
#' @param point_label Character, name of column containing locus labels
#' @param uninfomative_labels Character vector, labels to exclude from annotation
#'
#' @return Data frame with columns: gene, SNP, CHR, BP, P, and point_label
get_plot_data <- function(stats,
                          point_label = "locus_symbol",
                          uninfomative_labels = NA) {
  stats <- stats %>%
    mutate(
      SNP = paste0("S", CHR, "_", BP),
      P = P.Value  # Rename for fastman compatibility
    ) %>%
    dplyr::select(gene, SNP, CHR, BP, P, all_of(point_label)) %>%
    distinct() %>%
    arrange(CHR, BP)
  
  # Check if uninfomative_labels has any non-NA values
  if (length(uninfomative_labels) > 0 && any(!is.na(uninfomative_labels))) {
    # Set to empty string so points are plotted but not labeled
    stats[[point_label]][stats[[point_label]] %in% uninfomative_labels] <- ""
  }
  
  stats
}

#' Create Coordinate Transformer for Manhattan Plots
#'
#' Generates a function that transforms chromosome and base pair coordinates
#' to the scaled x-axis coordinates used by fastman_gg. This exactly replicates
#' the internal coordinate transformation of fastman_gg.
#'
#' @param m Data frame with CHR and BP columns
#'
#' @return A function that takes CHR and BP (vectors or scalars) and returns BPn
#'
#' @details The transformation follows these steps:
#'   1. Calculate per-chromosome statistics (min, max, width, median gap)
#'   2. Find maximum gap across all chromosomes
#'   3. Calculate cumulative base positions with gaps between chromosomes
#'   4. Scale so the final midpoint equals the number of chromosomes
#'   5. For each point: BPn = scaled_BP + scaled_base_for_chromosome
#'
#' @examples
#' transform_coords <- get_transformer(to_plot)
#' # Transform specific coordinates on chromosome 4 at 172.88 Mb
#' x_coord <- transform_coords(chr = 4, bp = 172.88e6)
get_transformer <- function(m) {
  
  # Calculate per-chromosome statistics (matching fastman_gg exactly)
  cmat <- m %>%
    arrange(CHR, BP) %>%
    group_by(CHR) %>%
    summarise(
      min_bp = min(BP),
      max_bp = max(BP),
      width = max_bp - min_bp,
      medgap = median(diff(sort(BP))),
      .groups = "drop"
    ) %>%
    arrange(CHR)
  
  # Find max gap across all chromosomes
  maxgap <- max(cmat$medgap, na.rm = TRUE)
  
  # Calculate cumulative bases and midpoints
  numc <- nrow(cmat)
  cmat$base <- 0
  cmat$midp <- 0
  
  cmat$base[1] <- 0
  cmat$midp[1] <- cmat$width[1] / 2
  
  if (numc > 1) {
    for (i in 2:numc) {
      cmat$base[i] <- cmat$base[i - 1] + cmat$width[i - 1] + maxgap
      cmat$midp[i] <- cmat$base[i] + cmat$width[i] / 2
    }
  }
  
  # Calculate scaling factor
  fac <- numc / cmat$midp[numc]
  
  # Scale bases
  cmat$basef <- fac * cmat$base
  
  # Create lookup table
  lookup <- cmat %>%
    select(CHR, min_bp, basef)
  
  # Return transformer function (closure)
  transform_coordinates <- function(chr, bp) {
    # Convert to data frame for joining
    coords <- data.frame(
      CHR = chr,
      BP = bp
    )
    
    # Join with lookup table and calculate BPn
    result <- coords %>%
      left_join(lookup, by = "CHR") %>%
      mutate(
        BP_offset = BP - min_bp,
        BP_scaled = fac * BP_offset,
        BPn = BP_scaled + basef
      ) %>%
      pull(BPn)
    
    result
  }
  
  transform_coordinates
}

#' Add Locus Labels to Manhattan Plot
#'
#' Adds ggrepel text annotations for top SNPs to an existing Manhattan plot
#'
#' @param p ggplot object (Manhattan plot)
#' @param m Data frame with CHR, BP, P, and label column
#' @param transform_coordinates Function that transforms CHR, BP to BPn
#' @param top_n Integer, number of top loci to annotate
#' @param maxnegLogP Maximum -log10(P) value for y-axis positioning
#' @param label_column Character, name of column to use for labels
#' @param text_size Numeric, size of annotation text
#' @param segment_color Character, color of connecting segments
#'
#' @return ggplot object with added annotations
add_locus_labels <- function(p,
                             m,
                             transform_coordinates,
                             top_n,
                             maxnegLogP,
                             label_column = "locus_symbol",
                             text_size = 5,
                             segment_color = "black") {
  
  if (is.null(top_n) || top_n <= 0) {
    return(p)
  }
  
  # Calculate BPn for all points using transformer
  m_with_coords <- m %>%
    mutate(
      BPn = transform_coordinates(CHR, BP),
      logP = -log10(P)
    )
  
  # Get top N SNPs with non-empty labels
  top_snps <- m_with_coords %>%
    filter(!is.na(.data[[label_column]]) & trimws(.data[[label_column]]) != "") %>%
    arrange(P) %>%
    head(top_n)
  
  if (nrow(top_snps) > 0) {
    p <- p +
      ggrepel::geom_text_repel(
        data = top_snps,
        aes(x = BPn, y = logP, label = .data[[label_column]]),
        inherit.aes = FALSE,
        ylim = c(maxnegLogP, NA),
        segment.color = segment_color,
        segment.size = 0.3,
        min.segment.length = 0,
        size = text_size,
        box.padding = 0.5,
        point.padding = 0.3,
        force = 1.5,
        max.overlaps = Inf,
        fontface = "italic"
      )
  }
  
  p
}

#' Create Manhattan Plot using fastman_gg
#'
#' Creates a Manhattan plot with optional ggrepel annotations for top loci
#'
#' @param m Data frame formatted by get_plot_data(), must contain:
#'   - CHR: chromosome number
#'   - BP: base pair position  
#'   - P: p-value
#'   - SNP: SNP identifier
#'   - locus_symbol: gene/locus labels for annotation (optional)
#' @param title Character string for plot title (supports markdown)
#' @param top_n Integer, number of top SNPs to annotate with ggrepel.
#'   Only SNPs with non-empty label values are considered.
#'   If NULL or 0, no annotation is added
#' @param label_column Character, name of column to use for labels
#' @param ... Additional arguments passed to fastman_gg()
#'
#' @return ggplot object with optional annotations
#'
#' @examples
#' # Basic plot
#' plot_manhattan_gg(m = to_plot, title = "My Analysis")
#' 
#' # With annotations
#' plot_manhattan_gg(m = to_plot, title = "My Analysis", top_n = 5)
#' 
#' # Add vertical lines
#' p <- plot_manhattan_gg(m = to_plot, title = "My Analysis", top_n = 3)
#' transform <- get_transformer(to_plot)
#' p + geom_vline(xintercept = transform(4, 172e6), linetype = "dashed")
plot_manhattan_gg <- function(m, 
                              title, 
                              top_n = NULL,
                              label_column = "locus_symbol",
                              ...) {
  
  neglogFDR <- expression(-log[10](italic(FDR)))
  FDR_thresh <- -log10(0.05) 
  maxnegLogP <- -log10(min(m$P[m$P > 0]))
  
  # Get coordinate transformer
  transform_coordinates <- get_transformer(m)
  
  # Create base Manhattan plot
  p <- fastman_gg(
    m = m,
    snp = "SNP",
    maxP = maxnegLogP,
    genomewideline = FDR_thresh,
    suggestiveline = NULL,
    ylab = neglogFDR,
    xlab = "", 
    ...
  ) +
    ggtitle(title) +
    theme_classic(base_size = 20) +
    scale_y_continuous(expand = expansion(mult = c(0.01, 0.25))) +
    theme(
      plot.title = element_markdown(hjust = 1, size = 25, face = "bold"),
      axis.text.x = element_text(angle = 0, hjust = 0.5),
      panel.grid.major.y = element_line(color = "grey90", linewidth = 0.2),
      legend.position = "none"
    )
  
  # Add locus labels if requested
  p <- add_locus_labels(
    p = p,
    m = m,
    transform_coordinates = transform_coordinates,
    top_n = top_n,
    maxnegLogP = maxnegLogP,
    label_column = label_column
  )
  
  p
}

r <- effects %>% filter(predictor == "Leaf")

# Calculate DEG count from data
n.significant <- sum(r$is_DEG)

# Prepare data
to_plot <- get_plot_data(
  r,
  uninfomative_labels = c("umc1690", "umc1286")
)

# Create plot with top 5 SNP annotations
leaf.plot <- plot_manhattan_gg(
  m = to_plot,
  title = paste("Leaf:", n.significant, "DEGs"),
  top_n = 5
)

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

# Save as Panel D for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_D_manhattan_leaf.png"),
  plot = leaf.plot,
  width = 8, height = 4, dpi = 300, bg = "white"
)
saveRDS(leaf.plot, file.path(paths$intermediate, "panel_D_manhattan_leaf.rds"))

leaf.plot

r <- effects %>% filter(predictor == "-P")

# Calculate DEG count from data
n.significant <- sum(r$is_DEG)

# Prepare data
to_plot <- get_plot_data(r)

# Create plot with top 5 SNP annotations
lowP.plot <- plot_manhattan_gg(
  m = to_plot, 
  title = paste("-P:", n.significant, "DEGs"),
  top_n = 5
)

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

lowP.plot

r <- effects %>% filter(predictor == "Inv4m")

# Calculate DEG count from data
n.significant <- sum(r$is_DEG)

uninformative <- c("cl19799_1", "cl19648_2a", "cl35478_1d", "IDP4515",
                   "pco086874", "GRMZM2G162007")

to_plot <- get_plot_data(
  r,
  uninfomative_labels = uninformative 
)

# Get transformer for adding vertical lines
convert_coordinates <- get_transformer(to_plot)

inv4m_x_coord <- convert_coordinates(
  c(4, 4),
  c(inv4m_start, inv4m_end)
)

# Create plot with top 5 SNP annotations
inv4m.plot <- plot_manhattan_gg(
  m = to_plot,
  title = paste("<i>Inv4m</i>:", n.significant, "DEGs"),
  top_n = 5
) +
  geom_vline(
    xintercept = inv4m_x_coord,
    linetype = "dashed",
    color = "grey"
  )

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

# Save as Panel E for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_E_manhattan_inv4m.png"),
  plot = inv4m.plot,
  width = 8, height = 4, dpi = 300, bg = "white"
)
saveRDS(inv4m.plot, file.path(paths$intermediate, "panel_E_manhattan_inv4m.rds"))

inv4m.plot

r <- effects %>% filter(predictor == "Inv4m:-P")

# Calculate DEG count from data
n.significant <- sum(r$is_DEG)

uninformative <- c("cl19799_1", "cl19648_2a", "cl35478_1d", "IDP4515",
                   "pco086874", "GRMZM2G162007")

to_plot <- get_plot_data(
  r,
  uninfomative_labels = uninformative 
)

# Create plot with top 3 SNP annotations
inv4mxP.plot <- plot_manhattan_gg(
  m = to_plot, 
  title = paste("<i>Inv4m</i> x -P:", n.significant, "DEGs"),
  top_n = 3
) +
  geom_vline(
    xintercept = inv4m_x_coord,
    linetype = "dashed",
    color = "grey"
  )

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

inv4mxP.plot

# Define color palette
pal <- c("purple4", "plum", "gold")
names(pal) <- c("inv4m", "flanking", "outside")
FDR_thresh <- -log10(0.05)

# Filter to Inv4m predictor and create region column
effect_inv4m <- effects %>%
  filter(predictor == "Inv4m") %>%
  mutate(
    region = case_when(
      in_Inv4m ~ "inv4m",
      in_flank ~ "flanking",
      TRUE ~ "outside"
    ),
    region = factor(region, levels = c("inv4m", "flanking", "outside"))
  )

# Create chromosome 4 detail plot
p1 <- effect_inv4m %>%
  filter(CHR == 4) %>%
  mutate(x = BP / 1e6) %>%
  ggplot(aes(x = x, y = -log10(adj.P.Val), group = region, color = region)) +
  ylab(expression(-log[10](italic(FDR)))) +
  annotate(
    "text",
    x = c(inversion_limit, introgression_limit),
    y = -0.5,
    label = rep("|", 4),
    size = 10
  ) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = pal) +
  scale_y_continuous(expand = c(0, 0.5)) +
  geom_hline(yintercept = FDR_thresh, col = "red", linetype = "dashed") +
  coord_cartesian(xlim = c(150, 200)) +
  xlab("Chromosome 4 Position [Mb]") +
  ggpubr::theme_classic2(base_size = 25) +
  theme(
    legend.position = "none",
    plot.margin = unit(c(0.5, 0, 0.5, 0), "cm")
  )

# Save as Panel G for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_G_local_manhattan.png"),
  plot = p1,
  width = 6, height = 4, dpi = 300, bg = "white"
)
saveRDS(p1, file.path(paths$intermediate, "panel_G_local_manhattan.rds"))

p1

# Panel H: Local logFC distribution for chr4 150-200Mb region
p_logfc <- effect_inv4m %>%
  filter(CHR == 4, BP >= 150e6, BP <= 200e6) %>%
  mutate(x = BP / 1e6) %>%
  ggplot(aes(x = x, y = logFC, color = region)) +
  ylab(expression(log[2]("Fold Change"))) +
  xlab("Chromosome 4 Position [Mb]") +
  annotate(
    "text",
    x = c(inversion_limit, introgression_limit),
    y = min(effect_inv4m$logFC[effect_inv4m$CHR == 4], na.rm = TRUE) - 0.3,
    label = rep("|", 4),
    size = 8
  ) +
  geom_point(size = 2, alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  scale_color_manual(values = pal) +
  coord_cartesian(xlim = c(150, 200)) +
  ggpubr::theme_classic2(base_size = 20) +
  theme(
    legend.position = "none",
    plot.margin = unit(c(0.5, 0, 0.5, 0), "cm")
  )

# Save as Panel H for Figure 3
ggsave(
  file.path(paths$figures, "panels", "panel_H_local_logfc.png"),
  plot = p_logfc,
  width = 6, height = 4, dpi = 300, bg = "white"
)
saveRDS(p_logfc, file.path(paths$intermediate, "panel_H_local_logfc.rds"))

p_logfc

# Test enrichment of significant genes in regions
contingency <- with(
  effect_inv4m %>% mutate(is_significant = adj.P.Val < 0.05),
  table(region, is_significant)
)

print(contingency)

##           is_significant
## region     FALSE  TRUE
##   inv4m      139   114
##   flanking   197   158
##   outside  23210   193

chisq.test(contingency)

## Warning in chisq.test(contingency): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  contingency
## X-squared = 6017.3, df = 2, p-value < 2.2e-16

# Test inv4m vs flanking
contingency_subset <- with(
  effect_inv4m %>% 
    filter(region != "outside") %>%
    mutate(is_significant = adj.P.Val < 0.05),
  table(region, is_significant)
)

print(contingency_subset)

##           is_significant
## region     FALSE TRUE
##   inv4m      139  114
##   flanking   197  158
##   outside      0    0

chisq.test(contingency_subset)

## Warning in chisq.test(contingency_subset): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_subset
## X-squared = NaN, df = 2, p-value = NA

# Test if p-value distributions differ between regions
ks_inv4m_flanking <- ks.test(
  effect_inv4m$P.Value[effect_inv4m$in_Inv4m],
  effect_inv4m$P.Value[effect_inv4m$in_flank],
  alternative = "greater"
)

## Warning in ks.test.default(effect_inv4m$P.Value[effect_inv4m$in_Inv4m], :
## p-value will be approximate in the presence of ties

ks_flanking_outside <- ks.test(
  effect_inv4m$P.Value[effect_inv4m$in_flank],
  effect_inv4m$P.Value[effect_inv4m$region == "outside"],
  alternative = "greater"
)

## Warning in ks.test.default(effect_inv4m$P.Value[effect_inv4m$in_flank], :
## p-value will be approximate in the presence of ties

# in_shared = in_Inv4m OR in_flank
in_shared <- effect_inv4m$in_Inv4m | effect_inv4m$in_flank
ks_shared_outside <- ks.test(
  effect_inv4m$P.Value[in_shared],
  effect_inv4m$P.Value[effect_inv4m$region == "outside"],
  alternative = "greater"
)

## Warning in ks.test.default(effect_inv4m$P.Value[in_shared],
## effect_inv4m$P.Value[effect_inv4m$region == : p-value will be approximate in
## the presence of ties

print(ks_inv4m_flanking)

## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  effect_inv4m$P.Value[effect_inv4m$in_Inv4m] and effect_inv4m$P.Value[effect_inv4m$in_flank]
## D^+ = 0.03623, p-value = 0.6785
## alternative hypothesis: the CDF of x lies above that of y

print(ks_flanking_outside)

## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  effect_inv4m$P.Value[effect_inv4m$in_flank] and effect_inv4m$P.Value[effect_inv4m$region == "outside"]
## D^+ = 0.53162, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies above that of y

print(ks_shared_outside)

## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  effect_inv4m$P.Value[in_shared] and effect_inv4m$P.Value[effect_inv4m$region == "outside"]
## D^+ = 0.52117, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies above that of y

# Define in_shared for convenience
in_shared <- effect_inv4m$in_Inv4m | effect_inv4m$in_flank

# Prepare QQ plot data for each region
inv4m_dens <- qqplot(
  runif(n = 10000),
  effect_inv4m$P.Value[effect_inv4m$in_Inv4m],
  plot.it = FALSE
)

flanking_dens <- qqplot(
  runif(n = 10000),
  effect_inv4m$P.Value[effect_inv4m$in_flank],
  plot.it = FALSE
)

outside_dens <- qqplot(
  runif(n = 10000),
  effect_inv4m$P.Value[!in_shared],
  plot.it = FALSE
)

# Combine data
qq_data <- rbind(
  inv4m_dens %>%
    as.data.frame() %>%
    mutate(
      region = "inv4m",
      expected = -log10(x),
      observed = -log10(y)
    ),
  flanking_dens %>%
    as.data.frame() %>%
    mutate(
      region = "flanking",
      expected = -log10(x),
      observed = -log10(y)
    ),
  outside_dens %>%
    as.data.frame() %>%
    mutate(
      region = "outside",
      expected = -log10(x),
      observed = -log10(y)
    )
)

qq_data$region <- factor(qq_data$region, levels = c("inv4m", "flanking", "outside"))
levels(qq_data$region) <- c("*Inv4m*", "flanking", "outside")

# Create QQ plot
pal2 <- viridis_pal(option = "plasma")(3)
key_order <- c("*Inv4m*", "flanking", "outside")
names(pal2) <- key_order

qqplot <- qq_data %>%
  ggplot(aes(y = observed, x = expected, color = region)) +
  ylab(expression("Observed " - log[10](italic(P)))) +
  xlab(expression("Expected " - log[10](italic(P)))) +
  geom_abline(slope = 1, color = "black") +
  geom_point(size = 3, alpha = 0.7) +
  annotate(
    "text",
    x = 2.3,
    y = c(25, 15),
    hjust = 0,
    label = c(expression(italic(p) == "0.29"), expression(italic(p) == "2.2e-16")),
    size = 5
  ) +
  scale_color_manual(values = pal2) +
  ggpubr::theme_classic2(base_size = 25) +
  theme(
    legend.position = c(0.17, 0.83),
    legend.margin = margin(0, 0, 0, 0),
    legend.box.spacing = unit(0, "pt"),
    legend.text = ggtext::element_markdown()
  )

qqplot

# Create combined detail plot
detail <- cowplot::plot_grid(
  qqplot,
  p1,
  ncol = 1,
  nrow = 2,
  rel_heights = c(1.5, 1),
  align = "v",
  axis = "lr"
)

detail

# Save to file
ggsave(
  filename = file.path(paths$figures, "chr04_expression_detail.png"),
  plot = detail,
  width = 8,
  height = 12,
  units = "in",
  dpi = 300
)

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggrepel_0.9.6        ggtext_0.1.2         ggpubr_0.6.2        
##  [4] cowplot_1.2.0        RAINBOWR_0.1.38      fastman_0.1.0       
##  [7] rtracklayer_1.70.1   GenomicRanges_1.62.1 Seqinfo_1.0.0       
## [10] IRanges_2.44.0       S4Vectors_0.48.0     BiocGenerics_0.56.0 
## [13] generics_0.1.4       ggplot2_4.0.1        scales_1.4.0        
## [16] dplyr_1.1.4          here_1.0.2          
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-9                rlang_1.1.6                
##   [3] magrittr_2.0.4              otel_0.2.0                 
##   [5] matrixStats_1.5.0           compiler_4.5.2             
##   [7] systemfonts_1.3.1           pbmcapply_1.5.1            
##   [9] vctrs_0.6.5                 stringr_1.6.0              
##  [11] pkgconfig_2.0.3             crayon_1.5.3               
##  [13] fastmap_1.2.0               backports_1.5.0            
##  [15] XVector_0.50.0              labeling_0.4.3             
##  [17] Rsamtools_2.26.0            rmarkdown_2.30             
##  [19] markdown_2.0                pracma_2.4.6               
##  [21] nloptr_2.2.1                ragg_1.5.0                 
##  [23] purrr_1.2.0                 xfun_0.55                  
##  [25] Rfast_2.1.5.2               cachem_1.1.0               
##  [27] litedown_0.9                cigarillo_1.0.0            
##  [29] jsonlite_2.0.0              DelayedArray_0.36.0        
##  [31] BiocParallel_1.44.0         gaston_1.6                 
##  [33] broom_1.0.11                parallel_4.5.2             
##  [35] cluster_2.1.8.1             R6_2.6.1                   
##  [37] bslib_0.9.0                 stringi_1.8.7              
##  [39] RColorBrewer_1.1-3          car_3.1-3                  
##  [41] jquerylib_0.1.4             numDeriv_2016.8-1.1        
##  [43] Rcpp_1.1.0                  SummarizedExperiment_1.40.0
##  [45] knitr_1.51                  R.utils_2.13.0             
##  [47] MM4LMM_3.0.3                Matrix_1.7-4               
##  [49] tidyselect_1.2.1            dichromat_2.0-0.1          
##  [51] abind_1.4-8                 yaml_2.3.12                
##  [53] codetools_0.2-20            curl_7.0.0                 
##  [55] lattice_0.22-7              tibble_3.3.0               
##  [57] Biobase_2.70.0              withr_3.0.2                
##  [59] S7_0.2.1                    evaluate_1.0.5             
##  [61] RcppParallel_5.1.11-1       xml2_1.5.1                 
##  [63] Biostrings_2.78.0           pillar_1.11.1              
##  [65] carData_3.0-5               MatrixGenerics_1.22.0      
##  [67] rprojroot_2.1.1             RCurl_1.98-1.17            
##  [69] commonmark_2.0.0            glue_1.8.0                 
##  [71] tools_4.5.2                 BiocIO_1.20.0              
##  [73] ggsignif_0.6.4              GenomicAlignments_1.46.0   
##  [75] XML_3.99-0.20               grid_4.5.2                 
##  [77] tidyr_1.3.2                 ape_5.8-1                  
##  [79] nlme_3.1-168                restfulr_0.0.16            
##  [81] Formula_1.2-5               cli_3.6.5                  
##  [83] zigg_0.0.2                  rrBLUP_4.6.3               
##  [85] textshaping_1.0.4           optimx_2025-4.9            
##  [87] expm_1.0-0                  viridisLite_0.4.2          
##  [89] S4Arrays_1.10.1             corpcor_1.6.10             
##  [91] gtable_0.3.6                R.methodsS3_1.8.2          
##  [93] rstatix_0.7.3               sass_0.4.10                
##  [95] digest_0.6.39               SparseArray_1.10.8         
##  [97] rjson_0.2.23                htmlwidgets_1.6.4          
##  [99] farver_2.1.2                htmltools_0.5.9            
## [101] R.oo_1.27.1                 lifecycle_1.0.4            
## [103] httr_1.4.7                  pegas_1.4                  
## [105] gridtext_0.1.5              MASS_7.3-65

Expression Manhattan Plots using fastman_gg (ggplot2)

Fausto Rodríguez Zapata

2026-03-21