Overview

This script analyzes JMJ cluster expression from transcript-level kallisto quantification using the JMJ-corrected cDNA reference.

Two analyses are performed:

Total Cluster Expression (main analysis): Sum of CPM across all 5 paralogs per sample. This is the biologically meaningful comparison because:
- B73/CTRL: Has 5 tandem JMJ gene copies (>97% identical)
- Inv4m: Has only 1 gene copy in the cluster
Comparing total cluster output shows whether the single Inv4m copy produces as much transcript as the 5 B73 copies combined.
Per-Paralog Expression: Individual expression for each paralog. Note that because the paralogs are >97% identical, kallisto distributes Inv4m reads across all 5 loci artificially. This view is useful for understanding read distribution but not for biological interpretation of Inv4m expression.

JMJ Cluster Paralogs (genomic order):

Transcript	Symbol	Description
Zm00001eb191790_T001	jmj9	First tandem copy
Zm00001eb191790_T017	psi	Pseudogene (from V4 annotation)
Zm00001eb191790_T006	jmj6	Third tandem copy
Zm00001eb191790_T013	jmj2	Fourth tandem copy
Zm00001eb191820_T001	jmj4	Separate gene model

Background: The V5 annotation incorrectly merges jmj9, psi, jmj6, and jmj2 into a single gene model (Zm00001eb191790) with 16 chimeric transcripts. We created a corrected cDNA reference that treats each paralog as a separate entry.

Libraries

library(tidyverse)
library(ggplot2)
library(ggfx)
library(ggbeeswarm)

Define JMJ Paralogs

# JMJ paralog transcripts (in genomic order)
jmj_paralogs <- tibble(
  transcript = c(
    "Zm00001eb191790_T001",
    "Zm00001eb191790_T017",
    "Zm00001eb191790_T006",
    "Zm00001eb191790_T013",
    "Zm00001eb191820_T001"
  ),
  symbol = c("jmj9", "psi", "jmj6", "jmj2", "jmj4"),
  position = c(1, 2, 3, 4, 5)
)

cat("JMJ cluster paralogs:\n")

## JMJ cluster paralogs:

print(jmj_paralogs)

## # A tibble: 5 × 3
##   transcript           symbol position
##   <chr>                <chr>     <dbl>
## 1 Zm00001eb191790_T001 jmj9          1
## 2 Zm00001eb191790_T017 psi           2
## 3 Zm00001eb191790_T006 jmj6          3
## 4 Zm00001eb191790_T013 jmj2          4
## 5 Zm00001eb191820_T001 jmj4          5

Load Counts Matrix

The counts matrix is pre-extracted from kallisto outputs using the standalone script extract_kallisto_counts_matrix.R. This analysis reads from the cached intermediate file only.

# Counts matrix path (extracted from kallisto by separate script)
counts_matrix_file <- file.path(
  paths$intermediate,
  "PSU2022_kallisto_jmj_corrected_counts.csv"
)

# Check if counts matrix exists
if (!file.exists(counts_matrix_file)) {
  stop(
    "Counts matrix not found: ", counts_matrix_file, "\n\n",
    "Run the extraction script first:\n",
    "  Rscript scripts/inversion_paper/extract_kallisto_counts_matrix.R\n",
    "(Requires server mounted)"
  )
}

cat("Loading counts matrix...\n")

## Loading counts matrix...

full_counts_matrix <- read_csv(counts_matrix_file, show_col_types = FALSE)
cat("Loaded:", nrow(full_counts_matrix), "transcripts x",
    ncol(full_counts_matrix) - 1, "samples\n")

## Loaded: 72527 transcripts x 60 samples

Extract JMJ Paralogs and Calculate CPM

# Calculate library sizes from full counts matrix
lib_sizes_df <- full_counts_matrix %>%
  pivot_longer(cols = -target_id, names_to = "sample", values_to = "counts") %>%
  group_by(sample) %>%
  summarise(lib_size = sum(counts, na.rm = TRUE), .groups = "drop")

cat("Library sizes calculated for", nrow(lib_sizes_df), "samples\n")

## Library sizes calculated for 60 samples

# Extract JMJ counts (long format)
kallisto_data <- full_counts_matrix %>%
  filter(target_id %in% jmj_paralogs$transcript) %>%
  pivot_longer(
    cols = -target_id,
    names_to = "sample",
    values_to = "est_counts"
  ) %>%
  left_join(lib_sizes_df, by = "sample") %>%
  mutate(
    # Calculate CPM (counts per million)
    cpm = (est_counts / lib_size) * 1e6
  )

cat("Extracted JMJ paralogs:", nrow(kallisto_data), "records\n")

## Extracted JMJ paralogs: 300 records

cat("(", n_distinct(kallisto_data$target_id), "transcripts x",
    n_distinct(kallisto_data$sample), "samples )\n")

## ( 5 transcripts x 60 samples )

Filter Low Quality Libraries

Samples with library size < 20 million reads are considered low quality (same threshold as DEG analysis).

# Get library sizes per sample
lib_sizes <- kallisto_data %>%
  distinct(sample, lib_size)

# Mark low quality samples
lib_sizes <- lib_sizes %>%
  mutate(low_quality = lib_size < 2e7)

{
  cat("=== LIBRARY SIZE QC ===\n")
  cat("Low quality libraries (< 20M reads):\n")
  print(table(lib_sizes$low_quality))

  if (any(lib_sizes$low_quality)) {
    cat("\nExcluded samples:\n")
    print(lib_sizes %>% filter(low_quality) %>% select(sample, lib_size))
  }
}

## === LIBRARY SIZE QC ===
## Low quality libraries (< 20M reads):
## 
## FALSE  TRUE 
##    43    17 
## 
## Excluded samples:
## # A tibble: 17 × 2
##    sample        lib_size
##    <chr>            <dbl>
##  1 L01_S1_L002    328934.
##  2 L02_S2_L002    799530.
##  3 L03_S3_L002   2400639.
##  4 L04_S4_L002   2478621.
##  5 L10_S10_L002 10483027.
##  6 L12_S12_L002 11035011.
##  7 L18_S18_L002  6076193.
##  8 L19_S19_L002  7919901.
##  9 L20_S20_L002 10783328.
## 10 L21_S21_L002  6987465.
## 11 L23_S23_L002  8952170.
## 12 L24_S24_L002 10233511.
## 13 L25_S25_L002  7673052.
## 14 R37_S37_L002 10527638.
## 15 R38_S38_L002  8988918.
## 16 R39_S39_L002  8495462.
## 17 R40_S40_L002  8408381.

# Filter out low quality samples
good_samples <- lib_sizes %>%
  filter(!low_quality) %>%
  pull(sample)

jmj_filtered <- kallisto_data %>%
  filter(sample %in% good_samples)

cat("\nRetained", n_distinct(jmj_filtered$sample), "samples\n")

## 
## Retained 43 samples

Add Metadata

# Load PSU2022 metadata
meta <- read.csv(file.path(paths$data, "PSU2022_metadata.csv"))

# Join with paralog info and metadata
jmj_expr <- jmj_filtered %>%
  left_join(jmj_paralogs, by = c("target_id" = "transcript")) %>%
  left_join(
    meta %>% select(library, Genotype, leaf_tissue),
    by = c("sample" = "library")
  ) %>%
  filter(!is.na(Genotype)) %>%
  mutate(
    genotype = ifelse(Genotype == "CTRL", "B73", "Inv4m"),
    tissue = paste0("Leaf ", leaf_tissue),
    # Order symbols by genomic position
    symbol = factor(symbol, levels = c("jmj9", "psi", "jmj6", "jmj2", "jmj4")),
    genotype = factor(genotype, levels = c("B73", "Inv4m"))
  )

{
  cat("=== DATA SUMMARY ===\n")
  cat("Samples:", n_distinct(jmj_expr$sample), "\n")
  cat("Paralogs:", n_distinct(jmj_expr$symbol), "\n")
  cat("\nBy genotype:\n")
  print(table(jmj_expr$genotype) / 5)
  cat("\nBy tissue:\n")
  print(table(jmj_expr$tissue) / 5)
}

## === DATA SUMMARY ===
## Samples: 43 
## Paralogs: 5 
## 
## By genotype:
## 
##   B73 Inv4m 
##    20    23 
## 
## By tissue:
## 
## Leaf 1 Leaf 2 Leaf 3 Leaf 4 
##     11     11     11     10

Per-Paralog Expression Boxplot

This shows expression distribution per paralog. Note: Because JMJ paralogs are >97% identical, kallisto artificially distributes Inv4m reads across all 5 loci. See “Total Cluster Expression” section below for the main analysis.

genotype_colors <- c("B73" = "gold", "Inv4m" = "#4a0f82")

p_jmj_paralogs <- ggplot(
  jmj_expr,
  aes(x = tissue, y = log2(cpm + 1), color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 1.5,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  facet_wrap(~ symbol, nrow = 1) +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 14) +
  theme(
    axis.text.x = element_text(
      angle = 45, hjust = 1,
      color = "black", face = "bold", size = 11
    ),
    axis.text.y = element_text(color = "black", face = "bold"),
    axis.title = element_text(face = "bold", size = 13),
    strip.text = element_text(face = "bold.italic", size = 14),
    strip.background = element_rect(fill = "grey95", color = NA),
    legend.position = "bottom",
    legend.text = element_text(face = "bold"),
    legend.title = element_text(face = "bold"),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
  ) +
  labs(
    x = NULL,
    y = expression(bold(log[2]*"(CPM + 1)"))
  )

print(p_jmj_paralogs)

Total Cluster Expression Analysis

Since B73/CTRL has 5 tandem JMJ paralogs (>97% identical) while Inv4m has only 1 gene copy, the biologically meaningful comparison is total cluster expression: the sum of CPM across all 5 paralogs per sample.

# Sum CPM across all 5 paralogs for each sample
jmj_total <- jmj_expr %>%
  group_by(sample, genotype, tissue) %>%
  summarise(
    total_cpm = sum(cpm, na.rm = TRUE),
    total_counts = sum(est_counts, na.rm = TRUE),
    n_paralogs = n(),
    .groups = "drop"
  )

cat("=== TOTAL CLUSTER EXPRESSION ===\n")

## === TOTAL CLUSTER EXPRESSION ===

cat("Samples:", n_distinct(jmj_total$sample), "\n")

## Samples: 43

cat("Mean total CPM by genotype:\n")

## Mean total CPM by genotype:

jmj_total %>%
  group_by(genotype) %>%
  summarise(
    mean_total_cpm = mean(total_cpm),
    sd_total_cpm = sd(total_cpm),
    .groups = "drop"
  ) %>%
  print()

## # A tibble: 2 × 3
##   genotype mean_total_cpm sd_total_cpm
##   <fct>             <dbl>        <dbl>
## 1 B73                94.8        19.2 
## 2 Inv4m              10.5         3.73

Total Cluster Expression Boxplot

This plot shows the sum of expression across all 5 JMJ paralogs for each sample, comparing B73 (5 gene copies) vs Inv4m (1 gene copy).

p_jmj_total <- ggplot(
  jmj_total,
  aes(x = tissue, y = log2(total_cpm + 1), color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 2,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 16) +
  theme(
    axis.text.x = element_text(
      angle = 45, hjust = 1,
      color = "black", face = "bold", size = 13
    ),
    axis.text.y = element_text(color = "black", face = "bold", size = 12),
    axis.title = element_text(face = "bold", size = 14),
    legend.position = "bottom",
    legend.text = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3),
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5, color = "grey40")
  ) +
  labs(
    title = "Total JMJ Cluster Expression",
    subtitle = "Sum of 5 paralogs: B73 (5 copies) vs Inv4m (1 copy)",
    x = NULL,
    y = expression(bold(log[2]*"(Total CPM + 1)"))
  )

print(p_jmj_total)

Total Cluster Expression Boxplot (Linear Scale)

Same data as above but on linear scale to emphasize the magnitude of difference. The dashed line shows the expected Inv4m expression if it scaled linearly with copy number (B73 mean / 5).

# Add expected values to data for plotting
expected_df <- expected_df %>%
  mutate(tissue = factor(tissue, levels = levels(factor(jmj_total$tissue))))

p_jmj_total_linear <- ggplot(
  jmj_total,
  aes(x = tissue, y = total_cpm, color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 2,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  # Add expected line (B73/5) - dashed red crossbars
  geom_errorbar(
    data = expected_df,
    aes(x = tissue, ymin = expected_inv4m, ymax = expected_inv4m),
    color = "red", linetype = "dashed", linewidth = 1,
    width = 0.6, inherit.aes = FALSE
  ) +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 16) +
  theme(
    axis.text.x = element_text(
      angle = 45, hjust = 1,
      color = "black", face = "bold", size = 13
    ),
    axis.text.y = element_text(color = "black", face = "bold", size = 12),
    axis.title = element_text(face = "bold", size = 14),
    legend.position = "bottom",
    legend.text = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3),
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5, color = "grey40")
  ) +
  labs(
    title = "Total JMJ Cluster Expression (Linear Scale)",
    subtitle = "Dashed red line = expected if Inv4m = B73/5 (copy number scaling)",
    x = NULL,
    y = "Total CPM"
  )

print(p_jmj_total_linear)

One-Sample t-Test: Inv4m vs Expected (B73/5)

Null hypothesis: Inv4m mean expression = B73 mean / 5 (linear copy number scaling)

# Calculate summary stats by tissue and genotype
tissue_stats <- jmj_total %>%
  group_by(tissue, genotype) %>%
  summarise(
    mean_cpm = mean(total_cpm),
    sd_cpm = sd(total_cpm),
    n = n(),
    values = list(total_cpm),
    .groups = "drop"
  )

# Get B73 stats and calculate expected
b73_stats <- tissue_stats %>%
  filter(genotype == "B73") %>%
  select(tissue, B73_mean = mean_cpm, B73_sd = sd_cpm, B73_n = n) %>%
  mutate(expected = B73_mean / 5)

# Get Inv4m stats
inv4m_stats <- tissue_stats %>%
  filter(genotype == "Inv4m") %>%
  select(tissue, Inv4m_mean = mean_cpm, Inv4m_sd = sd_cpm, Inv4m_n = n,
         inv4m_values = values)

# Merge and run t-tests
ttest_results <- b73_stats %>%
  left_join(inv4m_stats, by = "tissue") %>%
  rowwise() %>%
  mutate(
    ttest = list(t.test(inv4m_values, mu = expected)),
    t_stat = ttest$statistic,
    p_value = ttest$p.value,
    ci_lower = ttest$conf.int[1],
    ci_upper = ttest$conf.int[2],
    pct_of_expected = (Inv4m_mean / expected) * 100
  ) %>%
  ungroup() %>%
  select(-inv4m_values, -ttest)

# Display results table
cat("=== ONE-SAMPLE T-TEST: Inv4m vs Expected (B73/5) ===\n\n")

## === ONE-SAMPLE T-TEST: Inv4m vs Expected (B73/5) ===

ttest_table <- ttest_results %>%
  select(tissue, B73_mean, expected, Inv4m_mean, pct_of_expected, t_stat, p_value) %>%
  mutate(
    B73_mean = round(B73_mean, 1),
    expected = round(expected, 1),
    Inv4m_mean = round(Inv4m_mean, 1),
    pct_of_expected = paste0(round(pct_of_expected, 1), "%"),
    t_stat = round(t_stat, 2),
    p_value = format(p_value, digits = 2, scientific = TRUE)
  ) %>%
  rename(
    Tissue = tissue,
    `B73 Mean` = B73_mean,
    `Expected (B73/5)` = expected,
    `Inv4m Mean` = Inv4m_mean,
    `% of Expected` = pct_of_expected,
    `t` = t_stat,
    `p-value` = p_value
  )

knitr::kable(ttest_table, align = "lrrrcrr")

Tissue	B73 Mean	Expected (B73/5)	Inv4m Mean	% of Expected	t	p-value
Leaf 1	105.7	21.1	13.1	62%	-4.02	7.0e-03
Leaf 2	100.2	20.0	10.0	50.1%	-12.94	2.1e-04
Leaf 3	91.5	18.3	9.8	53.6%	-8.65	3.4e-04
Leaf 4	83.0	16.6	8.3	49.8%	-8.07	1.3e-03

# Overall test (pooling across tissues)
overall_B73_mean <- mean(jmj_total$total_cpm[jmj_total$genotype == "B73"])
overall_expected <- overall_B73_mean / 5
overall_Inv4m_values <- jmj_total$total_cpm[jmj_total$genotype == "Inv4m"]

overall_ttest <- t.test(overall_Inv4m_values, mu = overall_expected)

cat("\n=== OVERALL TEST (pooled across tissues) ===\n\n")

## 
## === OVERALL TEST (pooled across tissues) ===

cat("B73 mean:           ", round(overall_B73_mean, 2), "CPM\n")

## B73 mean:            94.82 CPM

cat("Expected (B73/5):   ", round(overall_expected, 2), "CPM\n")

## Expected (B73/5):    18.96 CPM

cat("Inv4m mean:         ", round(mean(overall_Inv4m_values), 2), "CPM\n")

## Inv4m mean:          10.53 CPM

cat("% of expected:      ", round(mean(overall_Inv4m_values) / overall_expected * 100, 1), "%\n\n")

## % of expected:       55.6 %

cat("One-sample t-test:\n")

## One-sample t-test:

cat("  t =", round(overall_ttest$statistic, 2), "\n")

##   t = -10.84

cat("  df =", round(overall_ttest$parameter, 1), "\n")

##   df = 22

cat("  p =", format(overall_ttest$p.value, digits = 3, scientific = TRUE), "\n")

##   p = 2.73e-10

cat("  95% CI: [", round(overall_ttest$conf.int[1], 2), ",",
    round(overall_ttest$conf.int[2], 2), "]\n")

##   95% CI: [ 8.92 , 12.15 ]

Total Cluster Summary Statistics

jmj_total_summary <- jmj_total %>%
  group_by(tissue, genotype) %>%
  summarise(
    mean_total_cpm = mean(total_cpm, na.rm = TRUE),
    sd_total_cpm = sd(total_cpm, na.rm = TRUE),
    mean_log2 = mean(log2(total_cpm + 1), na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = genotype,
    values_from = c(mean_total_cpm, sd_total_cpm, mean_log2, n),
    names_sep = "_"
  ) %>%
  mutate(
    log2FC = mean_log2_Inv4m - mean_log2_B73,
    fold_change = 2^log2FC,
    direction = ifelse(log2FC < 0, "Down", "Up")
  )

cat("=== TOTAL CLUSTER EXPRESSION BY TISSUE ===\n")

## === TOTAL CLUSTER EXPRESSION BY TISSUE ===

jmj_total_summary %>%
  select(tissue, mean_total_cpm_B73, mean_total_cpm_Inv4m, log2FC, fold_change, direction) %>%
  print()

## # A tibble: 4 × 6
##   tissue mean_total_cpm_B73 mean_total_cpm_Inv4m log2FC fold_change direction
##   <chr>               <dbl>                <dbl>  <dbl>       <dbl> <chr>    
## 1 Leaf 1              106.                 13.1   -2.97       0.128 Down     
## 2 Leaf 2              100.                 10.0   -3.19       0.110 Down     
## 3 Leaf 3               91.5                 9.81  -3.10       0.117 Down     
## 4 Leaf 4               83.0                 8.28  -3.19       0.109 Down

cat("\n=== OVERALL MEAN ===\n")

## 
## === OVERALL MEAN ===

cat("Mean log2FC across tissues:", round(mean(jmj_total_summary$log2FC), 3), "\n")

## Mean log2FC across tissues: -3.112

cat("Mean fold change:", round(mean(jmj_total_summary$fold_change), 3), "\n")

## Mean fold change: 0.116

Statistical Test (Total Cluster Expression)

# Test for genotype effect on total cluster expression
cat("=== STATISTICAL ANALYSIS ===\n\n")

## === STATISTICAL ANALYSIS ===

# Overall t-test (ignoring tissue)
t_overall <- t.test(
  log2(total_cpm + 1) ~ genotype,
  data = jmj_total
)
cat("Overall t-test (B73 vs Inv4m):\n")

## Overall t-test (B73 vs Inv4m):

cat("  t =", round(t_overall$statistic, 3), "\n")

##   t = 28.285

cat("  p =", format(t_overall$p.value, digits = 3, scientific = TRUE), "\n")

##   p = 3.97e-28

cat("  Mean B73:", round(t_overall$estimate[1], 3), "\n")

##   Mean B73: 6.553

cat("  Mean Inv4m:", round(t_overall$estimate[2], 3), "\n")

##   Mean Inv4m: 3.469

# ANOVA with tissue interaction
model <- aov(log2(total_cpm + 1) ~ genotype * tissue, data = jmj_total)
cat("\n\nTwo-way ANOVA (genotype × tissue):\n")

## 
## 
## Two-way ANOVA (genotype × tissue):

print(summary(model))

##                 Df Sum Sq Mean Sq F value Pr(>F)    
## genotype         1 101.70  101.70 877.692 <2e-16 ***
## tissue           3   1.29    0.43   3.702 0.0205 *  
## genotype:tissue  3   0.09    0.03   0.247 0.8630    
## Residuals       35   4.06    0.12                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Per-Paralog Summary Statistics

jmj_summary <- jmj_expr %>%
  group_by(tissue, symbol, genotype) %>%
  summarise(
    mean_cpm = mean(cpm, na.rm = TRUE),
    sd_cpm = sd(cpm, na.rm = TRUE),
    mean_log2 = mean(log2(cpm + 1), na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = genotype,
    values_from = c(mean_cpm, sd_cpm, mean_log2, n),
    names_sep = "_"
  ) %>%
  mutate(
    log2FC = mean_log2_Inv4m - mean_log2_B73,
    direction = ifelse(log2FC < 0, "Down", "Up")
  )

cat("=== EXPRESSION SUMMARY ===\n")

## === EXPRESSION SUMMARY ===

jmj_summary %>%
  select(tissue, symbol, mean_cpm_B73, mean_cpm_Inv4m, log2FC, direction) %>%
  arrange(symbol, tissue) %>%
  print(n = 25)

## # A tibble: 20 × 6
##    tissue symbol mean_cpm_B73 mean_cpm_Inv4m log2FC direction
##    <chr>  <fct>         <dbl>          <dbl>  <dbl> <chr>    
##  1 Leaf 1 jmj9          38.1           5.83   -2.54 Down     
##  2 Leaf 2 jmj9          35.8           3.97   -2.88 Down     
##  3 Leaf 3 jmj9          31.8           4.02   -2.75 Down     
##  4 Leaf 4 jmj9          29.9           3.31   -2.86 Down     
##  5 Leaf 1 psi            6.36          0.751  -2.11 Down     
##  6 Leaf 2 psi            6.22          0.647  -2.05 Down     
##  7 Leaf 3 psi            6.37          0.821  -2.02 Down     
##  8 Leaf 4 psi            5.74          0.628  -1.92 Down     
##  9 Leaf 1 jmj6          16.7           2.22   -2.48 Down     
## 10 Leaf 2 jmj6          16.1           1.34   -2.96 Down     
## 11 Leaf 3 jmj6          14.9           1.47   -2.69 Down     
## 12 Leaf 4 jmj6          14.0           1.09   -2.87 Down     
## 13 Leaf 1 jmj2          20.3           0.923  -3.77 Down     
## 14 Leaf 2 jmj2          18.7           0.794  -3.54 Down     
## 15 Leaf 3 jmj2          17.2           0.370  -3.83 Down     
## 16 Leaf 4 jmj2          15.4           0.338  -3.65 Down     
## 17 Leaf 1 jmj4          24.2           3.38   -2.54 Down     
## 18 Leaf 2 jmj4          23.3           3.29   -2.49 Down     
## 19 Leaf 3 jmj4          21.1           3.13   -2.39 Down     
## 20 Leaf 4 jmj4          18.0           2.91   -2.31 Down

Mean log2FC by Paralog

jmj_summary %>%
  group_by(symbol) %>%
  summarise(
    mean_log2FC = mean(log2FC),
    .groups = "drop"
  ) %>%
  arrange(symbol)

## # A tibble: 5 × 2
##   symbol mean_log2FC
##   <fct>        <dbl>
## 1 jmj9         -2.76
## 2 psi          -2.03
## 3 jmj6         -2.75
## 4 jmj2         -3.70
## 5 jmj4         -2.43

Save Outputs

# Save per-paralog plot
saveRDS(p_jmj_paralogs, file.path(paths$figures, "jmj_paralogs_expression_boxplot.rds"))
ggsave(
  file.path(paths$figures, "jmj_paralogs_expression_boxplot.pdf"),
  p_jmj_paralogs,
  width = 14, height = 5
)
ggsave(
  file.path(paths$figures, "jmj_paralogs_expression_boxplot.png"),
  p_jmj_paralogs,
  width = 14, height = 5, dpi = 300
)

# Save total cluster expression plot (log scale)
saveRDS(p_jmj_total, file.path(paths$figures, "jmj_total_cluster_expression.rds"))
ggsave(
  file.path(paths$figures, "jmj_total_cluster_expression.pdf"),
  p_jmj_total,
  width = 8, height = 6
)
ggsave(
  file.path(paths$figures, "jmj_total_cluster_expression.png"),
  p_jmj_total,
  width = 8, height = 6, dpi = 300
)

# Save total cluster expression plot (linear scale)
saveRDS(p_jmj_total_linear, file.path(paths$figures, "jmj_total_cluster_expression_linear.rds"))
ggsave(
  file.path(paths$figures, "jmj_total_cluster_expression_linear.pdf"),
  p_jmj_total_linear,
  width = 8, height = 6
)
ggsave(
  file.path(paths$figures, "jmj_total_cluster_expression_linear.png"),
  p_jmj_total_linear,
  width = 8, height = 6, dpi = 300
)

# Save per-paralog data
write_csv(jmj_expr, file.path(paths$intermediate, "jmj_paralogs_expression_data.csv"))
write_csv(jmj_summary, file.path(paths$intermediate, "jmj_paralogs_expression_summary.csv"))

# Save total cluster data
write_csv(jmj_total, file.path(paths$intermediate, "jmj_total_cluster_expression_data.csv"))
write_csv(jmj_total_summary, file.path(paths$intermediate, "jmj_total_cluster_expression_summary.csv"))

{
  cat("=== SAVED OUTPUTS ===\n")
  cat("\nPer-Paralog Analysis:\n")
  cat("  Figures: jmj_paralogs_expression_boxplot.pdf/png/rds\n")
  cat("  Data: jmj_paralogs_expression_data.csv\n")
  cat("  Data: jmj_paralogs_expression_summary.csv\n")
  cat("\nTotal Cluster Analysis:\n")
  cat("  Figures: jmj_total_cluster_expression.pdf/png/rds (log scale)\n")
  cat("  Figures: jmj_total_cluster_expression_linear.pdf/png/rds (linear scale)\n")
  cat("  Data: jmj_total_cluster_expression_data.csv\n")
  cat("  Data: jmj_total_cluster_expression_summary.csv\n")
}

## === SAVED OUTPUTS ===
## 
## Per-Paralog Analysis:
##   Figures: jmj_paralogs_expression_boxplot.pdf/png/rds
##   Data: jmj_paralogs_expression_data.csv
##   Data: jmj_paralogs_expression_summary.csv
## 
## Total Cluster Analysis:
##   Figures: jmj_total_cluster_expression.pdf/png/rds (log scale)
##   Figures: jmj_total_cluster_expression_linear.pdf/png/rds (linear scale)
##   Data: jmj_total_cluster_expression_data.csv
##   Data: jmj_total_cluster_expression_summary.csv

Session Info

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggbeeswarm_0.7.3 ggfx_1.0.3       lubridate_1.9.4  forcats_1.0.1   
##  [5] stringr_1.6.0    dplyr_1.1.4      purrr_1.2.0      readr_2.1.6     
##  [9] tidyr_1.3.2      tibble_3.3.0     ggplot2_4.0.1    tidyverse_2.0.0 
## [13] here_1.0.2      
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6         sass_0.4.10        generics_0.1.4     stringi_1.8.7     
##  [5] hms_1.1.4          digest_0.6.39      magrittr_2.0.4     evaluate_1.0.5    
##  [9] grid_4.5.2         timechange_0.3.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] rprojroot_2.1.1    jsonlite_2.0.0     scales_1.4.0       textshaping_1.0.4 
## [17] jquerylib_0.1.4    cli_3.6.5          crayon_1.5.3       rlang_1.1.6       
## [21] bit64_4.6.0-1      withr_3.0.2        cachem_1.1.0       yaml_2.3.12       
## [25] otel_0.2.0         parallel_4.5.2     tools_4.5.2        tzdb_0.5.0        
## [29] vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4    magick_2.9.0      
## [33] bit_4.6.0          vroom_1.6.7        vipor_0.4.7        ragg_1.5.0        
## [37] beeswarm_0.4.0     pkgconfig_2.0.3    pillar_1.11.1      bslib_0.9.0       
## [41] gtable_0.3.6       glue_1.8.0         Rcpp_1.1.0         systemfonts_1.3.1 
## [45] xfun_0.55          tidyselect_1.2.1   knitr_1.51         dichromat_2.0-0.1 
## [49] farver_2.1.2       htmltools_0.5.9    labeling_0.4.3     rmarkdown_2.30    
## [53] compiler_4.5.2     S7_0.2.1

JMJ Paralog Expression Boxplot (PA2022)

Analysis Pipeline

2026-03-19