Overview

This script creates an expression boxplot for the JMJ cluster genes (jmj2, jmj4, jmj21) comparing Inv4m vs CTRL across all tissues in:

PA2022: Field experiment with 4 leaf positions
Crow2020: Growth chamber experiment with 5 tissues (V1_Leaf, V1_Root, V3_Leaf, V3_Root, V3_SAM)

The output is saved as an RDS file for composition with the JMJ microsynteny figure.

Libraries

library(tidyverse)
library(ggplot2)
library(ggpubr)
library(ggfx)
library(ggbeeswarm)
library(here)

Define JMJ Genes

jmj_genes <- tibble(
 gene = c("Zm00001eb191790", "Zm00001eb191820", "Zm00001eb190750"),
 symbol = c("jmj2", "jmj4", "jmj21"),
 description = c(
   "JUMONJI-transcription factor 2",
   "JUMONJI-transcription factor 4",
   "JUMONJI-transcription factor 21"
 )
)

cat("JMJ cluster genes:\n")

## JMJ cluster genes:

print(jmj_genes)

## # A tibble: 3 × 3
##   gene            symbol description                    
##   <chr>           <chr>  <chr>                          
## 1 Zm00001eb191790 jmj2   JUMONJI-transcription factor 2 
## 2 Zm00001eb191820 jmj4   JUMONJI-transcription factor 4 
## 3 Zm00001eb190750 jmj21  JUMONJI-transcription factor 21

Load PA2022 Data

# Load expression matrix (logCPM)
expr_psu <- readRDS(file.path(paths$intermediate, "normalized_expression_logCPM_leaf_trt.rds"))

# Load metadata
meta_psu <- read.csv(file.path(paths$data, "PSU2022_metadata.csv"))

{
 cat("=== PA2022 DATA ===\n")
 cat("Expression matrix:", nrow(expr_psu), "genes x", ncol(expr_psu), "samples\n")
 cat("Metadata samples:", nrow(meta_psu), "\n")
 cat("\nGenotypes:\n")
 print(table(meta_psu$Genotype))
 cat("\nLeaf positions:\n")
 print(table(meta_psu$leaf_tissue))
}

## === PA2022 DATA ===
## Expression matrix: 24249 genes x 43 samples
## Metadata samples: 64 
## 
## Genotypes:
## 
##  CTRL INV4m 
##    32    32 
## 
## Leaf positions:
## 
##  1  2  3  4 
## 16 16 16 16

# Extract JMJ expression
jmj_in_psu <- jmj_genes$gene[jmj_genes$gene %in% rownames(expr_psu)]
cat("JMJ genes found in PA2022:", length(jmj_in_psu), "\n")

## JMJ genes found in PA2022: 3

if (length(jmj_in_psu) > 0) {
 # Extract expression for JMJ genes
 jmj_expr_psu <- expr_psu[jmj_in_psu, , drop = FALSE] %>%
   as.data.frame() %>%
   rownames_to_column("gene") %>%
   pivot_longer(
     cols = -gene,
     names_to = "sample",
     values_to = "logCPM"
   )

 # Match samples to metadata
 # Sample IDs in expression matrix should match library column in metadata
 jmj_expr_psu <- jmj_expr_psu %>%
   left_join(
     meta_psu %>%
       dplyr::select(library, Genotype, leaf_tissue, Treatment) %>%
       mutate(sample = library),
     by = "sample"
   ) %>%
   filter(!is.na(Genotype)) %>%
   left_join(jmj_genes, by = "gene") %>%
   mutate(
     experiment = "PA2022 (Field)",
     tissue = paste0("Leaf ", leaf_tissue),
     genotype = ifelse(Genotype == "CTRL", "B73", "Inv4m")
   )

 cat("PA2022 expression records:", nrow(jmj_expr_psu), "\n")
}

## PA2022 expression records: 129

Load Crow2020 Data

# Load expression matrix (logCPM)
expr_crow <- readRDS(file.path(paths$intermediate, "normalized_expression_logCPM_crow2020.rds"))

# Load metadata
meta_crow <- read.csv(file.path(paths$data, "SraRunTable.csv"), check.names = FALSE)

# Standardize column names - use Run (SRR ID) as sample ID
meta_crow <- meta_crow %>%
 dplyr::rename(
   sample = Run,
   genotype_raw = genotype,
   tissue_raw = tissue
 ) %>%
 mutate(
   genotype = ifelse(genotype_raw == "B73", "B73", "Inv4m"),
   tissue = tissue_raw
 )

{
 cat("=== CROW2020 DATA ===\n")
 cat("Expression matrix:", nrow(expr_crow), "genes x", ncol(expr_crow), "samples\n")
 cat("Metadata samples:", nrow(meta_crow), "\n")
 cat("\nGenotypes:\n")
 print(table(meta_crow$genotype))
 cat("\nTissues:\n")
 print(table(meta_crow$tissue))
 cat("\nTemperatures:\n")
 print(table(meta_crow$temp))
}

## === CROW2020 DATA ===
## Expression matrix: 26490 genes x 346 samples
## Metadata samples: 346 
## 
## Genotypes:
## 
##   B73 Inv4m 
##   190   156 
## 
## Tissues:
## 
##  V1_Leaf_tissue V1_Primary_root    V3_leaf_base  V3_leaf_sheath  V3_Leaf_tissue 
##              39              41              36              37              37 
## V3_Primary_root V3_S2_leaf_base  V3_S2_leaf_tip     V3_Stem_SAM 
##              42              38              36              40 
## 
## Temperatures:
## 
## cold warm 
##  147  199

# Extract JMJ expression
jmj_in_crow <- jmj_genes$gene[jmj_genes$gene %in% rownames(expr_crow)]
cat("JMJ genes found in Crow2020:", length(jmj_in_crow), "\n")

## JMJ genes found in Crow2020: 3

if (length(jmj_in_crow) > 0) {
 # Extract expression for JMJ genes
 jmj_expr_crow <- expr_crow[jmj_in_crow, , drop = FALSE] %>%
   as.data.frame() %>%
   rownames_to_column("gene") %>%
   pivot_longer(
     cols = -gene,
     names_to = "sample",
     values_to = "logCPM"
   )

 # Match samples to metadata
 jmj_expr_crow <- jmj_expr_crow %>%
   left_join(
     meta_crow %>%
       dplyr::select(sample, genotype, tissue, temp),
     by = "sample"
   ) %>%
   filter(!is.na(genotype)) %>%
   left_join(jmj_genes, by = "gene") %>%
   mutate(
     experiment = "Crow et al., 2020 (Growth Chamber)",
     # Clean up tissue names
     tissue = str_replace(tissue, "_tissue$", "") %>%
       str_replace("_", " ") %>%
       str_replace("Primary root", "Root")
   )

 cat("Crow2020 expression records:", nrow(jmj_expr_crow), "\n")
}

## Crow2020 expression records: 1038

Combine Data

# Combine both experiments
jmj_combined <- bind_rows(
 jmj_expr_psu %>%
   dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment),
 jmj_expr_crow %>%
   dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment)
)

# Order factors
jmj_combined <- jmj_combined %>%
 mutate(
   symbol = factor(symbol, levels = c("jmj2", "jmj4", "jmj21")),
   genotype = factor(genotype, levels = c("B73", "Inv4m")),
   experiment = factor(experiment, levels = c("Crow et al., 2020 (Growth Chamber)", "PA2022 (Field)"))
 )

{
 cat("=== COMBINED DATA ===\n")
 cat("Total records:", nrow(jmj_combined), "\n")
 cat("\nBy experiment:\n")
 print(table(jmj_combined$experiment))
 cat("\nBy gene:\n")
 print(table(jmj_combined$symbol))
}

## === COMBINED DATA ===
## Total records: 1167 
## 
## By experiment:
## 
## Crow et al., 2020 (Growth Chamber)                     PA2022 (Field) 
##                               1038                                129 
## 
## By gene:
## 
##  jmj2  jmj4 jmj21 
##   389   389   389

Total JMJ Cluster Expression Analysis

The JMJ cluster in B73 contains 5 paralogs (jmj9, psi, jmj6, jmj2, jmj4) that are >97% identical. In the V5 annotation, jmj9, psi, jmj6, and jmj2 are merged into a single gene model (Zm00001eb191790), while jmj4 is separate (Zm00001eb191820). The Inv4m inversion removes 4 copies, leaving only jmj4.

To compare total cluster expression between genotypes, we sum the CPM values for jmj2 + jmj4 (which captures all 5 paralogs in the V5 annotation). The expected Inv4m expression under copy-number scaling is B73/5 (one copy out of five).

Calculate Total Cluster Expression

# Convert logCPM to CPM and sum jmj2 + jmj4 per sample
# First, get unique sample info from PA2022
psu_samples <- jmj_expr_psu %>%
  filter(symbol %in% c("jmj2", "jmj4")) %>%
  mutate(CPM = 2^logCPM) %>%
  group_by(sample, genotype, tissue, experiment) %>%
  summarise(total_CPM = sum(CPM), .groups = "drop")

# Then from Crow2020
crow_samples <- jmj_expr_crow %>%
  filter(symbol %in% c("jmj2", "jmj4")) %>%
  mutate(CPM = 2^logCPM) %>%
  group_by(sample, genotype, tissue, experiment) %>%
  summarise(total_CPM = sum(CPM), .groups = "drop")

# Combine and rename B73 to CTRL
total_cluster_samples <- bind_rows(psu_samples, crow_samples) %>%
  mutate(
    genotype = recode(genotype, "B73" = "CTRL"),
    genotype = factor(genotype, levels = c("CTRL", "Inv4m")),
    experiment = factor(experiment, levels = c("Crow et al., 2020 (Growth Chamber)", "PA2022 (Field)"))
  )

{
  cat("=== TOTAL JMJ CLUSTER EXPRESSION ===\n")
  cat("Samples per experiment:\n")
  print(table(total_cluster_samples$experiment, total_cluster_samples$genotype))
  cat("\nMean total CPM by genotype and experiment:\n")
  print(
    total_cluster_samples %>%
      group_by(experiment, genotype) %>%
      summarise(mean_CPM = mean(total_CPM), sd_CPM = sd(total_CPM), n = n(), .groups = "drop")
  )
}

## === TOTAL JMJ CLUSTER EXPRESSION ===
## Samples per experiment:
##                                     
##                                      CTRL Inv4m
##   Crow et al., 2020 (Growth Chamber)  190   156
##   PA2022 (Field)                       20    23
## 
## Mean total CPM by genotype and experiment:
## # A tibble: 4 × 5
##   experiment                         genotype mean_CPM sd_CPM     n
##   <fct>                              <fct>       <dbl>  <dbl> <int>
## 1 Crow et al., 2020 (Growth Chamber) CTRL         95.4  39.1    190
## 2 Crow et al., 2020 (Growth Chamber) Inv4m        13.9  11.7    156
## 3 PA2022 (Field)                     CTRL         94.4  19.2     20
## 4 PA2022 (Field)                     Inv4m        10.4   3.72    23

Calculate Expected Values

# Calculate CTRL mean per experiment and tissue for expected lines
expected_by_tissue <- total_cluster_samples %>%
  filter(genotype == "CTRL") %>%
  group_by(experiment, tissue) %>%
  summarise(
    CTRL_mean = mean(total_CPM),
    expected_Inv4m = mean(total_CPM) / 5,
    .groups = "drop"
  )

# Also calculate overall expected per experiment (for stats)
expected_overall <- total_cluster_samples %>%
  filter(genotype == "CTRL") %>%
  group_by(experiment) %>%
  summarise(
    CTRL_mean = mean(total_CPM),
    expected_Inv4m = mean(total_CPM) / 5,
    .groups = "drop"
  )

{
  cat("=== EXPECTED VALUES (CTRL/5) ===\n")
  cat("\nBy tissue:\n")
  print(expected_by_tissue)
  cat("\nOverall by experiment:\n")
  print(expected_overall)
}

## === EXPECTED VALUES (CTRL/5) ===
## 
## By tissue:
## # A tibble: 13 × 4
##    experiment                         tissue          CTRL_mean expected_Inv4m
##    <fct>                              <chr>               <dbl>          <dbl>
##  1 Crow et al., 2020 (Growth Chamber) V1 Leaf             112.            22.3
##  2 Crow et al., 2020 (Growth Chamber) V1 Primary_root      68.0           13.6
##  3 Crow et al., 2020 (Growth Chamber) V3 Leaf             117.            23.4
##  4 Crow et al., 2020 (Growth Chamber) V3 Primary_root      64.0           12.8
##  5 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_base      81.4           16.3
##  6 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_tip      141.            28.2
##  7 Crow et al., 2020 (Growth Chamber) V3 Stem_SAM          89.3           17.9
##  8 Crow et al., 2020 (Growth Chamber) V3 leaf_base        114.            22.8
##  9 Crow et al., 2020 (Growth Chamber) V3 leaf_sheath       80.2           16.0
## 10 PA2022 (Field)                     Leaf 1              106.            21.1
## 11 PA2022 (Field)                     Leaf 2               99.7           19.9
## 12 PA2022 (Field)                     Leaf 3               91.0           18.2
## 13 PA2022 (Field)                     Leaf 4               82.6           16.5
## 
## Overall by experiment:
## # A tibble: 2 × 3
##   experiment                         CTRL_mean expected_Inv4m
##   <fct>                                  <dbl>          <dbl>
## 1 Crow et al., 2020 (Growth Chamber)      95.4           19.1
## 2 PA2022 (Field)                          94.4           18.9

Statistical Test (Per Tissue)

Linear Contrast t-Test

Test whether expression scales with copy number using a linear contrast:

H₀: μ_CTRL - 5×μ_Inv4m = 0 (expression scales with 5:1 copy number)

This Welch-type t-test properly accounts for uncertainty in both group means and is appropriate for small sample sizes (uses t-distribution with Welch-Satterthwaite degrees of freedom).

# Linear contrast t-test for each experiment AND tissue
# H0: mean_CTRL - 5*mean_Inv4m = 0 (expression scales with copy number)
contrast_by_tissue <- total_cluster_samples %>%
  group_by(experiment, tissue) %>%
  summarise(
    mean_CTRL = mean(total_CPM[genotype == "CTRL"]),
    mean_Inv4m = mean(total_CPM[genotype == "Inv4m"]),
    var_CTRL = var(total_CPM[genotype == "CTRL"]),
    var_Inv4m = var(total_CPM[genotype == "Inv4m"]),
    n_CTRL = sum(genotype == "CTRL"),
    n_Inv4m = sum(genotype == "Inv4m"),
    .groups = "drop"
  ) %>%
  mutate(
    # Observed ratio
    observed_ratio = mean_CTRL / mean_Inv4m,
    # Expected Inv4m under copy number scaling
    expected_Inv4m = mean_CTRL / 5,
    # Percent of expected
    percent_of_expected = (mean_Inv4m / expected_Inv4m) * 100,
    # Linear contrast: mean_CTRL - 5*mean_Inv4m
    contrast = mean_CTRL - 5 * mean_Inv4m,
    # Standard error of contrast (note: coefficient 5 gets squared for variance)
    se_contrast = sqrt(var_CTRL / n_CTRL + 25 * var_Inv4m / n_Inv4m),
    # t statistic
    t_stat = contrast / se_contrast,
    # Welch-Satterthwaite degrees of freedom
    df = (var_CTRL / n_CTRL + 25 * var_Inv4m / n_Inv4m)^2 /
         ((var_CTRL / n_CTRL)^2 / (n_CTRL - 1) + (25 * var_Inv4m / n_Inv4m)^2 / (n_Inv4m - 1)),
    # Two-tailed p-value
    p_value = 2 * pt(abs(t_stat), df = df, lower.tail = FALSE)
  ) %>%
  # Add FDR adjustment across all tissues and significance stars
  mutate(
    fdr = p.adjust(p_value, method = "fdr"),
    sig_stars = case_when(
      fdr < 0.001 ~ "***",
      fdr < 0.01 ~ "**",
      fdr < 0.05 ~ "*",
      TRUE ~ "ns"
    )
  )

# Display as table
contrast_table <- contrast_by_tissue %>%
  dplyr::select(experiment, tissue, n_CTRL, n_Inv4m, observed_ratio, t_stat, df, p_value, fdr, percent_of_expected, sig_stars) %>%
  mutate(
    observed_ratio = sprintf("%.2f:1", observed_ratio),
    t_stat = sprintf("%.2f", t_stat),
    df = sprintf("%.1f", df),
    p_value = sprintf("%.3f", p_value),
    fdr = sprintf("%.3f", fdr),
    percent_of_expected = sprintf("%.1f%%", percent_of_expected)
  )

knitr::kable(
  contrast_table,
  col.names = c("Experiment", "Tissue", "n CTRL", "n Inv4m", "Observed Ratio", "t", "df", "P-value", "FDR", "% of Expected", "Sig"),
  caption = "Linear contrast t-test: H₀ = μ_CTRL - 5×μ_Inv4m = 0 (expression scales with copy number)"
)

Linear contrast t-test: H₀ = μ_CTRL - 5×μ_Inv4m = 0 (expression scales with copy number)
Experiment	Tissue	n CTRL	n Inv4m	Observed Ratio	t	df	P-value	FDR	% of Expected	Sig
Crow et al., 2020 (Growth Chamber)	V1 Leaf	22	17	5.99:1	1.69	36.1	0.100	0.108	83.4%	ns
Crow et al., 2020 (Growth Chamber)	V1 Primary_root	22	19	10.32:1	4.21	34.2	0.000	0.002	48.4%	**
Crow et al., 2020 (Growth Chamber)	V3 Leaf	22	15	6.92:1	1.94	32.3	0.061	0.080	72.3%	ns
Crow et al., 2020 (Growth Chamber)	V3 Primary_root	24	18	7.55:1	2.50	25.2	0.019	0.031	66.2%	*
Crow et al., 2020 (Growth Chamber)	V3 S2_leaf_base	21	17	6.21:1	1.89	28.0	0.069	0.081	80.5%	ns
Crow et al., 2020 (Growth Chamber)	V3 S2_leaf_tip	20	16	6.97:1	2.60	24.0	0.016	0.031	71.7%	*
Crow et al., 2020 (Growth Chamber)	V3 Stem_SAM	21	19	6.11:1	0.58	18.7	0.567	0.567	81.8%	ns
Crow et al., 2020 (Growth Chamber)	V3 leaf_base	18	18	6.58:1	1.97	26.8	0.059	0.080	75.9%	ns
Crow et al., 2020 (Growth Chamber)	V3 leaf_sheath	20	17	7.29:1	2.53	34.7	0.016	0.031	68.6%	*
PA2022 (Field)	Leaf 1	4	7	8.14:1	2.94	8.3	0.018	0.031	61.5%	*
PA2022 (Field)	Leaf 2	6	5	10.04:1	6.00	7.2	0.000	0.003	49.8%	**
PA2022 (Field)	Leaf 3	5	6	9.43:1	4.34	6.3	0.004	0.014	53.0%	*
PA2022 (Field)	Leaf 4	5	5	10.15:1	4.36	6.7	0.004	0.014	49.3%	*

Total Cluster Expression Boxplot

# Define colors (matching ionome style: gold for CTRL, purple for Inv4m)
genotype_colors <- c("CTRL" = "gold", "Inv4m" = "#4a0f82")

# Prepare expected line data - add genotype = "Inv4m" so it dodges to Inv4m position
expected_lines <- expected_by_tissue %>%
  mutate(genotype = factor("Inv4m", levels = c("CTRL", "Inv4m")))

# Prepare significance annotation data
# Position stars just above the dashed red line (expected value)
# Only include significant results (remove "ns")
sig_annotations <- contrast_by_tissue %>%
  dplyr::select(experiment, tissue, sig_stars, expected_Inv4m) %>%
  filter(sig_stars != "ns") %>%  # Only show stars for significant results
  mutate(
    y_pos = expected_Inv4m * 1.08,  # Position just above the expected line
    genotype = factor("Inv4m", levels = c("CTRL", "Inv4m"))
  )

# Create boxplot with tissue on x-axis (matching jmj2/jmj4 plot layout)
p_total_cluster <- ggplot(
  total_cluster_samples,
  aes(x = tissue, y = total_CPM, color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 1.5,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  # Add expected line (CTRL/5) for each tissue at the Inv4m position
  geom_errorbar(
    data = expected_lines,
    aes(x = tissue, ymin = expected_Inv4m, ymax = expected_Inv4m, group = genotype),
    inherit.aes = FALSE,
    position = position_dodge(width = 0.7),
    width = 0.4,
    linewidth = 1,
    linetype = "dashed",
    color = "red"
  ) +
  # Add significance stars just above the expected line (only for significant)
  geom_text(
    data = sig_annotations,
    aes(x = tissue, y = y_pos, label = sig_stars, group = genotype),
    inherit.aes = FALSE,
    position = position_dodge(width = 0.7),
    size = 10,
    fontface = "bold",
    color = "red"
  ) +
  facet_grid(. ~ experiment, scales = "free_x", space = "free_x") +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 14) +
  theme(
    axis.text.x = element_text(
      angle = 30, hjust = 1,
      color = "black", face = "bold", size = 14
    ),
    axis.text.y = element_text(color = "black", face = "bold", size = 14),
    axis.title = element_text(face = "bold", size = 16),
    strip.text = element_text(face = "bold", size = 18),
    strip.background = element_rect(fill = "grey95", color = NA),
    legend.position = c(0.9, 0.9),
    legend.justification = c(1, 1),
    legend.background = element_rect(fill = "white", color = NA),
    legend.text = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
  ) +
  labs(
    x = NULL,
    y = "Total JMJ Cluster Expression\n(CPM)",
    caption = "Dashed red line = expected Inv4m (CTRL/5). Significance: * FDR<0.05, ** FDR<0.01, *** FDR<0.001"
  )

print(p_total_cluster)

Save Total Cluster Analysis Outputs

# Save total cluster expression plot
saveRDS(p_total_cluster, file.path(paths$figures, "jmj_total_cluster_expression.rds"))
ggsave(
  file.path(paths$figures, "jmj_total_cluster_expression.pdf"),
  p_total_cluster,
  width = 14, height = 7
)
ggsave(
  file.path(paths$figures, "jmj_total_cluster_expression.png"),
  p_total_cluster,
  width = 14, height = 7, dpi = 300
)

# Save per-tissue linear contrast t-test results
write_csv(
  contrast_by_tissue %>%
    dplyr::select(experiment, tissue, n_CTRL, n_Inv4m, mean_CTRL, mean_Inv4m,
                  observed_ratio, expected_Inv4m, percent_of_expected,
                  contrast, se_contrast, t_stat, df, p_value, fdr, sig_stars),
  file.path(paths$intermediate, "jmj_total_cluster_contrast_test.csv")
)

# Save sample-level data
write_csv(
  total_cluster_samples,
  file.path(paths$intermediate, "jmj_total_cluster_samples.csv")
)

{
  cat("=== TOTAL CLUSTER ANALYSIS SAVED ===\n")
  cat("  jmj_total_cluster_expression.rds/pdf/png\n")
  cat("  jmj_total_cluster_contrast_test.csv\n")
  cat("  jmj_total_cluster_samples.csv\n")
}

## === TOTAL CLUSTER ANALYSIS SAVED ===
##   jmj_total_cluster_expression.rds/pdf/png
##   jmj_total_cluster_contrast_test.csv
##   jmj_total_cluster_samples.csv

Create Expression Boxplot (All 3 genes)

# Define colors (matching ionome style: gold for B73, purple for Inv4m)
genotype_colors <- c("B73" = "gold", "Inv4m" = "#4a0f82")

p_jmj_expr <- ggplot(
  jmj_combined,
  aes(x = tissue, y = logCPM, color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 1.5,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  facet_grid(symbol ~ experiment, scales = "free_x", space = "free_x") +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 14) +
  theme(
    axis.text.x = element_text(
      angle = 30, hjust = 1,
      color = "black", face = "bold", size = 14
    ),
    axis.text.y = element_text(color = "black", face = "bold", size = 14),
    axis.title = element_text(face = "bold", size = 16),
    strip.text.x = element_text(face = "bold", size = 16),
    strip.text.y = element_text(face = "bold.italic", size = 16),
    strip.background = element_rect(fill = "grey95", color = NA),
    legend.position = c(0.9, 0.9),
    legend.justification = c(1, 1),
    legend.background = element_rect(fill = "white", color = NA),
    legend.text = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
  ) +
  labs(
    x = NULL,
    y = expression(atop(bold(log[2]), bold("(CPM)")))
  )

print(p_jmj_expr)

Create Expression Boxplot (jmj2 and jmj4 only)

# Filter for jmj2 and jmj4 only
jmj_cluster <- jmj_combined %>%
  filter(symbol %in% c("jmj2", "jmj4")) %>%
  mutate(symbol = factor(symbol, levels = c("jmj2", "jmj4")))

p_jmj_cluster <- ggplot(
  jmj_cluster,
  aes(x = tissue, y = logCPM, color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 1.5,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  facet_grid(symbol ~ experiment, scales = "free_x", space = "free_x") +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 14) +
  theme(
    axis.text.x = element_text(
      angle = 30, hjust = 1,
      color = "black", face = "bold", size = 14
    ),
    axis.text.y = element_text(color = "black", face = "bold", size = 14),
    axis.title = element_text(face = "bold", size = 16),
    strip.text.x = element_text(face = "bold", size = 18),
    strip.text.y = element_text(face = "bold.italic", size = 18),
    strip.background = element_rect(fill = "grey95", color = NA),
    legend.position = c(0.9, 0.9),
    legend.justification = c(1, 1),
    legend.background = element_rect(fill = "white", color = NA),
    legend.text = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
  ) +
  labs(
    x = NULL,
    y = expression(atop(bold(log[2]), bold("(CPM)")))
  )

print(p_jmj_cluster)

Statistical Summary

# Calculate means and fold changes
jmj_summary <- jmj_combined %>%
 group_by(experiment, tissue, symbol, genotype) %>%
 summarise(
   mean_logCPM = mean(logCPM, na.rm = TRUE),
   sd_logCPM = sd(logCPM, na.rm = TRUE),
   n = n(),
   .groups = "drop"
 ) %>%
 pivot_wider(
   names_from = genotype,
   values_from = c(mean_logCPM, sd_logCPM, n),
   names_sep = "_"
 ) %>%
 mutate(
   log2FC = mean_logCPM_Inv4m - mean_logCPM_B73,
   direction = ifelse(log2FC < 0, "Down", "Up")
 )

{
 cat("=== EXPRESSION SUMMARY ===\n")
 print(
   jmj_summary %>%
     dplyr::select(experiment, tissue, symbol, log2FC, direction) %>%
     arrange(symbol, experiment, tissue)
 )
}

## === EXPRESSION SUMMARY ===
## # A tibble: 39 × 5
##    experiment                         tissue          symbol log2FC direction
##    <fct>                              <chr>           <fct>   <dbl> <chr>    
##  1 Crow et al., 2020 (Growth Chamber) V1 Leaf         jmj2    -2.60 Down     
##  2 Crow et al., 2020 (Growth Chamber) V1 Primary_root jmj2    -4.11 Down     
##  3 Crow et al., 2020 (Growth Chamber) V3 Leaf         jmj2    -2.84 Down     
##  4 Crow et al., 2020 (Growth Chamber) V3 Primary_root jmj2    -3.92 Down     
##  5 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_base jmj2    -2.63 Down     
##  6 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_tip  jmj2    -3.13 Down     
##  7 Crow et al., 2020 (Growth Chamber) V3 Stem_SAM     jmj2    -3.40 Down     
##  8 Crow et al., 2020 (Growth Chamber) V3 leaf_base    jmj2    -2.95 Down     
##  9 Crow et al., 2020 (Growth Chamber) V3 leaf_sheath  jmj2    -3.13 Down     
## 10 PA2022 (Field)                     Leaf 1          jmj2    -3.16 Down     
## # ℹ 29 more rows

Save Outputs

# Save all 3 genes plot
saveRDS(p_jmj_expr, file.path(paths$figures, "jmj_expression_boxplot.rds"))
ggsave(
  file.path(paths$figures, "jmj_expression_boxplot.pdf"),
  p_jmj_expr,
  width = 14, height = 10
)
ggsave(
  file.path(paths$figures, "jmj_expression_boxplot.png"),
  p_jmj_expr,
  width = 14, height = 10, dpi = 300
)

# Save jmj2/jmj4 cluster plot
saveRDS(p_jmj_cluster, file.path(paths$figures, "jmj2_jmj4_expression_boxplot.rds"))
ggsave(
  file.path(paths$figures, "jmj2_jmj4_expression_boxplot.pdf"),
  p_jmj_cluster,
  width = 14, height = 7
)
ggsave(
  file.path(paths$figures, "jmj2_jmj4_expression_boxplot.png"),
  p_jmj_cluster,
  width = 14, height = 7, dpi = 300
)

# Save summary table
write_csv(
  jmj_summary,
  file.path(paths$intermediate, "jmj_expression_summary.csv")
)

{
  cat("=== SAVED OUTPUTS ===\n")
  cat("All 3 genes:\n")
  cat("  jmj_expression_boxplot.rds/pdf/png\n")
  cat("\njmj2/jmj4 cluster only:\n")
  cat("  jmj2_jmj4_expression_boxplot.rds/pdf/png\n")
  cat("\nSummary table:\n")
  cat("  jmj_expression_summary.csv\n")
}

## === SAVED OUTPUTS ===
## All 3 genes:
##   jmj_expression_boxplot.rds/pdf/png
## 
## jmj2/jmj4 cluster only:
##   jmj2_jmj4_expression_boxplot.rds/pdf/png
## 
## Summary table:
##   jmj_expression_summary.csv

Cell Proliferation Genes Expression (Supplementary Figure)

This section plots expression of two cell proliferation-related genes across tissues:

pcna2 (Zm00001eb192050): Proliferating cell nuclear antigen 2, a DNA replication processivity factor. Upregulated in Inv4m (log2FC = 2.96, p < 0.001).
smo4 (Zm00001eb192850): Ribosome biogenesis protein NOP53 (SMO4 ortholog). The Arabidopsis ortholog regulates cell proliferation and organ size (Zhang et al. 2015 JIPB). Downregulated in Inv4m (log2FC = -1.98).

Define Cell Proliferation Genes

cellprolif_genes <- tibble(
  gene = c("Zm00001eb192050", "Zm00001eb192850"),
  symbol = c("pcna2", "smo4"),
  description = c(
    "Proliferating cell nuclear antigen 2",
    "Ribosome biogenesis protein NOP53 (SMO4 ortholog)"
  )
)

cat("Cell proliferation genes:\n")

## Cell proliferation genes:

print(cellprolif_genes)

## # A tibble: 2 × 3
##   gene            symbol description                                      
##   <chr>           <chr>  <chr>                                            
## 1 Zm00001eb192050 pcna2  Proliferating cell nuclear antigen 2             
## 2 Zm00001eb192850 smo4   Ribosome biogenesis protein NOP53 (SMO4 ortholog)

Extract Cell Proliferation Gene Expression

# Extract from PA2022
cellprolif_in_psu <- cellprolif_genes$gene[cellprolif_genes$gene %in% rownames(expr_psu)]
cat("Cell proliferation genes found in PA2022:", length(cellprolif_in_psu), "\n")

## Cell proliferation genes found in PA2022: 2

if (length(cellprolif_in_psu) > 0) {
  cellprolif_expr_psu <- expr_psu[cellprolif_in_psu, , drop = FALSE] %>%
    as.data.frame() %>%
    rownames_to_column("gene") %>%
    pivot_longer(
      cols = -gene,
      names_to = "sample",
      values_to = "logCPM"
    ) %>%
    left_join(
      meta_psu %>%
        dplyr::select(library, Genotype, leaf_tissue, Treatment) %>%
        mutate(sample = library),
      by = "sample"
    ) %>%
    filter(!is.na(Genotype)) %>%
    left_join(cellprolif_genes, by = "gene") %>%
    mutate(
      experiment = "PA2022 (Field)",
      tissue = paste0("Leaf ", leaf_tissue),
      genotype = ifelse(Genotype == "CTRL", "B73", "Inv4m")
    )

  cat("PA2022 expression records:", nrow(cellprolif_expr_psu), "\n")
}

## PA2022 expression records: 86

# Extract from Crow2020
cellprolif_in_crow <- cellprolif_genes$gene[cellprolif_genes$gene %in% rownames(expr_crow)]
cat("Cell proliferation genes found in Crow2020:", length(cellprolif_in_crow), "\n")

## Cell proliferation genes found in Crow2020: 2

if (length(cellprolif_in_crow) > 0) {
  cellprolif_expr_crow <- expr_crow[cellprolif_in_crow, , drop = FALSE] %>%
    as.data.frame() %>%
    rownames_to_column("gene") %>%
    pivot_longer(
      cols = -gene,
      names_to = "sample",
      values_to = "logCPM"
    ) %>%
    left_join(
      meta_crow %>%
        dplyr::select(sample, genotype, tissue, temp),
      by = "sample"
    ) %>%
    filter(!is.na(genotype)) %>%
    left_join(cellprolif_genes, by = "gene") %>%
    mutate(
      experiment = "Crow et al., 2020 (Growth Chamber)",
      tissue = str_replace(tissue, "_tissue$", "") %>%
        str_replace("_", " ") %>%
        str_replace("Primary root", "Root")
    )

  cat("Crow2020 expression records:", nrow(cellprolif_expr_crow), "\n")
}

## Crow2020 expression records: 692

# Combine both experiments
cellprolif_combined <- bind_rows(
  cellprolif_expr_psu %>%
    dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment),
  cellprolif_expr_crow %>%
    dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment)
)

# Order factors (pcna2 first, then smo4)
cellprolif_combined <- cellprolif_combined %>%
  mutate(
    symbol = factor(symbol, levels = c("pcna2", "smo4")),
    genotype = factor(genotype, levels = c("B73", "Inv4m")),
    experiment = factor(experiment, levels = c("Crow et al., 2020 (Growth Chamber)", "PA2022 (Field)"))
  )

{
  cat("=== CELL PROLIFERATION GENES COMBINED DATA ===\n")
  cat("Total records:", nrow(cellprolif_combined), "\n")
  cat("\nBy gene:\n")
  print(table(cellprolif_combined$symbol))
  cat("\nBy experiment:\n")
  print(table(cellprolif_combined$experiment))
}

## === CELL PROLIFERATION GENES COMBINED DATA ===
## Total records: 778 
## 
## By gene:
## 
## pcna2  smo4 
##   389   389 
## 
## By experiment:
## 
## Crow et al., 2020 (Growth Chamber)                     PA2022 (Field) 
##                                692                                 86

Cell Proliferation Genes Boxplot

p_cellprolif_expr <- ggplot(
  cellprolif_combined,
  aes(x = tissue, y = logCPM, color = genotype)
) +
  geom_boxplot(
    width = 0.5,
    linewidth = 1,
    outlier.shape = NA,
    position = position_dodge(width = 0.7)
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  geom_quasirandom(
    size = 1.5,
    dodge.width = 0.7,
    alpha = 0.7
  ) %>%
  with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
  facet_grid(symbol ~ experiment, scales = "free_x", space = "free_x") +
  scale_color_manual(values = genotype_colors, name = "Genotype") +
  theme_classic(base_size = 14) +
  theme(
    axis.text.x = element_text(
      angle = 30, hjust = 1,
      color = "black", face = "bold", size = 14
    ),
    axis.text.y = element_text(color = "black", face = "bold", size = 14),
    axis.title = element_text(face = "bold", size = 16),
    strip.text.x = element_text(face = "bold", size = 18),
    strip.text.y = element_text(face = "bold.italic", size = 18),
    strip.background = element_rect(fill = "grey95", color = NA),
    legend.position = c(0.9, 0.9),
    legend.justification = c(1, 1),
    legend.background = element_rect(fill = "white", color = NA),
    legend.text = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
  ) +
  labs(
    x = NULL,
    y = expression(atop(bold(log[2]), bold("(CPM)")))
  )

print(p_cellprolif_expr)

Cell Proliferation Genes Statistics

cellprolif_summary <- cellprolif_combined %>%
  group_by(experiment, tissue, symbol, genotype) %>%
  summarise(
    mean_logCPM = mean(logCPM, na.rm = TRUE),
    sd_logCPM = sd(logCPM, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = genotype,
    values_from = c(mean_logCPM, sd_logCPM, n),
    names_sep = "_"
  ) %>%
  mutate(
    log2FC = mean_logCPM_Inv4m - mean_logCPM_B73,
    direction = ifelse(log2FC < 0, "Down", "Up")
  )

{
  cat("=== CELL PROLIFERATION GENES EXPRESSION SUMMARY ===\n")
  print(
    cellprolif_summary %>%
      dplyr::select(experiment, tissue, symbol, log2FC, direction) %>%
      arrange(symbol, experiment, tissue)
  )
}

## === CELL PROLIFERATION GENES EXPRESSION SUMMARY ===
## # A tibble: 26 × 5
##    experiment                         tissue          symbol   log2FC direction
##    <fct>                              <chr>           <fct>     <dbl> <chr>    
##  1 Crow et al., 2020 (Growth Chamber) V1 Leaf         pcna2  -0.370   Down     
##  2 Crow et al., 2020 (Growth Chamber) V1 Primary_root pcna2  -0.00913 Down     
##  3 Crow et al., 2020 (Growth Chamber) V3 Leaf         pcna2   0.274   Up       
##  4 Crow et al., 2020 (Growth Chamber) V3 Primary_root pcna2   0.304   Up       
##  5 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_base pcna2   1.04    Up       
##  6 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_tip  pcna2   0.346   Up       
##  7 Crow et al., 2020 (Growth Chamber) V3 Stem_SAM     pcna2   0.457   Up       
##  8 Crow et al., 2020 (Growth Chamber) V3 leaf_base    pcna2   0.719   Up       
##  9 Crow et al., 2020 (Growth Chamber) V3 leaf_sheath  pcna2   0.741   Up       
## 10 PA2022 (Field)                     Leaf 1          pcna2   3.15    Up       
## # ℹ 16 more rows

Save Cell Proliferation Outputs

# Save cell proliferation plot (pcna2 + smo4)
saveRDS(p_cellprolif_expr, file.path(paths$figures, "cellprolif_pcna2_smo4_expression_boxplot.rds"))
ggsave(
  file.path(paths$figures, "cellprolif_pcna2_smo4_expression_boxplot.pdf"),
  p_cellprolif_expr,
  width = 14, height = 7
)
ggsave(
  file.path(paths$figures, "cellprolif_pcna2_smo4_expression_boxplot.png"),
  p_cellprolif_expr,
  width = 14, height = 7, dpi = 300
)

# Save summary table
write_csv(
  cellprolif_summary,
  file.path(paths$intermediate, "cellprolif_pcna2_smo4_expression_summary.csv")
)

{
  cat("=== CELL PROLIFERATION SAVED OUTPUTS ===\n")
  cat("  cellprolif_pcna2_smo4_expression_boxplot.rds/pdf/png\n")
  cat("  cellprolif_pcna2_smo4_expression_summary.csv\n")
}

## === CELL PROLIFERATION SAVED OUTPUTS ===
##   cellprolif_pcna2_smo4_expression_boxplot.rds/pdf/png
##   cellprolif_pcna2_smo4_expression_summary.csv

Session Info

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggbeeswarm_0.7.3 ggfx_1.0.3       ggpubr_0.6.2     lubridate_1.9.4 
##  [5] forcats_1.0.1    stringr_1.6.0    dplyr_1.1.4      purrr_1.2.0     
##  [9] readr_2.1.6      tidyr_1.3.2      tibble_3.3.0     ggplot2_4.0.1   
## [13] tidyverse_2.0.0  here_1.0.2      
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       beeswarm_0.4.0     xfun_0.55          bslib_0.9.0       
##  [5] rstatix_0.7.3      tzdb_0.5.0         vctrs_0.6.5        tools_4.5.2       
##  [9] generics_0.1.4     parallel_4.5.2     pkgconfig_2.0.3    RColorBrewer_1.1-3
## [13] S7_0.2.1           lifecycle_1.0.4    compiler_4.5.2     farver_2.1.2      
## [17] textshaping_1.0.4  carData_3.0-5      vipor_0.4.7        htmltools_0.5.9   
## [21] sass_0.4.10        yaml_2.3.12        Formula_1.2-5      crayon_1.5.3      
## [25] pillar_1.11.1      car_3.1-3          jquerylib_0.1.4    cachem_1.1.0      
## [29] magick_2.9.0       abind_1.4-8        tidyselect_1.2.1   digest_0.6.39     
## [33] stringi_1.8.7      labeling_0.4.3     rprojroot_2.1.1    fastmap_1.2.0     
## [37] grid_4.5.2         cli_3.6.5          magrittr_2.0.4     dichromat_2.0-0.1 
## [41] utf8_1.2.6         broom_1.0.11       withr_3.0.2        scales_1.4.0      
## [45] backports_1.5.0    bit64_4.6.0-1      timechange_0.3.0   rmarkdown_2.30    
## [49] bit_4.6.0          otel_0.2.0         ggsignif_0.6.4     ragg_1.5.0        
## [53] hms_1.1.4          evaluate_1.0.5     knitr_1.51         rlang_1.1.6       
## [57] Rcpp_1.1.0         glue_1.8.0         vroom_1.6.7        jsonlite_2.0.0    
## [61] R6_2.6.1           systemfonts_1.3.1

JMJ Cluster Expression Boxplot

Analysis Pipeline

2026-03-19