This script creates an expression boxplot for the JMJ cluster genes (jmj2, jmj4, jmj21) comparing Inv4m vs CTRL across all tissues in:
The output is saved as an RDS file for composition with the JMJ microsynteny figure.
library(tidyverse)
library(ggplot2)
library(ggpubr)
library(ggfx)
library(ggbeeswarm)
library(here)
jmj_genes <- tibble(
gene = c("Zm00001eb191790", "Zm00001eb191820", "Zm00001eb190750"),
symbol = c("jmj2", "jmj4", "jmj21"),
description = c(
"JUMONJI-transcription factor 2",
"JUMONJI-transcription factor 4",
"JUMONJI-transcription factor 21"
)
)
cat("JMJ cluster genes:\n")
## JMJ cluster genes:
print(jmj_genes)
## # A tibble: 3 × 3
## gene symbol description
## <chr> <chr> <chr>
## 1 Zm00001eb191790 jmj2 JUMONJI-transcription factor 2
## 2 Zm00001eb191820 jmj4 JUMONJI-transcription factor 4
## 3 Zm00001eb190750 jmj21 JUMONJI-transcription factor 21
# Load expression matrix (logCPM)
expr_psu <- readRDS(file.path(paths$intermediate, "normalized_expression_logCPM_leaf_trt.rds"))
# Load metadata
meta_psu <- read.csv(file.path(paths$data, "PSU2022_metadata.csv"))
{
cat("=== PA2022 DATA ===\n")
cat("Expression matrix:", nrow(expr_psu), "genes x", ncol(expr_psu), "samples\n")
cat("Metadata samples:", nrow(meta_psu), "\n")
cat("\nGenotypes:\n")
print(table(meta_psu$Genotype))
cat("\nLeaf positions:\n")
print(table(meta_psu$leaf_tissue))
}
## === PA2022 DATA ===
## Expression matrix: 24249 genes x 43 samples
## Metadata samples: 64
##
## Genotypes:
##
## CTRL INV4m
## 32 32
##
## Leaf positions:
##
## 1 2 3 4
## 16 16 16 16
# Extract JMJ expression
jmj_in_psu <- jmj_genes$gene[jmj_genes$gene %in% rownames(expr_psu)]
cat("JMJ genes found in PA2022:", length(jmj_in_psu), "\n")
## JMJ genes found in PA2022: 3
if (length(jmj_in_psu) > 0) {
# Extract expression for JMJ genes
jmj_expr_psu <- expr_psu[jmj_in_psu, , drop = FALSE] %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
pivot_longer(
cols = -gene,
names_to = "sample",
values_to = "logCPM"
)
# Match samples to metadata
# Sample IDs in expression matrix should match library column in metadata
jmj_expr_psu <- jmj_expr_psu %>%
left_join(
meta_psu %>%
dplyr::select(library, Genotype, leaf_tissue, Treatment) %>%
mutate(sample = library),
by = "sample"
) %>%
filter(!is.na(Genotype)) %>%
left_join(jmj_genes, by = "gene") %>%
mutate(
experiment = "PA2022 (Field)",
tissue = paste0("Leaf ", leaf_tissue),
genotype = ifelse(Genotype == "CTRL", "B73", "Inv4m")
)
cat("PA2022 expression records:", nrow(jmj_expr_psu), "\n")
}
## PA2022 expression records: 129
# Load expression matrix (logCPM)
expr_crow <- readRDS(file.path(paths$intermediate, "normalized_expression_logCPM_crow2020.rds"))
# Load metadata
meta_crow <- read.csv(file.path(paths$data, "SraRunTable.csv"), check.names = FALSE)
# Standardize column names - use Run (SRR ID) as sample ID
meta_crow <- meta_crow %>%
dplyr::rename(
sample = Run,
genotype_raw = genotype,
tissue_raw = tissue
) %>%
mutate(
genotype = ifelse(genotype_raw == "B73", "B73", "Inv4m"),
tissue = tissue_raw
)
{
cat("=== CROW2020 DATA ===\n")
cat("Expression matrix:", nrow(expr_crow), "genes x", ncol(expr_crow), "samples\n")
cat("Metadata samples:", nrow(meta_crow), "\n")
cat("\nGenotypes:\n")
print(table(meta_crow$genotype))
cat("\nTissues:\n")
print(table(meta_crow$tissue))
cat("\nTemperatures:\n")
print(table(meta_crow$temp))
}
## === CROW2020 DATA ===
## Expression matrix: 26490 genes x 346 samples
## Metadata samples: 346
##
## Genotypes:
##
## B73 Inv4m
## 190 156
##
## Tissues:
##
## V1_Leaf_tissue V1_Primary_root V3_leaf_base V3_leaf_sheath V3_Leaf_tissue
## 39 41 36 37 37
## V3_Primary_root V3_S2_leaf_base V3_S2_leaf_tip V3_Stem_SAM
## 42 38 36 40
##
## Temperatures:
##
## cold warm
## 147 199
# Extract JMJ expression
jmj_in_crow <- jmj_genes$gene[jmj_genes$gene %in% rownames(expr_crow)]
cat("JMJ genes found in Crow2020:", length(jmj_in_crow), "\n")
## JMJ genes found in Crow2020: 3
if (length(jmj_in_crow) > 0) {
# Extract expression for JMJ genes
jmj_expr_crow <- expr_crow[jmj_in_crow, , drop = FALSE] %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
pivot_longer(
cols = -gene,
names_to = "sample",
values_to = "logCPM"
)
# Match samples to metadata
jmj_expr_crow <- jmj_expr_crow %>%
left_join(
meta_crow %>%
dplyr::select(sample, genotype, tissue, temp),
by = "sample"
) %>%
filter(!is.na(genotype)) %>%
left_join(jmj_genes, by = "gene") %>%
mutate(
experiment = "Crow et al., 2020 (Growth Chamber)",
# Clean up tissue names
tissue = str_replace(tissue, "_tissue$", "") %>%
str_replace("_", " ") %>%
str_replace("Primary root", "Root")
)
cat("Crow2020 expression records:", nrow(jmj_expr_crow), "\n")
}
## Crow2020 expression records: 1038
# Combine both experiments
jmj_combined <- bind_rows(
jmj_expr_psu %>%
dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment),
jmj_expr_crow %>%
dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment)
)
# Order factors
jmj_combined <- jmj_combined %>%
mutate(
symbol = factor(symbol, levels = c("jmj2", "jmj4", "jmj21")),
genotype = factor(genotype, levels = c("B73", "Inv4m")),
experiment = factor(experiment, levels = c("Crow et al., 2020 (Growth Chamber)", "PA2022 (Field)"))
)
{
cat("=== COMBINED DATA ===\n")
cat("Total records:", nrow(jmj_combined), "\n")
cat("\nBy experiment:\n")
print(table(jmj_combined$experiment))
cat("\nBy gene:\n")
print(table(jmj_combined$symbol))
}
## === COMBINED DATA ===
## Total records: 1167
##
## By experiment:
##
## Crow et al., 2020 (Growth Chamber) PA2022 (Field)
## 1038 129
##
## By gene:
##
## jmj2 jmj4 jmj21
## 389 389 389
The JMJ cluster in B73 contains 5 paralogs (jmj9, psi, jmj6, jmj2, jmj4) that are >97% identical. In the V5 annotation, jmj9, psi, jmj6, and jmj2 are merged into a single gene model (Zm00001eb191790), while jmj4 is separate (Zm00001eb191820). The Inv4m inversion removes 4 copies, leaving only jmj4.
To compare total cluster expression between genotypes, we sum the CPM values for jmj2 + jmj4 (which captures all 5 paralogs in the V5 annotation). The expected Inv4m expression under copy-number scaling is B73/5 (one copy out of five).
# Convert logCPM to CPM and sum jmj2 + jmj4 per sample
# First, get unique sample info from PA2022
psu_samples <- jmj_expr_psu %>%
filter(symbol %in% c("jmj2", "jmj4")) %>%
mutate(CPM = 2^logCPM) %>%
group_by(sample, genotype, tissue, experiment) %>%
summarise(total_CPM = sum(CPM), .groups = "drop")
# Then from Crow2020
crow_samples <- jmj_expr_crow %>%
filter(symbol %in% c("jmj2", "jmj4")) %>%
mutate(CPM = 2^logCPM) %>%
group_by(sample, genotype, tissue, experiment) %>%
summarise(total_CPM = sum(CPM), .groups = "drop")
# Combine and rename B73 to CTRL
total_cluster_samples <- bind_rows(psu_samples, crow_samples) %>%
mutate(
genotype = recode(genotype, "B73" = "CTRL"),
genotype = factor(genotype, levels = c("CTRL", "Inv4m")),
experiment = factor(experiment, levels = c("Crow et al., 2020 (Growth Chamber)", "PA2022 (Field)"))
)
{
cat("=== TOTAL JMJ CLUSTER EXPRESSION ===\n")
cat("Samples per experiment:\n")
print(table(total_cluster_samples$experiment, total_cluster_samples$genotype))
cat("\nMean total CPM by genotype and experiment:\n")
print(
total_cluster_samples %>%
group_by(experiment, genotype) %>%
summarise(mean_CPM = mean(total_CPM), sd_CPM = sd(total_CPM), n = n(), .groups = "drop")
)
}
## === TOTAL JMJ CLUSTER EXPRESSION ===
## Samples per experiment:
##
## CTRL Inv4m
## Crow et al., 2020 (Growth Chamber) 190 156
## PA2022 (Field) 20 23
##
## Mean total CPM by genotype and experiment:
## # A tibble: 4 × 5
## experiment genotype mean_CPM sd_CPM n
## <fct> <fct> <dbl> <dbl> <int>
## 1 Crow et al., 2020 (Growth Chamber) CTRL 95.4 39.1 190
## 2 Crow et al., 2020 (Growth Chamber) Inv4m 13.9 11.7 156
## 3 PA2022 (Field) CTRL 94.4 19.2 20
## 4 PA2022 (Field) Inv4m 10.4 3.72 23
# Calculate CTRL mean per experiment and tissue for expected lines
expected_by_tissue <- total_cluster_samples %>%
filter(genotype == "CTRL") %>%
group_by(experiment, tissue) %>%
summarise(
CTRL_mean = mean(total_CPM),
expected_Inv4m = mean(total_CPM) / 5,
.groups = "drop"
)
# Also calculate overall expected per experiment (for stats)
expected_overall <- total_cluster_samples %>%
filter(genotype == "CTRL") %>%
group_by(experiment) %>%
summarise(
CTRL_mean = mean(total_CPM),
expected_Inv4m = mean(total_CPM) / 5,
.groups = "drop"
)
{
cat("=== EXPECTED VALUES (CTRL/5) ===\n")
cat("\nBy tissue:\n")
print(expected_by_tissue)
cat("\nOverall by experiment:\n")
print(expected_overall)
}
## === EXPECTED VALUES (CTRL/5) ===
##
## By tissue:
## # A tibble: 13 × 4
## experiment tissue CTRL_mean expected_Inv4m
## <fct> <chr> <dbl> <dbl>
## 1 Crow et al., 2020 (Growth Chamber) V1 Leaf 112. 22.3
## 2 Crow et al., 2020 (Growth Chamber) V1 Primary_root 68.0 13.6
## 3 Crow et al., 2020 (Growth Chamber) V3 Leaf 117. 23.4
## 4 Crow et al., 2020 (Growth Chamber) V3 Primary_root 64.0 12.8
## 5 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_base 81.4 16.3
## 6 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_tip 141. 28.2
## 7 Crow et al., 2020 (Growth Chamber) V3 Stem_SAM 89.3 17.9
## 8 Crow et al., 2020 (Growth Chamber) V3 leaf_base 114. 22.8
## 9 Crow et al., 2020 (Growth Chamber) V3 leaf_sheath 80.2 16.0
## 10 PA2022 (Field) Leaf 1 106. 21.1
## 11 PA2022 (Field) Leaf 2 99.7 19.9
## 12 PA2022 (Field) Leaf 3 91.0 18.2
## 13 PA2022 (Field) Leaf 4 82.6 16.5
##
## Overall by experiment:
## # A tibble: 2 × 3
## experiment CTRL_mean expected_Inv4m
## <fct> <dbl> <dbl>
## 1 Crow et al., 2020 (Growth Chamber) 95.4 19.1
## 2 PA2022 (Field) 94.4 18.9
Test whether expression scales with copy number using a linear contrast:
H₀: μ_CTRL - 5×μ_Inv4m = 0 (expression scales with 5:1 copy number)
This Welch-type t-test properly accounts for uncertainty in both group means and is appropriate for small sample sizes (uses t-distribution with Welch-Satterthwaite degrees of freedom).
# Linear contrast t-test for each experiment AND tissue
# H0: mean_CTRL - 5*mean_Inv4m = 0 (expression scales with copy number)
contrast_by_tissue <- total_cluster_samples %>%
group_by(experiment, tissue) %>%
summarise(
mean_CTRL = mean(total_CPM[genotype == "CTRL"]),
mean_Inv4m = mean(total_CPM[genotype == "Inv4m"]),
var_CTRL = var(total_CPM[genotype == "CTRL"]),
var_Inv4m = var(total_CPM[genotype == "Inv4m"]),
n_CTRL = sum(genotype == "CTRL"),
n_Inv4m = sum(genotype == "Inv4m"),
.groups = "drop"
) %>%
mutate(
# Observed ratio
observed_ratio = mean_CTRL / mean_Inv4m,
# Expected Inv4m under copy number scaling
expected_Inv4m = mean_CTRL / 5,
# Percent of expected
percent_of_expected = (mean_Inv4m / expected_Inv4m) * 100,
# Linear contrast: mean_CTRL - 5*mean_Inv4m
contrast = mean_CTRL - 5 * mean_Inv4m,
# Standard error of contrast (note: coefficient 5 gets squared for variance)
se_contrast = sqrt(var_CTRL / n_CTRL + 25 * var_Inv4m / n_Inv4m),
# t statistic
t_stat = contrast / se_contrast,
# Welch-Satterthwaite degrees of freedom
df = (var_CTRL / n_CTRL + 25 * var_Inv4m / n_Inv4m)^2 /
((var_CTRL / n_CTRL)^2 / (n_CTRL - 1) + (25 * var_Inv4m / n_Inv4m)^2 / (n_Inv4m - 1)),
# Two-tailed p-value
p_value = 2 * pt(abs(t_stat), df = df, lower.tail = FALSE)
) %>%
# Add FDR adjustment across all tissues and significance stars
mutate(
fdr = p.adjust(p_value, method = "fdr"),
sig_stars = case_when(
fdr < 0.001 ~ "***",
fdr < 0.01 ~ "**",
fdr < 0.05 ~ "*",
TRUE ~ "ns"
)
)
# Display as table
contrast_table <- contrast_by_tissue %>%
dplyr::select(experiment, tissue, n_CTRL, n_Inv4m, observed_ratio, t_stat, df, p_value, fdr, percent_of_expected, sig_stars) %>%
mutate(
observed_ratio = sprintf("%.2f:1", observed_ratio),
t_stat = sprintf("%.2f", t_stat),
df = sprintf("%.1f", df),
p_value = sprintf("%.3f", p_value),
fdr = sprintf("%.3f", fdr),
percent_of_expected = sprintf("%.1f%%", percent_of_expected)
)
knitr::kable(
contrast_table,
col.names = c("Experiment", "Tissue", "n CTRL", "n Inv4m", "Observed Ratio", "t", "df", "P-value", "FDR", "% of Expected", "Sig"),
caption = "Linear contrast t-test: H₀ = μ_CTRL - 5×μ_Inv4m = 0 (expression scales with copy number)"
)
| Experiment | Tissue | n CTRL | n Inv4m | Observed Ratio | t | df | P-value | FDR | % of Expected | Sig |
|---|---|---|---|---|---|---|---|---|---|---|
| Crow et al., 2020 (Growth Chamber) | V1 Leaf | 22 | 17 | 5.99:1 | 1.69 | 36.1 | 0.100 | 0.108 | 83.4% | ns |
| Crow et al., 2020 (Growth Chamber) | V1 Primary_root | 22 | 19 | 10.32:1 | 4.21 | 34.2 | 0.000 | 0.002 | 48.4% | ** |
| Crow et al., 2020 (Growth Chamber) | V3 Leaf | 22 | 15 | 6.92:1 | 1.94 | 32.3 | 0.061 | 0.080 | 72.3% | ns |
| Crow et al., 2020 (Growth Chamber) | V3 Primary_root | 24 | 18 | 7.55:1 | 2.50 | 25.2 | 0.019 | 0.031 | 66.2% | * |
| Crow et al., 2020 (Growth Chamber) | V3 S2_leaf_base | 21 | 17 | 6.21:1 | 1.89 | 28.0 | 0.069 | 0.081 | 80.5% | ns |
| Crow et al., 2020 (Growth Chamber) | V3 S2_leaf_tip | 20 | 16 | 6.97:1 | 2.60 | 24.0 | 0.016 | 0.031 | 71.7% | * |
| Crow et al., 2020 (Growth Chamber) | V3 Stem_SAM | 21 | 19 | 6.11:1 | 0.58 | 18.7 | 0.567 | 0.567 | 81.8% | ns |
| Crow et al., 2020 (Growth Chamber) | V3 leaf_base | 18 | 18 | 6.58:1 | 1.97 | 26.8 | 0.059 | 0.080 | 75.9% | ns |
| Crow et al., 2020 (Growth Chamber) | V3 leaf_sheath | 20 | 17 | 7.29:1 | 2.53 | 34.7 | 0.016 | 0.031 | 68.6% | * |
| PA2022 (Field) | Leaf 1 | 4 | 7 | 8.14:1 | 2.94 | 8.3 | 0.018 | 0.031 | 61.5% | * |
| PA2022 (Field) | Leaf 2 | 6 | 5 | 10.04:1 | 6.00 | 7.2 | 0.000 | 0.003 | 49.8% | ** |
| PA2022 (Field) | Leaf 3 | 5 | 6 | 9.43:1 | 4.34 | 6.3 | 0.004 | 0.014 | 53.0% | * |
| PA2022 (Field) | Leaf 4 | 5 | 5 | 10.15:1 | 4.36 | 6.7 | 0.004 | 0.014 | 49.3% | * |
# Define colors (matching ionome style: gold for CTRL, purple for Inv4m)
genotype_colors <- c("CTRL" = "gold", "Inv4m" = "#4a0f82")
# Prepare expected line data - add genotype = "Inv4m" so it dodges to Inv4m position
expected_lines <- expected_by_tissue %>%
mutate(genotype = factor("Inv4m", levels = c("CTRL", "Inv4m")))
# Prepare significance annotation data
# Position stars just above the dashed red line (expected value)
# Only include significant results (remove "ns")
sig_annotations <- contrast_by_tissue %>%
dplyr::select(experiment, tissue, sig_stars, expected_Inv4m) %>%
filter(sig_stars != "ns") %>% # Only show stars for significant results
mutate(
y_pos = expected_Inv4m * 1.08, # Position just above the expected line
genotype = factor("Inv4m", levels = c("CTRL", "Inv4m"))
)
# Create boxplot with tissue on x-axis (matching jmj2/jmj4 plot layout)
p_total_cluster <- ggplot(
total_cluster_samples,
aes(x = tissue, y = total_CPM, color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 1.5,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
# Add expected line (CTRL/5) for each tissue at the Inv4m position
geom_errorbar(
data = expected_lines,
aes(x = tissue, ymin = expected_Inv4m, ymax = expected_Inv4m, group = genotype),
inherit.aes = FALSE,
position = position_dodge(width = 0.7),
width = 0.4,
linewidth = 1,
linetype = "dashed",
color = "red"
) +
# Add significance stars just above the expected line (only for significant)
geom_text(
data = sig_annotations,
aes(x = tissue, y = y_pos, label = sig_stars, group = genotype),
inherit.aes = FALSE,
position = position_dodge(width = 0.7),
size = 10,
fontface = "bold",
color = "red"
) +
facet_grid(. ~ experiment, scales = "free_x", space = "free_x") +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 14) +
theme(
axis.text.x = element_text(
angle = 30, hjust = 1,
color = "black", face = "bold", size = 14
),
axis.text.y = element_text(color = "black", face = "bold", size = 14),
axis.title = element_text(face = "bold", size = 16),
strip.text = element_text(face = "bold", size = 18),
strip.background = element_rect(fill = "grey95", color = NA),
legend.position = c(0.9, 0.9),
legend.justification = c(1, 1),
legend.background = element_rect(fill = "white", color = NA),
legend.text = element_text(face = "bold", size = 12),
legend.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
) +
labs(
x = NULL,
y = "Total JMJ Cluster Expression\n(CPM)",
caption = "Dashed red line = expected Inv4m (CTRL/5). Significance: * FDR<0.05, ** FDR<0.01, *** FDR<0.001"
)
print(p_total_cluster)
# Save total cluster expression plot
saveRDS(p_total_cluster, file.path(paths$figures, "jmj_total_cluster_expression.rds"))
ggsave(
file.path(paths$figures, "jmj_total_cluster_expression.pdf"),
p_total_cluster,
width = 14, height = 7
)
ggsave(
file.path(paths$figures, "jmj_total_cluster_expression.png"),
p_total_cluster,
width = 14, height = 7, dpi = 300
)
# Save per-tissue linear contrast t-test results
write_csv(
contrast_by_tissue %>%
dplyr::select(experiment, tissue, n_CTRL, n_Inv4m, mean_CTRL, mean_Inv4m,
observed_ratio, expected_Inv4m, percent_of_expected,
contrast, se_contrast, t_stat, df, p_value, fdr, sig_stars),
file.path(paths$intermediate, "jmj_total_cluster_contrast_test.csv")
)
# Save sample-level data
write_csv(
total_cluster_samples,
file.path(paths$intermediate, "jmj_total_cluster_samples.csv")
)
{
cat("=== TOTAL CLUSTER ANALYSIS SAVED ===\n")
cat(" jmj_total_cluster_expression.rds/pdf/png\n")
cat(" jmj_total_cluster_contrast_test.csv\n")
cat(" jmj_total_cluster_samples.csv\n")
}
## === TOTAL CLUSTER ANALYSIS SAVED ===
## jmj_total_cluster_expression.rds/pdf/png
## jmj_total_cluster_contrast_test.csv
## jmj_total_cluster_samples.csv
# Define colors (matching ionome style: gold for B73, purple for Inv4m)
genotype_colors <- c("B73" = "gold", "Inv4m" = "#4a0f82")
p_jmj_expr <- ggplot(
jmj_combined,
aes(x = tissue, y = logCPM, color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 1.5,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
facet_grid(symbol ~ experiment, scales = "free_x", space = "free_x") +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 14) +
theme(
axis.text.x = element_text(
angle = 30, hjust = 1,
color = "black", face = "bold", size = 14
),
axis.text.y = element_text(color = "black", face = "bold", size = 14),
axis.title = element_text(face = "bold", size = 16),
strip.text.x = element_text(face = "bold", size = 16),
strip.text.y = element_text(face = "bold.italic", size = 16),
strip.background = element_rect(fill = "grey95", color = NA),
legend.position = c(0.9, 0.9),
legend.justification = c(1, 1),
legend.background = element_rect(fill = "white", color = NA),
legend.text = element_text(face = "bold", size = 12),
legend.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
) +
labs(
x = NULL,
y = expression(atop(bold(log[2]), bold("(CPM)")))
)
print(p_jmj_expr)
# Filter for jmj2 and jmj4 only
jmj_cluster <- jmj_combined %>%
filter(symbol %in% c("jmj2", "jmj4")) %>%
mutate(symbol = factor(symbol, levels = c("jmj2", "jmj4")))
p_jmj_cluster <- ggplot(
jmj_cluster,
aes(x = tissue, y = logCPM, color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 1.5,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
facet_grid(symbol ~ experiment, scales = "free_x", space = "free_x") +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 14) +
theme(
axis.text.x = element_text(
angle = 30, hjust = 1,
color = "black", face = "bold", size = 14
),
axis.text.y = element_text(color = "black", face = "bold", size = 14),
axis.title = element_text(face = "bold", size = 16),
strip.text.x = element_text(face = "bold", size = 18),
strip.text.y = element_text(face = "bold.italic", size = 18),
strip.background = element_rect(fill = "grey95", color = NA),
legend.position = c(0.9, 0.9),
legend.justification = c(1, 1),
legend.background = element_rect(fill = "white", color = NA),
legend.text = element_text(face = "bold", size = 12),
legend.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
) +
labs(
x = NULL,
y = expression(atop(bold(log[2]), bold("(CPM)")))
)
print(p_jmj_cluster)
# Calculate means and fold changes
jmj_summary <- jmj_combined %>%
group_by(experiment, tissue, symbol, genotype) %>%
summarise(
mean_logCPM = mean(logCPM, na.rm = TRUE),
sd_logCPM = sd(logCPM, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
pivot_wider(
names_from = genotype,
values_from = c(mean_logCPM, sd_logCPM, n),
names_sep = "_"
) %>%
mutate(
log2FC = mean_logCPM_Inv4m - mean_logCPM_B73,
direction = ifelse(log2FC < 0, "Down", "Up")
)
{
cat("=== EXPRESSION SUMMARY ===\n")
print(
jmj_summary %>%
dplyr::select(experiment, tissue, symbol, log2FC, direction) %>%
arrange(symbol, experiment, tissue)
)
}
## === EXPRESSION SUMMARY ===
## # A tibble: 39 × 5
## experiment tissue symbol log2FC direction
## <fct> <chr> <fct> <dbl> <chr>
## 1 Crow et al., 2020 (Growth Chamber) V1 Leaf jmj2 -2.60 Down
## 2 Crow et al., 2020 (Growth Chamber) V1 Primary_root jmj2 -4.11 Down
## 3 Crow et al., 2020 (Growth Chamber) V3 Leaf jmj2 -2.84 Down
## 4 Crow et al., 2020 (Growth Chamber) V3 Primary_root jmj2 -3.92 Down
## 5 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_base jmj2 -2.63 Down
## 6 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_tip jmj2 -3.13 Down
## 7 Crow et al., 2020 (Growth Chamber) V3 Stem_SAM jmj2 -3.40 Down
## 8 Crow et al., 2020 (Growth Chamber) V3 leaf_base jmj2 -2.95 Down
## 9 Crow et al., 2020 (Growth Chamber) V3 leaf_sheath jmj2 -3.13 Down
## 10 PA2022 (Field) Leaf 1 jmj2 -3.16 Down
## # ℹ 29 more rows
# Save all 3 genes plot
saveRDS(p_jmj_expr, file.path(paths$figures, "jmj_expression_boxplot.rds"))
ggsave(
file.path(paths$figures, "jmj_expression_boxplot.pdf"),
p_jmj_expr,
width = 14, height = 10
)
ggsave(
file.path(paths$figures, "jmj_expression_boxplot.png"),
p_jmj_expr,
width = 14, height = 10, dpi = 300
)
# Save jmj2/jmj4 cluster plot
saveRDS(p_jmj_cluster, file.path(paths$figures, "jmj2_jmj4_expression_boxplot.rds"))
ggsave(
file.path(paths$figures, "jmj2_jmj4_expression_boxplot.pdf"),
p_jmj_cluster,
width = 14, height = 7
)
ggsave(
file.path(paths$figures, "jmj2_jmj4_expression_boxplot.png"),
p_jmj_cluster,
width = 14, height = 7, dpi = 300
)
# Save summary table
write_csv(
jmj_summary,
file.path(paths$intermediate, "jmj_expression_summary.csv")
)
{
cat("=== SAVED OUTPUTS ===\n")
cat("All 3 genes:\n")
cat(" jmj_expression_boxplot.rds/pdf/png\n")
cat("\njmj2/jmj4 cluster only:\n")
cat(" jmj2_jmj4_expression_boxplot.rds/pdf/png\n")
cat("\nSummary table:\n")
cat(" jmj_expression_summary.csv\n")
}
## === SAVED OUTPUTS ===
## All 3 genes:
## jmj_expression_boxplot.rds/pdf/png
##
## jmj2/jmj4 cluster only:
## jmj2_jmj4_expression_boxplot.rds/pdf/png
##
## Summary table:
## jmj_expression_summary.csv
This section plots expression of two cell proliferation-related genes across tissues:
cellprolif_genes <- tibble(
gene = c("Zm00001eb192050", "Zm00001eb192850"),
symbol = c("pcna2", "smo4"),
description = c(
"Proliferating cell nuclear antigen 2",
"Ribosome biogenesis protein NOP53 (SMO4 ortholog)"
)
)
cat("Cell proliferation genes:\n")
## Cell proliferation genes:
print(cellprolif_genes)
## # A tibble: 2 × 3
## gene symbol description
## <chr> <chr> <chr>
## 1 Zm00001eb192050 pcna2 Proliferating cell nuclear antigen 2
## 2 Zm00001eb192850 smo4 Ribosome biogenesis protein NOP53 (SMO4 ortholog)
# Extract from PA2022
cellprolif_in_psu <- cellprolif_genes$gene[cellprolif_genes$gene %in% rownames(expr_psu)]
cat("Cell proliferation genes found in PA2022:", length(cellprolif_in_psu), "\n")
## Cell proliferation genes found in PA2022: 2
if (length(cellprolif_in_psu) > 0) {
cellprolif_expr_psu <- expr_psu[cellprolif_in_psu, , drop = FALSE] %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
pivot_longer(
cols = -gene,
names_to = "sample",
values_to = "logCPM"
) %>%
left_join(
meta_psu %>%
dplyr::select(library, Genotype, leaf_tissue, Treatment) %>%
mutate(sample = library),
by = "sample"
) %>%
filter(!is.na(Genotype)) %>%
left_join(cellprolif_genes, by = "gene") %>%
mutate(
experiment = "PA2022 (Field)",
tissue = paste0("Leaf ", leaf_tissue),
genotype = ifelse(Genotype == "CTRL", "B73", "Inv4m")
)
cat("PA2022 expression records:", nrow(cellprolif_expr_psu), "\n")
}
## PA2022 expression records: 86
# Extract from Crow2020
cellprolif_in_crow <- cellprolif_genes$gene[cellprolif_genes$gene %in% rownames(expr_crow)]
cat("Cell proliferation genes found in Crow2020:", length(cellprolif_in_crow), "\n")
## Cell proliferation genes found in Crow2020: 2
if (length(cellprolif_in_crow) > 0) {
cellprolif_expr_crow <- expr_crow[cellprolif_in_crow, , drop = FALSE] %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
pivot_longer(
cols = -gene,
names_to = "sample",
values_to = "logCPM"
) %>%
left_join(
meta_crow %>%
dplyr::select(sample, genotype, tissue, temp),
by = "sample"
) %>%
filter(!is.na(genotype)) %>%
left_join(cellprolif_genes, by = "gene") %>%
mutate(
experiment = "Crow et al., 2020 (Growth Chamber)",
tissue = str_replace(tissue, "_tissue$", "") %>%
str_replace("_", " ") %>%
str_replace("Primary root", "Root")
)
cat("Crow2020 expression records:", nrow(cellprolif_expr_crow), "\n")
}
## Crow2020 expression records: 692
# Combine both experiments
cellprolif_combined <- bind_rows(
cellprolif_expr_psu %>%
dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment),
cellprolif_expr_crow %>%
dplyr::select(gene, symbol, logCPM, genotype, tissue, experiment)
)
# Order factors (pcna2 first, then smo4)
cellprolif_combined <- cellprolif_combined %>%
mutate(
symbol = factor(symbol, levels = c("pcna2", "smo4")),
genotype = factor(genotype, levels = c("B73", "Inv4m")),
experiment = factor(experiment, levels = c("Crow et al., 2020 (Growth Chamber)", "PA2022 (Field)"))
)
{
cat("=== CELL PROLIFERATION GENES COMBINED DATA ===\n")
cat("Total records:", nrow(cellprolif_combined), "\n")
cat("\nBy gene:\n")
print(table(cellprolif_combined$symbol))
cat("\nBy experiment:\n")
print(table(cellprolif_combined$experiment))
}
## === CELL PROLIFERATION GENES COMBINED DATA ===
## Total records: 778
##
## By gene:
##
## pcna2 smo4
## 389 389
##
## By experiment:
##
## Crow et al., 2020 (Growth Chamber) PA2022 (Field)
## 692 86
p_cellprolif_expr <- ggplot(
cellprolif_combined,
aes(x = tissue, y = logCPM, color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 1.5,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
facet_grid(symbol ~ experiment, scales = "free_x", space = "free_x") +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 14) +
theme(
axis.text.x = element_text(
angle = 30, hjust = 1,
color = "black", face = "bold", size = 14
),
axis.text.y = element_text(color = "black", face = "bold", size = 14),
axis.title = element_text(face = "bold", size = 16),
strip.text.x = element_text(face = "bold", size = 18),
strip.text.y = element_text(face = "bold.italic", size = 18),
strip.background = element_rect(fill = "grey95", color = NA),
legend.position = c(0.9, 0.9),
legend.justification = c(1, 1),
legend.background = element_rect(fill = "white", color = NA),
legend.text = element_text(face = "bold", size = 12),
legend.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
) +
labs(
x = NULL,
y = expression(atop(bold(log[2]), bold("(CPM)")))
)
print(p_cellprolif_expr)
cellprolif_summary <- cellprolif_combined %>%
group_by(experiment, tissue, symbol, genotype) %>%
summarise(
mean_logCPM = mean(logCPM, na.rm = TRUE),
sd_logCPM = sd(logCPM, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
pivot_wider(
names_from = genotype,
values_from = c(mean_logCPM, sd_logCPM, n),
names_sep = "_"
) %>%
mutate(
log2FC = mean_logCPM_Inv4m - mean_logCPM_B73,
direction = ifelse(log2FC < 0, "Down", "Up")
)
{
cat("=== CELL PROLIFERATION GENES EXPRESSION SUMMARY ===\n")
print(
cellprolif_summary %>%
dplyr::select(experiment, tissue, symbol, log2FC, direction) %>%
arrange(symbol, experiment, tissue)
)
}
## === CELL PROLIFERATION GENES EXPRESSION SUMMARY ===
## # A tibble: 26 × 5
## experiment tissue symbol log2FC direction
## <fct> <chr> <fct> <dbl> <chr>
## 1 Crow et al., 2020 (Growth Chamber) V1 Leaf pcna2 -0.370 Down
## 2 Crow et al., 2020 (Growth Chamber) V1 Primary_root pcna2 -0.00913 Down
## 3 Crow et al., 2020 (Growth Chamber) V3 Leaf pcna2 0.274 Up
## 4 Crow et al., 2020 (Growth Chamber) V3 Primary_root pcna2 0.304 Up
## 5 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_base pcna2 1.04 Up
## 6 Crow et al., 2020 (Growth Chamber) V3 S2_leaf_tip pcna2 0.346 Up
## 7 Crow et al., 2020 (Growth Chamber) V3 Stem_SAM pcna2 0.457 Up
## 8 Crow et al., 2020 (Growth Chamber) V3 leaf_base pcna2 0.719 Up
## 9 Crow et al., 2020 (Growth Chamber) V3 leaf_sheath pcna2 0.741 Up
## 10 PA2022 (Field) Leaf 1 pcna2 3.15 Up
## # ℹ 16 more rows
# Save cell proliferation plot (pcna2 + smo4)
saveRDS(p_cellprolif_expr, file.path(paths$figures, "cellprolif_pcna2_smo4_expression_boxplot.rds"))
ggsave(
file.path(paths$figures, "cellprolif_pcna2_smo4_expression_boxplot.pdf"),
p_cellprolif_expr,
width = 14, height = 7
)
ggsave(
file.path(paths$figures, "cellprolif_pcna2_smo4_expression_boxplot.png"),
p_cellprolif_expr,
width = 14, height = 7, dpi = 300
)
# Save summary table
write_csv(
cellprolif_summary,
file.path(paths$intermediate, "cellprolif_pcna2_smo4_expression_summary.csv")
)
{
cat("=== CELL PROLIFERATION SAVED OUTPUTS ===\n")
cat(" cellprolif_pcna2_smo4_expression_boxplot.rds/pdf/png\n")
cat(" cellprolif_pcna2_smo4_expression_summary.csv\n")
}
## === CELL PROLIFERATION SAVED OUTPUTS ===
## cellprolif_pcna2_smo4_expression_boxplot.rds/pdf/png
## cellprolif_pcna2_smo4_expression_summary.csv
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggbeeswarm_0.7.3 ggfx_1.0.3 ggpubr_0.6.2 lubridate_1.9.4
## [5] forcats_1.0.1 stringr_1.6.0 dplyr_1.1.4 purrr_1.2.0
## [9] readr_2.1.6 tidyr_1.3.2 tibble_3.3.0 ggplot2_4.0.1
## [13] tidyverse_2.0.0 here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 beeswarm_0.4.0 xfun_0.55 bslib_0.9.0
## [5] rstatix_0.7.3 tzdb_0.5.0 vctrs_0.6.5 tools_4.5.2
## [9] generics_0.1.4 parallel_4.5.2 pkgconfig_2.0.3 RColorBrewer_1.1-3
## [13] S7_0.2.1 lifecycle_1.0.4 compiler_4.5.2 farver_2.1.2
## [17] textshaping_1.0.4 carData_3.0-5 vipor_0.4.7 htmltools_0.5.9
## [21] sass_0.4.10 yaml_2.3.12 Formula_1.2-5 crayon_1.5.3
## [25] pillar_1.11.1 car_3.1-3 jquerylib_0.1.4 cachem_1.1.0
## [29] magick_2.9.0 abind_1.4-8 tidyselect_1.2.1 digest_0.6.39
## [33] stringi_1.8.7 labeling_0.4.3 rprojroot_2.1.1 fastmap_1.2.0
## [37] grid_4.5.2 cli_3.6.5 magrittr_2.0.4 dichromat_2.0-0.1
## [41] utf8_1.2.6 broom_1.0.11 withr_3.0.2 scales_1.4.0
## [45] backports_1.5.0 bit64_4.6.0-1 timechange_0.3.0 rmarkdown_2.30
## [49] bit_4.6.0 otel_0.2.0 ggsignif_0.6.4 ragg_1.5.0
## [53] hms_1.1.4 evaluate_1.0.5 knitr_1.51 rlang_1.1.6
## [57] Rcpp_1.1.0 glue_1.8.0 vroom_1.6.7 jsonlite_2.0.0
## [61] R6_2.6.1 systemfonts_1.3.1