This script analyzes JMJ cluster expression from transcript-level kallisto quantification using the JMJ-corrected cDNA reference.
Two analyses are performed:
Total Cluster Expression (main analysis): Sum of CPM across all 5 paralogs per sample. This is the biologically meaningful comparison because:
Comparing total cluster output shows whether the single Inv4m copy produces as much transcript as the 5 B73 copies combined.
Per-Paralog Expression: Individual expression for each paralog. Note that because the paralogs are >97% identical, kallisto distributes Inv4m reads across all 5 loci artificially. This view is useful for understanding read distribution but not for biological interpretation of Inv4m expression.
JMJ Cluster Paralogs (genomic order):
| Transcript | Symbol | Description |
|---|---|---|
| Zm00001eb191790_T001 | jmj9 | First tandem copy |
| Zm00001eb191790_T017 | psi | Pseudogene (from V4 annotation) |
| Zm00001eb191790_T006 | jmj6 | Third tandem copy |
| Zm00001eb191790_T013 | jmj2 | Fourth tandem copy |
| Zm00001eb191820_T001 | jmj4 | Separate gene model |
Background: The V5 annotation incorrectly merges jmj9, psi, jmj6, and jmj2 into a single gene model (Zm00001eb191790) with 16 chimeric transcripts. We created a corrected cDNA reference that treats each paralog as a separate entry.
library(tidyverse)
library(ggplot2)
library(ggfx)
library(ggbeeswarm)
# JMJ paralog transcripts (in genomic order)
jmj_paralogs <- tibble(
transcript = c(
"Zm00001eb191790_T001",
"Zm00001eb191790_T017",
"Zm00001eb191790_T006",
"Zm00001eb191790_T013",
"Zm00001eb191820_T001"
),
symbol = c("jmj9", "psi", "jmj6", "jmj2", "jmj4"),
position = c(1, 2, 3, 4, 5)
)
cat("JMJ cluster paralogs:\n")
## JMJ cluster paralogs:
print(jmj_paralogs)
## # A tibble: 5 × 3
## transcript symbol position
## <chr> <chr> <dbl>
## 1 Zm00001eb191790_T001 jmj9 1
## 2 Zm00001eb191790_T017 psi 2
## 3 Zm00001eb191790_T006 jmj6 3
## 4 Zm00001eb191790_T013 jmj2 4
## 5 Zm00001eb191820_T001 jmj4 5
The counts matrix is pre-extracted from kallisto outputs using the
standalone script extract_kallisto_counts_matrix.R. This
analysis reads from the cached intermediate file only.
# Counts matrix path (extracted from kallisto by separate script)
counts_matrix_file <- file.path(
paths$intermediate,
"PSU2022_kallisto_jmj_corrected_counts.csv"
)
# Check if counts matrix exists
if (!file.exists(counts_matrix_file)) {
stop(
"Counts matrix not found: ", counts_matrix_file, "\n\n",
"Run the extraction script first:\n",
" Rscript scripts/inversion_paper/extract_kallisto_counts_matrix.R\n",
"(Requires server mounted)"
)
}
cat("Loading counts matrix...\n")
## Loading counts matrix...
full_counts_matrix <- read_csv(counts_matrix_file, show_col_types = FALSE)
cat("Loaded:", nrow(full_counts_matrix), "transcripts x",
ncol(full_counts_matrix) - 1, "samples\n")
## Loaded: 72527 transcripts x 60 samples
# Calculate library sizes from full counts matrix
lib_sizes_df <- full_counts_matrix %>%
pivot_longer(cols = -target_id, names_to = "sample", values_to = "counts") %>%
group_by(sample) %>%
summarise(lib_size = sum(counts, na.rm = TRUE), .groups = "drop")
cat("Library sizes calculated for", nrow(lib_sizes_df), "samples\n")
## Library sizes calculated for 60 samples
# Extract JMJ counts (long format)
kallisto_data <- full_counts_matrix %>%
filter(target_id %in% jmj_paralogs$transcript) %>%
pivot_longer(
cols = -target_id,
names_to = "sample",
values_to = "est_counts"
) %>%
left_join(lib_sizes_df, by = "sample") %>%
mutate(
# Calculate CPM (counts per million)
cpm = (est_counts / lib_size) * 1e6
)
cat("Extracted JMJ paralogs:", nrow(kallisto_data), "records\n")
## Extracted JMJ paralogs: 300 records
cat("(", n_distinct(kallisto_data$target_id), "transcripts x",
n_distinct(kallisto_data$sample), "samples )\n")
## ( 5 transcripts x 60 samples )
Samples with library size < 20 million reads are considered low quality (same threshold as DEG analysis).
# Get library sizes per sample
lib_sizes <- kallisto_data %>%
distinct(sample, lib_size)
# Mark low quality samples
lib_sizes <- lib_sizes %>%
mutate(low_quality = lib_size < 2e7)
{
cat("=== LIBRARY SIZE QC ===\n")
cat("Low quality libraries (< 20M reads):\n")
print(table(lib_sizes$low_quality))
if (any(lib_sizes$low_quality)) {
cat("\nExcluded samples:\n")
print(lib_sizes %>% filter(low_quality) %>% select(sample, lib_size))
}
}
## === LIBRARY SIZE QC ===
## Low quality libraries (< 20M reads):
##
## FALSE TRUE
## 43 17
##
## Excluded samples:
## # A tibble: 17 × 2
## sample lib_size
## <chr> <dbl>
## 1 L01_S1_L002 328934.
## 2 L02_S2_L002 799530.
## 3 L03_S3_L002 2400639.
## 4 L04_S4_L002 2478621.
## 5 L10_S10_L002 10483027.
## 6 L12_S12_L002 11035011.
## 7 L18_S18_L002 6076193.
## 8 L19_S19_L002 7919901.
## 9 L20_S20_L002 10783328.
## 10 L21_S21_L002 6987465.
## 11 L23_S23_L002 8952170.
## 12 L24_S24_L002 10233511.
## 13 L25_S25_L002 7673052.
## 14 R37_S37_L002 10527638.
## 15 R38_S38_L002 8988918.
## 16 R39_S39_L002 8495462.
## 17 R40_S40_L002 8408381.
# Filter out low quality samples
good_samples <- lib_sizes %>%
filter(!low_quality) %>%
pull(sample)
jmj_filtered <- kallisto_data %>%
filter(sample %in% good_samples)
cat("\nRetained", n_distinct(jmj_filtered$sample), "samples\n")
##
## Retained 43 samples
# Load PSU2022 metadata
meta <- read.csv(file.path(paths$data, "PSU2022_metadata.csv"))
# Join with paralog info and metadata
jmj_expr <- jmj_filtered %>%
left_join(jmj_paralogs, by = c("target_id" = "transcript")) %>%
left_join(
meta %>% select(library, Genotype, leaf_tissue),
by = c("sample" = "library")
) %>%
filter(!is.na(Genotype)) %>%
mutate(
genotype = ifelse(Genotype == "CTRL", "B73", "Inv4m"),
tissue = paste0("Leaf ", leaf_tissue),
# Order symbols by genomic position
symbol = factor(symbol, levels = c("jmj9", "psi", "jmj6", "jmj2", "jmj4")),
genotype = factor(genotype, levels = c("B73", "Inv4m"))
)
{
cat("=== DATA SUMMARY ===\n")
cat("Samples:", n_distinct(jmj_expr$sample), "\n")
cat("Paralogs:", n_distinct(jmj_expr$symbol), "\n")
cat("\nBy genotype:\n")
print(table(jmj_expr$genotype) / 5)
cat("\nBy tissue:\n")
print(table(jmj_expr$tissue) / 5)
}
## === DATA SUMMARY ===
## Samples: 43
## Paralogs: 5
##
## By genotype:
##
## B73 Inv4m
## 20 23
##
## By tissue:
##
## Leaf 1 Leaf 2 Leaf 3 Leaf 4
## 11 11 11 10
This shows expression distribution per paralog. Note: Because JMJ paralogs are >97% identical, kallisto artificially distributes Inv4m reads across all 5 loci. See “Total Cluster Expression” section below for the main analysis.
genotype_colors <- c("B73" = "gold", "Inv4m" = "#4a0f82")
p_jmj_paralogs <- ggplot(
jmj_expr,
aes(x = tissue, y = log2(cpm + 1), color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 1.5,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
facet_wrap(~ symbol, nrow = 1) +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 14) +
theme(
axis.text.x = element_text(
angle = 45, hjust = 1,
color = "black", face = "bold", size = 11
),
axis.text.y = element_text(color = "black", face = "bold"),
axis.title = element_text(face = "bold", size = 13),
strip.text = element_text(face = "bold.italic", size = 14),
strip.background = element_rect(fill = "grey95", color = NA),
legend.position = "bottom",
legend.text = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3)
) +
labs(
x = NULL,
y = expression(bold(log[2]*"(CPM + 1)"))
)
print(p_jmj_paralogs)
Since B73/CTRL has 5 tandem JMJ paralogs (>97% identical) while Inv4m has only 1 gene copy, the biologically meaningful comparison is total cluster expression: the sum of CPM across all 5 paralogs per sample.
# Sum CPM across all 5 paralogs for each sample
jmj_total <- jmj_expr %>%
group_by(sample, genotype, tissue) %>%
summarise(
total_cpm = sum(cpm, na.rm = TRUE),
total_counts = sum(est_counts, na.rm = TRUE),
n_paralogs = n(),
.groups = "drop"
)
cat("=== TOTAL CLUSTER EXPRESSION ===\n")
## === TOTAL CLUSTER EXPRESSION ===
cat("Samples:", n_distinct(jmj_total$sample), "\n")
## Samples: 43
cat("Mean total CPM by genotype:\n")
## Mean total CPM by genotype:
jmj_total %>%
group_by(genotype) %>%
summarise(
mean_total_cpm = mean(total_cpm),
sd_total_cpm = sd(total_cpm),
.groups = "drop"
) %>%
print()
## # A tibble: 2 × 3
## genotype mean_total_cpm sd_total_cpm
## <fct> <dbl> <dbl>
## 1 B73 94.8 19.2
## 2 Inv4m 10.5 3.73
This plot shows the sum of expression across all 5 JMJ paralogs for each sample, comparing B73 (5 gene copies) vs Inv4m (1 gene copy).
p_jmj_total <- ggplot(
jmj_total,
aes(x = tissue, y = log2(total_cpm + 1), color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 2,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 16) +
theme(
axis.text.x = element_text(
angle = 45, hjust = 1,
color = "black", face = "bold", size = 13
),
axis.text.y = element_text(color = "black", face = "bold", size = 12),
axis.title = element_text(face = "bold", size = 14),
legend.position = "bottom",
legend.text = element_text(face = "bold", size = 12),
legend.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3),
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "grey40")
) +
labs(
title = "Total JMJ Cluster Expression",
subtitle = "Sum of 5 paralogs: B73 (5 copies) vs Inv4m (1 copy)",
x = NULL,
y = expression(bold(log[2]*"(Total CPM + 1)"))
)
print(p_jmj_total)
Same data as above but on linear scale to emphasize the magnitude of difference. The dashed line shows the expected Inv4m expression if it scaled linearly with copy number (B73 mean / 5).
# Add expected values to data for plotting
expected_df <- expected_df %>%
mutate(tissue = factor(tissue, levels = levels(factor(jmj_total$tissue))))
p_jmj_total_linear <- ggplot(
jmj_total,
aes(x = tissue, y = total_cpm, color = genotype)
) +
geom_boxplot(
width = 0.5,
linewidth = 1,
outlier.shape = NA,
position = position_dodge(width = 0.7)
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(
size = 2,
dodge.width = 0.7,
alpha = 0.7
) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
# Add expected line (B73/5) - dashed red crossbars
geom_errorbar(
data = expected_df,
aes(x = tissue, ymin = expected_inv4m, ymax = expected_inv4m),
color = "red", linetype = "dashed", linewidth = 1,
width = 0.6, inherit.aes = FALSE
) +
scale_color_manual(values = genotype_colors, name = "Genotype") +
theme_classic(base_size = 16) +
theme(
axis.text.x = element_text(
angle = 45, hjust = 1,
color = "black", face = "bold", size = 13
),
axis.text.y = element_text(color = "black", face = "bold", size = 12),
axis.title = element_text(face = "bold", size = 14),
legend.position = "bottom",
legend.text = element_text(face = "bold", size = 12),
legend.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_line(color = "grey90", linewidth = 0.3),
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "grey40")
) +
labs(
title = "Total JMJ Cluster Expression (Linear Scale)",
subtitle = "Dashed red line = expected if Inv4m = B73/5 (copy number scaling)",
x = NULL,
y = "Total CPM"
)
print(p_jmj_total_linear)
Null hypothesis: Inv4m mean expression = B73 mean / 5 (linear copy number scaling)
# Calculate summary stats by tissue and genotype
tissue_stats <- jmj_total %>%
group_by(tissue, genotype) %>%
summarise(
mean_cpm = mean(total_cpm),
sd_cpm = sd(total_cpm),
n = n(),
values = list(total_cpm),
.groups = "drop"
)
# Get B73 stats and calculate expected
b73_stats <- tissue_stats %>%
filter(genotype == "B73") %>%
select(tissue, B73_mean = mean_cpm, B73_sd = sd_cpm, B73_n = n) %>%
mutate(expected = B73_mean / 5)
# Get Inv4m stats
inv4m_stats <- tissue_stats %>%
filter(genotype == "Inv4m") %>%
select(tissue, Inv4m_mean = mean_cpm, Inv4m_sd = sd_cpm, Inv4m_n = n,
inv4m_values = values)
# Merge and run t-tests
ttest_results <- b73_stats %>%
left_join(inv4m_stats, by = "tissue") %>%
rowwise() %>%
mutate(
ttest = list(t.test(inv4m_values, mu = expected)),
t_stat = ttest$statistic,
p_value = ttest$p.value,
ci_lower = ttest$conf.int[1],
ci_upper = ttest$conf.int[2],
pct_of_expected = (Inv4m_mean / expected) * 100
) %>%
ungroup() %>%
select(-inv4m_values, -ttest)
# Display results table
cat("=== ONE-SAMPLE T-TEST: Inv4m vs Expected (B73/5) ===\n\n")
## === ONE-SAMPLE T-TEST: Inv4m vs Expected (B73/5) ===
ttest_table <- ttest_results %>%
select(tissue, B73_mean, expected, Inv4m_mean, pct_of_expected, t_stat, p_value) %>%
mutate(
B73_mean = round(B73_mean, 1),
expected = round(expected, 1),
Inv4m_mean = round(Inv4m_mean, 1),
pct_of_expected = paste0(round(pct_of_expected, 1), "%"),
t_stat = round(t_stat, 2),
p_value = format(p_value, digits = 2, scientific = TRUE)
) %>%
rename(
Tissue = tissue,
`B73 Mean` = B73_mean,
`Expected (B73/5)` = expected,
`Inv4m Mean` = Inv4m_mean,
`% of Expected` = pct_of_expected,
`t` = t_stat,
`p-value` = p_value
)
knitr::kable(ttest_table, align = "lrrrcrr")
| Tissue | B73 Mean | Expected (B73/5) | Inv4m Mean | % of Expected | t | p-value |
|---|---|---|---|---|---|---|
| Leaf 1 | 105.7 | 21.1 | 13.1 | 62% | -4.02 | 7.0e-03 |
| Leaf 2 | 100.2 | 20.0 | 10.0 | 50.1% | -12.94 | 2.1e-04 |
| Leaf 3 | 91.5 | 18.3 | 9.8 | 53.6% | -8.65 | 3.4e-04 |
| Leaf 4 | 83.0 | 16.6 | 8.3 | 49.8% | -8.07 | 1.3e-03 |
# Overall test (pooling across tissues)
overall_B73_mean <- mean(jmj_total$total_cpm[jmj_total$genotype == "B73"])
overall_expected <- overall_B73_mean / 5
overall_Inv4m_values <- jmj_total$total_cpm[jmj_total$genotype == "Inv4m"]
overall_ttest <- t.test(overall_Inv4m_values, mu = overall_expected)
cat("\n=== OVERALL TEST (pooled across tissues) ===\n\n")
##
## === OVERALL TEST (pooled across tissues) ===
cat("B73 mean: ", round(overall_B73_mean, 2), "CPM\n")
## B73 mean: 94.82 CPM
cat("Expected (B73/5): ", round(overall_expected, 2), "CPM\n")
## Expected (B73/5): 18.96 CPM
cat("Inv4m mean: ", round(mean(overall_Inv4m_values), 2), "CPM\n")
## Inv4m mean: 10.53 CPM
cat("% of expected: ", round(mean(overall_Inv4m_values) / overall_expected * 100, 1), "%\n\n")
## % of expected: 55.6 %
cat("One-sample t-test:\n")
## One-sample t-test:
cat(" t =", round(overall_ttest$statistic, 2), "\n")
## t = -10.84
cat(" df =", round(overall_ttest$parameter, 1), "\n")
## df = 22
cat(" p =", format(overall_ttest$p.value, digits = 3, scientific = TRUE), "\n")
## p = 2.73e-10
cat(" 95% CI: [", round(overall_ttest$conf.int[1], 2), ",",
round(overall_ttest$conf.int[2], 2), "]\n")
## 95% CI: [ 8.92 , 12.15 ]
jmj_total_summary <- jmj_total %>%
group_by(tissue, genotype) %>%
summarise(
mean_total_cpm = mean(total_cpm, na.rm = TRUE),
sd_total_cpm = sd(total_cpm, na.rm = TRUE),
mean_log2 = mean(log2(total_cpm + 1), na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
pivot_wider(
names_from = genotype,
values_from = c(mean_total_cpm, sd_total_cpm, mean_log2, n),
names_sep = "_"
) %>%
mutate(
log2FC = mean_log2_Inv4m - mean_log2_B73,
fold_change = 2^log2FC,
direction = ifelse(log2FC < 0, "Down", "Up")
)
cat("=== TOTAL CLUSTER EXPRESSION BY TISSUE ===\n")
## === TOTAL CLUSTER EXPRESSION BY TISSUE ===
jmj_total_summary %>%
select(tissue, mean_total_cpm_B73, mean_total_cpm_Inv4m, log2FC, fold_change, direction) %>%
print()
## # A tibble: 4 × 6
## tissue mean_total_cpm_B73 mean_total_cpm_Inv4m log2FC fold_change direction
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Leaf 1 106. 13.1 -2.97 0.128 Down
## 2 Leaf 2 100. 10.0 -3.19 0.110 Down
## 3 Leaf 3 91.5 9.81 -3.10 0.117 Down
## 4 Leaf 4 83.0 8.28 -3.19 0.109 Down
cat("\n=== OVERALL MEAN ===\n")
##
## === OVERALL MEAN ===
cat("Mean log2FC across tissues:", round(mean(jmj_total_summary$log2FC), 3), "\n")
## Mean log2FC across tissues: -3.112
cat("Mean fold change:", round(mean(jmj_total_summary$fold_change), 3), "\n")
## Mean fold change: 0.116
# Test for genotype effect on total cluster expression
cat("=== STATISTICAL ANALYSIS ===\n\n")
## === STATISTICAL ANALYSIS ===
# Overall t-test (ignoring tissue)
t_overall <- t.test(
log2(total_cpm + 1) ~ genotype,
data = jmj_total
)
cat("Overall t-test (B73 vs Inv4m):\n")
## Overall t-test (B73 vs Inv4m):
cat(" t =", round(t_overall$statistic, 3), "\n")
## t = 28.285
cat(" p =", format(t_overall$p.value, digits = 3, scientific = TRUE), "\n")
## p = 3.97e-28
cat(" Mean B73:", round(t_overall$estimate[1], 3), "\n")
## Mean B73: 6.553
cat(" Mean Inv4m:", round(t_overall$estimate[2], 3), "\n")
## Mean Inv4m: 3.469
# ANOVA with tissue interaction
model <- aov(log2(total_cpm + 1) ~ genotype * tissue, data = jmj_total)
cat("\n\nTwo-way ANOVA (genotype × tissue):\n")
##
##
## Two-way ANOVA (genotype × tissue):
print(summary(model))
## Df Sum Sq Mean Sq F value Pr(>F)
## genotype 1 101.70 101.70 877.692 <2e-16 ***
## tissue 3 1.29 0.43 3.702 0.0205 *
## genotype:tissue 3 0.09 0.03 0.247 0.8630
## Residuals 35 4.06 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
jmj_summary <- jmj_expr %>%
group_by(tissue, symbol, genotype) %>%
summarise(
mean_cpm = mean(cpm, na.rm = TRUE),
sd_cpm = sd(cpm, na.rm = TRUE),
mean_log2 = mean(log2(cpm + 1), na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
pivot_wider(
names_from = genotype,
values_from = c(mean_cpm, sd_cpm, mean_log2, n),
names_sep = "_"
) %>%
mutate(
log2FC = mean_log2_Inv4m - mean_log2_B73,
direction = ifelse(log2FC < 0, "Down", "Up")
)
cat("=== EXPRESSION SUMMARY ===\n")
## === EXPRESSION SUMMARY ===
jmj_summary %>%
select(tissue, symbol, mean_cpm_B73, mean_cpm_Inv4m, log2FC, direction) %>%
arrange(symbol, tissue) %>%
print(n = 25)
## # A tibble: 20 × 6
## tissue symbol mean_cpm_B73 mean_cpm_Inv4m log2FC direction
## <chr> <fct> <dbl> <dbl> <dbl> <chr>
## 1 Leaf 1 jmj9 38.1 5.83 -2.54 Down
## 2 Leaf 2 jmj9 35.8 3.97 -2.88 Down
## 3 Leaf 3 jmj9 31.8 4.02 -2.75 Down
## 4 Leaf 4 jmj9 29.9 3.31 -2.86 Down
## 5 Leaf 1 psi 6.36 0.751 -2.11 Down
## 6 Leaf 2 psi 6.22 0.647 -2.05 Down
## 7 Leaf 3 psi 6.37 0.821 -2.02 Down
## 8 Leaf 4 psi 5.74 0.628 -1.92 Down
## 9 Leaf 1 jmj6 16.7 2.22 -2.48 Down
## 10 Leaf 2 jmj6 16.1 1.34 -2.96 Down
## 11 Leaf 3 jmj6 14.9 1.47 -2.69 Down
## 12 Leaf 4 jmj6 14.0 1.09 -2.87 Down
## 13 Leaf 1 jmj2 20.3 0.923 -3.77 Down
## 14 Leaf 2 jmj2 18.7 0.794 -3.54 Down
## 15 Leaf 3 jmj2 17.2 0.370 -3.83 Down
## 16 Leaf 4 jmj2 15.4 0.338 -3.65 Down
## 17 Leaf 1 jmj4 24.2 3.38 -2.54 Down
## 18 Leaf 2 jmj4 23.3 3.29 -2.49 Down
## 19 Leaf 3 jmj4 21.1 3.13 -2.39 Down
## 20 Leaf 4 jmj4 18.0 2.91 -2.31 Down
jmj_summary %>%
group_by(symbol) %>%
summarise(
mean_log2FC = mean(log2FC),
.groups = "drop"
) %>%
arrange(symbol)
## # A tibble: 5 × 2
## symbol mean_log2FC
## <fct> <dbl>
## 1 jmj9 -2.76
## 2 psi -2.03
## 3 jmj6 -2.75
## 4 jmj2 -3.70
## 5 jmj4 -2.43
# Save per-paralog plot
saveRDS(p_jmj_paralogs, file.path(paths$figures, "jmj_paralogs_expression_boxplot.rds"))
ggsave(
file.path(paths$figures, "jmj_paralogs_expression_boxplot.pdf"),
p_jmj_paralogs,
width = 14, height = 5
)
ggsave(
file.path(paths$figures, "jmj_paralogs_expression_boxplot.png"),
p_jmj_paralogs,
width = 14, height = 5, dpi = 300
)
# Save total cluster expression plot (log scale)
saveRDS(p_jmj_total, file.path(paths$figures, "jmj_total_cluster_expression.rds"))
ggsave(
file.path(paths$figures, "jmj_total_cluster_expression.pdf"),
p_jmj_total,
width = 8, height = 6
)
ggsave(
file.path(paths$figures, "jmj_total_cluster_expression.png"),
p_jmj_total,
width = 8, height = 6, dpi = 300
)
# Save total cluster expression plot (linear scale)
saveRDS(p_jmj_total_linear, file.path(paths$figures, "jmj_total_cluster_expression_linear.rds"))
ggsave(
file.path(paths$figures, "jmj_total_cluster_expression_linear.pdf"),
p_jmj_total_linear,
width = 8, height = 6
)
ggsave(
file.path(paths$figures, "jmj_total_cluster_expression_linear.png"),
p_jmj_total_linear,
width = 8, height = 6, dpi = 300
)
# Save per-paralog data
write_csv(jmj_expr, file.path(paths$intermediate, "jmj_paralogs_expression_data.csv"))
write_csv(jmj_summary, file.path(paths$intermediate, "jmj_paralogs_expression_summary.csv"))
# Save total cluster data
write_csv(jmj_total, file.path(paths$intermediate, "jmj_total_cluster_expression_data.csv"))
write_csv(jmj_total_summary, file.path(paths$intermediate, "jmj_total_cluster_expression_summary.csv"))
{
cat("=== SAVED OUTPUTS ===\n")
cat("\nPer-Paralog Analysis:\n")
cat(" Figures: jmj_paralogs_expression_boxplot.pdf/png/rds\n")
cat(" Data: jmj_paralogs_expression_data.csv\n")
cat(" Data: jmj_paralogs_expression_summary.csv\n")
cat("\nTotal Cluster Analysis:\n")
cat(" Figures: jmj_total_cluster_expression.pdf/png/rds (log scale)\n")
cat(" Figures: jmj_total_cluster_expression_linear.pdf/png/rds (linear scale)\n")
cat(" Data: jmj_total_cluster_expression_data.csv\n")
cat(" Data: jmj_total_cluster_expression_summary.csv\n")
}
## === SAVED OUTPUTS ===
##
## Per-Paralog Analysis:
## Figures: jmj_paralogs_expression_boxplot.pdf/png/rds
## Data: jmj_paralogs_expression_data.csv
## Data: jmj_paralogs_expression_summary.csv
##
## Total Cluster Analysis:
## Figures: jmj_total_cluster_expression.pdf/png/rds (log scale)
## Figures: jmj_total_cluster_expression_linear.pdf/png/rds (linear scale)
## Data: jmj_total_cluster_expression_data.csv
## Data: jmj_total_cluster_expression_summary.csv
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggbeeswarm_0.7.3 ggfx_1.0.3 lubridate_1.9.4 forcats_1.0.1
## [5] stringr_1.6.0 dplyr_1.1.4 purrr_1.2.0 readr_2.1.6
## [9] tidyr_1.3.2 tibble_3.3.0 ggplot2_4.0.1 tidyverse_2.0.0
## [13] here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.6 sass_0.4.10 generics_0.1.4 stringi_1.8.7
## [5] hms_1.1.4 digest_0.6.39 magrittr_2.0.4 evaluate_1.0.5
## [9] grid_4.5.2 timechange_0.3.0 RColorBrewer_1.1-3 fastmap_1.2.0
## [13] rprojroot_2.1.1 jsonlite_2.0.0 scales_1.4.0 textshaping_1.0.4
## [17] jquerylib_0.1.4 cli_3.6.5 crayon_1.5.3 rlang_1.1.6
## [21] bit64_4.6.0-1 withr_3.0.2 cachem_1.1.0 yaml_2.3.12
## [25] otel_0.2.0 parallel_4.5.2 tools_4.5.2 tzdb_0.5.0
## [29] vctrs_0.6.5 R6_2.6.1 lifecycle_1.0.4 magick_2.9.0
## [33] bit_4.6.0 vroom_1.6.7 vipor_0.4.7 ragg_1.5.0
## [37] beeswarm_0.4.0 pkgconfig_2.0.3 pillar_1.11.1 bslib_0.9.0
## [41] gtable_0.3.6 glue_1.8.0 Rcpp_1.1.0 systemfonts_1.3.1
## [45] xfun_0.55 tidyselect_1.2.1 knitr_1.51 dichromat_2.0-0.1
## [49] farver_2.1.2 htmltools_0.5.9 labeling_0.4.3 rmarkdown_2.30
## [53] compiler_4.5.2 S7_0.2.1