PA2022 Spatial Analysis: SpATS Spatial Correction with GDD Integration

Overview

This notebook fits a SpATS PSANOVA surface per phenotype and returns spatially-corrected plot-level values for each phenotype, including GDD-based traits (GDDA and GDDS). The approach uses statgenHTP::fitModels() (which wraps SpATS when provided a TimePoints object) and getCorrected() to extract corrected plot-level observations. The workflow:

Load GDD lookup table for PA2022.
Load and clean data (as in prior notebook).
Create plot_data (plot-level means) and merge GDD values.
Build a TimePoints object for statgenHTP.
Loop over usable phenotypes (including GDDA, GDDS) and apply fitModels() → getCorrected().
Merge corrected values into a wide table sp_corrected.
Save corrected table for downstream analyses.

Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.

1. Load libraries

library(tidyverse)
library(dplyr)
library(lubridate)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme)
library(gstat)
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS)
library(statgenHTP)
library(ggbeeswarm)
library(cowplot)
library(emmeans)

1b. Configuration and load GDD lookup table

environment_name <- "PSU2022"
planting_date <- mdy("5/26/2022")
gdd_lookup_file <- file.path(paths$intermediate, "gdd_lookup_table.csv")

gdd_at_phenology <- read_csv(gdd_lookup_file) %>%
  filter(environment == environment_name)

cat("Loaded GDD lookup for", environment_name, "with", 
    nrow(gdd_at_phenology), "dates\n")

## Loaded GDD lookup for PSU2022 with 101 dates

cat("GDD range:", min(gdd_at_phenology$cumulative_gdd_from_planting), "to", 
    max(gdd_at_phenology$cumulative_gdd_from_planting), "°C\n")

## GDD range: 6.28 to 1203.705 °C

2. Load and prepare data

DATA_DIR <- paths$data
plant_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field.csv")
ear_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field_ear_pheno.csv")

field_data_raw <- read.csv(plant_csv, na.strings = c("", "#N/A", "NA"))
ear_data_raw <- read.csv(ear_csv, na.strings = c("", "n/a", "NA"), skip = 1)

field_data <- field_data_raw %>%
  filter(P22. >= 3004, P22. <= 4192) %>%
  rename(
    rowid = P22.,
    Genotype = Who.What,
    PH = Height_Anthesis,
    STW40 = X40_DAP_dw,
    STW50 = X50_DAP_dw, 
    STW60 = X60_DAP_dw,
    STWHV = harvest_dw
  ) %>%
  mutate(
    Treatment = factor(Treatment, levels = c("HighP", "LowP")),
    Genotype = factor(Genotype),
    Rep = as.factor(Rep)
  ) %>%
  mutate(
    Treatment = fct_recode(Treatment, "+P" = "HighP", "-P" = "LowP"),
    Genotype = fct_recode(Genotype, "Inv4m" = "INV4M")
  )

ear_data <- ear_data_raw %>%
  select(-description, -RK, -CC, -NIR, ear_rep = rep) %>%
  rename(rowid = row) %>%
  arrange(rowid) %>%
  group_by(rowid) %>%
  select(-ear_rep) %>%
  summarise_all(mean, na.rm = TRUE) %>%
  droplevels()

field_data_complete <- field_data %>%
  left_join(ear_data, by = "rowid") %>%
  mutate(HI = TKW / STWHV) %>%
  mutate(
    Plot_Column = case_when(
      Plot == "PlotVIII" ~ Plot_Column + 10,
      Plot == "PlotVI" ~ Plot_Column,
      TRUE ~ Plot_Column
    )
  ) %>%
  filter(!is.na(PH)) %>%
  droplevels()

base_phenotypes <- c("PH", "DTA", "DTS", "STW40", "STW50", "STW60", "STWHV")
ear_phenotypes <- c("EW", "EL", "ED", "KRN", "CD", "TKW", "KW50", "TKN", "HI")
all_phenotypes <- unique(c(base_phenotypes, ear_phenotypes))

cat("Field data dimensions:", dim(field_data_complete), "\n")

## Field data dimensions: 127 39

cat("Phenotypes available:", paste(all_phenotypes, collapse = ", "), "\n")

## Phenotypes available: PH, DTA, DTS, STW40, STW50, STW60, STWHV, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI

3. Create plot-level data with GDD integration

plot_data <- field_data_complete %>%
  group_by(rowid, Rep, Plot_Row, Plot_Column, Genotype, Treatment) %>%
  summarise(
    across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
    n_plants = n(),
    .groups = 'drop'
  ) %>%
  mutate(
    across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x)),
    environment = environment_name,
    anthesis_date = planting_date + days(round(DTA)),
    silking_date = planting_date + days(round(DTS))
  ) %>%
  left_join(
    gdd_at_phenology %>% 
      rename(anthesis_date = date, GDDA = cumulative_gdd_from_planting),
    by = c("environment", "anthesis_date")
  ) %>%
  left_join(
    gdd_at_phenology %>% 
      rename(silking_date = date, GDDS = cumulative_gdd_from_planting),
    by = c("environment", "silking_date")
  ) %>%
  select(-environment, -anthesis_date, -silking_date)

plot_data$Plot_Row <- as.numeric(plot_data$Plot_Row)
plot_data$Plot_Column <- as.numeric(plot_data$Plot_Column)
plot_data$Rep <- as.factor(plot_data$Rep)

cat("Plot-level summary dimensions:", dim(plot_data), "\n")

## Plot-level summary dimensions: 127 25

cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")

## Average plants per plot: 1

cat("GDD columns added: GDDA (anthesis), GDDS (silking)\n")

## GDD columns added: GDDA (anthesis), GDDS (silking)

cat("Plots with GDDA:", sum(!is.na(plot_data$GDDA)), "\n")

## Plots with GDDA: 127

cat("Plots with GDDS:", sum(!is.na(plot_data$GDDS)), "\n")

## Plots with GDDS: 127

4. Identify usable phenotypes (<50% missing)

all_traits <- c(all_phenotypes, "GDDA", "GDDS")

missing_summary <- plot_data %>%
  select(all_of(all_traits)) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
  mutate(
    total_obs = nrow(plot_data),
    missing_pct = round(missing_count / total_obs * 100, 1),
    available_n = total_obs - missing_count
  ) %>%
  arrange(missing_pct)

print(missing_summary)

## # A tibble: 18 × 5
##    phenotype missing_count total_obs missing_pct available_n
##    <chr>             <int>     <int>       <dbl>       <int>
##  1 PH                    0       127         0           127
##  2 DTA                   0       127         0           127
##  3 DTS                   0       127         0           127
##  4 GDDA                  0       127         0           127
##  5 GDDS                  0       127         0           127
##  6 STW40                 2       127         1.6         125
##  7 STWHV                 2       127         1.6         125
##  8 STW50                15       127        11.8         112
##  9 STW60                20       127        15.7         107
## 10 EW                   22       127        17.3         105
## 11 EL                   22       127        17.3         105
## 12 ED                   22       127        17.3         105
## 13 KRN                  22       127        17.3         105
## 14 CD                   22       127        17.3         105
## 15 TKW                  22       127        17.3         105
## 16 KW50                 22       127        17.3         105
## 17 TKN                  22       127        17.3         105
## 18 HI                   22       127        17.3         105

usable_phenotypes <- missing_summary %>%
  filter(missing_pct < 50) %>%
  pull(phenotype)

cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")

## Usable phenotypes (<50% missing): PH, DTA, DTS, GDDA, GDDS, STW40, STWHV, STW50, STW60, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI

5. Build TimePoints object for statgenHTP

single_time <- plot_data %>%
  mutate(
    Genotype = as.character(Genotype),
    Treatment = as.character(Treatment),
    Rep = as.character(Rep),
    Plot_Row = as.numeric(Plot_Row),
    Plot_Column = as.numeric(Plot_Column),
    time = as.POSIXct("2022-05-22 10:00:00")
  )

tp_data <- createTimePoints(
  dat = single_time,
  experimentName = environment_name,
  genotype = "Genotype",
  timePoint = "time",
  plotId = "rowid",
  repId = "Rep",
  rowNum = "Plot_Row",
  colNum = "Plot_Column",
  checkGenotypes = c("NUE", "B73"),
  addCheck = TRUE
)

summary(tp_data)

## tp_data contains data for experiment PSU2022.
## 
## It contains 1 time points.
## First time point: 2022-05-22 10:00:00 
## Last time point: 2022-05-22 10:00:00 
## 
## The following genotypes are defined as check genotypes: NUE, B73.

6. Fit SpATS per phenotype and extract corrected values

corrected_list <- list()
fit_info <- list()
all_models <- list()

for (phen in usable_phenotypes) {
  message("Fitting SpATS model for: ", phen)
  
  no_nas <- single_time %>%
    filter(!is.na(.data[[phen]]))
  
  if (nrow(no_nas) < 10) {
    message("Insufficient data for ", phen, " (n=", nrow(no_nas), ")")
    next
  }
  
  fit_try <- try(
    fitModels(
      TP = tp_data,
      trait = phen,
      extraFixedFactors = c("repId"),  # Treatment lives in Stage-2 lm
      timePoints = 1,
      what = "fixed"
    ),
    silent = TRUE
  )
  
  if (inherits(fit_try, "try-error")) {
    message("Fit failed for ", phen, ": ", fit_try)
    next
  }
  
  all_models[[phen]] <- fit_try
  
  corr <- try(getCorrected(fit_try), silent = TRUE)
  if (inherits(corr, "try-error") || is.null(corr)) {
    message("getCorrected() failed for ", phen)
    next
  }
  
  corr <- corr %>%
    select(-phen)
  
  corrected_list[[phen]] <- corr
  fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}

sp_corrected <- lapply(
  corrected_list,
  function(corr_df) {
    corr_df %>%
      select(timeNumber, timePoint, genotype, repId,
             rowId, colId, plotId)
  }
) %>%
  bind_rows() %>%
  arrange(plotId) %>%
  distinct()

## Treatment is no longer in SpATS extraFixedFactors (Stage-2 lm handles it)
## — re-attach it from single_time so downstream lm + emmeans has access.
sp_corrected <- sp_corrected %>%
  left_join(single_time %>%
              mutate(plotId = as.character(rowid)) %>%
              select(plotId, Treatment) %>%
              distinct(),
            by = "plotId")

for (phen in names(corrected_list)) {
  phen_corr <- paste0(phen, "_corr")
  corr_df <- corrected_list[[phen]] %>%
    select(plotId, all_of(phen_corr))
  sp_corrected <- left_join(sp_corrected, corr_df, by = "plotId")
}

colnames(sp_corrected) <- gsub("_corr", "", colnames(sp_corrected))

cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
cat("Corrected traits:", 
    paste(intersect(usable_phenotypes, colnames(sp_corrected)), 
          collapse = ", "), "\n")
print(head(sp_corrected))

7. Spatial correction visualization (example for PH and HI)

if ("PH" %in% names(all_models)) {
  plot(all_models$PH, timePoints = 1)
  plot(all_models$PH, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

if ("HI" %in% names(all_models)) {
  plot(all_models$HI, timePoints = 1)
  plot(all_models$HI, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

if ("GDDA" %in% names(all_models)) {
  cat("\n### Spatial plots for GDDA\n")
  plot(all_models$GDDA, timePoints = 1)
  plot(all_models$GDDA, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

## 
## ### Spatial plots for GDDA

if ("GDDS" %in% names(all_models)) {
  cat("\n### Spatial plots for GDDS\n")
  plot(all_models$GDDS, timePoints = 1)
  plot(all_models$GDDS, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

## 
## ### Spatial plots for GDDS

8. Stage-2 contrasts: lm + emmeans (matches PSU2024 §8)

NIL_corrected <- sp_corrected %>%
  filter(genotype %in% c("Inv4m", "CTRL")) %>%
  mutate(genotype = factor(genotype, levels = c("CTRL", "Inv4m")),
         Treatment = factor(Treatment)) %>%
  droplevels()

## Per-trait lm + emmeans: marginal Genotype contrast (pooled over P) and
## conditional Genotype contrasts within each P_LEVEL. G x T interaction p
## stored as diagnostic (NOT in FDR pool).
fit_one_trait <- function(trait_name, data) {
  d <- data %>%
    pivot_longer(cols = all_of(trait_name),
                 names_to = "trait", values_to = "value") %>%
    filter(!is.na(value))
  if (nrow(d) < 4 || length(unique(d$genotype)) < 2 ||
      length(unique(d$Treatment)) < 2) return(NULL)

  m <- tryCatch(lm(value ~ genotype * Treatment, data = d),
                error = function(e) NULL)
  if (is.null(m)) return(NULL)

  anov  <- anova(m)
  p_int <- if ("genotype:Treatment" %in% rownames(anov))
             anov["genotype:Treatment", "Pr(>F)"] else NA_real_

  marg <- pairs(emmeans(m, ~ genotype), reverse = TRUE) %>%
    as_tibble() %>%
    transmute(trait = trait_name,
              contrast_type = "marginal",
              contrast_level = "marginal",
              estimate, SE, df, t = t.ratio, p = p.value,
              p_int_GxT = p_int)

  cond <- pairs(emmeans(m, ~ genotype | Treatment), reverse = TRUE) %>%
    as_tibble() %>%
    transmute(trait = trait_name,
              contrast_type = "conditional",
              contrast_level = paste0("Genotype|", Treatment),
              estimate, SE, df, t = t.ratio, p = p.value,
              p_int_GxT = p_int)

  bind_rows(marg, cond)
}

fitted_traits <- intersect(usable_phenotypes, names(NIL_corrected))

all_contrasts <- lapply(fitted_traits, fit_one_trait,
                        data = NIL_corrected) %>%
  bind_rows() %>%
  mutate(p_adj = p.adjust(p, method = "fdr"),
         signif = case_when(
           p_adj < 0.001 ~ "***",
           p_adj < 0.01  ~ "**",
           p_adj < 0.05  ~ "*",
           TRUE          ~ "ns"
         ))

cat("FDR pool size:", nrow(all_contrasts), "contrasts\n")

## FDR pool size: 54 contrasts

cat("Significant (FDR < 0.05):",
    sum(all_contrasts$p_adj < 0.05), "rows\n\n")

## Significant (FDR < 0.05): 19 rows

knitr::kable(all_contrasts %>% arrange(p_adj) %>%
               select(trait, contrast_type, contrast_level,
                      estimate, SE, df, t, p, p_adj, signif, p_int_GxT),
             digits = 4,
             caption = paste("All Stage-2 contrasts. p = nominal,",
                             "p_adj = BH-FDR across the pool."))

All Stage-2 contrasts. p = nominal, p_adj = BH-FDR across the pool.
trait	contrast_type	contrast_level	estimate	SE	df	t	p	p_adj	signif	p_int_GxT
PH	marginal	marginal	6.4849	1.1154	60	5.8138	0.0000	0.0000	***	0.4200
PH	conditional	Genotype\|-P	7.3906	1.5775	60	4.6851	0.0000	0.0002	***	0.4200
DTA	marginal	marginal	-1.2923	0.2648	60	-4.8811	0.0000	0.0002	***	0.6078
DTS	marginal	marginal	-1.0546	0.2221	60	-4.7475	0.0000	0.0002	***	0.1578
GDDA	marginal	marginal	-18.7391	3.9079	60	-4.7951	0.0000	0.0002	***	0.4584
GDDS	marginal	marginal	-14.9196	3.2005	60	-4.6617	0.0000	0.0002	***	0.0869
GDDS	conditional	Genotype\|+P	-20.4898	4.5261	60	-4.5270	0.0000	0.0002	***	0.0869
DTS	conditional	Genotype\|+P	-1.3724	0.3142	60	-4.3685	0.0001	0.0003	***	0.1578
GDDA	conditional	Genotype\|+P	-21.6557	5.5267	60	-3.9184	0.0002	0.0014	**	0.4584
DTA	conditional	Genotype\|+P	-1.4290	0.3744	60	-3.8163	0.0003	0.0017	**	0.6078
PH	conditional	Genotype\|+P	5.5792	1.5775	60	3.5368	0.0008	0.0039	**	0.4200
CD	conditional	Genotype\|-P	-2.2160	0.6250	49	-3.5454	0.0009	0.0039	**	0.0156
DTA	conditional	Genotype\|-P	-1.1557	0.3744	60	-3.0866	0.0031	0.0127	*	0.6078
STW40	conditional	Genotype\|+P	-3.1330	1.0726	59	-2.9209	0.0049	0.0190	*	0.0455
GDDA	conditional	Genotype\|-P	-15.8226	5.5267	60	-2.8630	0.0058	0.0208	*	0.4584
STW50	conditional	Genotype\|+P	-4.9892	1.8034	53	-2.7666	0.0078	0.0263	*	0.0330
HI	marginal	marginal	0.1151	0.0422	49	2.7241	0.0089	0.0283	*	0.5071
STWHV	conditional	Genotype\|+P	-13.3954	5.0233	58	-2.6667	0.0099	0.0297	*	0.1784
STWHV	marginal	marginal	-8.6387	3.4923	58	-2.4737	0.0163	0.0464	*	0.1784
DTS	conditional	Genotype\|-P	-0.7368	0.3142	60	-2.3454	0.0223	0.0585	ns	0.1578
CD	marginal	marginal	-1.0727	0.4562	49	-2.3514	0.0228	0.0585	ns	0.0156
HI	conditional	Genotype\|+P	0.1433	0.0616	49	2.3285	0.0241	0.0590	ns	0.5071
ED	conditional	Genotype\|-P	-1.9903	0.9195	49	-2.1645	0.0353	0.0771	ns	0.2100
KRN	conditional	Genotype\|-P	-0.6956	0.3208	49	-2.1681	0.0350	0.0771	ns	0.3742
TKW	marginal	marginal	8.0148	3.7108	49	2.1599	0.0357	0.0771	ns	0.5907
STW40	marginal	marginal	-1.5960	0.7523	59	-2.1215	0.0381	0.0791	ns	0.0455
GDDS	conditional	Genotype\|-P	-9.3495	4.5261	60	-2.0657	0.0432	0.0836	ns	0.0869
KRN	marginal	marginal	-0.4856	0.2341	49	-2.0740	0.0434	0.0836	ns	0.3742
EL	conditional	Genotype\|+P	0.8117	0.3980	49	2.0395	0.0468	0.0872	ns	0.1108
EW	marginal	marginal	8.6107	4.4427	49	1.9382	0.0584	0.1051	ns	0.7230
STW50	marginal	marginal	-2.3120	1.2230	53	-1.8905	0.0642	0.1118	ns	0.0330
TKW	conditional	Genotype\|+P	10.0238	5.4063	49	1.8541	0.0697	0.1177	ns	0.5907
ED	marginal	marginal	-1.1378	0.6711	49	-1.6955	0.0963	0.1576	ns	0.2100
EW	conditional	Genotype\|+P	10.1946	6.4726	49	1.5751	0.1217	0.1933	ns	0.7230
STW60	conditional	Genotype\|-P	7.1062	4.6379	50	1.5322	0.1318	0.2033	ns	0.1120
HI	conditional	Genotype\|-P	0.0869	0.0579	49	1.5004	0.1399	0.2099	ns	0.5071
EL	marginal	marginal	0.3680	0.2732	49	1.3472	0.1841	0.2687	ns	0.1108
TKN	marginal	marginal	33.9659	26.6347	49	1.2753	0.2082	0.2959	ns	0.9310
TKW	conditional	Genotype\|-P	6.0059	5.0845	49	1.1812	0.2432	0.3368	ns	0.5907
EW	conditional	Genotype\|-P	7.0268	6.0874	49	1.1543	0.2540	0.3428	ns	0.7230
TKN	conditional	Genotype\|+P	36.2834	38.8041	49	0.9350	0.3544	0.4667	ns	0.9310
TKN	conditional	Genotype\|-P	31.6483	36.4947	49	0.8672	0.3901	0.5015	ns	0.9310
STWHV	conditional	Genotype\|-P	-3.8821	4.8529	58	-0.7999	0.4270	0.5241	ns	0.1784
KRN	conditional	Genotype\|+P	-0.2756	0.3411	49	-0.8081	0.4230	0.5241	ns	0.3742
STW60	conditional	Genotype\|+P	-3.8955	4.9724	50	-0.7834	0.4371	0.5245	ns	0.1120
STW60	marginal	marginal	1.6054	3.3998	50	0.4722	0.6388	0.7340	ns	0.1120
KW50	conditional	Genotype\|+P	0.2554	0.5378	49	0.4749	0.6370	0.7340	ns	0.7560
KW50	marginal	marginal	0.1400	0.3691	49	0.3793	0.7061	0.7943	ns	0.7560
ED	conditional	Genotype\|+P	-0.2853	0.9777	49	-0.2918	0.7716	0.8504	ns	0.2100
STW50	conditional	Genotype\|-P	0.3652	1.6524	53	0.2210	0.8259	0.8901	ns	0.0330
EL	conditional	Genotype\|-P	-0.0756	0.3743	49	-0.2021	0.8407	0.8901	ns	0.1108
CD	conditional	Genotype\|+P	0.0707	0.6646	49	0.1064	0.9157	0.9509	ns	0.0156
STW40	conditional	Genotype\|-P	-0.0591	1.0552	59	-0.0560	0.9555	0.9613	ns	0.0455
KW50	conditional	Genotype\|-P	0.0247	0.5058	49	0.0488	0.9613	0.9613	ns	0.7560

## y.position per trait for downstream stat_pvalue_manual
y_positions <- NIL_corrected %>%
  pivot_longer(cols = any_of(fitted_traits),
               names_to = "trait", values_to = "value") %>%
  filter(!is.na(value)) %>%
  group_by(trait) %>%
  summarise(y.position = max(value, na.rm = TRUE) +
                          0.10 * diff(range(value, na.rm = TRUE)),
            .groups = "drop")

## contrast_df = marginal row per trait; drives the panel star
contrast_df <- all_contrasts %>%
  filter(contrast_type == "marginal") %>%
  left_join(y_positions, by = "trait") %>%
  mutate(group1 = "CTRL", group2 = "Inv4m",
         p.adj = p_adj,
         p.adj.signif = signif)

## Back-compat alias: downstream boxplot code references `htest` with rstatix
## column names (group1/group2/y.position/p.adj.signif).
htest <- contrast_df %>%
  select(trait, group1, group2, y.position, p.adj, p.adj.signif,
         signif, p_int_GxT,
         effect = estimate, se_diff = SE, t, p, df)

alpha <- 0.05
sig_traits <- contrast_df %>% filter(p_adj < alpha) %>%
  arrange(p_adj) %>% pull(trait)

cat("Significant traits — marginal Inv4m - CTRL (FDR < 0.05):",
    if (length(sig_traits)) paste(sig_traits, collapse = ", ") else "none",
    "\n")

## Significant traits — marginal Inv4m - CTRL (FDR < 0.05): PH, DTA, DTS, GDDA, GDDS, HI, STWHV

pheno_theme2 <- theme_classic2(base_size = 20) +
  theme(
    plot.title = element_markdown(hjust = 0.5, face = "bold"),
    axis.title.y = element_markdown(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(size = 25, face = "bold", color = "black"),
    strip.background = element_blank(),
    strip.text = element_markdown(size = 20),
    legend.position = "none"
  )

if (length(sig_traits) == 0) {
  warning("No traits significant after FDR correction.")
} else {
  # --- Main Figure 2 Panel A: 6 traits with units ---
  main_traits <- c("PH", "DTA", "DTS", "HI", "STWHV", "CD")
  trait_units <- c(PH = "cm", DTA = "Days", DTS = "Days", HI = "", STWHV = "g", CD = "mm")

  plot_data_main <- NIL_corrected %>%
    pivot_longer(
      cols = all_of(usable_phenotypes),
      names_to = "trait",
      values_to = "value"
    ) %>%
    filter(trait %in% main_traits, !is.na(value)) %>%
    mutate(
      trait = factor(trait, levels = main_traits),
      genotype = factor(genotype, levels = c("CTRL", "Inv4m"))
    )

  test_main <- htest %>%
    filter(trait %in% main_traits)

  # Theme with bold strip text, no title, rotated x-axis with ggtext
 pheno_theme_panel <- theme_classic2(base_size = 20) +
    theme(
      plot.title = element_blank(),
      axis.title.y = element_text(size = 18),
      axis.title.x = element_blank(),
      axis.text.x = element_markdown(size = 18, face = "bold", color = "black"),
      axis.text.y = element_text(size = 16),
      strip.background = element_blank(),
      strip.text = element_text(size = 20, face = "bold"),
      legend.position = "none"
    )

  # Create individual plots with custom y-axis labels
  make_trait_plot <- function(trait_name, data, tests, units) {
    d <- filter(data, trait == trait_name)
    t <- filter(tests, trait == trait_name)
    y_label <- units[trait_name]

    p <- ggplot(d, aes(x = genotype, y = value, color = genotype)) +
      geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) %>%
      with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
      geom_quasirandom(size = 2) %>%
      with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
      scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
      scale_x_discrete(labels = c("CTRL" = "CTRL", "Inv4m" = "*Inv4m*")) +
      labs(title = trait_name, y = y_label, x = NULL) +
      stat_pvalue_manual(t, tip.length = 0.01, step.height = 0.08, size = 8, bracket.size = 0.8) +
      scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
      pheno_theme_panel +
      theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5))

    # Custom 1-day breaks for DTA/DTS
    if (trait_name %in% c("DTA", "DTS")) {
      y_range <- range(d$value, na.rm = TRUE)
      p <- p + scale_y_continuous(
        breaks = seq(floor(y_range[1]), ceiling(y_range[2]), by = 1),
        expand = expansion(mult = c(0.1, 0.15))
      )
    }
    return(p)
  }

  trait_plots <- lapply(main_traits, make_trait_plot, data = plot_data_main, tests = test_main, units = trait_units)

  main_panel <- cowplot::plot_grid(
    plotlist = trait_plots,
    nrow = 2,
    ncol = 3,
    align = "hv",
    axis = "tblr"
  )

  print(main_panel)

  # Save main panel for Figure 2 (2 rows x 3 cols)
  ggsave(
    file.path(paths$figures, "field_phenotypes_panel.pdf"),
    main_panel,
    width = 10, height = 7, dpi = 300
  )
  ggsave(
    file.path(paths$figures, "field_phenotypes_panel.png"),
    main_panel,
    width = 10, height = 7, dpi = 300
  )
  cat("Saved field phenotypes panel to:", file.path(paths$figures, "field_phenotypes_panel.png"), "\n")

  # --- Supplementary: GDDA, GDDS panel ---
  gdd_traits <- c("GDDA", "GDDS")
  gdd_traits_present <- intersect(gdd_traits, sig_traits)

  if (length(gdd_traits_present) > 0) {
    plot_data_gdd <- NIL_corrected %>%
      pivot_longer(cols = all_of(usable_phenotypes), names_to = "trait", values_to = "value") %>%
      filter(trait %in% gdd_traits_present, !is.na(value)) %>%
      mutate(trait = factor(trait, levels = gdd_traits_present))

    test_gdd <- htest %>% filter(trait %in% gdd_traits_present)

    gdd_plot <- ggplot(plot_data_gdd, aes(x = genotype, y = value, color = genotype)) +
      geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) %>%
      with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
      geom_quasirandom(size = 2) %>%
      with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
      scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
      labs(y = "GDD (°C·days)", x = NULL) +
      stat_pvalue_manual(test_gdd, tip.length = 0.01, step.height = 0.08, size = 8, bracket.size = 0.8) +
      scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
      facet_wrap(~ trait, scales = "free_y", ncol = 2) +
      pheno_theme_panel

    print(gdd_plot)

    ggsave(file.path(paths$figures, "gdd_phenotypes_panel.pdf"), gdd_plot, width = 6, height = 4, dpi = 300)
    ggsave(file.path(paths$figures, "gdd_phenotypes_panel.png"), gdd_plot, width = 6, height = 4, dpi = 300)
    cat("Saved GDD phenotypes panel to:", file.path(paths$figures, "gdd_phenotypes_panel.png"), "\n")
  }

  # --- All significant traits plot (for reference) ---
  plot_data_sig <- NIL_corrected %>%
    pivot_longer(cols = all_of(usable_phenotypes), names_to = "trait", values_to = "value") %>%
    filter(trait %in% sig_traits, !is.na(value)) %>%
    mutate(trait = as.character(trait))

  test_to_plot <- htest %>% filter(trait %in% sig_traits)

  diff_plot <- ggplot(plot_data_sig, aes(x = genotype, y = value, color = genotype)) +
    geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) %>%
    with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
    geom_quasirandom(size = 2) %>%
    with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
    scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
    labs(title = "Inv4m vs CTRL: Spatially Corrected Phenotypes (FDR-Adjusted)", y = "Corrected Value", x = "Genotype") +
    stat_pvalue_manual(test_to_plot, tip.length = 0.01, step.height = 0.08, size = 10, bracket.size = 0.8) +
    scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
    facet_wrap(~ factor(trait, levels = sig_traits), scales = "free_y", ncol = 4) +
    pheno_theme_panel

  print(diff_plot)
}

## Saved field phenotypes panel to: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/figures/field_phenotypes_panel.png

## Saved GDD phenotypes panel to: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/figures/gdd_phenotypes_panel.png

8b. Save Stage-2 contrasts (matches PSU2024 CSV naming)

write_csv(contrast_df %>% select(-y.position, -group1, -group2, -p.adj,
                                  -p.adj.signif),
          file.path(paths$intermediate,
                    paste0(environment_name,
                           "_focal_contrast_marginal.csv")))

write_csv(all_contrasts,
          file.path(paths$intermediate,
                    paste0(environment_name,
                           "_focal_contrast_all.csv")))

cat("Outputs:\n  ",
    file.path(paths$intermediate,
              paste0(environment_name, "_focal_contrast_marginal.csv")),
    "\n  ",
    file.path(paths$intermediate,
              paste0(environment_name, "_focal_contrast_all.csv")), "\n")

## Outputs:
##    /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate/PSU2022_focal_contrast_marginal.csv 
##    /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate/PSU2022_focal_contrast_all.csv

9. Summary and export

cat("\n=== Analysis Summary ===\n")

## 
## === Analysis Summary ===

cat("Total phenotypes evaluated:", length(all_traits), "\n")

## Total phenotypes evaluated: 18

cat("Usable phenotypes (<50% missing):", length(usable_phenotypes), "\n")

## Usable phenotypes (<50% missing): 18

cat("Successfully fitted models:", length(all_models), "\n")

## Successfully fitted models: 18

if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
  cat("Plots with corrected data:", nrow(sp_corrected), "\n")

  output_file <- file.path(paths$intermediate, paste0(environment_name, "_spatially_corrected_phenotypes.csv"))
  write_csv(sp_corrected, output_file)
  cat("Spatially corrected data exported to:", output_file, "\n")
} else {
  cat("No corrected data to export\n")
}

## Plots with corrected data: 127 
## Spatially corrected data exported to: /Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4m/results/inversion_paper/intermediate/PSU2022_spatially_corrected_phenotypes.csv

if (length(fit_info) > 0) {
  cat("\n=== Model Fit Details ===\n")
  for (trait in names(fit_info)) {
    info <- fit_info[[trait]]
    cat(sprintf("%s: n=%d, status=%s\n", trait, info$n_obs, info$status))
  }
}

## 
## === Model Fit Details ===
## PH: n=127, status=success
## DTA: n=127, status=success
## DTS: n=127, status=success
## GDDA: n=127, status=success
## GDDS: n=127, status=success
## STW40: n=125, status=success
## STWHV: n=125, status=success
## STW50: n=112, status=success
## STW60: n=107, status=success
## EW: n=105, status=success
## EL: n=105, status=success
## ED: n=105, status=success
## KRN: n=105, status=success
## CD: n=105, status=success
## TKW: n=105, status=success
## KW50: n=105, status=success
## TKN: n=105, status=success
## HI: n=105, status=success