This notebook generates the data for Supplementary Tables S1 (Inv4m delimitation and breakpoints) and S2 (breakpoint knob repeats) from:
scripts/02_genomics_foundation/run_repeat_annotation_4genomes.sh.| Label | Assembly | Chr4 name |
|---|---|---|
| B73 | Zm-B73-REFERENCE-NAM-5.0 | chr4 |
| PT | Zm-PT-REFERENCE-HiLo-1.0 | PT04 |
| TIL18 | Zx-TIL18-REFERENCE-PanAnd-1.0 | chr4 |
| Mi21 NIL | Mi21 NIL HiFi (hifiasm + RagTag) | scf00000010_RagTag |
Breakpoint coordinates were defined by the AnchorWave alignment endpoints where reference-orientation synteny was lost. Bounding genes were identified by inspecting gene annotations (B73 NAM5 GFF3 for B73, PT HiLo-1.0 GFF3 for PT, TIL18 PanAnd-1.0 GFF3 for TIL18, and Liftoff B73-to-Mi21 GFF3 for Mi21 NIL) flanking the breakpoint segments.
Curated inversion boundaries — the first and last genes within the inverted region, as identified by AnchorWave strand analysis.
inv4m_limits <- tribble(
~genome, ~start, ~end, ~start_gene, ~end_gene,
"B73", 172883881, 188131462, "Zm00001eb190470", "Zm00001eb194800",
"PT", 186925484, 173486186, "Zm00109aa017629", "Zm00109aa018009",
"TIL18", 180365316, 193570652, "Zx00002aa015554", "Zx00002aa015905",
"Mi21", 156752807, 167620500, "Zm00001eb194800", "Zm00001eb190470"
) %>%
mutate(length = abs(end - start))
kable(inv4m_limits, format.args = list(big.mark = ","))
| genome | start | end | start_gene | end_gene | length |
|---|---|---|---|---|---|
| B73 | 172,883,881 | 188,131,462 | Zm00001eb190470 | Zm00001eb194800 | 15,247,581 |
| PT | 186,925,484 | 173,486,186 | Zm00109aa017629 | Zm00109aa018009 | 13,439,298 |
| TIL18 | 180,365,316 | 193,570,652 | Zx00002aa015554 | Zx00002aa015905 | 13,205,336 |
| Mi21 | 156,752,807 | 167,620,500 | Zm00001eb194800 | Zm00001eb190470 | 10,867,693 |
Gene-free segments flanking the inversion, defined as the gap between the last gene on the collinear side and the first gene on the inverted side.
“Upstream” and “Downstream” are defined in B73-sense orientation: upstream is the breakpoint at lower B73 coordinates (~172.9 Mb), downstream at higher (~188.1 Mb). For highland genomes carrying the inversion, the physical positions are swapped but the labels follow the B73 convention.
breakpoint_segments <- tribble(
~genome, ~up_start, ~up_end, ~down_start, ~down_end,
# Upstream/downstream follow B73-sense orientation (~172.9 Mb / ~188.1 Mb).
# For highland genomes the inversion swaps the physical positions, but
# the labels are defined by which B73 breakpoint each segment corresponds to.
"B73", 172561330, 172883881, 188131461, 188220418,
"PT", 173369064, 173486186, 186925483, 187092654,
"TIL18", 180269950, 180365316, 193570651, 193734606,
# Mi21: B73-sense upstream breakpoint maps to Mi21 ~167.6 Mb (higher coord),
# B73-sense downstream maps to Mi21 ~156.6 Mb (lower coord).
"Mi21", 167621241, 167838764, 156560061, 156750619
) %>%
mutate(
up_span = up_end - up_start,
down_span = down_end - down_start
)
kable(breakpoint_segments, format.args = list(big.mark = ","))
| genome | up_start | up_end | down_start | down_end | up_span | down_span |
|---|---|---|---|---|---|---|
| B73 | 172,561,330 | 172,883,881 | 188,131,461 | 188,220,418 | 322,551 | 88,957 |
| PT | 173,369,064 | 173,486,186 | 186,925,483 | 187,092,654 | 117,122 | 167,171 |
| TIL18 | 180,269,950 | 180,365,316 | 193,570,651 | 193,734,606 | 95,366 | 163,955 |
| Mi21 | 167,621,241 | 167,838,764 | 156,560,061 | 156,750,619 | 217,523 | 190,558 |
Genes flanking each breakpoint segment (outside → inside the inversion).
bounding_genes <- tribble(
~genome, ~up_outer, ~up_inner, ~down_inner, ~down_outer,
"B73", "Zm00001eb190450", "Zm00001eb190470", "Zm00109aa017627", "Zm00001eb194820",
"PT", "Zm00109aa017627", "Zm00109aa017629", "Zx00002aa015905", "Zm00109aa017629",
"TIL18", "Zx00002aa015905", "Zx00002aa015906", "Zx00002aa015906", "Zx00002aa015554",
"Mi21", "Zm00001eb190450", "Zm00001eb194800", "Zm00001eb190470", "Zm00001eb194820"
)
kable(bounding_genes)
| genome | up_outer | up_inner | down_inner | down_outer |
|---|---|---|---|---|
| B73 | Zm00001eb190450 | Zm00001eb190470 | Zm00109aa017627 | Zm00001eb194820 |
| PT | Zm00109aa017627 | Zm00109aa017629 | Zx00002aa015905 | Zm00109aa017629 |
| TIL18 | Zx00002aa015905 | Zx00002aa015906 | Zx00002aa015906 | Zx00002aa015554 |
| Mi21 | Zm00001eb190450 | Zm00001eb194800 | Zm00001eb190470 | Zm00001eb194820 |
apply_label <- function(df) {
df %>% mutate(genome = genome_labels[genome])
}
s1_limits <- inv4m_limits %>%
select(genome, start, end, length, start_gene, end_gene) %>%
apply_label()
s1_segments <- breakpoint_segments %>%
select(genome, up_start, up_end, up_span, down_start, down_end, down_span) %>%
apply_label()
s1_genes <- bounding_genes %>% apply_label()
write_csv(s1_limits, file.path(paths$intermediate, "table_S1_inv4m_limits.csv"))
write_csv(s1_segments, file.path(paths$intermediate, "table_S1_breakpoint_segments.csv"))
write_csv(s1_genes, file.path(paths$intermediate, "table_S1_bounding_genes.csv"))
BLAST+ megablast (outfmt 6) results from
run_repeat_annotation_4genomes.sh. Columns: qseqid, sseqid,
pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue,
bitscore.
blast_cols <- c("qseqid", "sseqid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send",
"evalue", "bitscore")
genomes <- c("B73", "PT", "TIL18", "Mi21")
read_blast <- function(repeat_type, genome) {
f <- file.path(paths$intermediate, paste0(repeat_type, "_", genome, ".blast"))
read_tsv(f, col_names = blast_cols, col_types = cols(.default = "c")) %>%
mutate(
across(c(length, sstart, send, bitscore), as.numeric),
genome = genome,
repeat_type = repeat_type,
# Normalize subject coordinates so smin < smax
smin = pmin(sstart, send),
smax = pmax(sstart, send)
)
}
all_blast <- bind_rows(
lapply(genomes, function(g) read_blast("knob180", g)),
lapply(genomes, function(g) read_blast("TR-1", g))
)
cat(sprintf("Loaded %s BLAST hits across %d genomes\n",
format(nrow(all_blast), big.mark = ","), length(genomes)))
## Loaded 210,358 BLAST hits across 4 genomes
# Breakpoint segment coordinates per genome (B73-sense upstream/downstream)
segments <- breakpoint_segments %>%
select(genome, up_start, up_end, down_start, down_end)
# Internal knob region: between inv4m boundaries
internal_regions <- inv4m_limits %>%
mutate(
int_start = pmin(start, end),
int_end = pmax(start, end)
) %>%
select(genome, int_start, int_end)
count_in_region <- function(blast_df, genome_id, region_start, region_end) {
blast_df %>%
filter(
genome == genome_id,
repeat_type == "knob180",
(smin >= region_start & smin <= region_end) |
(smax >= region_start & smax <= region_end)
) %>%
summarise(
count = n(),
match_len = sum(length),
hit_start = ifelse(n() > 0, min(smin), NA_real_),
hit_end = ifelse(n() > 0, max(smax), NA_real_),
span = ifelse(n() > 0, max(smax) - min(smin), NA_real_),
.groups = "drop"
)
}
# Upstream breakpoint repeats
up_repeats <- lapply(seq_len(nrow(segments)), function(i) {
s <- segments[i, ]
count_in_region(all_blast, s$genome, s$up_start, s$up_end) %>%
mutate(genome = s$genome, section = "upstream",
seg_span = s$up_end - s$up_start)
}) %>% bind_rows()
# Downstream breakpoint repeats
down_repeats <- lapply(seq_len(nrow(segments)), function(i) {
s <- segments[i, ]
count_in_region(all_blast, s$genome, s$down_start, s$down_end) %>%
mutate(genome = s$genome, section = "downstream",
seg_span = s$down_end - s$down_start)
}) %>% bind_rows()
# Internal repeats
int_repeats <- lapply(seq_len(nrow(internal_regions)), function(i) {
r <- internal_regions[i, ]
count_in_region(all_blast, r$genome, r$int_start, r$int_end) %>%
mutate(genome = r$genome, section = "internal",
seg_span = r$int_end - r$int_start)
}) %>% bind_rows()
knob_repeats <- bind_rows(up_repeats, down_repeats, int_repeats) %>%
mutate(
pct_knob_span = ifelse(!is.na(span), round(span / seg_span * 100), NA),
pct_knob = ifelse(!is.na(span) & span > 0,
round(match_len / span * 100), NA),
genome_label = genome_labels[genome]
) %>%
select(genome = genome_label, section, count, match_len,
hit_start, hit_end, span, pct_knob_span, pct_knob)
knob_repeats %>% filter(section == "upstream") %>%
kable(format.args = list(big.mark = ","))
| genome | section | count | match_len | hit_start | hit_end | span | pct_knob_span | pct_knob |
|---|---|---|---|---|---|---|---|---|
| B73 | upstream | 353 | 60,552 | 172,561,781 | 172,882,488 | 320,707 | 99 | 19 |
| PT | upstream | 45 | 7,938 | 173,370,041 | 173,392,244 | 22,203 | 19 | 36 |
| TIL18 | upstream | 45 | 7,938 | 180,270,927 | 180,293,129 | 22,202 | 23 | 36 |
| Mi21 NIL | upstream | 210 | 36,316 | 167,622,041 | 167,760,234 | 138,193 | 64 | 26 |
knob_repeats %>% filter(section == "internal") %>%
select(-pct_knob_span, -pct_knob) %>%
kable(format.args = list(big.mark = ","))
| genome | section | count | match_len | hit_start | hit_end | span |
|---|---|---|---|---|---|---|
| B73 | internal | 651 | 113,511 | 184,152,381 | 184,525,209 | 372,828 |
| PT | internal | 355 | 61,995 | 177,250,062 | 177,516,479 | 266,417 |
| TIL18 | internal | 407 | 70,957 | 183,816,884 | 184,072,480 | 255,596 |
| Mi21 NIL | internal | 330 | 57,545 | 160,494,941 | 160,755,992 | 261,051 |
knob_repeats %>% filter(section == "downstream") %>%
kable(format.args = list(big.mark = ","))
| genome | section | count | match_len | hit_start | hit_end | span | pct_knob_span | pct_knob |
|---|---|---|---|---|---|---|---|---|
| B73 | downstream | 27 | 4,624 | 188,148,945 | 188,220,088 | 71,143 | 80 | 6 |
| PT | downstream | 131 | 21,945 | 186,926,848 | 187,051,496 | 124,648 | 75 | 18 |
| TIL18 | downstream | 155 | 26,350 | 193,572,019 | 193,671,226 | 99,207 | 61 | 27 |
| Mi21 NIL | downstream | 57 | 9,865 | 156,560,208 | 156,615,508 | 55,300 | 29 | 18 |
write_csv(knob_repeats, file.path(paths$intermediate, "table_S2_knob_repeats.csv"))
Compare computed values against the hardcoded values in
main.tex.
# Spot-check B73 upstream: expect count=352, match_len=59580
b73_up <- knob_repeats %>%
filter(genome == "B73", section == "upstream")
cat(sprintf("B73 upstream: count=%d (expect 352), match_len=%d (expect 59580)\n",
b73_up$count, b73_up$match_len))
## B73 upstream: count=353 (expect 352), match_len=60552 (expect 59580)
# Spot-check Mi21 downstream: expect count=57, match_len=9865
mi21_down <- knob_repeats %>%
filter(genome == "Mi21 NIL", section == "downstream")
cat(sprintf("Mi21 downstream: count=%d (expect 57), match_len=%d (expect 9865)\n",
mi21_down$count, mi21_down$match_len))
## Mi21 downstream: count=57 (expect 57), match_len=9865 (expect 9865)
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.51 readr_2.1.6 tidyr_1.3.2 dplyr_1.1.4 here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] bit_4.6.0 jsonlite_2.0.0 compiler_4.5.2 crayon_1.5.3
## [5] tidyselect_1.2.1 parallel_4.5.2 jquerylib_0.1.4 yaml_2.3.12
## [9] fastmap_1.2.0 R6_2.6.1 generics_0.1.4 tibble_3.3.0
## [13] rprojroot_2.1.1 bslib_0.9.0 pillar_1.11.1 tzdb_0.5.0
## [17] rlang_1.1.6 cachem_1.1.0 xfun_0.55 sass_0.4.10
## [21] bit64_4.6.0-1 otel_0.2.0 cli_3.6.5 withr_3.0.2
## [25] magrittr_2.0.4 digest_0.6.39 vroom_1.6.7 hms_1.1.4
## [29] lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0
## [33] rmarkdown_2.30 purrr_1.2.0 tools_4.5.2 pkgconfig_2.0.3
## [37] htmltools_0.5.9