Differentiation Progression Analysis

Quantifying differentiation timing differences from bulk RNA-seq time course data

Overview

PCA-based method for comparing differentiation progression between conditions in bulk RNA-seq time course data.

Dataset. Data from Martinez et al. (2024): D21 (euploid control) and T21 (trisomy 21) iPSC-derived neural progenitors at Days 6, 10, and 17 (18 samples, 3 replicates per condition).

Workflow

Mathematical Framework

Maturation Score

Each sample is projected onto $\vec{v}$ and normalized so the early centroid maps to 0 and the late centroid maps to 1:

\[ s_i = \frac{ \left(\mathbf{z}_i - \bar{\mathbf{z}}_{t_{\textrm{early}}}\right) \cdot \vec{v} }{ \lVert\vec{v}\rVert^2 } \]

Analysis

Setup

Code

library(DESeq2)
library(ggplot2)
library(dplyr)
library(tidyr)
library(tibble)
library(plotly)
library(ComplexHeatmap)
library(circlize)
library(viridis)

meta <- readRDS("dat/metadata/WC24_metadata_clean.rds")
raw_counts <- readRDS(
  "dat/counts/raw/WC24_filt_raw_counts.rds"
)
vst_counts <- readRDS(
  "dat/counts/vst/WC24_vst_counts.rds"
)

meta$timepoint <- factor(
  meta$timepoint, levels = c(6, 10, 17)
)

theme_pub <- function(base_size = 10) {
  theme_classic(base_size = base_size) +
  theme(
    text = element_text(color = "black"),
    axis.text = element_text(
      size = base_size - 2, color = "black"
    ),
    axis.title = element_text(
      size = base_size - 1, color = "black"
    ),
    axis.line = element_line(
      color = "black", linewidth = 0.3
    ),
    axis.ticks = element_line(
      linewidth = 0.3, color = "black"
    ),
    panel.grid.major = element_line(
      color = "#D9D9D9", linewidth = 0.22
    ),
    panel.grid.minor = element_blank(),
    legend.title = element_blank(),
    legend.text = element_text(
      size = base_size - 2, color = "black"
    ),
    legend.key.size = unit(0.35, "cm"),
    strip.text = element_text(
      size = base_size - 1, face = "bold",
      color = "black"
    ),
    strip.background = element_blank()
  )
}

col_d21 <- "#1F78B4"
col_t21 <- "#E31A1C"

Feature Selection: Likelihood Ratio Test

Code

ctrl_meta <- meta[meta$genotype == "D21", ]
ctrl_counts <- raw_counts[, ctrl_meta$sample_id]

dds <- DESeqDataSetFromMatrix(
  countData = ctrl_counts,
  colData   = ctrl_meta,
  design    = ~ timepoint
)
dds <- DESeq(dds, test = "LRT", reduced = ~ 1)
res <- results(dds)
res <- res[order(res$padj), ]

sig_genes <- rownames(res)[
  !is.na(res$padj) & res$padj < 0.05
]
n_top <- min(1000, length(sig_genes))
top_genes <- head(rownames(res), n_top)

cat(
  "Significant genes (padj < 0.05):",
  length(sig_genes), "\n",
  "Genes used for PCA:", n_top
)

Significant genes (padj < 0.05): 8299 
 Genes used for PCA: 1000

Temporally Variable Gene Heatmap

Code

hm_mat <- as.matrix(
  vst_counts[sig_genes, ctrl_meta$sample_id]
)
hm_z <- t(scale(t(hm_mat)))
hm_z[hm_z >  2] <-  2
hm_z[hm_z < -2] <- -2

col_fun <- colorRamp2(
  seq(-2, 2, length.out = 100),
  viridis(100)
)

ht <- Heatmap(
  hm_z,
  name                   = "Z-score",
  col                    = col_fun,
  cluster_columns        = FALSE,
  cluster_rows           = TRUE,
  clustering_method_rows = "ward.D2",
  row_km                 = 4,
  row_gap                = unit(2, "mm"),
  show_row_names         = FALSE,
  show_column_names      = FALSE,
  column_split           = factor(
    ctrl_meta[colnames(hm_z), "timepoint"],
    levels = c(6, 10, 17)
  ),
  column_title    = c("Day 6", "Day 10", "Day 17"),
  column_gap      = unit(3, "mm"),
  row_title       = paste0(
    length(sig_genes), " genes"
  ),
  border          = FALSE,
  use_raster      = TRUE
)

draw(ht)

Figure 1: Z-scored expression of temporally variable genes across D21 control timepoints (k=4 row clusters, viridis palette, winsorized at +/- 2)

Temporal Expression Patterns

Six temporal profiles identified among LRT-significant genes.

Code

ctrl_vst <- vst_counts[sig_genes, ctrl_meta$sample_id]
tp_num <- as.numeric(
  as.character(ctrl_meta$timepoint)
)

mean_d6  <- rowMeans(ctrl_vst[, tp_num == 6])
mean_d10 <- rowMeans(ctrl_vst[, tp_num == 10])
mean_d17 <- rowMeans(ctrl_vst[, tp_num == 17])

# All 6 possible 3-timepoint orderings
patterns <- list(
  "Monotonic Decrease" = names(which(
    mean_d6 > mean_d10 & mean_d10 > mean_d17
  )),
  "Early Drop" = names(which(
    mean_d6 > mean_d17 & mean_d17 > mean_d10
  )),
  "U-shape" = names(which(
    mean_d17 > mean_d6 & mean_d6 > mean_d10
  )),
  "Transient Peak" = names(which(
    mean_d10 > mean_d6 & mean_d6 > mean_d17
  )),
  "Peak at Day 10" = names(which(
    mean_d10 > mean_d17 & mean_d17 > mean_d6
  )),
  "Monotonic Increase" = names(which(
    mean_d17 > mean_d10 & mean_d10 > mean_d6
  ))
)

dyn_range <- apply(
  cbind(mean_d6, mean_d10, mean_d17), 1,
  function(x) max(x) - min(x)
)

# Pick top gene per pattern
example_genes <- vapply(
  names(patterns), function(nm) {
    genes <- patterns[[nm]]
    if (length(genes) == 0) return(NA_character_)
    genes[which.max(dyn_range[genes])]
  }, character(1)
)

# Count genes per pattern
pattern_counts <- vapply(
  patterns, length, integer(1)
)

# Keep only patterns with genes
keep <- !is.na(example_genes)
example_genes <- example_genes[keep]
pattern_names <- names(example_genes)

# Build plot data
plot_df <- do.call(rbind, lapply(
  seq_along(example_genes), function(i) {
    g <- example_genes[i]
    nm <- pattern_names[i]
    data.frame(
      gene      = g,
      pattern   = nm,
      n_genes   = pattern_counts[nm],
      timepoint = tp_num,
      expr      = as.numeric(ctrl_vst[g, ]),
      sample    = ctrl_meta$sample_id
    )
  }
))

plot_df$label <- paste0(
  plot_df$pattern,
  " (n=", plot_df$n_genes, ")",
  "\n", plot_df$gene
)
plot_df$label <- factor(
  plot_df$label, levels = unique(plot_df$label)
)

plot_means <- plot_df %>%
  group_by(label, timepoint) %>%
  summarize(
    m    = mean(expr),
    ymin = min(expr),
    ymax = max(expr),
    .groups = "drop"
  )

ggplot() +
  geom_line(
    data = plot_means,
    aes(x = timepoint, y = m),
    color = col_d21, linewidth = 0.55
  ) +
  geom_linerange(
    data = plot_means,
    aes(x = timepoint, ymin = ymin,
        ymax = ymax),
    linewidth = 0.35, color = col_d21
  ) +
  geom_point(
    data = plot_df,
    aes(x = timepoint, y = expr),
    shape = 21, size = 1.2,
    stroke = 0.25, color = "black",
    fill = col_d21, alpha = 1
  ) +
  facet_wrap(~ label, nrow = 2,
             scales = "free_y") +
  scale_x_continuous(
    breaks = c(6, 10, 17),
    labels = c("6", "10", "17")
  ) +
  labs(x = "Day", y = "VST") +
  theme_pub(base_size = 10)

Figure 2: Six temporal expression patterns identified by the LRT. Each panel shows the gene with the largest dynamic range for that pattern class. Points = replicates, line = mean, bars = range.

D21 Control PC Space

Code

pca_input <- t(as.matrix(
  vst_counts[top_genes, ctrl_meta$sample_id]
))
pca_res <- prcomp(pca_input, scale. = TRUE,
                  center = TRUE)

var_pct <- round(
  summary(pca_res)$importance[2, 1:3] * 100, 1
)

all_input <- t(as.matrix(
  vst_counts[top_genes, meta$sample_id]
))
all_proj <- predict(pca_res, all_input)

pca_df <- data.frame(
  PC1       = all_proj[, 1],
  PC2       = all_proj[, 2],
  PC3       = all_proj[, 3],
  sample    = meta$sample_id,
  genotype  = meta$genotype,
  timepoint = factor(
    paste0("Day ", meta$timepoint),
    levels = c("Day 6", "Day 10", "Day 17")
  )
)

tp_num_all <- as.numeric(
  as.character(meta$timepoint)
)
ctrl_d6  <- meta$genotype == "D21" & tp_num_all == 6
ctrl_d10 <- meta$genotype == "D21" & tp_num_all == 10
ctrl_d17 <- meta$genotype == "D21" & tp_num_all == 17

centroid_d6  <- colMeans(all_proj[ctrl_d6,  1:3])
centroid_d10 <- colMeans(all_proj[ctrl_d10, 1:3])
centroid_d17 <- colMeans(all_proj[ctrl_d17, 1:3])

tp_colors <- c(
  "Day 6"  = "#2563EB",
  "Day 10" = "#059669",
  "Day 17" = "#DC2626"
)
geno_symbols <- c(
  "D21" = "circle", "T21" = "diamond"
)
scene_axes <- list(
  xaxis = list(title = paste0(
    "PC1 (", var_pct[1], "%)"
  )),
  yaxis = list(title = paste0(
    "PC2 (", var_pct[2], "%)"
  )),
  zaxis = list(title = paste0(
    "PC3 (", var_pct[3], "%)"
  ))
)

# Curved arc through the 3 centroids
t_param <- c(0, 0.5, 1)
t_interp <- seq(0, 1, length.out = 40)
arc_x <- spline(
  t_param,
  c(centroid_d6[1], centroid_d10[1],
    centroid_d17[1]),
  xout = t_interp
)$y
arc_y <- spline(
  t_param,
  c(centroid_d6[2], centroid_d10[2],
    centroid_d17[2]),
  xout = t_interp
)$y
arc_z <- spline(
  t_param,
  c(centroid_d6[3], centroid_d10[3],
    centroid_d17[3]),
  xout = t_interp
)$y

arc_dir <- c(
  arc_x[40] - arc_x[39],
  arc_y[40] - arc_y[39],
  arc_z[40] - arc_z[39]
)

Code

ctrl_pca <- pca_df[pca_df$genotype == "D21", ]

plot_ly() %>%
  add_trace(
    data = ctrl_pca,
    x = ~PC1, y = ~PC2, z = ~PC3,
    color  = ~timepoint,
    colors = tp_colors,
    type   = "scatter3d", mode = "markers",
    marker = list(size = 4, opacity = 1),
    text   = ~paste(sample, timepoint),
    hoverinfo = "text"
  ) %>%
  add_trace(
    x = arc_x, y = arc_y, z = arc_z,
    type = "scatter3d", mode = "lines",
    line = list(
      width = 5, color = "#000", dash = "solid"
    ),
    name = "Differentiation Arc",
    showlegend = TRUE
  ) %>%
  add_trace(
    type = "cone",
    x = arc_x[40], y = arc_y[40], z = arc_z[40],
    u = arc_dir[1], v = arc_dir[2],
    w = arc_dir[3],
    sizemode = "absolute", sizeref = 2,
    anchor = "tail", showscale = FALSE,
    colorscale = list(c(0, "#000"), c(1, "#000")),
    name = "", showlegend = FALSE
  ) %>%
  # Labels at centroids
  add_trace(
    x = centroid_d6[1], y = centroid_d6[2],
    z = centroid_d6[3],
    type = "scatter3d", mode = "text",
    text = "Start",
    textfont = list(
      size = 12, color = "#000", family = "Arial"
    ),
    textposition = "top center",
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = centroid_d10[1], y = centroid_d10[2],
    z = centroid_d10[3],
    type = "scatter3d", mode = "text",
    text = "Middle",
    textfont = list(
      size = 12, color = "#000", family = "Arial"
    ),
    textposition = "top center",
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = centroid_d17[1], y = centroid_d17[2],
    z = centroid_d17[3],
    type = "scatter3d", mode = "text",
    text = "End",
    textfont = list(
      size = 12, color = "#000", family = "Arial"
    ),
    textposition = "top center",
    showlegend = FALSE, name = ""
  ) %>%
  layout(scene = scene_axes)

Figure 3: D21 control samples in PC space (top 1000 LRT genes). Curved line traces the differentiation arc through timepoint centroids.

Choosing the Reference Vector

Three possible centroid-to-centroid vectors, each capturing a different segment of the trajectory.

Code

vec_early <- centroid_d10 - centroid_d6
vec_late  <- centroid_d17 - centroid_d10
vec_full  <- centroid_d17 - centroid_d6

mid_early <- (centroid_d6 + centroid_d10) / 2
mid_late  <- (centroid_d10 + centroid_d17) / 2
mid_full  <- (centroid_d6 + centroid_d17) / 2

plot_ly() %>%
  add_trace(
    data = ctrl_pca,
    x = ~PC1, y = ~PC2, z = ~PC3,
    color  = ~timepoint,
    colors = tp_colors,
    type   = "scatter3d", mode = "markers",
    marker = list(size = 4, opacity = 1),
    text   = ~paste(sample, timepoint),
    hoverinfo = "text"
  ) %>%
  # D6 -> D10
  add_trace(
    x = c(centroid_d6[1], centroid_d10[1]),
    y = c(centroid_d6[2], centroid_d10[2]),
    z = c(centroid_d6[3], centroid_d10[3]),
    type = "scatter3d", mode = "lines",
    line = list(width = 5, color = "#6366F1"),
    name = "D6 -> D10"
  ) %>%
  add_trace(
    type = "cone",
    x = centroid_d10[1], y = centroid_d10[2],
    z = centroid_d10[3],
    u = vec_early[1], v = vec_early[2],
    w = vec_early[3],
    sizemode = "absolute", sizeref = 2,
    anchor = "tail", showscale = FALSE,
    colorscale = list(
      c(0, "#6366F1"), c(1, "#6366F1")
    ),
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = mid_early[1], y = mid_early[2],
    z = mid_early[3],
    type = "scatter3d", mode = "text",
    text = "D6->D10",
    textfont = list(size = 10, color = "#6366F1"),
    showlegend = FALSE, name = ""
  ) %>%
  # D10 -> D17
  add_trace(
    x = c(centroid_d10[1], centroid_d17[1]),
    y = c(centroid_d10[2], centroid_d17[2]),
    z = c(centroid_d10[3], centroid_d17[3]),
    type = "scatter3d", mode = "lines",
    line = list(width = 5, color = "#F59E0B"),
    name = "D10 -> D17"
  ) %>%
  add_trace(
    type = "cone",
    x = centroid_d17[1], y = centroid_d17[2],
    z = centroid_d17[3],
    u = vec_late[1], v = vec_late[2],
    w = vec_late[3],
    sizemode = "absolute", sizeref = 2,
    anchor = "tail", showscale = FALSE,
    colorscale = list(
      c(0, "#F59E0B"), c(1, "#F59E0B")
    ),
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = mid_late[1], y = mid_late[2],
    z = mid_late[3],
    type = "scatter3d", mode = "text",
    text = "D10->D17",
    textfont = list(size = 10, color = "#F59E0B"),
    showlegend = FALSE, name = ""
  ) %>%
  # D6 -> D17
  add_trace(
    x = c(centroid_d6[1], centroid_d17[1]),
    y = c(centroid_d6[2], centroid_d17[2]),
    z = c(centroid_d6[3], centroid_d17[3]),
    type = "scatter3d", mode = "lines",
    line = list(width = 7, color = "#059669"),
    name = "D6 -> D17"
  ) %>%
  add_trace(
    type = "cone",
    x = centroid_d17[1] + vec_late[1] * 0.01,
    y = centroid_d17[2] + vec_late[2] * 0.01,
    z = centroid_d17[3] + vec_late[3] * 0.01,
    u = vec_full[1], v = vec_full[2],
    w = vec_full[3],
    sizemode = "absolute", sizeref = 2.5,
    anchor = "tail", showscale = FALSE,
    colorscale = list(
      c(0, "#059669"), c(1, "#059669")
    ),
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = mid_full[1], y = mid_full[2],
    z = mid_full[3],
    type = "scatter3d", mode = "text",
    text = "D6->D17",
    textfont = list(size = 11, color = "#059669"),
    showlegend = FALSE, name = ""
  ) %>%
  layout(scene = scene_axes)

Figure 4: Three candidate reference vectors. The full trajectory (Day 6 to Day 17, green solid) is selected because it spans the complete differentiation arc.

All Samples in Reference PC Space

Code

plot_ly(
  pca_df,
  x = ~PC1, y = ~PC2, z = ~PC3,
  color  = ~timepoint,
  symbol = ~genotype,
  colors = tp_colors,
  symbols = geno_symbols,
  type   = "scatter3d",
  mode   = "markers",
  marker = list(size = 4, opacity = 1),
  text   = ~paste(sample, genotype, timepoint),
  hoverinfo = "text"
) %>%
  layout(scene = scene_axes)

Figure 5: All D21 and T21 samples projected into the D21-defined PC space (color = timepoint, shape = genotype)

Reference Trajectory

Code

ref_vec <- centroid_d17 - centroid_d6

plot_ly() %>%
  add_trace(
    data = pca_df,
    x = ~PC1, y = ~PC2, z = ~PC3,
    color  = ~timepoint,
    symbol = ~genotype,
    colors = tp_colors,
    symbols = geno_symbols,
    type   = "scatter3d", mode = "markers",
    marker = list(size = 4, opacity = 1),
    text   = ~paste(sample, genotype, timepoint),
    hoverinfo = "text"
  ) %>%
  add_trace(
    x = c(centroid_d6[1], centroid_d17[1]),
    y = c(centroid_d6[2], centroid_d17[2]),
    z = c(centroid_d6[3], centroid_d17[3]),
    type = "scatter3d", mode = "lines",
    line = list(width = 7, color = "#059669"),
    name = "Reference Vector (D6->D17)",
    showlegend = TRUE
  ) %>%
  add_trace(
    type = "cone",
    x = centroid_d17[1], y = centroid_d17[2],
    z = centroid_d17[3],
    u = ref_vec[1] * 0.15,
    v = ref_vec[2] * 0.15,
    w = ref_vec[3] * 0.15,
    sizemode = "absolute", sizeref = 3,
    anchor = "tail", showscale = FALSE,
    colorscale = list(
      c(0, "#059669"), c(1, "#059669")
    ),
    name = "", showlegend = FALSE
  ) %>%
  # Cluster labels at centroids
  add_trace(
    x = centroid_d6[1], y = centroid_d6[2],
    z = centroid_d6[3],
    type = "scatter3d", mode = "text",
    text = "Day 6",
    textfont = list(
      size = 11, color = tp_colors["Day 6"],
      family = "Arial"
    ),
    textposition = "top center",
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = centroid_d10[1], y = centroid_d10[2],
    z = centroid_d10[3],
    type = "scatter3d", mode = "text",
    text = "Day 10",
    textfont = list(
      size = 11, color = tp_colors["Day 10"],
      family = "Arial"
    ),
    textposition = "top center",
    showlegend = FALSE, name = ""
  ) %>%
  add_trace(
    x = centroid_d17[1], y = centroid_d17[2],
    z = centroid_d17[3],
    type = "scatter3d", mode = "text",
    text = "Day 17",
    textfont = list(
      size = 11, color = tp_colors["Day 17"],
      family = "Arial"
    ),
    textposition = "top center",
    showlegend = FALSE, name = ""
  ) %>%
  layout(scene = scene_axes)

Figure 6: Reference trajectory (green) from D21 Day 6 to Day 17 centroid with arrowhead. Black diamonds = centroids.

2D PC Projections with Reference Vector

Code

# Build segment data for the reference vector
vec_seg <- data.frame(
  x = centroid_d6[1:3],
  xend = centroid_d17[1:3],
  panel = c("PC1 vs PC2", "PC1 vs PC3",
            "PC2 vs PC3")
)

# Build panel data
panel_df <- rbind(
  data.frame(
    x = pca_df$PC1, y = pca_df$PC2,
    panel = "PC1 vs PC2",
    genotype = pca_df$genotype,
    timepoint = pca_df$timepoint
  ),
  data.frame(
    x = pca_df$PC1, y = pca_df$PC3,
    panel = "PC1 vs PC3",
    genotype = pca_df$genotype,
    timepoint = pca_df$timepoint
  ),
  data.frame(
    x = pca_df$PC2, y = pca_df$PC3,
    panel = "PC2 vs PC3",
    genotype = pca_df$genotype,
    timepoint = pca_df$timepoint
  )
)
panel_df$panel <- factor(
  panel_df$panel,
  levels = c("PC1 vs PC2", "PC1 vs PC3",
             "PC2 vs PC3")
)

# Vector endpoints per panel
seg_df <- data.frame(
  x    = c(centroid_d6[1], centroid_d6[1],
           centroid_d6[2]),
  y    = c(centroid_d6[2], centroid_d6[3],
           centroid_d6[3]),
  xend = c(centroid_d17[1], centroid_d17[1],
           centroid_d17[2]),
  yend = c(centroid_d17[2], centroid_d17[3],
           centroid_d17[3]),
  panel = factor(
    c("PC1 vs PC2", "PC1 vs PC3",
      "PC2 vs PC3"),
    levels = c("PC1 vs PC2", "PC1 vs PC3",
               "PC2 vs PC3")
  )
)

# Axis labels per panel
ax_labels <- data.frame(
  panel = factor(
    c("PC1 vs PC2", "PC1 vs PC3",
      "PC2 vs PC3"),
    levels = c("PC1 vs PC2", "PC1 vs PC3",
               "PC2 vs PC3")
  ),
  xlab = c(
    paste0("PC1 (", var_pct[1], "%)"),
    paste0("PC1 (", var_pct[1], "%)"),
    paste0("PC2 (", var_pct[2], "%)")
  ),
  ylab = c(
    paste0("PC2 (", var_pct[2], "%)"),
    paste0("PC3 (", var_pct[3], "%)"),
    paste0("PC3 (", var_pct[3], "%)")
  )
)

ggplot(panel_df, aes(x = x, y = y)) +
  geom_segment(
    data = seg_df,
    aes(x = x, y = y, xend = xend, yend = yend),
    color = "#059669", linewidth = 0.8,
    arrow = arrow(
      length = unit(0.12, "cm"), type = "closed"
    ),
    inherit.aes = FALSE
  ) +
  geom_point(
    aes(fill = timepoint, shape = genotype),
    size = 1.8, stroke = 0.25, color = "black",
    alpha = 1
  ) +
  scale_fill_manual(values = tp_colors) +
  scale_shape_manual(values = c(
    "D21" = 21, "T21" = 23
  )) +
  facet_wrap(~ panel, nrow = 1,
             scales = "free") +
  labs(x = NULL, y = NULL,
       fill = "Timepoint", shape = "Genotype") +
  guides(
    fill = guide_legend(
      override.aes = list(shape = 21)
    )
  ) +
  theme_pub(base_size = 10) +
  theme(legend.position = "top")

Figure 7: Pairwise PC projections of all samples with the D6-to-D17 reference vector (green arrow). Shape = genotype, color = timepoint.

Sample Projections onto Trajectory

Code

scores <- apply(all_proj[, 1:3], 1, function(z) {
  sum((z - centroid_d6) * ref_vec) / sum(ref_vec^2)
})

score_df <- data.frame(
  sample    = meta$sample_id,
  genotype  = meta$genotype,
  timepoint = factor(
    paste0("Day ", meta$timepoint),
    levels = c("Day 6", "Day 10", "Day 17")
  ),
  score     = scores
)

Code

ggplot(
  score_df,
  aes(x = score, y = genotype, fill = genotype)
) +
  geom_vline(
    xintercept = c(0, 1),
    linetype = "dashed", color = "#000",
    linewidth = 0.3, alpha = 0.4
  ) +
  geom_point(
    shape = 21, size = 1.8,
    stroke = 0.25, color = "black", alpha = 1
  ) +
  facet_wrap(~ timepoint, ncol = 1) +
  scale_fill_manual(values = c(
    "D21" = col_d21, "T21" = col_t21
  )) +
  annotate(
    "text", x = 0, y = 0.4,
    label = "Day 6\ncentroid", size = 2.2,
    hjust = 0.5, color = "#000"
  ) +
  annotate(
    "text", x = 1, y = 0.4,
    label = "Day 17\ncentroid", size = 2.2,
    hjust = 0.5, color = "#000"
  ) +
  labs(x = "Maturation Score", y = NULL) +
  theme_pub(base_size = 10) +
  theme(
    panel.grid.major.y = element_blank(),
    legend.position = "none"
  )

Figure 8: Each sample projected onto the reference vector. Score 0 = Day 6 centroid, 1 = Day 17 centroid.

Maturation Score Comparison

Code

score_summary <- score_df %>%
  group_by(genotype, timepoint) %>%
  summarize(
    mean_score = round(mean(score), 3),
    sd         = round(sd(score), 3),
    .groups    = "drop"
  )

knitr::kable(
  score_summary,
  col.names = c(
    "Genotype", "Timepoint", "Mean Score", "SD"
  ),
  caption = "Maturation scores by condition"
)

Maturation scores by condition
Genotype	Timepoint	Mean Score	SD
D21	Day 6	0.000	0.015
D21	Day 10	0.359	0.039
D21	Day 17	1.000	0.013
T21	Day 6	0.128	0.015
T21	Day 10	0.489	0.031
T21	Day 17	0.950	0.031

Code

ggplot(
  score_df,
  aes(x = timepoint, y = score, fill = genotype)
) +
  geom_hline(
    yintercept = c(0, 1),
    linetype = "dashed", color = "#000",
    linewidth = 0.3, alpha = 0.4
  ) +
  geom_boxplot(
    width = 0.5, outlier.shape = NA,
    alpha = 1, linewidth = 0.3,
    color = "black"
  ) +
  geom_point(
    position = position_jitterdodge(
      jitter.width = 0.05, dodge.width = 0.5
    ),
    shape = 21, size = 1.2,
    stroke = 0.25, color = "black",
    alpha = 1,
    aes(fill = genotype)
  ) +
  scale_fill_manual(values = c(
    "D21" = col_d21, "T21" = col_t21
  )) +
  scale_y_continuous(
    breaks = seq(0, 1, 0.25),
    expand = expansion(mult = c(0.05, 0.05))
  ) +
  annotate(
    "text", x = 0.55, y = 0.03,
    label = "Day 6 ref", color = "#000",
    size = 2.5, hjust = 0
  ) +
  annotate(
    "text", x = 0.55, y = 1.03,
    label = "Day 17 ref", color = "#000",
    size = 2.5, hjust = 0
  ) +
  labs(x = NULL, y = "Maturation Score") +
  theme_pub(base_size = 10) +
  theme(legend.position = "top")

Figure 9: Maturation score by genotype and timepoint. Dashed lines mark Day 6 (s=0) and Day 17 (s=1) control centroids.

Genotype Comparison

Code

d21_means <- score_summary %>%
  filter(genotype == "D21") %>%
  select(timepoint, d21 = mean_score)
t21_means <- score_summary %>%
  filter(genotype == "T21") %>%
  select(timepoint, t21 = mean_score)

diff_df <- inner_join(
  d21_means, t21_means, by = "timepoint"
) %>%
  mutate(
    difference = round(t21 - d21, 3),
    pct_diff   = paste0(
      round(difference * 100, 1), "%"
    )
  )

knitr::kable(
  diff_df,
  col.names = c(
    "Timepoint", "D21 Score", "T21 Score",
    "Difference", "% Difference"
  ),
  caption = "Maturation score differences (T21 - D21)"
)

Maturation score differences (T21 - D21)
Timepoint	D21 Score	T21 Score	Difference	% Difference
Day 6	0.000	0.128	0.128	12.8%
Day 10	0.359	0.489	0.130	13%
Day 17	1.000	0.950	-0.050	-5%

Leave-One-Out Validation

To validate that the reference trajectory accurately classifies samples by differentiation stage, we hold out one replicate from each D21 timepoint, build the trajectory from the remaining $n = 2$ replicates, and score the held-out samples. No T21 samples are used.

Code

# Hold out replicate 1 from each timepoint
holdout_ids <- c(
  ctrl_meta$sample_id[
    ctrl_meta$timepoint == 6][1],
  ctrl_meta$sample_id[
    ctrl_meta$timepoint == 10][1],
  ctrl_meta$sample_id[
    ctrl_meta$timepoint == 17][1]
)
train_ids <- setdiff(
  ctrl_meta$sample_id, holdout_ids
)

cat(
  "Training samples (n=6):",
  paste(train_ids, collapse = ", "), "\n",
  "Held-out samples (n=3):",
  paste(holdout_ids, collapse = ", ")
)

Training samples (n=6): BA2D6_S2, BA3D6_S3, BA2D10_S5, BA3D10_S6, BA2D17_S8, BA3D17_S9 
 Held-out samples (n=3): BA1D6_S1, BA1D10_S4, BA1D17_S7

Code

# LRT on training set
train_meta <- ctrl_meta[
  ctrl_meta$sample_id %in% train_ids, ]
train_counts <- raw_counts[, train_ids]

dds_loo <- DESeqDataSetFromMatrix(
  countData = train_counts,
  colData   = train_meta,
  design    = ~ timepoint
)
dds_loo <- DESeq(
  dds_loo, test = "LRT", reduced = ~ 1
)
res_loo <- results(dds_loo)
res_loo <- res_loo[order(res_loo$padj), ]

sig_loo <- rownames(res_loo)[
  !is.na(res_loo$padj) & res_loo$padj < 0.05
]
n_top_loo <- min(1000, length(sig_loo))
top_loo <- head(rownames(res_loo), n_top_loo)

cat(
  "\nLRT significant genes:", length(sig_loo),
  "\nGenes used for PCA:", n_top_loo
)


LRT significant genes: 5584 
Genes used for PCA: 1000

Code

# PCA on training samples
pca_loo <- prcomp(
  t(as.matrix(vst_counts[top_loo, train_ids])),
  scale. = TRUE, center = TRUE
)

# Project all D21 samples (train + holdout)
all_d21_proj <- predict(
  pca_loo,
  t(as.matrix(
    vst_counts[top_loo, ctrl_meta$sample_id]
  ))
)

# Centroids from training samples only
tp_train <- as.numeric(
  as.character(train_meta$timepoint)
)
c_d6  <- colMeans(
  all_d21_proj[train_ids[tp_train == 6],  1:3]
)
c_d17 <- colMeans(
  all_d21_proj[train_ids[tp_train == 17], 1:3]
)
rv <- c_d17 - c_d6

# Score all D21 samples
loo_scores <- apply(
  all_d21_proj[, 1:3], 1, function(z) {
    sum((z - c_d6) * rv) / sum(rv^2)
  }
)

loo_df <- data.frame(
  sample    = ctrl_meta$sample_id,
  timepoint = factor(
    paste0("Day ", ctrl_meta$timepoint),
    levels = c("Day 6", "Day 10", "Day 17")
  ),
  score     = loo_scores[ctrl_meta$sample_id],
  set       = ifelse(
    ctrl_meta$sample_id %in% holdout_ids,
    "Held-out", "Training"
  )
)

Code

ggplot(
  loo_df,
  aes(x = score, y = timepoint)
) +
  geom_vline(
    xintercept = c(0, 1),
    linetype = "dashed", color = "#000",
    linewidth = 0.3, alpha = 0.4
  ) +
  geom_point(
    aes(fill = timepoint, shape = set),
    size = 2.5, stroke = 0.4, color = "black",
    alpha = 1
  ) +
  scale_fill_manual(values = tp_colors) +
  scale_shape_manual(
    values = c("Training" = 1, "Held-out" = 21)
  ) +
  annotate(
    "text", x = 0, y = 0.5,
    label = "Day 6\ncentroid", size = 2.2,
    hjust = 0.5, color = "#000"
  ) +
  annotate(
    "text", x = 1, y = 0.5,
    label = "Day 17\ncentroid", size = 2.2,
    hjust = 0.5, color = "#000"
  ) +
  labs(
    x = "Maturation Score", y = NULL,
    shape = NULL, fill = NULL
  ) +
  guides(
    fill = "none",
    shape = guide_legend(
      override.aes = list(size = 2)
    )
  ) +
  theme_pub(base_size = 10) +
  theme(
    panel.grid.major.y = element_blank(),
    legend.position = "top"
  )

Figure 10: Leave-one-out validation. Reference trajectory built from n=2 D21 replicates per timepoint (open circles). Held-out replicates (filled) are scored along the trajectory. Accurate classification = held-out samples land near their expected position.

Code

knitr::kable(
  loo_df %>%
    select(sample, timepoint, set, score) %>%
    mutate(score = round(score, 3)) %>%
    arrange(timepoint, set),
  col.names = c(
    "Sample", "Timepoint", "Set", "Score"
  ),
  caption = paste(
    "Leave-one-out validation scores.",
    "Held-out samples should score near 0",
    "(Day 6), ~0.5 (Day 10), and ~1 (Day 17)."
  )
)

Leave-one-out validation scores. Held-out samples should score near 0 (Day 6), ~0.5 (Day 10), and ~1 (Day 17).
	Sample	Timepoint	Set	Score
BA1D6_S1	BA1D6_S1	Day 6	Held-out	-0.004
BA2D6_S2	BA2D6_S2	Day 6	Training	-0.015
BA3D6_S3	BA3D6_S3	Day 6	Training	0.015
BA1D10_S4	BA1D10_S4	Day 10	Held-out	0.346
BA2D10_S5	BA2D10_S5	Day 10	Training	0.317
BA3D10_S6	BA3D10_S6	Day 10	Training	0.390
BA1D17_S7	BA1D17_S7	Day 17	Held-out	0.994
BA2D17_S8	BA2D17_S8	Day 17	Training	1.007
BA3D17_S9	BA3D17_S9	Day 17	Training	0.993

Data: Martinez et al. (2024). Front Cell Neurosci 18: 1341141.

--- title: "Differentiation Progression Analysis" subtitle: > Quantifying differentiation timing differences from bulk RNA-seq time course data format: html: code-fold: true code-tools: true toc: true toc-depth: 3 fig-width: 8 fig-height: 6 fig-dpi: 150 execute: warning: false message: false --- ## Overview PCA-based method for comparing differentiation progression between conditions in bulk RNA-seq time course data. **Dataset.** Data from [Martinez et al. (2024)](https://doi.org/10.3389/fncel.2024.1341141): D21 (euploid control) and T21 (trisomy 21) iPSC-derived neural progenitors at Days 6, 10, and 17 (18 samples, 3 replicates per condition). ## Workflow ![](assets/workflow-flowchart.svg){width="55%" fig-align="center"} ## Mathematical Framework ### Maturation Score Each sample is projected onto $\vec{v}$ and normalized so the early centroid maps to 0 and the late centroid maps to 1: $$ s_i = \frac{ \left(\mathbf{z}_i - \bar{\mathbf{z}}_{t_{\textrm{early}}}\right) \cdot \vec{v} }{ \lVert\vec{v}\rVert^2 } $$ --- ## Analysis ### Setup ```{r} #| label: setup library(DESeq2) library(ggplot2) library(dplyr) library(tidyr) library(tibble) library(plotly) library(ComplexHeatmap) library(circlize) library(viridis) meta <- readRDS("dat/metadata/WC24_metadata_clean.rds") raw_counts <- readRDS( "dat/counts/raw/WC24_filt_raw_counts.rds" ) vst_counts <- readRDS( "dat/counts/vst/WC24_vst_counts.rds" ) meta$timepoint <- factor( meta$timepoint, levels = c(6, 10, 17) ) theme_pub <- function(base_size = 10) { theme_classic(base_size = base_size) + theme( text = element_text(color = "black"), axis.text = element_text( size = base_size - 2, color = "black" ), axis.title = element_text( size = base_size - 1, color = "black" ), axis.line = element_line( color = "black", linewidth = 0.3 ), axis.ticks = element_line( linewidth = 0.3, color = "black" ), panel.grid.major = element_line( color = "#D9D9D9", linewidth = 0.22 ), panel.grid.minor = element_blank(), legend.title = element_blank(), legend.text = element_text( size = base_size - 2, color = "black" ), legend.key.size = unit(0.35, "cm"), strip.text = element_text( size = base_size - 1, face = "bold", color = "black" ), strip.background = element_blank() ) } col_d21 <- "#1F78B4" col_t21 <- "#E31A1C" ``` ### Feature Selection: Likelihood Ratio Test ```{r} #| label: lrt #| cache: true ctrl_meta <- meta[meta$genotype == "D21", ] ctrl_counts <- raw_counts[, ctrl_meta$sample_id] dds <- DESeqDataSetFromMatrix( countData = ctrl_counts, colData = ctrl_meta, design = ~ timepoint ) dds <- DESeq(dds, test = "LRT", reduced = ~ 1) res <- results(dds) res <- res[order(res$padj), ] sig_genes <- rownames(res)[ !is.na(res$padj) & res$padj < 0.05 ] n_top <- min(1000, length(sig_genes)) top_genes <- head(rownames(res), n_top) cat( "Significant genes (padj < 0.05):", length(sig_genes), "\n", "Genes used for PCA:", n_top ) ``` ### Temporally Variable Gene Heatmap ```{r} #| label: fig-heatmap #| fig-cap: > #| Z-scored expression of temporally variable genes #| across D21 control timepoints (k=4 row clusters, #| viridis palette, winsorized at +/- 2) #| fig-height: 4.5 #| fig-width: 4 hm_mat <- as.matrix( vst_counts[sig_genes, ctrl_meta$sample_id] ) hm_z <- t(scale(t(hm_mat))) hm_z[hm_z > 2] <- 2 hm_z[hm_z < -2] <- -2 col_fun <- colorRamp2( seq(-2, 2, length.out = 100), viridis(100) ) ht <- Heatmap( hm_z, name = "Z-score", col = col_fun, cluster_columns = FALSE, cluster_rows = TRUE, clustering_method_rows = "ward.D2", row_km = 4, row_gap = unit(2, "mm"), show_row_names = FALSE, show_column_names = FALSE, column_split = factor( ctrl_meta[colnames(hm_z), "timepoint"], levels = c(6, 10, 17) ), column_title = c("Day 6", "Day 10", "Day 17"), column_gap = unit(3, "mm"), row_title = paste0( length(sig_genes), " genes" ), border = FALSE, use_raster = TRUE ) draw(ht) ``` ### Temporal Expression Patterns Six temporal profiles identified among LRT-significant genes. ```{r} #| label: fig-patterns #| fig-cap: > #| Six temporal expression patterns identified #| by the LRT. Each panel shows the gene with the #| largest dynamic range for that pattern class. #| Points = replicates, line = mean, bars = range. #| fig-height: 4.5 #| fig-width: 8 ctrl_vst <- vst_counts[sig_genes, ctrl_meta$sample_id] tp_num <- as.numeric( as.character(ctrl_meta$timepoint) ) mean_d6 <- rowMeans(ctrl_vst[, tp_num == 6]) mean_d10 <- rowMeans(ctrl_vst[, tp_num == 10]) mean_d17 <- rowMeans(ctrl_vst[, tp_num == 17]) # All 6 possible 3-timepoint orderings patterns <- list( "Monotonic Decrease" = names(which( mean_d6 > mean_d10 & mean_d10 > mean_d17 )), "Early Drop" = names(which( mean_d6 > mean_d17 & mean_d17 > mean_d10 )), "U-shape" = names(which( mean_d17 > mean_d6 & mean_d6 > mean_d10 )), "Transient Peak" = names(which( mean_d10 > mean_d6 & mean_d6 > mean_d17 )), "Peak at Day 10" = names(which( mean_d10 > mean_d17 & mean_d17 > mean_d6 )), "Monotonic Increase" = names(which( mean_d17 > mean_d10 & mean_d10 > mean_d6 )) ) dyn_range <- apply( cbind(mean_d6, mean_d10, mean_d17), 1, function(x) max(x) - min(x) ) # Pick top gene per pattern example_genes <- vapply( names(patterns), function(nm) { genes <- patterns[[nm]] if (length(genes) == 0) return(NA_character_) genes[which.max(dyn_range[genes])] }, character(1) ) # Count genes per pattern pattern_counts <- vapply( patterns, length, integer(1) ) # Keep only patterns with genes keep <- !is.na(example_genes) example_genes <- example_genes[keep] pattern_names <- names(example_genes) # Build plot data plot_df <- do.call(rbind, lapply( seq_along(example_genes), function(i) { g <- example_genes[i] nm <- pattern_names[i] data.frame( gene = g, pattern = nm, n_genes = pattern_counts[nm], timepoint = tp_num, expr = as.numeric(ctrl_vst[g, ]), sample = ctrl_meta$sample_id ) } )) plot_df$label <- paste0( plot_df$pattern, " (n=", plot_df$n_genes, ")", "\n", plot_df$gene ) plot_df$label <- factor( plot_df$label, levels = unique(plot_df$label) ) plot_means <- plot_df %>% group_by(label, timepoint) %>% summarize( m = mean(expr), ymin = min(expr), ymax = max(expr), .groups = "drop" ) ggplot() + geom_line( data = plot_means, aes(x = timepoint, y = m), color = col_d21, linewidth = 0.55 ) + geom_linerange( data = plot_means, aes(x = timepoint, ymin = ymin, ymax = ymax), linewidth = 0.35, color = col_d21 ) + geom_point( data = plot_df, aes(x = timepoint, y = expr), shape = 21, size = 1.2, stroke = 0.25, color = "black", fill = col_d21, alpha = 1 ) + facet_wrap(~ label, nrow = 2, scales = "free_y") + scale_x_continuous( breaks = c(6, 10, 17), labels = c("6", "10", "17") ) + labs(x = "Day", y = "VST") + theme_pub(base_size = 10) ``` ### D21 Control PC Space ```{r} #| label: pca-compute pca_input <- t(as.matrix( vst_counts[top_genes, ctrl_meta$sample_id] )) pca_res <- prcomp(pca_input, scale. = TRUE, center = TRUE) var_pct <- round( summary(pca_res)$importance[2, 1:3] * 100, 1 ) all_input <- t(as.matrix( vst_counts[top_genes, meta$sample_id] )) all_proj <- predict(pca_res, all_input) pca_df <- data.frame( PC1 = all_proj[, 1], PC2 = all_proj[, 2], PC3 = all_proj[, 3], sample = meta$sample_id, genotype = meta$genotype, timepoint = factor( paste0("Day ", meta$timepoint), levels = c("Day 6", "Day 10", "Day 17") ) ) tp_num_all <- as.numeric( as.character(meta$timepoint) ) ctrl_d6 <- meta$genotype == "D21" & tp_num_all == 6 ctrl_d10 <- meta$genotype == "D21" & tp_num_all == 10 ctrl_d17 <- meta$genotype == "D21" & tp_num_all == 17 centroid_d6 <- colMeans(all_proj[ctrl_d6, 1:3]) centroid_d10 <- colMeans(all_proj[ctrl_d10, 1:3]) centroid_d17 <- colMeans(all_proj[ctrl_d17, 1:3]) tp_colors <- c( "Day 6" = "#2563EB", "Day 10" = "#059669", "Day 17" = "#DC2626" ) geno_symbols <- c( "D21" = "circle", "T21" = "diamond" ) scene_axes <- list( xaxis = list(title = paste0( "PC1 (", var_pct[1], "%)" )), yaxis = list(title = paste0( "PC2 (", var_pct[2], "%)" )), zaxis = list(title = paste0( "PC3 (", var_pct[3], "%)" )) ) # Curved arc through the 3 centroids t_param <- c(0, 0.5, 1) t_interp <- seq(0, 1, length.out = 40) arc_x <- spline( t_param, c(centroid_d6[1], centroid_d10[1], centroid_d17[1]), xout = t_interp )$y arc_y <- spline( t_param, c(centroid_d6[2], centroid_d10[2], centroid_d17[2]), xout = t_interp )$y arc_z <- spline( t_param, c(centroid_d6[3], centroid_d10[3], centroid_d17[3]), xout = t_interp )$y arc_dir <- c( arc_x[40] - arc_x[39], arc_y[40] - arc_y[39], arc_z[40] - arc_z[39] ) ``` ```{r} #| label: fig-d21-pca #| fig-cap: > #| D21 control samples in PC space (top 1000 LRT #| genes). Curved line traces the differentiation #| arc through timepoint centroids. ctrl_pca <- pca_df[pca_df$genotype == "D21", ] plot_ly() %>% add_trace( data = ctrl_pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~timepoint, colors = tp_colors, type = "scatter3d", mode = "markers", marker = list(size = 4, opacity = 1), text = ~paste(sample, timepoint), hoverinfo = "text" ) %>% add_trace( x = arc_x, y = arc_y, z = arc_z, type = "scatter3d", mode = "lines", line = list( width = 5, color = "#000", dash = "solid" ), name = "Differentiation Arc", showlegend = TRUE ) %>% add_trace( type = "cone", x = arc_x[40], y = arc_y[40], z = arc_z[40], u = arc_dir[1], v = arc_dir[2], w = arc_dir[3], sizemode = "absolute", sizeref = 2, anchor = "tail", showscale = FALSE, colorscale = list(c(0, "#000"), c(1, "#000")), name = "", showlegend = FALSE ) %>% # Labels at centroids add_trace( x = centroid_d6[1], y = centroid_d6[2], z = centroid_d6[3], type = "scatter3d", mode = "text", text = "Start", textfont = list( size = 12, color = "#000", family = "Arial" ), textposition = "top center", showlegend = FALSE, name = "" ) %>% add_trace( x = centroid_d10[1], y = centroid_d10[2], z = centroid_d10[3], type = "scatter3d", mode = "text", text = "Middle", textfont = list( size = 12, color = "#000", family = "Arial" ), textposition = "top center", showlegend = FALSE, name = "" ) %>% add_trace( x = centroid_d17[1], y = centroid_d17[2], z = centroid_d17[3], type = "scatter3d", mode = "text", text = "End", textfont = list( size = 12, color = "#000", family = "Arial" ), textposition = "top center", showlegend = FALSE, name = "" ) %>% layout(scene = scene_axes) ``` ### Choosing the Reference Vector Three possible centroid-to-centroid vectors, each capturing a different segment of the trajectory. ```{r} #| label: fig-vector-options #| fig-cap: > #| Three candidate reference vectors. The full #| trajectory (Day 6 to Day 17, green solid) is #| selected because it spans the complete #| differentiation arc. vec_early <- centroid_d10 - centroid_d6 vec_late <- centroid_d17 - centroid_d10 vec_full <- centroid_d17 - centroid_d6 mid_early <- (centroid_d6 + centroid_d10) / 2 mid_late <- (centroid_d10 + centroid_d17) / 2 mid_full <- (centroid_d6 + centroid_d17) / 2 plot_ly() %>% add_trace( data = ctrl_pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~timepoint, colors = tp_colors, type = "scatter3d", mode = "markers", marker = list(size = 4, opacity = 1), text = ~paste(sample, timepoint), hoverinfo = "text" ) %>% # D6 -> D10 add_trace( x = c(centroid_d6[1], centroid_d10[1]), y = c(centroid_d6[2], centroid_d10[2]), z = c(centroid_d6[3], centroid_d10[3]), type = "scatter3d", mode = "lines", line = list(width = 5, color = "#6366F1"), name = "D6 -> D10" ) %>% add_trace( type = "cone", x = centroid_d10[1], y = centroid_d10[2], z = centroid_d10[3], u = vec_early[1], v = vec_early[2], w = vec_early[3], sizemode = "absolute", sizeref = 2, anchor = "tail", showscale = FALSE, colorscale = list( c(0, "#6366F1"), c(1, "#6366F1") ), showlegend = FALSE, name = "" ) %>% add_trace( x = mid_early[1], y = mid_early[2], z = mid_early[3], type = "scatter3d", mode = "text", text = "D6->D10", textfont = list(size = 10, color = "#6366F1"), showlegend = FALSE, name = "" ) %>% # D10 -> D17 add_trace( x = c(centroid_d10[1], centroid_d17[1]), y = c(centroid_d10[2], centroid_d17[2]), z = c(centroid_d10[3], centroid_d17[3]), type = "scatter3d", mode = "lines", line = list(width = 5, color = "#F59E0B"), name = "D10 -> D17" ) %>% add_trace( type = "cone", x = centroid_d17[1], y = centroid_d17[2], z = centroid_d17[3], u = vec_late[1], v = vec_late[2], w = vec_late[3], sizemode = "absolute", sizeref = 2, anchor = "tail", showscale = FALSE, colorscale = list( c(0, "#F59E0B"), c(1, "#F59E0B") ), showlegend = FALSE, name = "" ) %>% add_trace( x = mid_late[1], y = mid_late[2], z = mid_late[3], type = "scatter3d", mode = "text", text = "D10->D17", textfont = list(size = 10, color = "#F59E0B"), showlegend = FALSE, name = "" ) %>% # D6 -> D17 add_trace( x = c(centroid_d6[1], centroid_d17[1]), y = c(centroid_d6[2], centroid_d17[2]), z = c(centroid_d6[3], centroid_d17[3]), type = "scatter3d", mode = "lines", line = list(width = 7, color = "#059669"), name = "D6 -> D17" ) %>% add_trace( type = "cone", x = centroid_d17[1] + vec_late[1] * 0.01, y = centroid_d17[2] + vec_late[2] * 0.01, z = centroid_d17[3] + vec_late[3] * 0.01, u = vec_full[1], v = vec_full[2], w = vec_full[3], sizemode = "absolute", sizeref = 2.5, anchor = "tail", showscale = FALSE, colorscale = list( c(0, "#059669"), c(1, "#059669") ), showlegend = FALSE, name = "" ) %>% add_trace( x = mid_full[1], y = mid_full[2], z = mid_full[3], type = "scatter3d", mode = "text", text = "D6->D17", textfont = list(size = 11, color = "#059669"), showlegend = FALSE, name = "" ) %>% layout(scene = scene_axes) ``` ### All Samples in Reference PC Space ```{r} #| label: fig-all-pca #| fig-cap: > #| All D21 and T21 samples projected into the #| D21-defined PC space (color = timepoint, #| shape = genotype) plot_ly( pca_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~timepoint, symbol = ~genotype, colors = tp_colors, symbols = geno_symbols, type = "scatter3d", mode = "markers", marker = list(size = 4, opacity = 1), text = ~paste(sample, genotype, timepoint), hoverinfo = "text" ) %>% layout(scene = scene_axes) ``` ### Reference Trajectory ```{r} #| label: fig-ref-trajectory #| fig-cap: > #| Reference trajectory (green) from D21 Day 6 #| to Day 17 centroid with arrowhead. #| Black diamonds = centroids. ref_vec <- centroid_d17 - centroid_d6 plot_ly() %>% add_trace( data = pca_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~timepoint, symbol = ~genotype, colors = tp_colors, symbols = geno_symbols, type = "scatter3d", mode = "markers", marker = list(size = 4, opacity = 1), text = ~paste(sample, genotype, timepoint), hoverinfo = "text" ) %>% add_trace( x = c(centroid_d6[1], centroid_d17[1]), y = c(centroid_d6[2], centroid_d17[2]), z = c(centroid_d6[3], centroid_d17[3]), type = "scatter3d", mode = "lines", line = list(width = 7, color = "#059669"), name = "Reference Vector (D6->D17)", showlegend = TRUE ) %>% add_trace( type = "cone", x = centroid_d17[1], y = centroid_d17[2], z = centroid_d17[3], u = ref_vec[1] * 0.15, v = ref_vec[2] * 0.15, w = ref_vec[3] * 0.15, sizemode = "absolute", sizeref = 3, anchor = "tail", showscale = FALSE, colorscale = list( c(0, "#059669"), c(1, "#059669") ), name = "", showlegend = FALSE ) %>% # Cluster labels at centroids add_trace( x = centroid_d6[1], y = centroid_d6[2], z = centroid_d6[3], type = "scatter3d", mode = "text", text = "Day 6", textfont = list( size = 11, color = tp_colors["Day 6"], family = "Arial" ), textposition = "top center", showlegend = FALSE, name = "" ) %>% add_trace( x = centroid_d10[1], y = centroid_d10[2], z = centroid_d10[3], type = "scatter3d", mode = "text", text = "Day 10", textfont = list( size = 11, color = tp_colors["Day 10"], family = "Arial" ), textposition = "top center", showlegend = FALSE, name = "" ) %>% add_trace( x = centroid_d17[1], y = centroid_d17[2], z = centroid_d17[3], type = "scatter3d", mode = "text", text = "Day 17", textfont = list( size = 11, color = tp_colors["Day 17"], family = "Arial" ), textposition = "top center", showlegend = FALSE, name = "" ) %>% layout(scene = scene_axes) ``` ### 2D PC Projections with Reference Vector ```{r} #| label: fig-pc-panels #| fig-cap: > #| Pairwise PC projections of all samples with #| the D6-to-D17 reference vector (green arrow). #| Shape = genotype, color = timepoint. #| fig-height: 3 #| fig-width: 9 # Build segment data for the reference vector vec_seg <- data.frame( x = centroid_d6[1:3], xend = centroid_d17[1:3], panel = c("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3") ) # Build panel data panel_df <- rbind( data.frame( x = pca_df$PC1, y = pca_df$PC2, panel = "PC1 vs PC2", genotype = pca_df$genotype, timepoint = pca_df$timepoint ), data.frame( x = pca_df$PC1, y = pca_df$PC3, panel = "PC1 vs PC3", genotype = pca_df$genotype, timepoint = pca_df$timepoint ), data.frame( x = pca_df$PC2, y = pca_df$PC3, panel = "PC2 vs PC3", genotype = pca_df$genotype, timepoint = pca_df$timepoint ) ) panel_df$panel <- factor( panel_df$panel, levels = c("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3") ) # Vector endpoints per panel seg_df <- data.frame( x = c(centroid_d6[1], centroid_d6[1], centroid_d6[2]), y = c(centroid_d6[2], centroid_d6[3], centroid_d6[3]), xend = c(centroid_d17[1], centroid_d17[1], centroid_d17[2]), yend = c(centroid_d17[2], centroid_d17[3], centroid_d17[3]), panel = factor( c("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3"), levels = c("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3") ) ) # Axis labels per panel ax_labels <- data.frame( panel = factor( c("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3"), levels = c("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3") ), xlab = c( paste0("PC1 (", var_pct[1], "%)"), paste0("PC1 (", var_pct[1], "%)"), paste0("PC2 (", var_pct[2], "%)") ), ylab = c( paste0("PC2 (", var_pct[2], "%)"), paste0("PC3 (", var_pct[3], "%)"), paste0("PC3 (", var_pct[3], "%)") ) ) ggplot(panel_df, aes(x = x, y = y)) + geom_segment( data = seg_df, aes(x = x, y = y, xend = xend, yend = yend), color = "#059669", linewidth = 0.8, arrow = arrow( length = unit(0.12, "cm"), type = "closed" ), inherit.aes = FALSE ) + geom_point( aes(fill = timepoint, shape = genotype), size = 1.8, stroke = 0.25, color = "black", alpha = 1 ) + scale_fill_manual(values = tp_colors) + scale_shape_manual(values = c( "D21" = 21, "T21" = 23 )) + facet_wrap(~ panel, nrow = 1, scales = "free") + labs(x = NULL, y = NULL, fill = "Timepoint", shape = "Genotype") + guides( fill = guide_legend( override.aes = list(shape = 21) ) ) + theme_pub(base_size = 10) + theme(legend.position = "top") ``` ### Sample Projections onto Trajectory ```{r} #| label: scores-compute scores <- apply(all_proj[, 1:3], 1, function(z) { sum((z - centroid_d6) * ref_vec) / sum(ref_vec^2) }) score_df <- data.frame( sample = meta$sample_id, genotype = meta$genotype, timepoint = factor( paste0("Day ", meta$timepoint), levels = c("Day 6", "Day 10", "Day 17") ), score = scores ) ``` ```{r} #| label: fig-projection-strip #| fig-cap: > #| Each sample projected onto the reference vector. #| Score 0 = Day 6 centroid, 1 = Day 17 centroid. #| fig-height: 3 #| fig-width: 6 ggplot( score_df, aes(x = score, y = genotype, fill = genotype) ) + geom_vline( xintercept = c(0, 1), linetype = "dashed", color = "#000", linewidth = 0.3, alpha = 0.4 ) + geom_point( shape = 21, size = 1.8, stroke = 0.25, color = "black", alpha = 1 ) + facet_wrap(~ timepoint, ncol = 1) + scale_fill_manual(values = c( "D21" = col_d21, "T21" = col_t21 )) + annotate( "text", x = 0, y = 0.4, label = "Day 6\ncentroid", size = 2.2, hjust = 0.5, color = "#000" ) + annotate( "text", x = 1, y = 0.4, label = "Day 17\ncentroid", size = 2.2, hjust = 0.5, color = "#000" ) + labs(x = "Maturation Score", y = NULL) + theme_pub(base_size = 10) + theme( panel.grid.major.y = element_blank(), legend.position = "none" ) ``` ### Maturation Score Comparison ```{r} #| label: score-table score_summary <- score_df %>% group_by(genotype, timepoint) %>% summarize( mean_score = round(mean(score), 3), sd = round(sd(score), 3), .groups = "drop" ) knitr::kable( score_summary, col.names = c( "Genotype", "Timepoint", "Mean Score", "SD" ), caption = "Maturation scores by condition" ) ``` ```{r} #| label: fig-scores #| fig-cap: > #| Maturation score by genotype and timepoint. #| Dashed lines mark Day 6 (s=0) and Day 17 #| (s=1) control centroids. #| fig-height: 3.5 #| fig-width: 4.5 ggplot( score_df, aes(x = timepoint, y = score, fill = genotype) ) + geom_hline( yintercept = c(0, 1), linetype = "dashed", color = "#000", linewidth = 0.3, alpha = 0.4 ) + geom_boxplot( width = 0.5, outlier.shape = NA, alpha = 1, linewidth = 0.3, color = "black" ) + geom_point( position = position_jitterdodge( jitter.width = 0.05, dodge.width = 0.5 ), shape = 21, size = 1.2, stroke = 0.25, color = "black", alpha = 1, aes(fill = genotype) ) + scale_fill_manual(values = c( "D21" = col_d21, "T21" = col_t21 )) + scale_y_continuous( breaks = seq(0, 1, 0.25), expand = expansion(mult = c(0.05, 0.05)) ) + annotate( "text", x = 0.55, y = 0.03, label = "Day 6 ref", color = "#000", size = 2.5, hjust = 0 ) + annotate( "text", x = 0.55, y = 1.03, label = "Day 17 ref", color = "#000", size = 2.5, hjust = 0 ) + labs(x = NULL, y = "Maturation Score") + theme_pub(base_size = 10) + theme(legend.position = "top") ``` ### Genotype Comparison ```{r} #| label: comparison d21_means <- score_summary %>% filter(genotype == "D21") %>% select(timepoint, d21 = mean_score) t21_means <- score_summary %>% filter(genotype == "T21") %>% select(timepoint, t21 = mean_score) diff_df <- inner_join( d21_means, t21_means, by = "timepoint" ) %>% mutate( difference = round(t21 - d21, 3), pct_diff = paste0( round(difference * 100, 1), "%" ) ) knitr::kable( diff_df, col.names = c( "Timepoint", "D21 Score", "T21 Score", "Difference", "% Difference" ), caption = "Maturation score differences (T21 - D21)" ) ``` --- ## Leave-One-Out Validation To validate that the reference trajectory accurately classifies samples by differentiation stage, we hold out one replicate from each D21 timepoint, build the trajectory from the remaining $n = 2$ replicates, and score the held-out samples. No T21 samples are used. ```{r} #| label: loo-compute #| cache: true # Hold out replicate 1 from each timepoint holdout_ids <- c( ctrl_meta$sample_id[ ctrl_meta$timepoint == 6][1], ctrl_meta$sample_id[ ctrl_meta$timepoint == 10][1], ctrl_meta$sample_id[ ctrl_meta$timepoint == 17][1] ) train_ids <- setdiff( ctrl_meta$sample_id, holdout_ids ) cat( "Training samples (n=6):", paste(train_ids, collapse = ", "), "\n", "Held-out samples (n=3):", paste(holdout_ids, collapse = ", ") ) # LRT on training set train_meta <- ctrl_meta[ ctrl_meta$sample_id %in% train_ids, ] train_counts <- raw_counts[, train_ids] dds_loo <- DESeqDataSetFromMatrix( countData = train_counts, colData = train_meta, design = ~ timepoint ) dds_loo <- DESeq( dds_loo, test = "LRT", reduced = ~ 1 ) res_loo <- results(dds_loo) res_loo <- res_loo[order(res_loo$padj), ] sig_loo <- rownames(res_loo)[ !is.na(res_loo$padj) & res_loo$padj < 0.05 ] n_top_loo <- min(1000, length(sig_loo)) top_loo <- head(rownames(res_loo), n_top_loo) cat( "\nLRT significant genes:", length(sig_loo), "\nGenes used for PCA:", n_top_loo ) # PCA on training samples pca_loo <- prcomp( t(as.matrix(vst_counts[top_loo, train_ids])), scale. = TRUE, center = TRUE ) # Project all D21 samples (train + holdout) all_d21_proj <- predict( pca_loo, t(as.matrix( vst_counts[top_loo, ctrl_meta$sample_id] )) ) # Centroids from training samples only tp_train <- as.numeric( as.character(train_meta$timepoint) ) c_d6 <- colMeans( all_d21_proj[train_ids[tp_train == 6], 1:3] ) c_d17 <- colMeans( all_d21_proj[train_ids[tp_train == 17], 1:3] ) rv <- c_d17 - c_d6 # Score all D21 samples loo_scores <- apply( all_d21_proj[, 1:3], 1, function(z) { sum((z - c_d6) * rv) / sum(rv^2) } ) loo_df <- data.frame( sample = ctrl_meta$sample_id, timepoint = factor( paste0("Day ", ctrl_meta$timepoint), levels = c("Day 6", "Day 10", "Day 17") ), score = loo_scores[ctrl_meta$sample_id], set = ifelse( ctrl_meta$sample_id %in% holdout_ids, "Held-out", "Training" ) ) ``` ```{r} #| label: fig-loo #| fig-cap: > #| Leave-one-out validation. Reference trajectory #| built from n=2 D21 replicates per timepoint #| (open circles). Held-out replicates (filled) #| are scored along the trajectory. Accurate #| classification = held-out samples land near #| their expected position. #| fig-height: 3.5 #| fig-width: 5.5 ggplot( loo_df, aes(x = score, y = timepoint) ) + geom_vline( xintercept = c(0, 1), linetype = "dashed", color = "#000", linewidth = 0.3, alpha = 0.4 ) + geom_point( aes(fill = timepoint, shape = set), size = 2.5, stroke = 0.4, color = "black", alpha = 1 ) + scale_fill_manual(values = tp_colors) + scale_shape_manual( values = c("Training" = 1, "Held-out" = 21) ) + annotate( "text", x = 0, y = 0.5, label = "Day 6\ncentroid", size = 2.2, hjust = 0.5, color = "#000" ) + annotate( "text", x = 1, y = 0.5, label = "Day 17\ncentroid", size = 2.2, hjust = 0.5, color = "#000" ) + labs( x = "Maturation Score", y = NULL, shape = NULL, fill = NULL ) + guides( fill = "none", shape = guide_legend( override.aes = list(size = 2) ) ) + theme_pub(base_size = 10) + theme( panel.grid.major.y = element_blank(), legend.position = "top" ) ``` ```{r} #| label: loo-table knitr::kable( loo_df %>% select(sample, timepoint, set, score) %>% mutate(score = round(score, 3)) %>% arrange(timepoint, set), col.names = c( "Sample", "Timepoint", "Set", "Score" ), caption = paste( "Leave-one-out validation scores.", "Held-out samples should score near 0", "(Day 6), ~0.5 (Day 10), and ~1 (Day 17)." ) ) ``` --- *Data: Martinez et al. (2024). Front Cell Neurosci 18: 1341141.*