Skip to contents

Overview

This article reproduces key experiments from the SONG paper using the songR package:

Senanayake, D. A., Wang, W., Naik, S. H., & Halgamuge, S. (2021). Self-Organizing Nebulous Growths for Robust and Incremental Data Visualization. IEEE TNNLS, 32(10), 4588-4602.

All figures use the viridis plasma color scale. For full-scale reproductions with large datasets, see the tutorials/ folder in the repository.

Plasma Color Helpers

# Standard scatter using plasma palette
plot_plasma <- function(emb, labels, title = "", pch = 16, cex = 0.5) {
  if (is.factor(labels) || is.character(labels)) {
    labels <- as.factor(labels)
    cols <- viridis::plasma(nlevels(labels), end = 0.92)[as.integer(labels)]
  } else {
    cols <- viridis::plasma(256)[cut(labels, 256, labels = FALSE)]
  }
  plot(emb[, 1], emb[, 2], col = cols, pch = pch, cex = cex,
       xlab = "Dim 1", ylab = "Dim 2", main = title, bty = "n")
}

Figure 3/4: Heterogeneous Increments (Fashion-MNIST / MNIST)

The paper adds 2 new classes per step and tracks how each method handles the growing embedding. SONG updates incrementally; t-SNE and UMAP refit from scratch.

Here we demonstrate the principle on the bundled songR_blobs dataset (8 clusters, 20D), adding 2 clusters per step:

data(songR_blobs)
X <- songR_blobs$data
labs <- as.integer(songR_blobs$labels)

# Define 4 incremental steps: 2 clusters each
set.seed(SEED)
cluster_order <- sample(8)
steps <- list(cluster_order[1:2], cluster_order[3:4],
              cluster_order[5:6], cluster_order[7:8])
step_names <- c("2 clusters", "4 clusters", "6 clusters", "8 clusters")

# Build data for each step
X_list <- lapply(steps, function(cls) X[labs %in% cls, , drop = FALSE])
lab_list <- lapply(steps, function(cls) labs[labs %in% cls])

# SONG incremental
song_embs <- list()
model <- NULL; X_seen <- NULL
for (s in seq_along(X_list)) {
  X_seen <- rbind(X_seen, X_list[[s]])
  if (is.null(model)) {
    model <- song(X_seen, epochs = 15L, seed = SEED, verbose = FALSE)
  } else {
    model <- update(model, X_list[[s]], epochs = 15L, verbose = FALSE)
  }
  song_embs[[s]] <- predict(model, newdata = X_seen)
}

# SONG + Reinit (refit from scratch each step)
reinit_embs <- list()
X_seen <- NULL
for (s in seq_along(X_list)) {
  X_seen <- rbind(X_seen, X_list[[s]])
  m <- song(X_seen, epochs = 15L, seed = SEED, verbose = FALSE)
  reinit_embs[[s]] <- m$embedding
}

# Cumulative labels for coloring
cum_labs <- list()
for (s in seq_along(steps)) cum_labs[[s]] <- unlist(lab_list[1:s])

par(mfrow = c(2, 4), mar = c(2, 2, 2.5, 1))
for (s in 1:4) {
  plot_plasma(song_embs[[s]], factor(cum_labs[[s]]),
              title = if (s == 1) paste0("SONG\n", step_names[s])
                      else step_names[s])
}
for (s in 1:4) {
  plot_plasma(reinit_embs[[s]], factor(cum_labs[[s]]),
              title = if (s == 1) paste0("SONG+Reinit\n", step_names[s])
                      else step_names[s])
}

Key observation: SONG preserves existing cluster positions as new clusters are added, while SONG+Reinit recomputes the entire layout.

Figure 5: Homogeneous Increments (Wong CyTOF)

Homogeneous increments add more data of the same distribution. The paper uses Wong CyTOF data colored by CCR7 expression. Here we demonstrate with blobs, adding random subsamples:

data(songR_blobs)
set.seed(SEED)
idx <- sample(nrow(songR_blobs$data))
sizes <- c(400, 800, 1200, 1600)

par(mfrow = c(2, 4), mar = c(2, 2, 2.5, 1))

# SONG incremental
model <- NULL; prev_n <- 0
for (s in seq_along(sizes)) {
  n_s <- sizes[s]
  if (prev_n == 0) {
    X_chunk <- songR_blobs$data[idx[1:n_s], ]
    model <- song(X_chunk, epochs = 15L, seed = SEED, verbose = FALSE)
  } else {
    X_chunk <- songR_blobs$data[idx[(prev_n + 1):n_s], ]
    model <- update(model, X_chunk, epochs = 15L, verbose = FALSE)
  }
  emb <- predict(model, newdata = songR_blobs$data[idx[1:n_s], ])
  cur_labs <- songR_blobs$labels[idx[1:n_s]]
  plot_plasma(emb, cur_labs,
              title = if (s == 1) paste0("SONG\nn=", n_s) else paste0("n=", n_s))
  prev_n <- n_s
}

# SONG+Reinit
for (s in seq_along(sizes)) {
  n_s <- sizes[s]
  m <- song(songR_blobs$data[idx[1:n_s], ], epochs = 15L, seed = SEED, verbose = FALSE)
  cur_labs <- songR_blobs$labels[idx[1:n_s]]
  plot_plasma(m$embedding, cur_labs,
              title = if (s == 1) paste0("SONG+Reinit\nn=", n_s) else paste0("n=", n_s))
}

Figure 6: CDY (Consecutive Displacement of Y)

CDY measures how much existing points move when new data is added. Lower CDY = more stable embedding.

data(songR_blobs)
set.seed(SEED)
idx <- sample(nrow(songR_blobs$data))
X <- songR_blobs$data[idx, ]
labs <- songR_blobs$labels[idx]

init_n <- 400
step_n <- 300
n_steps <- 4

# SONG incremental CDY
model <- song(X[1:init_n, ], epochs = 15L, seed = SEED, verbose = FALSE)
prev_emb <- predict(model, newdata = X[1:init_n, ])
song_cdy <- numeric(n_steps)
bound <- init_n

for (i in seq_len(n_steps)) {
  new_bound <- bound + step_n
  model <- update(model, X[(bound + 1):new_bound, ], epochs = 15L, verbose = FALSE)
  curr_emb <- predict(model, newdata = X[1:bound, ])
  song_cdy[i] <- mean(sqrt(rowSums((prev_emb - curr_emb)^2)))
  prev_emb <- predict(model, newdata = X[1:new_bound, ])
  bound <- new_bound
}

# SONG+Reinit CDY
model0 <- song(X[1:init_n, ], epochs = 15L, seed = SEED, verbose = FALSE)
prev_emb <- model0$embedding
reinit_cdy <- numeric(n_steps)
bound <- init_n

for (i in seq_len(n_steps)) {
  new_bound <- bound + step_n
  m <- song(X[1:new_bound, ], epochs = 15L, seed = SEED, verbose = FALSE)
  reinit_cdy[i] <- mean(sqrt(rowSums(
    (prev_emb - m$embedding[1:nrow(prev_emb), ])^2)))
  prev_emb <- m$embedding
  bound <- new_bound
}

# Plot
method_cols <- viridis::plasma(4, end = 0.92)
plot(1:n_steps, song_cdy, type = "b", pch = 16, col = method_cols[1],
     ylim = c(0, max(c(song_cdy, reinit_cdy)) * 1.1),
     xlab = "Increment", ylab = "Mean CDY",
     main = "CDY: Embedding Stability", bty = "n", lwd = 2)
lines(1:n_steps, reinit_cdy, type = "b", pch = 17, col = method_cols[3], lwd = 2)
legend("topright", c("SONG (incremental)", "SONG+Reinit"),
       col = method_cols[c(1, 3)], pch = c(16, 17), lwd = 2, bty = "n")

Key result: SONG (incremental) displaces existing points far less than reinitialized methods. In the paper, t-SNE shows CDY 10-50x higher than SONG.

Figure 7 / Table IV: Noise Tolerance (Gaussian Blobs)

The paper tests 32 configurations of Gaussian blobs (8 noise levels x 4 cluster counts, 60D). SONG maintains cluster quality even at high noise.

simulate_blobs <- function(k, noise_sd, d = 20, n_per = 100, seed = 42) {
  set.seed(seed)
  centers <- matrix(rnorm(k * d, sd = 30), ncol = d)
  data <- do.call(rbind, lapply(1:k, function(i)
    sweep(matrix(rnorm(n_per * d, sd = noise_sd), ncol = d), 2, centers[i, ], "+")))
  list(data = data, labels = factor(rep(1:k, each = n_per)))
}

# Low noise vs high noise at 10 clusters
blobs_low  <- simulate_blobs(10, 4)
blobs_high <- simulate_blobs(10, 16)

par(mfrow = c(1, 3), mar = c(2, 2, 2.5, 1))

m1 <- song(blobs_low$data, epochs = 20L, seed = SEED, verbose = FALSE)
plot_plasma(m1$embedding, blobs_low$labels, title = "SONG (std=4, k=10)")

m2 <- song(blobs_high$data, epochs = 20L, seed = SEED, verbose = FALSE)
plot_plasma(m2$embedding, blobs_high$labels, title = "SONG (std=16, k=10)")

# High noise, more clusters
blobs_many <- simulate_blobs(20, 12, n_per = 50)
m3 <- song(blobs_many$data, epochs = 20L, seed = SEED, verbose = FALSE)
plot_plasma(m3$embedding, blobs_many$labels, title = "SONG (std=12, k=20)")

The full tutorial 06_fig7_table_IV_noise_tolerance.R computes AMI scores across all 32 configurations and compares SONG, t-SNE, and UMAP. Typical results: SONG achieves AMI 85-95 across all conditions.

Figure 8: COIL-20 Topology Preservation

COIL-20 contains 20 objects photographed at 72 angles (360 degrees). The underlying topology is circular. SONG and UMAP preserve this; t-SNE distorts it into arches.

# Simulate 5 objects with circular topology
set.seed(SEED)
n_poses <- 72
sim_data <- list()
for (obj in 1:5) {
  angles <- seq(0, 2 * pi, length.out = n_poses + 1)[-(n_poses + 1)]
  b1 <- rnorm(10); b1 <- b1 / sqrt(sum(b1^2))
  b2 <- rnorm(10); b2 <- b2 - sum(b2 * b1) * b1; b2 <- b2 / sqrt(sum(b2^2))
  center <- rnorm(10, sd = 5)
  r <- runif(1, 1, 3)
  sim_data[[obj]] <- sweep(r * (outer(cos(angles), b1) + outer(sin(angles), b2)),
                            2, center, "+") + matrix(rnorm(n_poses * 10, sd = 0.1), ncol = 10)
}
X_coil <- do.call(rbind, sim_data)
labs_coil <- factor(rep(1:5, each = n_poses))

par(mfrow = c(1, 3), mar = c(2, 2, 2.5, 1))

m_song <- song(X_coil, epochs = 25L, seed = SEED, verbose = FALSE)
plot_plasma(m_song$embedding, labs_coil, title = "SONG", cex = 0.8)

if (requireNamespace("Rtsne", quietly = TRUE)) {
  set.seed(SEED)
  emb_tsne <- Rtsne::Rtsne(X_coil, dims = 2, perplexity = 20,
                             check_duplicates = FALSE, verbose = FALSE)$Y
  plot_plasma(emb_tsne, labs_coil, title = "t-SNE", cex = 0.8)
} else {
  plot.new(); text(0.5, 0.5, "Rtsne not installed")
}

if (requireNamespace("uwot", quietly = TRUE)) {
  set.seed(SEED)
  emb_umap <- uwot::umap(X_coil, n_neighbors = 15, min_dist = 0.1, verbose = FALSE)
  plot_plasma(emb_umap, labs_coil, title = "UMAP", cex = 0.8)
} else {
  plot.new(); text(0.5, 0.5, "uwot not installed")
}

Expected: Circular/elongated clusters in SONG and UMAP, arch shapes in t-SNE.

Table II / III: AMI Scores

The paper reports AMI (Adjusted Mutual Information) after k-means clustering on the embeddings. Here we compute AMI on the blobs dataset:

if (requireNamespace("aricode", quietly = TRUE)) {
  data(songR_blobs)
  m <- song(songR_blobs$data, epochs = 20L, seed = SEED, verbose = FALSE)
  km <- kmeans(m$embedding, centers = 8, nstart = 10)
  ami <- aricode::AMI(as.integer(songR_blobs$labels), km$cluster) * 100
  cat(sprintf("SONG AMI on songR_blobs: %.1f%%\n", ami))

  if (requireNamespace("uwot", quietly = TRUE)) {
    set.seed(SEED)
    emb_u <- uwot::umap(songR_blobs$data, verbose = FALSE)
    km_u <- kmeans(emb_u, centers = 8, nstart = 10)
    ami_u <- aricode::AMI(as.integer(songR_blobs$labels), km_u$cluster) * 100
    cat(sprintf("UMAP AMI on songR_blobs: %.1f%%\n", ami_u))
  }
} else {
  cat("Install aricode for AMI computation: install.packages('aricode')\n")
}
#> SONG AMI on songR_blobs: 91.3%
#> UMAP AMI on songR_blobs: 100.0%

Comparison: SONG vs t-SNE vs UMAP on Iris

X <- as.matrix(iris[, 1:4])
labs <- iris$Species

par(mfrow = c(1, 3), mar = c(2, 2, 2.5, 1))

m <- song(X, epochs = 15L, seed = SEED, verbose = FALSE)
plot_plasma(m$embedding, labs, title = "SONG")

if (requireNamespace("Rtsne", quietly = TRUE)) {
  set.seed(SEED)
  emb_t <- Rtsne::Rtsne(X, dims = 2, perplexity = 30,
                          check_duplicates = FALSE, verbose = FALSE)$Y
  plot_plasma(emb_t, labs, title = "t-SNE")
}

if (requireNamespace("uwot", quietly = TRUE)) {
  set.seed(SEED)
  emb_u <- uwot::umap(X, verbose = FALSE)
  plot_plasma(emb_u, labs, title = "UMAP")
}

Running the Full Tutorial Suite

For full-scale paper reproduction with MNIST (70k), Fashion-MNIST (70k), Wong CyTOF (1.27M), Samusik (87k), and COIL-20 (1440):

setwd("path/to/songR")

# Install dependencies and prepare data
source("tutorials/00_install_dependencies.R")
source("tutorials/01_prepare_data.R")

# Figures (output PDFs in tutorials/output/)
source("tutorials/02_fig3_fashion_mnist_heterogeneous.R")
source("tutorials/03_fig4_mnist_heterogeneous.R")
source("tutorials/04_fig5_wong_homogeneous.R")
source("tutorials/05_fig6_cdy_lines.R")
source("tutorials/06_fig7_table_IV_noise_tolerance.R")
source("tutorials/07_fig8_coil20_topology.R")

# Tables (output CSVs in tutorials/output/)
source("tutorials/08_table_II_heterogeneous_ami.R")
source("tutorials/09_table_III_homogeneous_ami.R")

All scripts have FAST_MODE <- TRUE at the top. Set to FALSE for full paper-scale experiments.

Citation

citation("songR")

Senanayake, D. A., Wang, W., Naik, S. H., & Halgamuge, S. (2021). Self-Organizing Nebulous Growths for Robust and Incremental Data Visualization. IEEE TNNLS, 32(10), 4588-4602. doi:10.1109/TNNLS.2020.3023941

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] viridis_0.6.5     viridisLite_0.4.3 songR_0.1.0      
#> 
#> loaded via a namespace (and not attached):
#>  [1] Matrix_1.7-5       gtable_0.3.6       jsonlite_2.0.0     compiler_4.6.0    
#>  [5] Rcpp_1.1.1-1.1     FNN_1.1.4.1        gridExtra_2.3      jquerylib_0.1.4   
#>  [9] systemfonts_1.3.2  scales_1.4.0       textshaping_1.0.5  yaml_2.3.12       
#> [13] fastmap_1.2.0      uwot_0.2.4         aricode_1.0.3      lattice_0.22-9    
#> [17] ggplot2_4.0.3      R6_2.6.1           knitr_1.51         Rtsne_0.17        
#> [21] desc_1.4.3         bslib_0.10.0       RColorBrewer_1.1-3 rlang_1.2.0       
#> [25] cachem_1.1.0       xfun_0.57          fs_2.1.0           sass_0.4.10       
#> [29] S7_0.2.2           otel_0.2.0         cli_3.6.6          pkgdown_2.2.0     
#> [33] digest_0.6.39      grid_4.6.0         irlba_2.3.7        lifecycle_1.0.5   
#> [37] vctrs_0.7.3        evaluate_1.0.5     glue_1.8.1         farver_2.1.2      
#> [41] ragg_1.5.2         rmarkdown_2.31     tools_4.6.0        htmltools_0.5.9