Analysis/SpatialDE_clustering.Rmd

---
title: "SpatialDE clustering"
author: "Lukas Weber"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output: 
    html_document:
        toc: true
        toc_depth: 2
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```


# SpatialDE clustering

This script contains code for several versions of data-driven clustering to reproduce the DLPFC layer structure in our dataset, using various combinations of SpatialDE genes and spatial dimensions. Clustering results are evaluated using manually annotated layers from Kristen Maynard as ground truth.

```{r, message = FALSE}
library(SingleCellExperiment)
library(scran)
library(scater)
library(readr)
library(readxl)
library(dplyr)
library(tidyr)
library(uwot)
library(mclust)
library(aricode)
library(nlme)
library(ggplot2)
library(RColorBrewer)
library(Polychrome)
library(patchwork)
library(broom)
library(magick)
```


## Load data

Load SingleCellExperiment object from Leo's analyses.

```{r}
# load scran output file
load("../../data/Human_DLPFC_Visium_processedData_sce_scran.Rdata")
sce

# sample names
sample_names <- paste0("sample_", unique(colData(sce)$sample_name))
sample_names
```


## Load SpatialDE genes

Load lists of significant SpatialDE genes (with additional filtering) for each sample. (Note: these gene lists were saved as output files from the previous script "SpatialDE_genes_analysis.Rmd".)

```{r}
# load spreadsheets containing SpatialDE gene lists (with additional filtering)
genes_SpatialDE_filt_files <- list.files(
    "../outputs/SpatialDE_genes_analysis", 
    pattern = "^SpatialDE_filtered_sample_[0-9]+\\.csv$", full.names = TRUE
)
genes_SpatialDE_filt_pooled_file <- list.files(
    "../outputs/SpatialDE_genes_analysis", 
    pattern = "^SpatialDE_filtered_pooled\\.csv$", full.names = TRUE
)

genes_SpatialDE_filt_files
genes_SpatialDE_filt_pooled_file

genes_SpatialDE_filt <- lapply(genes_SpatialDE_filt_files, read_csv)
genes_SpatialDE_filt_pooled <- read_csv(genes_SpatialDE_filt_pooled_file)

stopifnot(all(sample_names == paste0("sample_", gsub("\\.csv$", "", gsub("^.*_", "", genes_SpatialDE_filt_files)))))

names(genes_SpatialDE_filt) <- sample_names

# number of genes
sapply(genes_SpatialDE_filt, nrow)
nrow(genes_SpatialDE_filt_pooled)
```


## Load pseudobulk genes (from Leo's analyses)

```{r}
# load spreadsheet of significant genes for pseudobulk layers (from Leo's analyses)
sig_genes <- read_csv("Layer_Guesses/sig_genes.csv")
sig_genes

genes_pseudobulk <- sig_genes[, c("ensembl", "gene")]
colnames(genes_pseudobulk) <- c("gene_id", "gene_name")

dim(genes_pseudobulk)

# remove duplicates (i.e. genes identified for multiple layers)
genes_pseudobulk <- distinct(genes_pseudobulk)

dim(genes_pseudobulk)
```


## Load known marker genes (from Kristen)

```{r}
# load spreadsheet of known marker genes (from Kristen)
KRM_Layer_Markers <- read_xlsx("KRM_Layer_Markers.xlsx")
KRM_Layer_Markers
dim(KRM_Layer_Markers)

marker_genes_KRM <- KRM_Layer_Markers$Gene

# get gene IDs (note: not all available in "sce" object)
sum(toupper(marker_genes_KRM) %in% rowData(sce)$gene_name)

genes_markers <- data.frame(gene_name = toupper(marker_genes_KRM))
genes_markers$gene_id <- rowData(sce)$gene_id[match(genes_markers$gene_name, rowData(sce)$gene_name)]

dim(genes_markers)
sum(is.na(genes_markers$gene_id))
```


## Load ground truth

Load ground truth cluster labels (manually annotated DLPFC layers from Kristen Maynard).

```{r}
# load KRM manual layer labels
KRM_manual_layers_part1 <- read_csv("../../from_Slack/KRM_manual_layers/spatialLIBD_layerGuesses_2019-12-19 19_49_57_Merged.csv")
KRM_manual_layers_part2 <- read_csv("../../from_Slack/KRM_manual_layers/spatialLIBD_layerGuesses_2019-12-30 17_35_40_Combined2.csv")

KRM_manual_layers <- rbind(KRM_manual_layers_part1, KRM_manual_layers_part2)

table(KRM_manual_layers$layer)
table(KRM_manual_layers$sample_name, KRM_manual_layers$layer)

# re-code "Layer 2/3" as "Layer 3" (for samples 151669, 151670, 151671, 151672)
KRM_manual_layers$layer[KRM_manual_layers$layer == "Layer 2/3"] <- "Layer 3"

KRM_manual_layers$layer <- gsub(" ", "_", KRM_manual_layers$layer)

KRM_manual_layers
table(KRM_manual_layers$layer)
table(KRM_manual_layers$sample_name, KRM_manual_layers$layer)
```


## Clustering

Run the following clustering analyses:

- clustering on top 50 PCs on SpatialDE significant genes (filtered) for each sample
- clustering on top 50 PCs on SpatialDE significant genes (filtered) pooled
- clustering on top 50 PCs on HVGs
- clustering on top 50 PCs on pseudobulk layer genes (from Leo's analyses; 198 genes)
- clustering on top 50 PCs on known marker genes (from Kristen; 77 out of 81 genes)

- clustering on top 10 UMAPs on top 50 PCs on SpatialDE significant genes (filtered) for each sample
- clustering on top 10 UMAPs on top 50 PCs on SpatialDE significant genes (filtered) pooled
- clustering on top 10 UMAPs on top 50 PCs on HVGs
- clustering on top 10 UMAPs on pseudobulk layer genes (from Leo's analyses; 198 genes)
- clustering on top 10 UMAPs on known marker genes (from Kristen; 77 out of 81 genes)

- clustering on top 50 PCs on SpatialDE significant genes (filtered) for each sample + 2 scaled spatial dimensions
- clustering on top 50 PCs on SpatialDE significant genes (filtered) pooled + 2 scaled spatial dimensions
- clustering on top 50 PCs on HVGs + 2 scaled spatial dimensions
- clustering on top 50 PCs on pseudobulk layer genes (from Leo's analyses; 198 genes) + 2 scaled spatial dimensions
- clustering on top 50 PCs on known marker genes (from Kristen; 77 out of 81 genes) + 2 scaled spatial dimensions

- clustering on top 10 UMAPs on top 50 PCs on SpatialDE significant genes (filtered) for each sample + 2 scaled spatial dimensions
- clustering on top 10 UMAPs on top 50 PCs on SpatialDE significant genes (filtered) pooled + 2 scaled spatial dimensions
- clustering on top 10 UMAPs on top 50 PCs on HVGs + 2 scaled spatial dimensions
- clustering on top 10 UMAPs on pseudobulk layer genes (from Leo's analyses; 198 genes) + 2 scaled spatial dimensions
- clustering on top 10 UMAPs on known marker genes (from Kristen; 77 out of 81 genes) + 2 scaled spatial dimensions


Notes:

- spatial dimensions are scaled to a range approximately comparable to the other dimensions; no z-scaling of spatial dimensions since these are physical coordinates

- clustering is performed using Bioconductor graph-based clustering number of clusters equal to known number of ground truth layers

```{r}
# parameters
n_umap <- 10
max_spatial <- 1
n_neighbors <- 10
n_clus <- 8


d_plot <- data.frame()


# function to sort cluster labels by descending frequency (from Leo)
sort_clusters <- function(clusters, map_subset = NULL) {
    if (is.null(map_subset)) {
        map_subset <- rep(TRUE, length(clusters))
    }
    map <- rank(length(clusters[map_subset]) - table(clusters[map_subset]), ties.method = "first")
    res <- map[clusters]
    factor(res)
}


# run once per sample
for (i in seq_along(sample_names)) {
    
    # select spots from this sample
    sce_sub <- sce[, colData(sce)$sample_name == gsub("^sample_", "", sample_names[i])]
    dim(sce_sub)
    
    
    # ----------------------
    # get spatial dimensions
    # ----------------------
    
    # extract x-y coordinates of spots (note: y coordinate is reversed)
    xy_coords <- data.frame(
        x_coord = colData(sce_sub)[, c("imagecol")], 
        y_coord = -colData(sce_sub)[, c("imagerow")]
    )
    
    dims_spatial <- xy_coords
    colnames(dims_spatial) <- c("spatial_x", "spatial_y")
    rownames(dims_spatial) <- colnames(sce_sub)
    dim(dims_spatial)
    stopifnot(nrow(dims_spatial) == ncol(sce_sub))
    
    # scale spatial dimensions
    apply(dims_spatial, 2, range)
    dims_spatial <- apply(as.matrix(dims_spatial), 2, function(col) {
        (col - min(col)) / (max(col) - min(col)) * (2 * max_spatial) - max_spatial
    })
    rownames(dims_spatial) <- colnames(sce_sub)
    apply(dims_spatial, 2, range)
    dim(dims_spatial)
    stopifnot(nrow(dims_spatial) == ncol(sce_sub))
    
    
    # -----------------------
    # get ground truth labels
    # -----------------------
    
    # note ground truth labels are not available for all spots
    KRM_manual_layers_sub <- filter(KRM_manual_layers, sample_name == gsub("^sample_", "", sample_names[i]))
    dim(KRM_manual_layers_sub)
    
    ground_truth_sub <- data.frame(
        truth = rep(NA, ncol(sce_sub))
    )
    rownames(ground_truth_sub) <- colnames(sce_sub)
    ground_truth_sub[KRM_manual_layers_sub$spot_name, "truth"] <- KRM_manual_layers_sub$layer
    dim(ground_truth_sub)
    stopifnot(nrow(ground_truth_sub) == ncol(sce_sub))
    
    
    # ---------------------------------------------
    # extract and calculate features (PCA and UMAP)
    # ---------------------------------------------
    
    ### HVGs
    
    # extract top 50 PCs on HVGs
    dims_HVG_PCA <- reducedDim(sce_sub, type = "PCA")
    dim(dims_HVG_PCA)
    stopifnot(nrow(dims_HVG_PCA) == ncol(sce_sub))
    
    
    # run UMAP on top 50 PCs on HVGs
    set.seed(1234)
    out_umap_hvgs <- umap(dims_HVG_PCA, scale = TRUE, n_components = n_umap)
    
    dims_HVG_UMAP <- out_umap_hvgs
    colnames(dims_HVG_UMAP) <- paste0("UMAP", seq_len(n_umap))
    rownames(dims_HVG_UMAP) <- colnames(sce_sub)
    dim(dims_HVG_UMAP)
    stopifnot(nrow(dims_HVG_UMAP) == ncol(sce_sub))
    
    
    ### SpatialDE significant genes (sample-specific)
    
    # run PCA on SpatialDE significant genes (sample-specific)
    logcounts_spatialde <- logcounts(sce_sub[genes_SpatialDE_filt[[sample_names[i]]]$gene_id, ])
    
    # note scater::calculatePCA has random seed
    set.seed(1234)
    out_pca_spatialde <- calculatePCA(logcounts_spatialde, ncomponents = 50)
    
    dims_SpatialDE_PCA <- out_pca_spatialde
    rownames(dims_SpatialDE_PCA) <- colnames(sce_sub)
    dim(dims_SpatialDE_PCA)
    stopifnot(nrow(dims_SpatialDE_PCA) == ncol(sce_sub))
    
    
    # run UMAP on SpatialDE significant genes (sample-specific)
    set.seed(1234)
    out_umap_spatialde <- umap(dims_SpatialDE_PCA, scale = TRUE, n_components = n_umap)
    
    dims_SpatialDE_UMAP <- out_umap_spatialde
    colnames(dims_SpatialDE_UMAP) <- paste0("UMAP", seq_len(n_umap))
    rownames(dims_SpatialDE_UMAP) <- colnames(sce_sub)
    dim(dims_SpatialDE_UMAP)
    stopifnot(nrow(dims_SpatialDE_UMAP) == ncol(sce_sub))
    
    
    ### SpatialDE significant genes (pooled)
    
    # run PCA on SpatialDE significant genes (pooled)
    logcounts_spatialde_pool <- logcounts(sce_sub[genes_SpatialDE_filt_pooled$gene_id, ])
    
    # note scater::calculatePCA has random seed
    set.seed(1234)
    out_pca_spatialde_pool <- calculatePCA(logcounts_spatialde_pool, ncomponents = 50)
    
    dims_SpatialDE_pool_PCA <- out_pca_spatialde_pool
    rownames(dims_SpatialDE_pool_PCA) <- colnames(sce_sub)
    dim(dims_SpatialDE_pool_PCA)
    stopifnot(nrow(dims_SpatialDE_pool_PCA) == ncol(sce_sub))
    
    
    # run UMAP on SpatialDE significant genes (pooled)
    set.seed(1234)
    out_umap_spatialde_pool <- umap(dims_SpatialDE_pool_PCA, scale = TRUE, n_components = n_umap)
    
    dims_SpatialDE_pool_UMAP <- out_umap_spatialde_pool
    colnames(dims_SpatialDE_pool_UMAP) <- paste0("UMAP", seq_len(n_umap))
    rownames(dims_SpatialDE_pool_UMAP) <- colnames(sce_sub)
    dim(dims_SpatialDE_pool_UMAP)
    stopifnot(nrow(dims_SpatialDE_pool_UMAP) == ncol(sce_sub))
    
    
    ### pseudobulk layer genes (from Leo's analyses; 198 genes)
    
    # run PCA on pseudobulk layer genes (from Leo's analyses; 198 genes)
    logcounts_pseudobulk <- logcounts(sce_sub[genes_pseudobulk$gene_id, ])
    
    # note: use 'prcomp' instead of 'calculatePCA' due to small number of genes
    out_pca_pseudobulk <- prcomp(t(as.matrix(logcounts_pseudobulk)))$x[, 1:50]
    
    dims_pseudobulk_PCA <- out_pca_pseudobulk
    rownames(dims_pseudobulk_PCA) <- colnames(sce_sub)
    dim(dims_pseudobulk_PCA)
    stopifnot(nrow(dims_pseudobulk_PCA) == ncol(sce_sub))
    
    
    # run UMAP on pseudobulk layer genes (from Leo's analyses; 198 genes)
    set.seed(1234)
    out_umap_pseudobulk <- umap(dims_pseudobulk_PCA, scale = TRUE, n_components = n_umap)
    
    dims_pseudobulk_UMAP <- out_umap_pseudobulk
    colnames(dims_pseudobulk_UMAP) <- paste0("UMAP", seq_len(n_umap))
    rownames(dims_pseudobulk_UMAP) <- colnames(sce_sub)
    dim(dims_pseudobulk_UMAP)
    stopifnot(nrow(dims_pseudobulk_UMAP) == ncol(sce_sub))
    
    
    ### known marker genes (from Kristen; 77 out of 81 genes)
    
    # run PCA on known marker genes (from Kristen; 77 out of 81 genes)
    # note: remove NAs
    logcounts_markers <- logcounts(sce_sub[na.omit(genes_markers$gene_id), ])
    
    # note: use 'prcomp' instead of 'calculatePCA' due to small number of genes
    out_pca_markers <- prcomp(t(as.matrix(logcounts_markers)))$x[, 1:50]
    
    dims_markers_PCA <- out_pca_markers
    rownames(dims_markers_PCA) <- colnames(sce_sub)
    dim(dims_markers_PCA)
    stopifnot(nrow(dims_markers_PCA) == ncol(sce_sub))
    
    
    # run UMAP on known marker genes (from Kristen; 77 out of 81 genes)
    set.seed(1234)
    out_umap_markers <- umap(dims_markers_PCA, scale = TRUE, n_components = n_umap)
    
    dims_markers_UMAP <- out_umap_markers
    colnames(dims_markers_UMAP) <- paste0("UMAP", seq_len(n_umap))
    rownames(dims_markers_UMAP) <- colnames(sce_sub)
    dim(dims_markers_UMAP)
    stopifnot(nrow(dims_markers_UMAP) == ncol(sce_sub))
    
    
    # --------------------------------------------------------------------------------------------
    # run clustering and calculate Adjusted Rand Index (ARI) / Normalized Mutual Information (NMI)
    # --------------------------------------------------------------------------------------------
    
    # using graph-based clustering (see Bioconductor OSCA book)
    
    # convenience function; note uses some external variables from above
    run_clustering_ARI <- function(input, method) {
        dims_clus <- input
        
        set.seed(1234)
        g <- buildSNNGraph(t(dims_clus), k = n_neighbors, d = ncol(dims_clus))
        g_walk <- igraph::cluster_walktrap(g)
        clus <- igraph::cut_at(g_walk, n = n_clus)
        clus <- sort_clusters(clus)
        
        table(clus)
        stopifnot(length(clus) == nrow(dims_clus))
        stopifnot(length(clus) == nrow(ground_truth_sub))
        
        truth <- ground_truth_sub$truth
        table(clus, truth)
        
        df_pairs <- na.omit(data.frame(clus = clus, truth = truth))
        
        out_ARI <- adjustedRandIndex(df_pairs$clus, df_pairs$truth)
        out_NMI <- NMI(df_pairs$clus, df_pairs$truth)
        
        data.frame(
            spot_name = as.character(rownames(dims_clus)), 
            xy_coords, 
            sample_name = as.character(sample_names[i]), 
            method = as.character(method), 
            cluster = as.numeric(clus), 
            truth = as.character(truth), 
            ARI = out_ARI, 
            NMI = out_NMI, 
            stringsAsFactors = FALSE
        )
    }
    
    
    # run clustering for each method
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_HVG_PCA, method = "HVG_PCA"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_HVG_UMAP, method = "HVG_UMAP"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_SpatialDE_PCA, method = "SpatialDE_PCA"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_SpatialDE_UMAP, method = "SpatialDE_UMAP"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_SpatialDE_pool_PCA, method = "SpatialDE_pool_PCA"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_SpatialDE_pool_UMAP, method = "SpatialDE_pool_UMAP"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_pseudobulk_PCA, method = "pseudobulk_PCA"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_pseudobulk_UMAP, method = "pseudobulk_UMAP"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_markers_PCA, method = "markers_PCA"))
    d_plot <- rbind(d_plot, run_clustering_ARI(dims_markers_UMAP, method = "markers_UMAP"))
    
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_HVG_PCA, dims_spatial), method = "HVG_PCA_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_HVG_UMAP, dims_spatial), method = "HVG_UMAP_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_SpatialDE_PCA, dims_spatial), method = "SpatialDE_PCA_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_SpatialDE_UMAP, dims_spatial), method = "SpatialDE_UMAP_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_SpatialDE_pool_PCA, dims_spatial), method = "SpatialDE_pool_PCA_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_SpatialDE_pool_UMAP, dims_spatial), method = "SpatialDE_pool_UMAP_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_pseudobulk_PCA, dims_spatial), method = "pseudobulk_PCA_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_pseudobulk_UMAP, dims_spatial), method = "pseudobulk_UMAP_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_markers_PCA, dims_spatial), method = "markers_PCA_spatial"))
    d_plot <- rbind(d_plot, run_clustering_ARI(cbind(dims_markers_UMAP, dims_spatial), method = "markers_UMAP_spatial"))
}


# re-order method names for plotting
method_names <- c(
    "SpatialDE_PCA", "SpatialDE_pool_PCA", "HVG_PCA", "pseudobulk_PCA", "markers_PCA", 
    "SpatialDE_UMAP", "SpatialDE_pool_UMAP", "HVG_UMAP", "pseudobulk_UMAP", "markers_UMAP", 
    "SpatialDE_PCA_spatial", "SpatialDE_pool_PCA_spatial", "HVG_PCA_spatial", "pseudobulk_PCA_spatial", "markers_PCA_spatial", 
    "SpatialDE_UMAP_spatial", "SpatialDE_pool_UMAP_spatial", "HVG_UMAP_spatial", "pseudobulk_UMAP_spatial", "markers_UMAP_spatial"
)

ix_unsupervised <- !(grepl("pseudobulk", method_names) | grepl("markers", method_names))
method_names_unsupervised <- method_names[ix_unsupervised]
method_names_semisupervised <- method_names[!ix_unsupervised]
```


## Export cluster labels

Export cluster labels for Shiny app.

```{r}
# separate spreadsheet for each sample
for (i in seq_along(sample_names)) {
    out <- 
        d_plot %>% 
        as_tibble() %>% 
        filter(sample_name == sample_names[i]) %>% 
        mutate(key = paste(gsub("^sample_", "", sample_name), spot_name, sep = "_")) %>%
        mutate(ground_truth = factor(truth, levels = c(paste0("Layer_", 1:6), "WM"))) %>% 
        select(c("key", "method", "ground_truth", "cluster")) %>% 
        mutate(method = factor(method, levels = method_names)) %>% 
        spread(key = method, value = cluster)
    
    sce_sub <- sce[, colData(sce)$sample_name == gsub("^sample_", "", sample_names[i])]
    stopifnot(nrow(out) == ncol(sce_sub))
    stopifnot(all(out$key == colData(sce_sub)$key))
    stopifnot(all(colData(sce_sub)$key == out$key))
    
    file_out <- paste0("../outputs/SpatialDE_clustering/cluster_labels_", 
                       gsub("^sample_", "", sample_names[i]), ".csv")
    write.csv(out, file_out, row.names = FALSE)
}
```


## Plots: clustering

```{r}
# color palette
library(RColorBrewer)

set1 <- brewer.pal(8, "Set1")
accent <- brewer.pal(8, "Accent")

my_palette <- set1
my_palette[1] <- accent[6]
my_palette[6] <- "darkorange1"
my_palette[5] <- "gold"
my_palette[8] <- "gray40"
my_palette[7] <- "gray10"

my_palette

library(gplots)
my_palette <- col2hex(my_palette)

my_palette
```


```{r, fig.width = 11.5, fig.height = 5}
# separate plots for each sample
p_clustering_unsupervised <- list()
p_clustering_semisupervised <- list()

colors <- c(my_palette, rep("gray50", 8))

for (i in seq_along(sample_names)) {
    
    d_plot_sub <- d_plot[d_plot$sample_name == sample_names[i], ]
    
    d_plot_sub$method <- factor(d_plot_sub$method, levels = method_names)
    d_plot_sub$cluster <- as.factor(d_plot_sub$cluster)
    d_plot_sub$truth <- as.factor(d_plot_sub$truth)
    
    # separate plots for unsupervised and semisupervised
    d_plot_sub_unsupervised <- d_plot_sub[d_plot_sub$method %in% method_names_unsupervised, ]
    d_plot_sub_semisupervised <- d_plot_sub[d_plot_sub$method %in% method_names_semisupervised, ]
    
    p_clustering_unsupervised[[i]] <- 
        ggplot(d_plot_sub_unsupervised, aes(x = x_coord, y = y_coord, color = cluster)) + 
            facet_wrap(~ method, nrow = 2) + 
            geom_point(size = 0.2) + 
            coord_fixed() + 
            scale_color_manual(values = colors) + 
            ggtitle(paste0("Sample ", gsub("^sample_", "", sample_names[i]), 
                           ": Clustering (unsupervised)")) + 
            guides(color = guide_legend(override.aes = list(size = 2))) + 
            theme_bw() + 
            theme(axis.title.x = element_blank(), 
                  axis.title.y = element_blank(), 
                  axis.text.x = element_blank(), 
                  axis.text.y = element_blank(), 
                  axis.ticks.x = element_blank(), 
                  axis.ticks.y = element_blank(), 
                  panel.grid.major = element_blank(), 
                  panel.grid.minor = element_blank())
    
    print(p_clustering_unsupervised[[i]])
    
    filename <- paste0("../plots/SpatialDE_clustering/clustering_unsupervised_sample_", 
                       gsub("^sample_", "", sample_names[i]), ".png")
    ggsave(filename, width = 11.5, height = 5)
    
    
    p_clustering_semisupervised[[i]] <- 
        ggplot(d_plot_sub_semisupervised, aes(x = x_coord, y = y_coord, color = cluster)) + 
            facet_wrap(~ method, nrow = 2) + 
            geom_point(size = 0.2) + 
            coord_fixed() + 
            scale_color_manual(values = colors) + 
            ggtitle(paste0("Sample ", gsub("^sample_", "", sample_names[i]), 
                           ": Clustering (semi-supervised and markers)")) + 
            guides(color = guide_legend(override.aes = list(size = 2))) + 
            theme_bw() + 
            theme(axis.title.x = element_blank(), 
                  axis.title.y = element_blank(), 
                  axis.text.x = element_blank(), 
                  axis.text.y = element_blank(), 
                  axis.ticks.x = element_blank(), 
                  axis.ticks.y = element_blank(), 
                  panel.grid.major = element_blank(), 
                  panel.grid.minor = element_blank())
    
    print(p_clustering_semisupervised[[i]])
    
    filename <- paste0("../plots/SpatialDE_clustering/clustering_semisupervised_sample_", 
                       gsub("^sample_", "", sample_names[i]), ".png")
    ggsave(filename, width = 8, height = 5)
}
```


```{r}
# supplementary figures for manuscript (ground truth and sample 151673; copied from above)
system("cp ../plots/SpatialDE_clustering/clustering_unsupervised_sample_151673.png ../plots/supp_figures/supp_clustering_unsupervised_151673.png")
system("cp ../plots/SpatialDE_clustering/clustering_semisupervised_sample_151673.png ../plots/supp_figures/supp_clustering_semisupervised_151673.png")


# supplementary figures for manuscript (all samples; formatted as 2 samples per page)
pages <- list(1:2, 3:4, 5:6, 7:8, 9:10, 11:12)

for (i in seq_along(pages)) {
    file1 <- paste0("../plots/SpatialDE_clustering/clustering_unsupervised_", 
                    sample_names[pages[[i]][1]], ".png")
    file2 <- paste0("../plots/SpatialDE_clustering/clustering_semisupervised_", 
                    sample_names[pages[[i]][1]], ".png")
    file3 <- paste0("../plots/SpatialDE_clustering/clustering_unsupervised_", 
                    sample_names[pages[[i]][2]], ".png")
    file4 <- paste0("../plots/SpatialDE_clustering/clustering_semisupervised_", 
                    sample_names[pages[[i]][2]], ".png")
    
    file_out <- paste0("../plots/supp_figures/supp_clustering_page_", i, ".pdf")
    image_write(c(image_read(file1), image_read(file2), image_read(file3), image_read(file4)), 
                format = "pdf", file_out)
}


# supplementary figures for manuscript (all samples; alternative version formatted as a single file)
files_unsupervised <- 
    paste0("../plots/SpatialDE_clustering/clustering_unsupervised_", sample_names, ".png")
files_semisupervised <- 
    paste0("../plots/SpatialDE_clustering/clustering_semisupervised_", sample_names, ".png")

f <- c(files_unsupervised, files_semisupervised)

file_out <- paste0("../plots/supp_figures/SFileXX_clustering.pdf")

image_write(c(image_read(f[1]), image_read(f[13]), image_read(f[2]), image_read(f[14]), image_read(f[3]), image_read(f[15]), 
              image_read(f[4]), image_read(f[16]), image_read(f[5]), image_read(f[17]), image_read(f[6]), image_read(f[18]), 
              image_read(f[7]), image_read(f[19]), image_read(f[8]), image_read(f[20]), image_read(f[9]), image_read(f[21]), 
              image_read(f[10]), image_read(f[22]), image_read(f[11]), image_read(f[23]), image_read(f[12]), image_read(f[24])), 
            format = "pdf", file_out)
```


## Plots: ground truth

```{r, fig.width = 3.75, fig.height = 3.25}
# separate plots for each sample
p_ground_truth <- list()

for (i in seq_along(sample_names)) {
    
    d_plot_sub <- d_plot[d_plot$sample_name == sample_names[i], ]
    
    d_plot_sub$method <- factor(d_plot_sub$method, levels = method_names)
    d_plot_sub$cluster <- as.factor(d_plot_sub$cluster)
    d_plot_sub$truth <- as.factor(d_plot_sub$truth)
    
    # ground truth is the same for each method
    d_plot_truth <- d_plot_sub[d_plot_sub$method == method_names[1], ]
    d_plot_truth <- na.omit(d_plot_truth)
    
    p_ground_truth[[i]] <- 
        ggplot(d_plot_truth, aes(x = x_coord, y = y_coord, color = truth)) + 
            geom_point(size = 0.6) + 
            coord_fixed() + 
            scale_color_manual(values = colors) + 
            ggtitle(paste0("Sample ", gsub("^sample_", "", sample_names[i]), ": Ground truth")) + 
            guides(color = guide_legend(override.aes = list(size = 2))) + 
            theme_bw() + 
            theme(legend.title = element_blank(), 
                  axis.title.x = element_blank(), 
                  axis.title.y = element_blank(), 
                  axis.text.x = element_blank(), 
                  axis.text.y = element_blank(), 
                  axis.ticks.x = element_blank(), 
                  axis.ticks.y = element_blank(), 
                  panel.grid.major = element_blank(), 
                  panel.grid.minor = element_blank())
    
    print(p_ground_truth[[i]])
    
    filename <- paste0("../plots/SpatialDE_clustering/ground_truth_sample_", 
                       gsub("^sample_", "", sample_names[i]), ".png")
    ggsave(filename, width = 3.75, height = 3.25)
}
```


```{r}
# combined version for supplementary figures for manuscript (using 'patchwork')

p_ground_truth <- lapply(p_ground_truth, function(p) {
   p + geom_point(size = 0.1)
})

p_truth_combined <- 
    p_ground_truth[[1]] + p_ground_truth[[2]] + p_ground_truth[[3]] + p_ground_truth[[4]] + 
    p_ground_truth[[5]] + p_ground_truth[[6]] + p_ground_truth[[7]] + p_ground_truth[[8]] + 
    p_ground_truth[[9]] + p_ground_truth[[10]] + p_ground_truth[[11]] + p_ground_truth[[12]] + 
    plot_layout(nrow = 3)

file_out <- paste0("../plots/supp_figures/supp_clustering_ground_truth.pdf")
ggsave(file_out, width = 16, height = 10)

file_out <- paste0("../plots/supp_figures/supp_clustering_ground_truth.png")
ggsave(file_out, width = 16, height = 10)
```


## Plots: Adjusted Rand Index (ARI)

```{r}
# order method names and types for ARI plots
method_names

method_levels <- c(
    "SpatialDE_PCA", "SpatialDE_pool_PCA", "HVG_PCA", "pseudobulk_PCA", "markers_PCA", 
    "SpatialDE_PCA_spatial", "SpatialDE_pool_PCA_spatial", "HVG_PCA_spatial", "pseudobulk_PCA_spatial", "markers_PCA_spatial", 
    "SpatialDE_UMAP", "SpatialDE_pool_UMAP", "HVG_UMAP", "pseudobulk_UMAP", "markers_UMAP", 
    "SpatialDE_UMAP_spatial", "SpatialDE_pool_UMAP_spatial", "HVG_UMAP_spatial", "pseudobulk_UMAP_spatial", "markers_UMAP_spatial"
)

# method types for plotting (note: ordering from 'method_names')
method_types_supervision <- factor(
    rep(c(rep("unsupervised", 3), "semisupervised", "markers"), 4), levels = c("unsupervised", "semisupervised", "markers"))
method_types_dimension_reduction <- factor(
    rep(c(rep("PCA", 5), rep("UMAP", 5)), 2), levels = c("PCA", "UMAP"))
method_types_SpatialDE <- factor(
    rep(c(rep("SpatialDE", 2), rep("other", 3)), 4), levels = c("SpatialDE", "other"))
method_types_spatial_coords <- factor(
    c(rep("nonspatial", 10), rep("spatial", 10)), levels = c("nonspatial", "spatial"))

method_types <- data.frame(
    method = factor(method_names, levels = method_levels), 
    method_types_supervision, 
    method_types_dimension_reduction, 
    method_types_SpatialDE, 
    method_types_spatial_coords
)
rownames(method_types) <- method_types$method
```


```{r, fig.width = 8, fig.height = 5}
# separate plots for each sample
p_ARI <- list()

for (i in seq_along(sample_names)) {
    
    d_plot_sub <- d_plot[d_plot$sample_name == sample_names[i], ]
    d_plot_sub <- merge(d_plot_sub, method_types, by = "method", all = FALSE, sort = FALSE)
    d_plot_sub$method <- factor(d_plot_sub$method, levels = method_levels)
    
    # remove PCA and UMAP from method names/levels (put in legend instead) and re-order levels
    levels_keep <- c(
        "SpatialDE", "SpatialDE_spatial", "SpatialDE_pool", "SpatialDE_pool_spatial", "HVG", "HVG_spatial", 
        "pseudobulk", "pseudobulk_spatial", "markers", "markers_spatial"
    )
    d_plot_sub$method <- gsub("_UMAP", "", gsub("_PCA", "", d_plot_sub$method))
    d_plot_sub$method <- factor(d_plot_sub$method, levels = levels_keep)
    
    d_plot_ARI <- distinct(
        d_plot_sub[, c("sample_name", "method", "ARI", 
                       "method_types_supervision", "method_types_dimension_reduction", 
                       "method_types_SpatialDE", "method_types_spatial_coords")]
    )
    
    p_ARI[[i]] <- 
        ggplot(d_plot_ARI, aes(x = method, y = ARI, 
                               color = method, shape = method_types_spatial_coords)) + 
            facet_grid(method_types_dimension_reduction ~ method_types_supervision, 
                       scales = "free", space = "free_x") + 
            geom_point(stroke = 1.5) + 
            scale_color_manual(values = brewer.pal(10, "Paired")) + 
            scale_shape_manual(values = c(19, 4), 
                               labels = c("without spatial coordinates", "with spatial coordinates")) + 
            ylim(c(-0.1, 0.5)) + 
            ggtitle(paste0("Adjusted Rand Index: sample ", gsub("^sample_", "", sample_names[i]))) + 
            theme_bw() + 
            theme(axis.text.x = element_blank(), 
                  axis.title.x = element_blank(), 
                  axis.ticks.x = element_blank(), 
                  legend.title = element_blank())
    
    print(p_ARI[[i]])
    
    filename <- paste0("../plots/SpatialDE_clustering/ARI_sample_", 
                       gsub("^sample_", "", sample_names[i]), ".pdf")
    ggsave(filename, width = 8, height = 5)
}
```


```{r}
# for final plot: combined boxplots showing all samples for each method
d_plot_ARI_all <- merge(d_plot, method_types, by = "method", all = FALSE, sort = FALSE)
d_plot_ARI_all$method <- factor(d_plot_ARI_all$method, levels = method_levels)

save(d_plot_ARI_all, file = "../outputs/clustering/ARI.RData")

#load("../outputs/clustering/ARI.RData")


# linear model testing for differences in ARI with vs. without spatial coords

res_ARI <- distinct(
    d_plot_ARI_all[, c("method", "sample_name", "ARI", "method_types_spatial_coords")]
)
res_ARI$sample_name <- as.factor(res_ARI$sample_name)
res_ARI$method <- factor(gsub("_spatial$", "", res_ARI$method), 
                         levels = levels(res_ARI$method)[!grepl("_spatial$", levels(res_ARI$method))])
str(res_ARI)

# experimental design ('location' refers to spatially adjacent replicates)
design <- tibble(
    sample_name = levels(res_ARI$sample_name), 
    donor = as.factor(paste0("donor_", rep(1:3, each = 4))), 
    location = as.factor(paste0("location_", rep(c(1, 1, 2, 2), 3)))
)
design$donor_location = as.factor(paste(design$donor, design$location, sep = "_"))
design

res_ARI_merged <- merge(res_ARI, design, by = "sample_name")

# fit linear models and calculate tests
res_ARI_pvals <- c()
methods_pvals <- levels(res_ARI_merged$method)

for (m in seq_along(methods_pvals)) {
    d_test <- filter(res_ARI_merged, method == methods_pvals[m])
    lm_tidy <- tidy(
        lm(ARI ~ method_types_spatial_coords + donor_location, data = d_test)
    )
    res_ARI_pvals[m] <- lm_tidy %>% 
        filter(term == "method_types_spatial_coordsspatial") %>% 
        pull("p.value")
    
    # alternatively: linear mixed models using 'nlme' (gives very similar p-values)
    #lme_out <- lme(ARI ~ method_types_spatial_coords, random = ~1|donor/location, data = d_test)
    #res_ARI_pvals[m] <- summary(lme_out)$tTable["method_types_spatial_coordsspatial", "p-value"]
}

names(res_ARI_pvals) <- methods_pvals
res_ARI_pvals

# overall model (across all methods)
lm_tidy_overall <- tidy(
    lm(ARI ~ method_types_spatial_coords + donor_location + method, data = res_ARI_merged)
)
res_ARI_pval_overall <- lm_tidy_overall %>% 
    filter(term == "method_types_spatial_coordsspatial") %>% 
    pull("p.value")
res_ARI_pval_overall

# alternatively: linear mixed models using 'nlme' (gives very similar p-values)
#lme_out_overall <- lme(ARI ~ method_types_spatial_coords + method, random = ~1|donor/location, data = res_ARI_merged)
#res_ARI_pval_overall <- summary(lme_out_overall)$tTable["method_types_spatial_coordsspatial", "p-value"]
#res_ARI_pval_overall
```


```{r, fig.width = 8.75, fig.height = 5.25}
# plot: without p-value annotation

# remove PCA and UMAP from method names/levels (put in legend instead) and re-order levels
levels_keep <- c(
    "SpatialDE", "SpatialDE_spatial", "SpatialDE_pool", "SpatialDE_pool_spatial", "HVG", "HVG_spatial", 
    "pseudobulk", "pseudobulk_spatial", "markers", "markers_spatial"
)
d_plot_ARI_all$method <- gsub("_UMAP", "", gsub("_PCA", "", d_plot_ARI_all$method))
d_plot_ARI_all$method <- factor(d_plot_ARI_all$method, levels = levels_keep)

d_plot_ARI_all <- distinct(
    d_plot_ARI_all[, c("sample_name", "method", "ARI", 
                       "method_types_supervision", "method_types_dimension_reduction", 
                       "method_types_SpatialDE", "method_types_spatial_coords")])

d_plot_ARI_all$method_types_supervision <- recode(
    d_plot_ARI_all$method_types_supervision, semisupervised = "semi-\nsupervised")

set.seed(123)
p_ARI_all <- 
    ggplot(d_plot_ARI_all, aes(x = method, y = ARI, 
                               color = method, shape = method_types_spatial_coords)) + 
        facet_grid(method_types_dimension_reduction ~ method_types_supervision, 
                   scales = "free", space = "free_x") + 
        geom_boxplot(width = 0.65, outlier.shape = NA, show.legend = FALSE) + 
        geom_jitter(stroke = 1.5, size = 0.5, width = 0.15, height = 0) + 
        scale_color_manual(values = brewer.pal(10, "Paired")) + 
        scale_shape_manual(values = c(19, 4), 
                           labels = c("without spatial coordinates", "with spatial coordinates")) + 
        ylim(c(-0.1, 0.5)) + 
        ylab("Adjusted Rand Index") + 
        guides(color = guide_legend(override.aes = list(size = 2.5)), 
               shape = guide_legend(override.aes = list(size = 2.5))) + 
        theme_bw() + 
        theme(strip.text.x = element_text(size = 14), 
              strip.text.y = element_text(size = 14), 
              axis.title.y = element_text(size = 14), 
              axis.text.y = element_text(size = 12), 
              axis.text.x = element_blank(), 
              axis.title.x = element_blank(), 
              axis.ticks.x = element_blank(), 
              legend.title = element_blank(), 
              legend.text = element_text(size = 11))

print(p_ARI_all)

filename <- "../plots/SpatialDE_clustering/ARI_all_samples.pdf"
ggsave(filename, width = 8.75, height = 5.25)

# save copy of final figure for manuscript
system("cp ../plots/SpatialDE_clustering/ARI_all_samples.pdf ../plots/figure/Fig7_spatialDE_ARI.pdf")
```


```{r, fig.width = 8.75, fig.height = 5.25}
# plot: add p-value annotation

p_vals_fmt <- paste0("p = ", sprintf("%.3f", res_ARI_pvals))

ann_text <- tibble(
    method_types_supervision = factor(
        rep(c(rep("unsupervised", 3), "semi-\nsupervised", "markers"), 2), 
        levels = c("unsupervised", "semi-\nsupervised", "markers")), 
    method_types_dimension_reduction = factor(
        rep(c("PCA", "UMAP"), each = 5), 
        levels = c("PCA", "UMAP")), 
    method = factor(rep(c("SpatialDE", "SpatialDE_pool", "HVG", "pseudobulk", "markers"), 2), 
                    levels = c("SpatialDE", "SpatialDE_pool", "HVG", "pseudobulk", "markers")), 
    text = p_vals_fmt
)

p_ARI_annot <- 
    p_ARI_all + 
        geom_text(data = ann_text, 
                  aes(x = method, y = 0.46, shape = NULL, label = text), 
                  nudge_x = 0.5, 
                  color = "black", 
                  size = 4.5)

print(p_ARI_annot)

filename <- "../plots/SpatialDE_clustering/ARI_all_samples_pvals.pdf"
ggsave(filename, width = 8.75, height = 5.25)

# save copy of final figure for manuscript
system("cp ../plots/SpatialDE_clustering/ARI_all_samples_pvals.pdf ../plots/figure/Fig7_spatialDE_ARI_pvals.pdf")
```


## Plots: Normalized Mutual Information (NMI)

```{r, fig.width = 8, fig.height = 5}
# separate plots for each sample
p_NMI <- list()

for (i in seq_along(sample_names)) {
    
    d_plot_sub <- d_plot[d_plot$sample_name == sample_names[i], ]
    d_plot_sub <- merge(d_plot_sub, method_types, by = "method", all = FALSE, sort = FALSE)
    d_plot_sub$method <- factor(d_plot_sub$method, levels = method_levels)
    
    # remove PCA and UMAP from method names/levels (put in legend instead) and re-order levels
    levels_keep <- c(
        "SpatialDE", "SpatialDE_spatial", "SpatialDE_pool", "SpatialDE_pool_spatial", "HVG", "HVG_spatial", 
        "pseudobulk", "pseudobulk_spatial", "markers", "markers_spatial"
    )
    d_plot_sub$method <- gsub("_UMAP", "", gsub("_PCA", "", d_plot_sub$method))
    d_plot_sub$method <- factor(d_plot_sub$method, levels = levels_keep)
    
    d_plot_NMI <- distinct(
        d_plot_sub[, c("sample_name", "method", "NMI", 
                       "method_types_supervision", "method_types_dimension_reduction", 
                       "method_types_SpatialDE", "method_types_spatial_coords")]
    )
    
    p_NMI[[i]] <- 
        ggplot(d_plot_NMI, aes(x = method, y = NMI, 
                               color = method, shape = method_types_spatial_coords)) + 
            facet_grid(method_types_dimension_reduction ~ method_types_supervision, 
                       scales = "free", space = "free_x") + 
            geom_point(stroke = 1.5) + 
            scale_color_manual(values = brewer.pal(10, "Paired")) + 
            scale_shape_manual(values = c(19, 4), 
                               labels = c("without spatial coordinates", "with spatial coordinates")) + 
            ylim(c(0, 0.6)) + 
            ggtitle(paste0("Normalized Mutual Information: sample ", gsub("^sample_", "", sample_names[i]))) + 
            theme_bw() + 
            theme(axis.text.x = element_blank(), 
                  axis.title.x = element_blank(), 
                  axis.ticks.x = element_blank(), 
                  legend.title = element_blank())
    
    #print(p_NMI[[i]])
    
    p_NMI[[i]]
    
    filename <- paste0("../plots/SpatialDE_clustering/NMI_sample_", 
                       gsub("^sample_", "", sample_names[i]), ".pdf")
    ggsave(filename, width = 8, height = 5)
}
```


```{r, fig.width = 8, fig.height = 5}
# combined boxplots showing all samples for each method
d_plot_NMI_all <- merge(d_plot, method_types, by = "method", all = FALSE, sort = FALSE)
d_plot_NMI_all$method <- factor(d_plot_NMI_all$method, levels = method_levels)

save(d_plot_NMI_all, file = "../outputs/clustering/NMI.RData")

# remove PCA and UMAP from method names/levels (put in legend instead) and re-order levels
levels_keep <- c(
    "SpatialDE", "SpatialDE_spatial", "SpatialDE_pool", "SpatialDE_pool_spatial", "HVG", "HVG_spatial", 
    "pseudobulk", "pseudobulk_spatial", "markers", "markers_spatial"
)
d_plot_NMI_all$method <- gsub("_UMAP", "", gsub("_PCA", "", d_plot_NMI_all$method))
d_plot_NMI_all$method <- factor(d_plot_NMI_all$method, levels = levels_keep)

d_plot_NMI_all <- distinct(
    d_plot_NMI_all[, c("sample_name", "method", "NMI", 
                       "method_types_supervision", "method_types_dimension_reduction", 
                       "method_types_SpatialDE", "method_types_spatial_coords")]
    )

set.seed(123)
p_NMI_all <- 
    ggplot(d_plot_NMI_all, aes(x = method, y = NMI, 
                               color = method, shape = method_types_spatial_coords)) + 
        facet_grid(method_types_dimension_reduction ~ method_types_supervision, 
                   scales = "free", space = "free_x") + 
        geom_boxplot(width = 0.65, outlier.shape = NA, show.legend = FALSE) + 
        geom_jitter(stroke = 1.5, size = 0.5, width = 0.15, height = 0) + 
        scale_color_manual(values = brewer.pal(10, "Paired")) + 
        scale_shape_manual(values = c(19, 4), 
                           labels = c("without spatial coordinates", "with spatial coordinates")) + 
        ylim(c(0, 0.6)) + 
        ggtitle("Normalized Mutual Information: all samples") + 
        guides(color = guide_legend(override.aes = list(size = 2.5)), 
               shape = guide_legend(override.aes = list(size = 2.5))) + 
        theme_bw() + 
        theme(axis.text.x = element_blank(), 
              axis.title.x = element_blank(), 
              axis.ticks.x = element_blank(), 
              legend.title = element_blank())

print(p_NMI_all)

filename <- "../plots/SpatialDE_clustering/NMI_all_samples.pdf"
ggsave(filename, width = 8, height = 5)
```


## Plots: summary plot

Summary plot for sample 151673 showing:

- gold standard (Kristen's manual annotation)
- clustering using marker genes (best result from this set of methods) (from Kristen; 77 out of 81 genes)
- unsupervised clustering (best result from this set of methods, i.e. SpatialDE_PCA_spatial)
- semisupervised clustering (best result from this set of methods) (from Leo's analyses; 198 genes)

```{r, fig.width = 13, fig.height = 3.5}
# color palette
colors <- my_palette

# select sample 151673
i <- 9

d_plot_sub <- d_plot[d_plot$sample_name == sample_names[i], ]

d_plot_sub$method <- factor(d_plot_sub$method, levels = method_names)
d_plot_sub$cluster <- as.factor(d_plot_sub$cluster)
d_plot_sub$truth <- as.factor(d_plot_sub$truth)

# select methods
d_plot_sub_semisupervised_best <- d_plot_sub[d_plot_sub$method == "pseudobulk_PCA_spatial", ]
d_plot_sub_unsupervised_best <- d_plot_sub[d_plot_sub$method == "SpatialDE_PCA_spatial", ]
d_plot_sub_markers_best <- d_plot_sub[d_plot_sub$method == "markers_PCA_spatial", ]

d_plot_truth_combined <- d_plot_sub[d_plot_sub$method == "pseudobulk_PCA_spatial", ]
d_plot_truth_combined <- na.omit(d_plot_truth_combined)
d_plot_truth_combined$method <- factor("ground_truth")


# re-build individual plots without facets
p_clustering_semisupervised_best <- 
    ggplot(d_plot_sub_semisupervised_best, aes(x = x_coord, y = y_coord, color = cluster)) + 
        facet_wrap(~ method) + 
        geom_point(size = 0.3, alpha = 1) + 
        coord_fixed() + 
        #scale_color_manual(values = brewer.pal(8, "Paired")[c(4, 2, 3, 5, 8, 7, 6, 1)]) + 
        scale_color_manual(values = colors[c(4, 2, 3, 5, 8, 7, 6, 1)]) + 
        ggtitle("Semisupervised clustering", 
                subtitle = paste0("sample ", gsub("^sample_", "", sample_names[i]))) + 
        theme_bw() + 
        theme(axis.title.x = element_blank(), 
              axis.title.y = element_blank(), 
              axis.text.x = element_blank(), 
              axis.text.y = element_blank(), 
              axis.ticks.x = element_blank(), 
              axis.ticks.y = element_blank(), 
              panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank())

p_clustering_unsupervised_best <- 
    ggplot(d_plot_sub_unsupervised_best, aes(x = x_coord, y = y_coord, color = cluster)) + 
        facet_wrap(~ method) + 
        geom_point(size = 0.3, alpha = 1) + 
        coord_fixed() + 
        #scale_color_manual(values = brewer.pal(8, "Paired")[c(5, 4, 3, 8, 1, 7, 2, 6)]) + 
        scale_color_manual(values = colors[c(3, 4, 2, 1, 8, 6, 5, 7)]) + 
        ggtitle("Unsupervised clustering", 
                subtitle = paste0("sample ", gsub("^sample_", "", sample_names[i]))) + 
        theme_bw() + 
        theme(axis.title.x = element_blank(), 
              axis.title.y = element_blank(), 
              axis.text.x = element_blank(), 
              axis.text.y = element_blank(), 
              axis.ticks.x = element_blank(), 
              axis.ticks.y = element_blank(), 
              panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank())

p_clustering_markers_best <- 
    ggplot(d_plot_sub_markers_best, aes(x = x_coord, y = y_coord, color = cluster)) + 
        facet_wrap(~ method) + 
        geom_point(size = 0.3, alpha = 1) + 
        coord_fixed() + 
        #scale_color_manual(values = brewer.pal(8, "Paired")[c(3, 5, 7, 6, 1, 2, 4, 8)]) + 
        scale_color_manual(values = colors[c(3, 5, 7, 6, 1, 2, 4, 8)]) + 
        ggtitle("Markers clustering", 
                subtitle = paste0("sample ", gsub("^sample_", "", sample_names[i]))) + 
        theme_bw() + 
        theme(axis.title.x = element_blank(), 
              axis.title.y = element_blank(), 
              axis.text.x = element_blank(), 
              axis.text.y = element_blank(), 
              axis.ticks.x = element_blank(), 
              axis.ticks.y = element_blank(), 
              panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank())

p_ground_truth_combined <- 
    ggplot(d_plot_truth_combined, aes(x = x_coord, y = y_coord, color = truth)) + 
        facet_wrap(~ method) + 
        geom_point(size = 0.3, alpha = 1) + 
        coord_fixed() + 
        #scale_color_brewer(palette = "Paired") + 
        scale_color_manual(values = colors) + 
        ggtitle("Ground truth", 
                subtitle = paste0("sample ", gsub("^sample_", "", sample_names[i]))) + 
        theme_bw() + 
        theme(axis.title.x = element_blank(), 
              axis.title.y = element_blank(), 
              axis.text.x = element_blank(), 
              axis.text.y = element_blank(), 
              axis.ticks.x = element_blank(), 
              axis.ticks.y = element_blank(), 
              panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank())


# combine plots using 'patchwork' package
p_combined <- p_ground_truth_combined | p_clustering_semisupervised_best | p_clustering_unsupervised_best | p_clustering_markers_best

print(p_combined)

filename <- paste0("../plots/SpatialDE_clustering/clustering_summary_sample_", 
                   gsub("^sample_", "", sample_names[i]), ".png")
ggsave(filename, width = 13, height = 3.5)
```


## Plots: manuscript

Plots with additional formatting for manuscript.

```{r, fig.width = 3.75, fig.height = 3}
# save panels for final figure (with custom formatting)
panel_a_ground_truth <- 
    p_ground_truth_combined + 
        facet_wrap(NULL) + 
        geom_point(size = 0.15) + 
        scale_color_manual(values = colors, labels = c(paste0("Layer ", 1:6), "WM", "NA")) + 
        ggtitle("Ground truth", subtitle = NULL) + 
        guides(color = guide_legend(override.aes = list(size = 2.5))) + 
        theme(plot.title = element_text(size = 16), 
              legend.text = element_text(size = 12), 
              legend.title = element_blank(), 
              strip.background = element_blank(), 
              strip.text.x = element_blank())
print(panel_a_ground_truth)
ggsave("../plots/figure/Fig7_spatialDE_ground_truth.pdf", width = 3.75, height = 3)
```


```{r, fig.width = 3.4, fig.height = 9.25}
panel_b_i <- 
    p_clustering_unsupervised_best + 
        geom_point(size = 0.15) + 
        ggtitle("Unsupervised", subtitle = NULL) + 
        theme(plot.title = element_text(size = 15), 
              legend.position = "none", 
              strip.background = element_blank(), 
              strip.text.x = element_blank())
panel_b_ii <- 
    p_clustering_semisupervised_best + 
        geom_point(size = 0.15) + 
        ggtitle("Semi-supervised", subtitle = NULL) + 
        guides(color = guide_legend(override.aes = list(color = colors, size = 2.5))) + 
        theme(plot.title = element_text(size = 15), 
              legend.title = element_text(size = 12), 
              legend.text = element_text(size = 12), 
              strip.background = element_blank(), 
              strip.text.x = element_blank())
panel_b_iii <- 
    p_clustering_markers_best + 
        geom_point(size = 0.15) + 
        ggtitle("Markers", subtitle = NULL) + 
        theme(plot.title = element_text(size = 15), 
              legend.position = "none", 
              strip.background = element_blank(), 
              strip.text.x = element_blank())
# using 'patchwork' package
panel_b_clustering <- panel_b_i / panel_b_ii / panel_b_iii
print(panel_b_clustering)
ggsave("../plots/figure/Fig7_spatialDE_clustering.pdf", width = 3.4, height = 9.25)
```


```{r, fig.width = 2.25, fig.height = 2.5}
# figure for schematic: ground truth
panel_a_ground_truth + 
    theme(plot.title = element_blank(), 
          legend.position = "none")
ggsave("../plots/figure/schematic/schematic_ground_truth.pdf", width = 2.25, height = 2.5)
```


```{r, fig.width = 2.25, fig.height = 2.5}
# figure for schematic: clustering (one layer only)
panel_a_ground_truth + 
    scale_color_manual(values = c(rep("gray90", 3), rep("red", 2), rep("gray90", 2))) + 
    theme(plot.title = element_blank(), 
          legend.position = "none")
ggsave("../plots/figure/schematic/schematic_clustering_one_layer.pdf", width = 2.25, height = 2.5)
```


```{r, fig.width = 2.25, fig.height = 2.5}
# figure for schematic: clustering
panel_b_ii + 
    theme(plot.title = element_blank(), 
          legend.position = "none")
ggsave("../plots/figure/schematic/schematic_clustering.pdf", width = 2.25, height = 2.5)
```


## Plots: color palette

Examples of color palettes for clustering plots.

```{r, fig.width = 2.25, fig.height = 2.5}
# color palette examples
panel_a_ground_truth + 
    scale_color_manual(values = brewer.pal(8, "Accent"), labels = c(paste0("Layer ", 1:6), "White matter", "NA")) + 
    theme(title = element_blank(), 
          legend.position = "none")
ggsave("../plots/color_palette/color_palette_Accent.pdf", width = 2.25, height = 2.5)

panel_a_ground_truth + 
    scale_color_manual(values = brewer.pal(8, "Set1"), labels = c(paste0("Layer ", 1:6), "White matter", "NA")) + 
    theme(title = element_blank(), 
          legend.position = "none")
ggsave("../plots/color_palette/color_palette_Set1.pdf", width = 2.25, height = 2.5)


# custom palette
library(RColorBrewer)

set1 <- brewer.pal(8, "Set1")
accent <- brewer.pal(8, "Accent")

my_palette <- set1
my_palette[1] <- accent[6]
my_palette[6] <- "darkorange1"
my_palette[5] <- "gold"
my_palette[8] <- "gray40"
my_palette[7] <- "gray10"

my_palette

library(gplots)
my_palette <- col2hex(my_palette)

my_palette


panel_a_ground_truth + 
    scale_color_manual(values = my_palette, labels = c(paste0("Layer ", 1:6), "White matter", "NA")) + 
    theme(title = element_blank(), 
          legend.position = "none")
ggsave("../plots/color_palette/palette_7colors.pdf", width = 2.25, height = 2.5)

panel_b_ii + 
    scale_color_manual(values = my_palette[c(4, 2, 3, 5, 8, 7, 6, 1)], labels = c(paste0("cluster", 1:8))) + 
    theme(title = element_blank(), 
          legend.position = "none")
ggsave("../plots/color_palette/palette_8colors.pdf", width = 2.25, height = 2.5)
```


## Session information

```{r}
sessionInfo()
```