ST MOB data analysis

This vignette introduces the CAESAR.Suite workflow for the analysis of MOB ST spatial transcriptomics dataset. In this vignette, the workflow of CAESAR.Suite consists of four steps

Load and quality control both reference and target data

We demonstrate the use of CAESAR to MOB data, which can be downloaded and load to the current working path by the following command:

githubURL <- "https://github.com/XiaoZhangryy/CAESAR.Suite/blob/master/vignettes_data/MOB_ST.rda?raw=true"
MOB_ST_file <- file.path(tempdir(), "MOB_ST.rda")
download.file(githubURL, MOB_ST_file, mode='wb')
load(MOB_ST_file)

print(MOB_ST)

githubURL <- "https://github.com/XiaoZhangryy/CAESAR.Suite/blob/master/vignettes_data/MOB_scRNA.rda?raw=true"
MOB_scRNA_file <- file.path(tempdir(), "MOB_scRNA.rda")
download.file(githubURL, MOB_scRNA_file, mode='wb')
load(MOB_scRNA_file)

print(MOB_scRNA)

The package can be loaded with the command:

set.seed(1) # set a random seed for reproducibility.
library(CAESAR.Suite) # load the package of CAESAR method

##

##

library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(ggplot2)

Users can perform appropriate quality control on the reference dataset and target data. Genes expressed in fewer than one cell were removed. Other quality control steps can be set by the user according to the data quality. Here, cells with less than five genes and genes expressed in less than one cell were excluded.

MOB_ST <- CreateSeuratObject(
    counts = MOB_ST@assays$RNA@counts,
    meta.data = MOB_ST@meta.data,
    min.features = 5,
    min.cells = 1
)

print(MOB_ST)

MOB_scRNA <- CreateSeuratObject(
    counts = MOB_scRNA@assays$RNA@counts,
    meta.data = MOB_scRNA@meta.data,
    min.features = 5,
    min.cells = 1
)

print(MOB_scRNA)

Preprocessing and match reference and target data

First, we normalize the data and select the variable genes. We matched the genes and variable genes between the reference and target datasets.

# match genes
common_genes <- intersect(rownames(MOB_ST), rownames(MOB_scRNA))
MOB_ST <- MOB_ST[common_genes, ]
MOB_scRNA <- MOB_scRNA[common_genes, ]

print(length(common_genes))

MOB_ST <- NormalizeData(MOB_ST)
MOB_ST <- FindVariableFeatures(MOB_ST, nfeatures = 2000)

MOB_scRNA <- NormalizeData(MOB_scRNA)
MOB_scRNA <- FindVariableFeatures(MOB_scRNA, nfeatures = 2000)

common_vgs <- intersect(VariableFeatures(MOB_ST), VariableFeatures(MOB_scRNA))
VariableFeatures(MOB_ST) <- common_vgs
VariableFeatures(MOB_scRNA) <- common_vgs

print(length(common_vgs))

Detect signature genes for each cell type using scRNA-seq reference data

We introduce how to use CAESAR to detect signature genes form scRNA-seq reference data. First, we calculate the co-embeddings.

MOB_scRNA <- CAESAR.coembedding(MOB_scRNA, q = 50)

Then, we detect signature genes.

# calculate cell-gene distance and identify signature genes
print(table(MOB_scRNA$CellType))

Idents(MOB_scRNA) <- MOB_scRNA$CellType
sg_sc_List <- find.sig.genes(MOB_scRNA, reduction.name = "caesar")

str(sg_sc_List)

Finally, select marker genes for each cell type from the signature gene list.

marker <- marker.select(sg_sc_List, overlap.max = 1)
print(marker)

Annotate the MOB ST data using CAESAR and marker genes from scRNA-seq reference data

Similarly, we first calculate co-embeddings for MOB ST dataset. The difference is that spatial transcriptome data has spatial coordinates information, so we can obtain spatial aware co-embeddings.

# the spatial coordinates
pos <- MOB_ST@meta.data[, c("x", "y")]
print(head(pos))

MOB_ST <- CAESAR.coembedding(MOB_ST, pos, reduction.name = "caesar", q = 50)
print(MOB_ST)

Subsequently, the CAESAR co-embeddings and marker genes from scRNA-seq reference data are used to annotate the MOB ST data.

# convert marker list to marker frequency matrix
marker.freq <- markerList2mat(list(marker))

# perform annotation using CAESAR and save results to Seurat object
print(colnames(MOB_ST@meta.data))

MOB_ST <- CAESAR.annotation(MOB_ST, marker.freq, reduction.name = "caesar", add.to.meta = TRUE)
print(colnames(MOB_ST@meta.data))

Downstream analysis

In the following, we visualize the CAESAR annotation results.

# set up colors
cols_manual <- setNames(
    c(
        "#4374A5", "#FCDDDE", "#2AB67F", "#F08A21", "#737373"
    ),
    c(
        "GCL", "MCL", "ONL", "GL", "Unknown"
    )
)

celltypes_manual <- c("GCL", "MCL", "ONL", "GL", "Unknown")

cols <- setNames(
    c(
        "#4374A5", "#FCDDDE", "#2AB673", "#F08A21", "#E04D50", "#737373"
    ),
    c(
        "GC", "M/TC", "OSNs", "PGC", "EPL-IN", "unassigned"
    )
)

celltypes <- c("GC", "M/TC", "OSNs", "PGC", "EPL-IN", "unassigned")

colnames(pos) <- paste0("pos", 1:2)
MOB_ST@reductions[["pos"]] <- CreateDimReducObject(
    embeddings = as.matrix(pos),
    key = paste0("pos", "_"), assay = "RNA"
)

First, we visualize the manual annotation.

Idents(MOB_ST) <- factor(MOB_ST$manual_annotation, levels = celltypes_manual)
DimPlot(MOB_ST, reduction = "pos", cols = cols_manual, pt.size = 8)

Then, we visualize the CAESAR annotation without account for ‘unassigned’.

Idents(MOB_ST) <- factor(MOB_ST$CAESAR, levels = celltypes)
DimPlot(MOB_ST, reduction = "pos", cols = cols, pt.size = 8)

And visualize the CAESAR annotation account for ‘unassigned’.

Idents(MOB_ST) <- factor(MOB_ST$CAESARunasg, levels = celltypes)
DimPlot(MOB_ST, reduction = "pos", cols = cols, pt.size = 8)

The confidence level of the CAESAR annotation can be visualized by

FeaturePlot(
    MOB_ST,
    reduction = "pos", features = "CAESARconf", pt.size = 8,
    cols = c("blue", "lightgrey"), min.cutoff = 0.0, max.cutoff = 1.0
)

CAESAR provides the cell mixing proportion for each cell type, which can be visualized by

caesar_prob <- colnames(MOB_ST@meta.data)[15:19]
print(caesar_prob)

plots <- lapply(caesar_prob, function(feature) {
    FeaturePlot(MOB_ST, features = feature, reduction = "pos", pt.size = 3.5) +
        scale_color_gradientn(
            colors = c("#f6eff7", "#feebe2", "#f768a1", "#7a0177", "#6e016b"),
            values = scales::rescale(c(0.0, 0.125, 0.25, 0.375, 0.50)),
            limits = c(0.0, 0.50)
        ) + labs(title = feature)
})

cowplot::plot_grid(plotlist = plots, ncol = 2)

The annotation accuracy is calculated by

acc_st <- function(manual_annotation, pred) {
    manual_annotation <- as.character(manual_annotation)
    pred <- as.character(pred)
    manual_annotation[manual_annotation == "GCL"] <- "GC"
    manual_annotation[manual_annotation == "MCL"] <- "M/TC"
    manual_annotation[manual_annotation == "ONL"] <- "OSNs"
    manual_annotation[manual_annotation == "GL"] <- "PGC"
    return(mean(manual_annotation == pred))
}

print(paste0(
    "The ACC of CAESAR annotation is ",
    acc_st(MOB_ST$manual_annotation, MOB_ST$CAESARunasg)
))

Next, we detect and visualize the signature genes for each cell type.

Idents(MOB_ST) <- factor(MOB_ST$CAESARunasg, celltypes)
sg_List <- find.sig.genes(MOB_ST)

str(sg_List)

We visualize the top three signature genes by a dot plot.

# obtain the top three signature genes
celltypes_plot <- setdiff(names(sg_List), "unassigned")
top3sgs <- Intsg(list(sg_List), 3)[celltypes_plot]
print(top3sgs)

sg_features <- unname(unlist(top3sgs))

DotPlot(
    MOB_ST,
    idents = celltypes_plot, col.min = -1, col.max = 2, dot.scale = 7,
    features = sg_features, scale.min = 0, scale.max = 30
) + theme(axis.text.x = element_text(face = "italic", angle = 45, vjust = 1, hjust = 1))

Next, we calculate the UMAP projections of co-embeddings of cells and the selected signature genes.

# calculate coumap
MOB_ST <- CoUMAP(
    MOB_ST, reduction = "caesar", reduction.name = "caesarUMAP",
    gene.set = sg_features
)

df_gene_label <- data.frame(
    gene = unlist(top3sgs),
    label = rep(names(top3sgs), each = 3)
)

CoUMAP.plot(
    MOB_ST, reduction = "caesarUMAP", gene_txtdata = df_gene_label,
    cols = c("gene" = "#000000", cols)
)

Session Info

sessionInfo()

## R Under development (unstable) (2025-02-28 r87848 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.0
## 
## locale:
## [1] LC_COLLATE=C                               
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8   
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C                               
## [5] LC_TIME=Chinese (Simplified)_China.utf8    
## 
## time zone: Asia/Shanghai
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.5.1      Seurat_5.2.1       SeuratObject_5.0.2 sp_2.2-0          
## [5] CAESAR.Suite_0.3.0
## 
## loaded via a namespace (and not attached):
##   [1] matrixStats_1.5.0           spatstat.sparse_3.1-0      
##   [3] httr_1.4.7                  RColorBrewer_1.1-3         
##   [5] tools_4.5.0                 sctransform_0.4.1          
##   [7] backports_1.5.0             R6_2.6.1                   
##   [9] lazyeval_0.2.2              uwot_0.2.3                 
##  [11] withr_3.0.2                 prettyunits_1.2.0          
##  [13] gridExtra_2.3               progressr_0.15.1           
##  [15] cli_3.6.4                   Biobase_2.67.0             
##  [17] spatstat.explore_3.3-4      fastDummies_1.7.5          
##  [19] sass_0.4.9                  mvtnorm_1.3-3              
##  [21] spatstat.data_3.1-4         proxy_0.4-27               
##  [23] ggridges_0.5.6              pbapply_1.7-2              
##  [25] harmony_1.2.3               scater_1.35.1              
##  [27] parallelly_1.42.0           readxl_1.4.3               
##  [29] rstudioapi_0.17.1           RSQLite_2.3.9              
##  [31] generics_0.1.3              gtools_3.9.5               
##  [33] ica_1.0-3                   spatstat.random_3.3-2      
##  [35] car_3.1-3                   dplyr_1.1.4                
##  [37] Matrix_1.7-2                ggbeeswarm_0.7.2           
##  [39] DescTools_0.99.59           S4Vectors_0.45.4           
##  [41] abind_1.4-8                 lifecycle_1.0.4            
##  [43] yaml_2.3.10                 CompQuadForm_1.4.3         
##  [45] carData_3.0-5               SummarizedExperiment_1.37.0
##  [47] SparseArray_1.7.6           BiocFileCache_2.15.1       
##  [49] Rtsne_0.17                  grid_4.5.0                 
##  [51] blob_1.2.4                  promises_1.3.2             
##  [53] crayon_1.5.3                GiRaF_1.0.1                
##  [55] miniUI_0.1.1.1              lattice_0.22-6             
##  [57] haven_2.5.4                 beachmat_2.23.6            
##  [59] cowplot_1.1.3               KEGGREST_1.47.0            
##  [61] pillar_1.10.1               knitr_1.49                 
##  [63] ProFAST_1.4                 GenomicRanges_1.59.1       
##  [65] boot_1.3-31                 gld_2.6.7                  
##  [67] future.apply_1.11.3         codetools_0.2-20           
##  [69] glue_1.8.0                  spatstat.univar_3.1-1      
##  [71] data.table_1.17.0           vctrs_0.6.5                
##  [73] png_0.1-8                   spam_2.11-1                
##  [75] org.Mm.eg.db_3.20.0         cellranger_1.1.0           
##  [77] gtable_0.3.6                cachem_1.1.0               
##  [79] xfun_0.51                   S4Arrays_1.7.3             
##  [81] mime_0.12                   survival_3.8-3             
##  [83] SingleCellExperiment_1.29.1 fitdistrplus_1.2-2         
##  [85] ROCR_1.0-11                 nlme_3.1-167               
##  [87] bit64_4.6.0-1               progress_1.2.3             
##  [89] filelock_1.0.3              RcppAnnoy_0.0.22           
##  [91] GenomeInfoDb_1.43.4         bslib_0.9.0                
##  [93] irlba_2.3.5.1               vipor_0.4.7                
##  [95] KernSmooth_2.23-26          colorspace_2.1-1           
##  [97] BiocGenerics_0.53.6         DBI_1.2.3                  
##  [99] ade4_1.7-23                 Exact_3.3                  
## [101] tidyselect_1.2.1            DR.SC_3.4                  
## [103] bit_4.5.0.1                 compiler_4.5.0             
## [105] curl_6.2.1                  httr2_1.1.0                
## [107] BiocNeighbors_2.1.2         expm_1.0-0                 
## [109] xml2_1.3.6                  DelayedArray_0.33.6        
## [111] plotly_4.10.4               scales_1.3.0               
## [113] lmtest_0.9-40               rappdirs_0.3.3             
## [115] stringr_1.5.1               digest_0.6.37              
## [117] goftest_1.2-3               spatstat.utils_3.1-2       
## [119] rmarkdown_2.29              XVector_0.47.2             
## [121] htmltools_0.5.8.1           pkgconfig_2.0.3            
## [123] MatrixGenerics_1.19.1       dbplyr_2.5.0               
## [125] fastmap_1.2.0               rlang_1.1.5                
## [127] htmlwidgets_1.6.4           ggthemes_5.1.0             
## [129] UCSC.utils_1.3.1            shiny_1.10.0               
## [131] farver_2.1.2                jquerylib_0.1.4            
## [133] zoo_1.8-13                  jsonlite_1.8.9             
## [135] BiocParallel_1.41.2         mclust_6.1.1               
## [137] BiocSingular_1.23.0         magrittr_2.0.3             
## [139] Formula_1.2-5               scuttle_1.17.0             
## [141] GenomeInfoDbData_1.2.13     dotCall64_1.2              
## [143] patchwork_1.3.0             munsell_0.5.1              
## [145] Rcpp_1.0.14                 viridis_0.6.5              
## [147] reticulate_1.41.0           furrr_0.3.1                
## [149] stringi_1.8.4               rootSolve_1.8.2.4          
## [151] MASS_7.3-65                 org.Hs.eg.db_3.20.0        
## [153] plyr_1.8.9                  parallel_4.5.0             
## [155] PRECAST_1.6.5               listenv_0.9.1              
## [157] ggrepel_0.9.6               forcats_1.0.0              
## [159] lmom_3.2                    deldir_2.0-4               
## [161] Biostrings_2.75.4           splines_4.5.0              
## [163] tensor_1.5                  hms_1.1.3                  
## [165] igraph_2.1.4                ggpubr_0.6.0               
## [167] spatstat.geom_3.3-5         ggsignif_0.6.4             
## [169] RcppHNSW_0.6.0              reshape2_1.4.4             
## [171] biomaRt_2.63.1              stats4_4.5.0               
## [173] ScaledMatrix_1.15.0         evaluate_1.0.3             
## [175] httpuv_1.6.15               RANN_2.6.2                 
## [177] tidyr_1.3.1                 purrr_1.0.4                
## [179] polyclip_1.10-7             future_1.34.0              
## [181] scattermore_1.2             rsvd_1.0.5                 
## [183] broom_1.0.7                 xtable_1.8-4               
## [185] e1071_1.7-16                RSpectra_0.16-2            
## [187] rstatix_0.7.2               later_1.4.1                
## [189] viridisLite_0.4.2           class_7.3-23               
## [191] tibble_3.2.1                memoise_2.0.1              
## [193] beeswarm_0.4.0              AnnotationDbi_1.69.0       
## [195] IRanges_2.41.3              cluster_2.1.8              
## [197] globals_0.16.3