UCell signature enrichment analysis

In this demo, we will apply UCell to evaluate gene signatures in single-cell dataset from 31 tumor biopsies of melanoma patients, sequenced using the Smart-seq2 protocol Jerby-Arnon et al. (2018) Cell.

Installation

if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("UCell")
library(UCell)
set.seed(123)

scRNA-seq data preparation

Download the gene expression matrix for this study from Gene Expression Omnibus - GSE115978

library(GEOquery)
cached.object <- "demodata.rds"

if (!file.exists(cached.object)) {
    geo_acc <- "GSE115978"
    options(timeout = max(1000, getOption("timeout")))

    gse <- getGEO(geo_acc)
    getGEOSuppFiles(geo_acc)

    exp.mat <- read.csv(sprintf("%s/GSE115978_counts.csv.gz", geo_acc), header = T,
        row.names = 1, sep = ",")

    saveRDS(exp.mat, cached.object)
} else {
    exp.mat <- readRDS(cached.object)
}

Define gene signatures

Here we define some simple gene sets based on the “Human Cell Landscape” signatures Han et al. (2020) Nature. You may edit existing signatures, or add new one as elements in a list.

signatures <- list(Immune = c("PTPRC"), Macrophage = c("CTSB", "C1QB", "LAPTM5",
    "TYROBP", "PSAP", "C1QA", "HLA-DRA", "CTSD", "NPC2", "FCER1G"), Tcell = c("CD3D",
    "CD3E", "CD3G", "CD2"), Bcell = c("MS4A1", "CD79A", "CD79B", "CD19", "BANK1"),
    Myeloid_cell = c("CD14", "LYZ", "CSF1R", "FCER1G", "SPI1", "LCK-"), Stromal = c("MMP2",
        "COL1A1", "COL1A2", "COL3A1", "LUM", "DCN"))

Run UCell

Run ScoreSignatures_UCell and get directly signature scores for all cells

u.scores <- ScoreSignatures_UCell(exp.mat, features = signatures)
u.scores[1:8, 1:2]
                                                 Immune_UCell Macrophage_UCell
cy78_CD45_neg_1_B04_S496_comb                               0        0.3286333
cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb                    0        0.2744000
CY88_5_B10_S694_comb                                        0        0.2673333
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb             0        0.3904667
cy78_CD45_neg_3_H06_S762_comb                               0        0.3680333
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb             0        0.3810000
cy79_p1_CD45_neg_PDL1_neg_AS_C4_R1_D09_S141_comb            0        0.4197000
CY88_3_D02_S614_comb                                        0        0.3159667

Show the distribution of predicted scores

library(reshape2)
library(ggplot2)
melted <- reshape2::melt(u.scores)
colnames(melted) <- c("Cell", "Signature", "UCell_score")
p <- ggplot(melted, aes(x = Signature, y = UCell_score)) + geom_violin(aes(fill = Signature),
    scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0) + theme_bw() +
    theme(axis.text.x = element_blank())
p

Pre-calculating gene rankings

The time- and memory-demanding step in UCell is the calculation of gene rankings for each individual cell. If we plan to experiment with signatures, editing them or adding new cell subtypes, it is possible to pre-calculate the gene rankings once and for all and then apply new signatures over these pre-calculated ranks. Run the StoreRankings_UCell function to pre-calculate gene rankings over a dataset:

set.seed(123)
ranks <- StoreRankings_UCell(exp.mat)
ranks[1:5, 1:5]
5 x 5 sparse Matrix of class "dgCMatrix"
         cy78_CD45_neg_1_B04_S496_comb cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb
C9orf152                           .                                        .  
RPS11                             91.5                                   1498.5
ELMO2                              .                                        .  
CREB3L1                            .                                        .  
PNMA1                            942.5                                      .  
         CY88_5_B10_S694_comb cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb
C9orf152                  .                                               .  
RPS11                   183.5                                          1000.0
ELMO2                     .                                             758.5
CREB3L1                   .                                               .  
PNMA1                     .                                               .  
         cy78_CD45_neg_3_H06_S762_comb
C9orf152                           .  
RPS11                            122.5
ELMO2                              .  
CREB3L1                            .  
PNMA1                            649.5

Then, we can apply our signature set, or any other new signature to the pre-calculated ranks. The calculations will be considerably faster.

set.seed(123)
u.scores.2 <- ScoreSignatures_UCell(features = signatures, precalc.ranks = ranks)

melted <- reshape2::melt(u.scores.2)
colnames(melted) <- c("Cell", "Signature", "UCell_score")
p <- ggplot(melted, aes(x = Signature, y = UCell_score)) + geom_violin(aes(fill = Signature),
    scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0) + theme_bw() +
    theme(axis.text.x = element_blank())
p

new.signatures <- list(Mast.cell = c("TPSAB1", "TPSB2", "CPA3", "SRGN", "RGS2", "RGS1",
    "NFKBIA", "GLUL", "VIM", "ANXA1"), Erythroid.cell = c("HBA2", "HBG2", "HBA1",
    "HBB", "HBG1", "AHSP", "ALAS2", "SLC25A37", "HBM"))

u.scores.3 <- ScoreSignatures_UCell(features = new.signatures, precalc.ranks = ranks)
melted <- reshape2::melt(u.scores.3)
colnames(melted) <- c("Cell", "Signature", "UCell_score")
p <- ggplot(melted, aes(x = Signature, y = UCell_score)) + geom_violin(aes(fill = Signature),
    scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0) + theme_bw() +
    theme(axis.text.x = element_blank())
p

Multi-core processing

If your machine has multi-core capabilities and enough RAM, running UCell in parallel can speed up considerably your analysis. In this example we will use 4 parallel cores:

u.scores <- ScoreSignatures_UCell(exp.mat, features = signatures, ncores = 4)

melted <- reshape2::melt(u.scores)
colnames(melted) <- c("Cell", "Signature", "Uscore")
p <- ggplot(melted, aes(x = Signature, y = Uscore)) + geom_violin(aes(fill = Signature),
    scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0) + theme_bw() +
    theme(axis.text.x = element_blank())
p

Interacting with SingleCellExperiment or Seurat

UCell + SingleCellExperiment

The function ScoreSignatures_UCell() allows operating directly on sce objects. UCell scores are returned in a altExp object (altExp(sce, 'UCell'))

library(SingleCellExperiment)

sce <- SingleCellExperiment(list(counts = exp.mat))
sce <- ScoreSignatures_UCell(sce, features = signatures, assay = "counts", name = NULL,
    ncores = 4)
altExp(sce, "UCell")
class: SummarizedExperiment 
dim: 6 7186 
metadata(0):
assays(1): UCell
rownames(6): Immune Macrophage ... Myeloid_cell Stromal
rowData names(0):
colnames(7186): cy78_CD45_neg_1_B04_S496_comb
  cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb ...
  CY75_1_CD45_CD8_8__S289_comb_BCD8 CY75_1_CD45_CD8_8__S351_comb_BCD8
colData names(0):

Dimensionality reduction and visualization

library(scater)
library(patchwork)
# PCA
sce <- logNormCounts(sce)
sce <- runPCA(sce, scale = T, ncomponents = 20)

# UMAP
set.seed(1234)
sce <- runUMAP(sce, dimred = "PCA")

Visualize UCell scores on low-dimensional representation (UMAP)

pll <- lapply(names(signatures), function(x) {
    plotUMAP(sce, colour_by = x, by_exprs_values = "UCell", text_size = 10)
})
wrap_plots(pll)

UCell + Seurat

The function AddModuleScore_UCell() allows operating directly on Seurat objects. UCell scores are returned as metadata columns in the Seurat object. To see how this function differs from Seurat’s own AddModuleScore() (not based on per-cell ranks) see this vignette

library(Seurat)
seurat.object <- CreateSeuratObject(counts = exp.mat, project = "JerbyArnon")
seurat.object <- AddModuleScore_UCell(seurat.object, features = signatures, name = NULL,
    ncores = 4)
head(seurat.object@meta.data)
                                                orig.ident nCount_RNA
cy78_CD45_neg_1_B04_S496_comb                         cy78     357919
cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb              cy79       5727
CY88_5_B10_S694_comb                                  CY88     139218
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb       cy79      73996
cy78_CD45_neg_3_H06_S762_comb                         cy78     380341
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb       cy79      92485
                                                nFeature_RNA Immune Macrophage
cy78_CD45_neg_1_B04_S496_comb                           8094      0  0.3286333
cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb                2022      0  0.2744000
CY88_5_B10_S694_comb                                    5141      0  0.2673333
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb         5517      0  0.3904667
cy78_CD45_neg_3_H06_S762_comb                           7216      0  0.3680333
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb         6822      0  0.3810000
                                                Tcell Bcell Myeloid_cell
cy78_CD45_neg_1_B04_S496_comb                       0     0       0.0000
cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb            0     0       0.0000
CY88_5_B10_S694_comb                                0     0       0.0408
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb     0     0       0.0000
cy78_CD45_neg_3_H06_S762_comb                       0     0       0.0000
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb     0     0       0.0000
                                                   Stromal
cy78_CD45_neg_1_B04_S496_comb                   0.00000000
cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb        0.00000000
CY88_5_B10_S694_comb                            0.00000000
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb 0.00000000
cy78_CD45_neg_3_H06_S762_comb                   0.08355556
cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb 0.00000000

Generate PCA and UMAP embeddings

seurat.object <- NormalizeData(seurat.object)
seurat.object <- FindVariableFeatures(seurat.object, selection.method = "vst", nfeatures = 500)

seurat.object <- ScaleData(seurat.object)
seurat.object <- RunPCA(seurat.object, features = seurat.object@assays$RNA@var.features,
    npcs = 20)
seurat.object <- RunUMAP(seurat.object, reduction = "pca", dims = 1:20, seed.use = 123)

Visualize UCell scores on low-dimensional representation (UMAP)

FeaturePlot(seurat.object, reduction = "umap", features = names(signatures), ncol = 3,
    order = T)

Further reading

To see how UCell can interact directly with Seurat to analyse human PBMCs see THIS VIGNETTE

The code and the package are available at the UCell GitHub repository; more demos available at UCell demo repository