#renv::restore()

BiocManager::install("UCell")
#remotes::install_github("carmonalab/UCell", ref="dev_local")  #development version

library(UCell)
library(Seurat)

#Object can be downloaded from: https://doi.org/10.6084/m9.figshare.28891538.v1

#Direct download
options(timeout = 3000)
url <- "https://figshare.com/ndownloader/files/54054143"
download.file(url, destfile = "data/pbmc_multimodal.Tcells_ds9k.rds")

pbmc.Tcell <- readRDS("data/pbmc_multimodal.Tcells_ds9k.rds")
pbmc.Tcell <- RenameAssays(pbmc.Tcell, assay.name = "SCT", new.assay.name = "RNA")
## Renaming default assay from SCT to RNA
## Warning: Key 'sct_' taken, using 'rna_' instead
DimPlot(object = pbmc.Tcell, reduction = "wnn.umap", group.by = "celltype.l2", label = TRUE, label.size = 3, repel = TRUE)

# Basic functionalities

data.matrix <- pbmc.Tcell@assays$RNA@data

dim(data.matrix)
## [1] 20729 20000
data.matrix[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
##            L4_CTGTAGATCTGGAGAG E2L6_AATGCCATCAGAGCGA E2L2_GAGGCAACAACCTATG
## AL627309.1           .                             .                     .
## AL669831.5           .                             .                     .
## LINC00115            .                             .                     .
## FAM41C               .                             .                     .
## NOC2L                0.6931472                     .                     .
##            E2L8_ATCCGTCTCTCCTGAC L2_GCCATTCAGTAAACTG
## AL627309.1                     .                   .
## AL669831.5                     .                   .
## LINC00115                      .                   .
## FAM41C                         .                   .
## NOC2L                          .                   .

Calculate on the fly

set.seed(123)
basic.sign <- list( Tcell_signature = c("CD2","CD3E","CD3D"), Myeloid_signature = c("SPI1","FCER1G","CSF1R","LCK-"))

scores <- ScoreSignatures_UCell(data.matrix, features=basic.sign)
scores[1:5,]
##                       Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG               0.6515354                       0
## E2L6_AATGCCATCAGAGCGA             0.5282599                       0
## E2L2_GAGGCAACAACCTATG             0.8772808                       0
## E2L8_ATCCGTCTCTCCTGAC             0.4979973                       0
## L2_GCCATTCAGTAAACTG               0.2725857                       0

Show distribution of predicted scores

library(reshape2)
library(ggplot2)
melted <- melt(scores)
colnames(melted) <- c("Cell","Signature","Uscore")
p <- ggplot(melted, aes(x=Signature, y=Uscore)) + geom_violin(aes(fill=Signature), scale = "width") +
       geom_boxplot(width=0.1) + theme_bw()
p

Pre-compute ranks

set.seed(123)
ranks <- StoreRankings_UCell(data.matrix)
ranks[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
##            L4_CTGTAGATCTGGAGAG E2L6_AATGCCATCAGAGCGA E2L2_GAGGCAACAACCTATG
## AL627309.1                 .                       .                     .
## AL669831.5                 .                       .                     .
## LINC00115                  .                       .                     .
## FAM41C                     .                       .                     .
## NOC2L                   1006.5                     .                     .
##            E2L8_ATCCGTCTCTCCTGAC L2_GCCATTCAGTAAACTG
## AL627309.1                     .                   .
## AL669831.5                     .                   .
## LINC00115                      .                   .
## FAM41C                         .                   .
## NOC2L                          .                   .

Now quickly evaluate signatures from pre-stored ranks

set.seed(123)
scores.2 <- ScoreSignatures_UCell(data.matrix, features=basic.sign, precalc.ranks = ranks)

scores.2[1:5,]
##                       Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG               0.6516517                       0
## E2L6_AATGCCATCAGAGCGA             0.5284173                       0
## E2L2_GAGGCAACAACCTATG             0.8773218                       0
## E2L8_ATCCGTCTCTCCTGAC             0.4980536                       0
## L2_GCCATTCAGTAAACTG               0.2726059                       0
plot(scores[,1], scores.2[,1])

new.sign <- list( CD4_Tcell = c("CD4","CD40LG"), CD8_Tcell = c("CD8A","CD8B"), Treg = c("FOXP3","IL2RA"))
scores.new <- ScoreSignatures_UCell(data.matrix, features=new.sign, precalc.ranks = ranks)

scores.new[1:5,]
##                       CD4_Tcell_UCell CD8_Tcell_UCell Treg_UCell
## L4_CTGTAGATCTGGAGAG          0.329553       0.0000000          0
## E2L6_AATGCCATCAGAGCGA        0.000000       0.0000000          0
## E2L2_GAGGCAACAACCTATG        0.000000       0.0000000          0
## E2L8_ATCCGTCTCTCCTGAC        0.000000       0.2024683          0
## L2_GCCATTCAGTAAACTG          0.000000       0.5675450          0
melted <- melt(scores.new)
colnames(melted) <- c("Cell","Signature","Uscore")
p <- ggplot(melted, aes(x=Signature, y=Uscore)) + geom_violin(aes(fill=Signature), scale = "width") +
       geom_boxplot(width=0.1) + theme_bw()
p

Run in parallel with multiple cores

scores <- ScoreSignatures_UCell(data.matrix, features=basic.sign, ncores=4)

melted <- melt(scores)
colnames(melted) <- c("Cell","Signature","Uscore")
p <- ggplot(melted, aes(x=Signature, y=Uscore)) + geom_violin(aes(fill=Signature), scale = "width") +
       geom_boxplot(width=0.1) + theme_bw()
p

Easy interaction with Seurat

Define human T cell level 2 signatues

signatures.T <- list()
signatures.T$Tcell_CD4 <- c("CD4","CD40LG")
signatures.T$Tcell_CD8 <- c("CD8A","CD8B")
signatures.T$Tcell_Treg <- c("FOXP3","IL2RA")
signatures.T$Tcell_MAIT <- c("SLC4A10", "TRAV1-2")
signatures.T$Tcell_gd <- c("TRDC", "TRGC1", "TRGC2", "TRDV1","TRAC-","TRBC1-","TRBC2-")
signatures.T$Tcell_NK <- c("FGFBP2", "SPON2", "KLRF1", "FCGR3A", "KLRD1", "TRDC","CD3E-","CD3G-")
Idents(pbmc.Tcell) <- "celltype.l1"
pbmc.Tcell.CD8 <- subset(pbmc.Tcell, idents = c("CD8 T"))
table(pbmc.Tcell.CD8@active.ident)
## 
## CD8 T 
##  7007
DimPlot(object = pbmc.Tcell.CD8, reduction = "wnn.umap", group.by = "celltype.l2", label = TRUE, label.size = 3, repel = TRUE) + NoLegend()

AddModuleScore from Seurat is very fast, but the score is highly dependent on the composition of the dataset

gene.sets <- list(Tcell_CD8=c("CD8A","CD8B"))

pbmc.Tcell.CD8 <- AddModuleScore(pbmc.Tcell.CD8, features = gene.sets, name="Tcell_CD8_Seurat", seed=123)
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_Seurat1", pt.size=0)
## Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pbmc.Tcell <- AddModuleScore(pbmc.Tcell, features = gene.sets, name="Tcell_CD8_Seurat", seed=123)
p2 <- VlnPlot(pbmc.Tcell, features = "Tcell_CD8_Seurat1", pt.size=0)

p1 | p2

UCell score is based on gene rankings and therefore is not affected by the composition of the query dataset

pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = gene.sets)
p1 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)

pbmc.Tcell <- AddModuleScore_UCell(pbmc.Tcell, features = gene.sets)
p2 <- VlnPlot(pbmc.Tcell, features = "Tcell_CD8_UCell", pt.size=0)

p1 | p2

Evaluate more signatures for all flavors of T cells

pbmc.Tcell.ss  <- AddModuleScore_UCell(pbmc.Tcell, features = signatures.T)
sign.names <- paste0(names(signatures.T),"_UCell")

VlnPlot(pbmc.Tcell.ss, features = sign.names, pt.size=0)

VlnPlot(pbmc.Tcell.ss, features = sign.names, group.by = "celltype.l2", pt.size=0)

How do signatures compare to original annotations

Idents(pbmc.Tcell.ss) <- "celltype.l2"
DimPlot(object = pbmc.Tcell.ss, reduction = "wnn.umap", group.by = "celltype.l2", label.size = 3, repel = TRUE, label = T)

FeaturePlot(pbmc.Tcell.ss, reduction = "wnn.umap", features = sign.names, ncol=3, order=T)

See behavior with missing or non-existing genes

pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = list(Tcell_CD8=c("CD8A","CD8B","notagene")))
## Warning: The following genes were not found and will be
##                         imputed to exp=0:
## * notagene
p1 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)

pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = list(Tcell_CD8=c("CD8A","CD8B","notagene","notagene2","notagene3")))
## Warning: Over half of genes (60%) in specified signatures are missing from
## data. Check the integrity of your dataset.
## Warning: The following genes were not found and will be
##                         imputed to exp=0:
## * notagene,notagene2,notagene3
p2 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)

pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = list(Tcell_CD8=c("notagene","notagene2","notagene3")))
## Warning: Over half of genes (100%) in specified signatures are missing from data. Check the integrity of your dataset.
## Warning: The following genes were not found and will be
##                         imputed to exp=0:
## * notagene,notagene2,notagene3
p3 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of Tcell_CD8_UCell.
p1 | p2 | p3

Recalculate with the Cd8 T cell simple signature

gene.sets <- list(Tcell_CD8=c("CD8A","CD8B"))
pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = gene.sets)
VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)

plot(pbmc.Tcell.CD8$Tcell_CD8_Seurat1, pbmc.Tcell.CD8$Tcell_CD8_UCell)

plot(pbmc.Tcell$Tcell_CD8_Seurat1, pbmc.Tcell$Tcell_CD8_UCell)

Compare to AUCell

library(AUCell)
cells_rankings <- AUCell_buildRankings(pbmc.Tcell@assays$RNA@data,nCores = 1, plotStats = F)
## Warning in .AUCell_buildRankings(exprMat = exprMat, featureType = featureType,
## : nCores is no longer used. It will be deprecated in the next AUCell version.
cells_AUC <- AUCell_calcAUC(signatures.T, cells_rankings, aucMaxRank=1000)
## Genes in the gene sets NOT available in the dataset: 
##  Tcell_gd:   3 (43% of 7)
##  Tcell_NK:   2 (25% of 8)
cells_AUC.num <- as.data.frame(t(getAUC(cells_AUC)))
colnames(cells_AUC.num) <- paste0(colnames(cells_AUC.num), "_AUCell")

pbmc.Tcell <- AddMetaData(pbmc.Tcell, cells_AUC.num)


cells_rankings.cd8 <- AUCell_buildRankings(pbmc.Tcell.CD8@assays$RNA@data,nCores = 1, plotStats = T)
## Warning in .AUCell_buildRankings(exprMat = exprMat, featureType = featureType,
## : nCores is no longer used. It will be deprecated in the next AUCell version.

## Quantiles for the number of genes detected by cell: 
## (Non-detected genes are shuffled at the end of the ranking. Keep it in mind when choosing the threshold for calculating the AUC).
##     min      1%      5%     10%     50%    100% 
##  807.00 1047.06 1224.30 1313.60 1779.00 3757.00
cells_AUC.cd8 <- AUCell_calcAUC(signatures.T, cells_rankings.cd8, aucMaxRank=1000)
## Genes in the gene sets NOT available in the dataset: 
##  Tcell_gd:   3 (43% of 7)
##  Tcell_NK:   2 (25% of 8)
cells_AUC.num.cd8 <- as.data.frame(t(getAUC(cells_AUC.cd8)))
colnames(cells_AUC.num.cd8) <- paste0(colnames(cells_AUC.num.cd8), "_AUCell")

pbmc.Tcell.CD8 <- AddMetaData(pbmc.Tcell.CD8, cells_AUC.num.cd8)


p1 <- VlnPlot(pbmc.Tcell, features = "Tcell_CD8_AUCell", pt.size=0)
p2 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_AUCell", pt.size=0)
p1 | p2

#Seurat ModuleScore vs UCell
plot(pbmc.Tcell.CD8$Tcell_CD8_Seurat1, pbmc.Tcell.CD8$Tcell_CD8_UCell)

#AUCell vs UCell
plot(pbmc.Tcell.CD8$Tcell_CD8_AUCell, pbmc.Tcell.CD8$Tcell_CD8_UCell)

Test ranks storing functionality

t1 <- Sys.time()
pbmc.Tcell.UcellPrecomp <- AddModuleScore_UCell(pbmc.Tcell, features = gene.sets, storeRanks = T)
t2 <- Sys.time()
t2 - t1

VlnPlot(pbmc.Tcell.UcellPrecomp, features = "Tcell_CD8_UCell")

Check that scores calculate with rank pre-calculation are the same as those calculated “on the fly”

t1 <- Sys.time()
pbmc.Tcell.UcellPrecomp <- AddModuleScore_UCell(pbmc.Tcell.UcellPrecomp, features = list(Tcell_CD8_copy=c("CD8A","CD8B")))
t2 <- Sys.time()
t2 - t1

VlnPlot(pbmc.Tcell.UcellPrecomp, features = "Tcell_CD8_copy_UCell")

plot(pbmc.Tcell.UcellPrecomp$Tcell_CD8_UCell,pbmc.Tcell.UcellPrecomp$Tcell_CD8_copy_UCell)

t1 <- Sys.time()
pbmc.Tcell.UcellPrecomp <- AddModuleScore_UCell(pbmc.Tcell.UcellPrecomp, features = signatures.T)
t2 <- Sys.time()
t2 - t1

VlnPlot(pbmc.Tcell.UcellPrecomp, features = paste0(names(signatures.T),"_UCell"))

Gene ranks are stored here:

head(pbmc.Tcell.UcellPrecomp@assays$UCellRanks@data[,1:10])
head(pbmc.Tcell.UcellPrecomp@assays$UCellRanks@data["CD8A",1:10])
hist(as.numeric(pbmc.Tcell.UcellPrecomp@assays$UCellRanks@data["CD8A",]))

Test ties methods

set.seed(123)
basic.sign <- list( Tcell_signature = c("CD2","CD3E","CD3D"), Myeloid_signature = c("SPI1","FCER1G","CSF1R"))

#Using random for ties
system.time(scores <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "random"))
##    user  system elapsed 
## 110.801   3.527 114.391
scores[1:5,]
##                       Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG               0.6208278              0.00000000
## E2L6_AATGCCATCAGAGCGA             0.5792167              0.00000000
## E2L2_GAGGCAACAACCTATG             0.8809524              0.00000000
## E2L8_ATCCGTCTCTCCTGAC             0.5008901              0.01335113
## L2_GCCATTCAGTAAACTG               0.2843792              0.00000000
#Using average for rank ties
system.time(scores <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "average"))
##    user  system elapsed 
##  40.165   3.307  43.471
scores[1:5,]
##                       Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG               0.6515354              0.00000000
## E2L6_AATGCCATCAGAGCGA             0.5282599              0.00000000
## E2L2_GAGGCAACAACCTATG             0.8772808              0.00000000
## E2L8_ATCCGTCTCTCCTGAC             0.4979973              0.03382287
## L2_GCCATTCAGTAAACTG               0.2725857              0.00000000
scores.rand <- list()
scores.ave <- list()
for (seed in c(1:3)) {
   set.seed(seed)
   t1 <- system.time(scores.rand[[seed]] <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "random"))
   t2 <- system.time(scores.ave[[seed]] <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "ave"))
   print(t1)
   print(t2)
}
##    user  system elapsed 
## 110.682   3.628 114.431 
##    user  system elapsed 
##  39.851   3.079  42.957 
##    user  system elapsed 
## 110.708   3.501 114.240 
##    user  system elapsed 
##  40.624   3.136  43.792 
##    user  system elapsed 
## 112.207   3.762 117.000 
##    user  system elapsed 
##  41.327   3.227  44.561
##Reproducibility
#Rand vs rand
plot(scores.rand[[1]][,1], scores.rand[[2]][,1])

plot(scores.rand[[1]][,1], scores.rand[[3]][,1])

#ave vs ave
plot(scores.ave[[1]][,1], scores.ave[[2]][,1])

plot(scores.ave[[1]][,1], scores.ave[[3]][,1])

#Rand vs. ave
plot(scores.rand[[1]][,1], scores.ave[[1]][,1])

plot(scores.rand[[2]][,1], scores.ave[[2]][,1])


  1. ↩︎

  2. ↩︎