#renv::restore()
BiocManager::install("UCell")
#remotes::install_github("carmonalab/UCell", ref="dev_local") #development version
library(UCell)
library(Seurat)
#Object can be downloaded from: https://doi.org/10.6084/m9.figshare.28891538.v1
#Direct download
options(timeout = 3000)
url <- "https://figshare.com/ndownloader/files/54054143"
download.file(url, destfile = "data/pbmc_multimodal.Tcells_ds9k.rds")
pbmc.Tcell <- readRDS("data/pbmc_multimodal.Tcells_ds9k.rds")
pbmc.Tcell <- RenameAssays(pbmc.Tcell, assay.name = "SCT", new.assay.name = "RNA")
## Renaming default assay from SCT to RNA
## Warning: Key 'sct_' taken, using 'rna_' instead
DimPlot(object = pbmc.Tcell, reduction = "wnn.umap", group.by = "celltype.l2", label = TRUE, label.size = 3, repel = TRUE)
# Basic functionalities
data.matrix <- pbmc.Tcell@assays$RNA@data
dim(data.matrix)
## [1] 20729 20000
data.matrix[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## L4_CTGTAGATCTGGAGAG E2L6_AATGCCATCAGAGCGA E2L2_GAGGCAACAACCTATG
## AL627309.1 . . .
## AL669831.5 . . .
## LINC00115 . . .
## FAM41C . . .
## NOC2L 0.6931472 . .
## E2L8_ATCCGTCTCTCCTGAC L2_GCCATTCAGTAAACTG
## AL627309.1 . .
## AL669831.5 . .
## LINC00115 . .
## FAM41C . .
## NOC2L . .
Calculate on the fly
set.seed(123)
basic.sign <- list( Tcell_signature = c("CD2","CD3E","CD3D"), Myeloid_signature = c("SPI1","FCER1G","CSF1R","LCK-"))
scores <- ScoreSignatures_UCell(data.matrix, features=basic.sign)
scores[1:5,]
## Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG 0.6515354 0
## E2L6_AATGCCATCAGAGCGA 0.5282599 0
## E2L2_GAGGCAACAACCTATG 0.8772808 0
## E2L8_ATCCGTCTCTCCTGAC 0.4979973 0
## L2_GCCATTCAGTAAACTG 0.2725857 0
Show distribution of predicted scores
library(reshape2)
library(ggplot2)
melted <- melt(scores)
colnames(melted) <- c("Cell","Signature","Uscore")
p <- ggplot(melted, aes(x=Signature, y=Uscore)) + geom_violin(aes(fill=Signature), scale = "width") +
geom_boxplot(width=0.1) + theme_bw()
p
Pre-compute ranks
set.seed(123)
ranks <- StoreRankings_UCell(data.matrix)
ranks[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## L4_CTGTAGATCTGGAGAG E2L6_AATGCCATCAGAGCGA E2L2_GAGGCAACAACCTATG
## AL627309.1 . . .
## AL669831.5 . . .
## LINC00115 . . .
## FAM41C . . .
## NOC2L 1006.5 . .
## E2L8_ATCCGTCTCTCCTGAC L2_GCCATTCAGTAAACTG
## AL627309.1 . .
## AL669831.5 . .
## LINC00115 . .
## FAM41C . .
## NOC2L . .
Now quickly evaluate signatures from pre-stored ranks
set.seed(123)
scores.2 <- ScoreSignatures_UCell(data.matrix, features=basic.sign, precalc.ranks = ranks)
scores.2[1:5,]
## Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG 0.6516517 0
## E2L6_AATGCCATCAGAGCGA 0.5284173 0
## E2L2_GAGGCAACAACCTATG 0.8773218 0
## E2L8_ATCCGTCTCTCCTGAC 0.4980536 0
## L2_GCCATTCAGTAAACTG 0.2726059 0
plot(scores[,1], scores.2[,1])
new.sign <- list( CD4_Tcell = c("CD4","CD40LG"), CD8_Tcell = c("CD8A","CD8B"), Treg = c("FOXP3","IL2RA"))
scores.new <- ScoreSignatures_UCell(data.matrix, features=new.sign, precalc.ranks = ranks)
scores.new[1:5,]
## CD4_Tcell_UCell CD8_Tcell_UCell Treg_UCell
## L4_CTGTAGATCTGGAGAG 0.329553 0.0000000 0
## E2L6_AATGCCATCAGAGCGA 0.000000 0.0000000 0
## E2L2_GAGGCAACAACCTATG 0.000000 0.0000000 0
## E2L8_ATCCGTCTCTCCTGAC 0.000000 0.2024683 0
## L2_GCCATTCAGTAAACTG 0.000000 0.5675450 0
melted <- melt(scores.new)
colnames(melted) <- c("Cell","Signature","Uscore")
p <- ggplot(melted, aes(x=Signature, y=Uscore)) + geom_violin(aes(fill=Signature), scale = "width") +
geom_boxplot(width=0.1) + theme_bw()
p
Run in parallel with multiple cores
scores <- ScoreSignatures_UCell(data.matrix, features=basic.sign, ncores=4)
melted <- melt(scores)
colnames(melted) <- c("Cell","Signature","Uscore")
p <- ggplot(melted, aes(x=Signature, y=Uscore)) + geom_violin(aes(fill=Signature), scale = "width") +
geom_boxplot(width=0.1) + theme_bw()
p
Define human T cell level 2 signatues
signatures.T <- list()
signatures.T$Tcell_CD4 <- c("CD4","CD40LG")
signatures.T$Tcell_CD8 <- c("CD8A","CD8B")
signatures.T$Tcell_Treg <- c("FOXP3","IL2RA")
signatures.T$Tcell_MAIT <- c("SLC4A10", "TRAV1-2")
signatures.T$Tcell_gd <- c("TRDC", "TRGC1", "TRGC2", "TRDV1","TRAC-","TRBC1-","TRBC2-")
signatures.T$Tcell_NK <- c("FGFBP2", "SPON2", "KLRF1", "FCGR3A", "KLRD1", "TRDC","CD3E-","CD3G-")
Idents(pbmc.Tcell) <- "celltype.l1"
pbmc.Tcell.CD8 <- subset(pbmc.Tcell, idents = c("CD8 T"))
table(pbmc.Tcell.CD8@active.ident)
##
## CD8 T
## 7007
DimPlot(object = pbmc.Tcell.CD8, reduction = "wnn.umap", group.by = "celltype.l2", label = TRUE, label.size = 3, repel = TRUE) + NoLegend()
AddModuleScore from Seurat is very fast, but the score is highly dependent on the composition of the dataset
gene.sets <- list(Tcell_CD8=c("CD8A","CD8B"))
pbmc.Tcell.CD8 <- AddModuleScore(pbmc.Tcell.CD8, features = gene.sets, name="Tcell_CD8_Seurat", seed=123)
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_Seurat1", pt.size=0)
## Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pbmc.Tcell <- AddModuleScore(pbmc.Tcell, features = gene.sets, name="Tcell_CD8_Seurat", seed=123)
p2 <- VlnPlot(pbmc.Tcell, features = "Tcell_CD8_Seurat1", pt.size=0)
p1 | p2
UCell score is based on gene rankings and therefore is not affected by the composition of the query dataset
pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = gene.sets)
p1 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)
pbmc.Tcell <- AddModuleScore_UCell(pbmc.Tcell, features = gene.sets)
p2 <- VlnPlot(pbmc.Tcell, features = "Tcell_CD8_UCell", pt.size=0)
p1 | p2
Evaluate more signatures for all flavors of T cells
pbmc.Tcell.ss <- AddModuleScore_UCell(pbmc.Tcell, features = signatures.T)
sign.names <- paste0(names(signatures.T),"_UCell")
VlnPlot(pbmc.Tcell.ss, features = sign.names, pt.size=0)
VlnPlot(pbmc.Tcell.ss, features = sign.names, group.by = "celltype.l2", pt.size=0)
How do signatures compare to original annotations
Idents(pbmc.Tcell.ss) <- "celltype.l2"
DimPlot(object = pbmc.Tcell.ss, reduction = "wnn.umap", group.by = "celltype.l2", label.size = 3, repel = TRUE, label = T)
FeaturePlot(pbmc.Tcell.ss, reduction = "wnn.umap", features = sign.names, ncol=3, order=T)
See behavior with missing or non-existing genes
pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = list(Tcell_CD8=c("CD8A","CD8B","notagene")))
## Warning: The following genes were not found and will be
## imputed to exp=0:
## * notagene
p1 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)
pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = list(Tcell_CD8=c("CD8A","CD8B","notagene","notagene2","notagene3")))
## Warning: Over half of genes (60%) in specified signatures are missing from
## data. Check the integrity of your dataset.
## Warning: The following genes were not found and will be
## imputed to exp=0:
## * notagene,notagene2,notagene3
p2 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)
pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = list(Tcell_CD8=c("notagene","notagene2","notagene3")))
## Warning: Over half of genes (100%) in specified signatures are missing from data. Check the integrity of your dataset.
## Warning: The following genes were not found and will be
## imputed to exp=0:
## * notagene,notagene2,notagene3
p3 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of Tcell_CD8_UCell.
p1 | p2 | p3
Recalculate with the Cd8 T cell simple signature
gene.sets <- list(Tcell_CD8=c("CD8A","CD8B"))
pbmc.Tcell.CD8 <- AddModuleScore_UCell(pbmc.Tcell.CD8, features = gene.sets)
VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_UCell", pt.size=0)
plot(pbmc.Tcell.CD8$Tcell_CD8_Seurat1, pbmc.Tcell.CD8$Tcell_CD8_UCell)
plot(pbmc.Tcell$Tcell_CD8_Seurat1, pbmc.Tcell$Tcell_CD8_UCell)
Compare to AUCell
library(AUCell)
cells_rankings <- AUCell_buildRankings(pbmc.Tcell@assays$RNA@data,nCores = 1, plotStats = F)
## Warning in .AUCell_buildRankings(exprMat = exprMat, featureType = featureType,
## : nCores is no longer used. It will be deprecated in the next AUCell version.
cells_AUC <- AUCell_calcAUC(signatures.T, cells_rankings, aucMaxRank=1000)
## Genes in the gene sets NOT available in the dataset:
## Tcell_gd: 3 (43% of 7)
## Tcell_NK: 2 (25% of 8)
cells_AUC.num <- as.data.frame(t(getAUC(cells_AUC)))
colnames(cells_AUC.num) <- paste0(colnames(cells_AUC.num), "_AUCell")
pbmc.Tcell <- AddMetaData(pbmc.Tcell, cells_AUC.num)
cells_rankings.cd8 <- AUCell_buildRankings(pbmc.Tcell.CD8@assays$RNA@data,nCores = 1, plotStats = T)
## Warning in .AUCell_buildRankings(exprMat = exprMat, featureType = featureType,
## : nCores is no longer used. It will be deprecated in the next AUCell version.
## Quantiles for the number of genes detected by cell:
## (Non-detected genes are shuffled at the end of the ranking. Keep it in mind when choosing the threshold for calculating the AUC).
## min 1% 5% 10% 50% 100%
## 807.00 1047.06 1224.30 1313.60 1779.00 3757.00
cells_AUC.cd8 <- AUCell_calcAUC(signatures.T, cells_rankings.cd8, aucMaxRank=1000)
## Genes in the gene sets NOT available in the dataset:
## Tcell_gd: 3 (43% of 7)
## Tcell_NK: 2 (25% of 8)
cells_AUC.num.cd8 <- as.data.frame(t(getAUC(cells_AUC.cd8)))
colnames(cells_AUC.num.cd8) <- paste0(colnames(cells_AUC.num.cd8), "_AUCell")
pbmc.Tcell.CD8 <- AddMetaData(pbmc.Tcell.CD8, cells_AUC.num.cd8)
p1 <- VlnPlot(pbmc.Tcell, features = "Tcell_CD8_AUCell", pt.size=0)
p2 <- VlnPlot(pbmc.Tcell.CD8, features = "Tcell_CD8_AUCell", pt.size=0)
p1 | p2
#Seurat ModuleScore vs UCell
plot(pbmc.Tcell.CD8$Tcell_CD8_Seurat1, pbmc.Tcell.CD8$Tcell_CD8_UCell)
#AUCell vs UCell
plot(pbmc.Tcell.CD8$Tcell_CD8_AUCell, pbmc.Tcell.CD8$Tcell_CD8_UCell)
t1 <- Sys.time()
pbmc.Tcell.UcellPrecomp <- AddModuleScore_UCell(pbmc.Tcell, features = gene.sets, storeRanks = T)
t2 <- Sys.time()
t2 - t1
VlnPlot(pbmc.Tcell.UcellPrecomp, features = "Tcell_CD8_UCell")
Check that scores calculate with rank pre-calculation are the same as those calculated “on the fly”
t1 <- Sys.time()
pbmc.Tcell.UcellPrecomp <- AddModuleScore_UCell(pbmc.Tcell.UcellPrecomp, features = list(Tcell_CD8_copy=c("CD8A","CD8B")))
t2 <- Sys.time()
t2 - t1
VlnPlot(pbmc.Tcell.UcellPrecomp, features = "Tcell_CD8_copy_UCell")
plot(pbmc.Tcell.UcellPrecomp$Tcell_CD8_UCell,pbmc.Tcell.UcellPrecomp$Tcell_CD8_copy_UCell)
t1 <- Sys.time()
pbmc.Tcell.UcellPrecomp <- AddModuleScore_UCell(pbmc.Tcell.UcellPrecomp, features = signatures.T)
t2 <- Sys.time()
t2 - t1
VlnPlot(pbmc.Tcell.UcellPrecomp, features = paste0(names(signatures.T),"_UCell"))
Gene ranks are stored here:
head(pbmc.Tcell.UcellPrecomp@assays$UCellRanks@data[,1:10])
head(pbmc.Tcell.UcellPrecomp@assays$UCellRanks@data["CD8A",1:10])
hist(as.numeric(pbmc.Tcell.UcellPrecomp@assays$UCellRanks@data["CD8A",]))
set.seed(123)
basic.sign <- list( Tcell_signature = c("CD2","CD3E","CD3D"), Myeloid_signature = c("SPI1","FCER1G","CSF1R"))
#Using random for ties
system.time(scores <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "random"))
## user system elapsed
## 110.801 3.527 114.391
scores[1:5,]
## Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG 0.6208278 0.00000000
## E2L6_AATGCCATCAGAGCGA 0.5792167 0.00000000
## E2L2_GAGGCAACAACCTATG 0.8809524 0.00000000
## E2L8_ATCCGTCTCTCCTGAC 0.5008901 0.01335113
## L2_GCCATTCAGTAAACTG 0.2843792 0.00000000
#Using average for rank ties
system.time(scores <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "average"))
## user system elapsed
## 40.165 3.307 43.471
scores[1:5,]
## Tcell_signature_UCell Myeloid_signature_UCell
## L4_CTGTAGATCTGGAGAG 0.6515354 0.00000000
## E2L6_AATGCCATCAGAGCGA 0.5282599 0.00000000
## E2L2_GAGGCAACAACCTATG 0.8772808 0.00000000
## E2L8_ATCCGTCTCTCCTGAC 0.4979973 0.03382287
## L2_GCCATTCAGTAAACTG 0.2725857 0.00000000
scores.rand <- list()
scores.ave <- list()
for (seed in c(1:3)) {
set.seed(seed)
t1 <- system.time(scores.rand[[seed]] <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "random"))
t2 <- system.time(scores.ave[[seed]] <- ScoreSignatures_UCell(data.matrix,features=basic.sign, ties.method = "ave"))
print(t1)
print(t2)
}
## user system elapsed
## 110.682 3.628 114.431
## user system elapsed
## 39.851 3.079 42.957
## user system elapsed
## 110.708 3.501 114.240
## user system elapsed
## 40.624 3.136 43.792
## user system elapsed
## 112.207 3.762 117.000
## user system elapsed
## 41.327 3.227 44.561
##Reproducibility
#Rand vs rand
plot(scores.rand[[1]][,1], scores.rand[[2]][,1])
plot(scores.rand[[1]][,1], scores.rand[[3]][,1])
#ave vs ave
plot(scores.ave[[1]][,1], scores.ave[[2]][,1])
plot(scores.ave[[1]][,1], scores.ave[[3]][,1])
#Rand vs. ave
plot(scores.rand[[1]][,1], scores.ave[[1]][,1])
plot(scores.rand[[2]][,1], scores.ave[[2]][,1])