TCRanker preparation

Preparing input data set of TILs from PMEL mice for TCRanker

Introduction

In this study, We used the paired scRNA-seq and scTCR-seq of CD8+ TILs from C57BL/6 (n=4) and PMEL (n=3) mice bearing B16-F10 melanoma tumor from the study by Carmona et al., OncoImmunology (2020)


Preparation

We’ll need some packages to complete the workflow:

library(GEOquery)
library(Seurat)
library(scRepertoire)

And let’s make the files tidy.

setwd(getwd())
inDir <- paste0(getwd(),'/input') # Directory for input files
outDir <- paste0(getwd(),'/output') # Director for output files
dir.create(inDir)
dir.create(outDir)

Data Set

The corresponding GEO id of our data set is GSE116390.

projectID <- 'Carmona.B16'
id <- 'GSE116390'
dir.create(paste0(inDir, '/', id))

supp <- getGEOSuppFiles(id, baseDir = inDir, fetch_files = F)
supp$fname
## [1] "GSE116390_RAW.tar"                          
## [2] "GSE116390_VDJ_annotations.csv.tar.gz"       
## [3] "GSE116390_aggregated_filtered_matrix.tar.gz"
## [4] "GSE116390_aggregated_matrix.tar.gz"

Among what we have in the supplementary files, what we need are the expression matrix and TCR matrix:

supp <- supp[c(2,4),]
supp
##                                  fname
## 2 GSE116390_VDJ_annotations.csv.tar.gz
## 4   GSE116390_aggregated_matrix.tar.gz
##                                                                                                       url
## 2 https://ftp.ncbi.nlm.nih.gov/geo/series/GSE116nnn/GSE116390/suppl//GSE116390_VDJ_annotations.csv.tar.gz
## 4   https://ftp.ncbi.nlm.nih.gov/geo/series/GSE116nnn/GSE116390/suppl//GSE116390_aggregated_matrix.tar.gz

scRNA-seq

Firstly, we load the expression matrix:

for(i in seq_along(supp)){
    fileDir <- paste0(inDir, '/', id, '/', supp$fname[i])
    download.file(url = supp$url[i], destfile = fileDir)
    fileFolder <- sub(".tar.gz", "", fileDir)
    untar(fileDir, exdir = fileFolder)
    rownames(supp)[i] <- fileFolder
} #Downloading and decompressing the file, might be different for other entries with different directory structures.

expMat <- Read10X(sub(".tar.*", "", rownames(supp)[2]))

query.b16 <- CreateSeuratObject(counts = expMat, project = projectID, 
                                    min.cells = 3, min.features = 50)
table(substring(colnames(query.b16), 18))
## 
##    1    2    3    4    5    6    7 
##  519  686 1756  609  552  801 2251

Table 1

Now we could assign the group name by the sample code. From the GEO description page we could find the correspondence of sample code as followed:

sample code 1 2 3 4 5 6 7
group PMEL1 WT1 WT3 PMEL3 PMEL2 WT2 WT4
Table 2


libIDtoSampleID <- c('PMEL1', 'WT1', 'WT3', 'PMEL3', 'PMEL2', 'WT2', 'WT4')
names(libIDtoSampleID) <- 1:7

query.b16$Sample <- as.factor(libIDtoSampleID[substring(colnames(query.b16), 18)])

table(query.b16$Sample)
## 
## PMEL1 PMEL2 PMEL3   WT1   WT2   WT3   WT4 
##   519   552   609   686   801  1756  2251

Table 3

Comparing three tables above, we know that the sample ID assignments were correct.

scTCR-seq

Secondly, we load the TCR matrix:

vdj.list <- list()
for (sampleID in 1:7) {
    sample <- libIDtoSampleID[sampleID]
    vdj <- read.csv(sprintf("%s/%s_VDJ_annotations.csv", 
                            rownames(supp)[1], sample), as.is = T)
    vdj$barcode <- paste0(sub("\\d", "", vdj$barcode), sampleID)
    
    vdj$raw_clonotype_id <- paste0(vdj$raw_clonotype_id, "-", sampleID) #Reassign sample ID to the barcodes
    vdj.list[[sampleID]] <- vdj
}

#Combine α and β chains (cdr3 segments)
combined <- combineTCR(vdj.list, ID = 1:7, sample=libIDtoSampleID, cells = "T-AB", removeNA = T, filterMulti=T)

#Strip barcodes
for (i in seq_along(combined)) {
    combined[[i]] <- stripBarcode(combined[[i]], column = 1, connector = "_", num_connects = 3)
}

Now we can combine the scTCR-seq data into the Seurat object

query.b16 <- combineExpression(combined, query.b16, cloneCall = "gene", groupBy = "none")
query.b16$clonotype <- query.b16$CTaa

query.b16
## An object of class Seurat 
## 14445 features across 7174 samples within 1 assay 
## Active assay: RNA (14445 features, 0 variable features)
colnames(query.b16@meta.data)
##  [1] "orig.ident"   "nCount_RNA"   "nFeature_RNA" "Sample"       "barcode"     
##  [6] "CTgene"       "CTnt"         "CTaa"         "CTstrict"     "Frequency"   
## [11] "cloneType"    "clonotype"
#Save the RDS for later usage
saveRDS(query.b16, file = paste0(outDir,'/query.b16.RDS'))

Now we have the data set prepared in the format of Seurat object.

For the demonstration of how to use the TCRanker package, you can check TCRanker demonstration.