Quantification of ClickTag Counts with kallisto-bustools (Stimulation Data)
Download Data¶
In [ ]:
#Install kallisto and bustools
!wget --quiet https://github.com/pachterlab/kallisto/releases/download/v0.46.2/kallisto_linux-v0.46.2.tar.gz
!tar -xf kallisto_linux-v0.46.2.tar.gz
!cp kallisto/kallisto /usr/local/bin/
!wget --quiet https://github.com/BUStools/bustools/releases/download/v0.40.0/bustools_linux-v0.40.0.tar.gz
!tar -xf bustools_linux-v0.40.0.tar.gz
!cp bustools/bustools /usr/local/bin/
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#Get HiSeq/MiSeq sequencing of ClickTags
download_file('10.22002/D1.1793','.tar.gz')
!tar -xf D1.1793.tar.gz
In [ ]:
# Get ClickTag Barcodes/Sequences
download_file('10.22002/D1.1831','.gz')
#Get saved embedding of ClickTags
download_file('10.22002/D1.1838','.gz')
!gunzip *.gz
In [ ]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
!pip3 install --quiet leidenalg
!pip install --quiet louvain
!pip3 install --quiet biopython
Import Packages¶
In [ ]:
import pandas as pd
import copy
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import scipy
import numpy as np
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from collections import OrderedDict
from Bio import SeqIO
import os
from scipy import io
import scipy.io as sio
import time
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
How ClickTag counts and Clustering Are Generated¶
Use kallisto-bustools to align reads to kallisto index of ClickTag barcode whitelist consisting of sequences hamming distance 1 away from original barcodes
In [ ]:
## Set parameters - below are parameters for 10x 3' v3 chemistry
## The cell hashing method uses tags of length 12, we included the variable B (T,C,G) in the end to make it lenght 13
cell_barcode_length = 16
UMI_length = 12
sample_tag_length=11
"""
This function returns all sample tags and and their single base mismatches (hamming distance 1).
ClickTag sequences are provided as a fasta file, and the script indexes the barcode region of the fasta
"""
"parse the tags file and output the set of tag sequences"
def parse_tags(filename):
odict = OrderedDict()
print('Read the following tags:')
for record in SeqIO.parse(filename, "fasta"):
counter=0
print(record.seq)
odict[record.name] = str(record.seq)[26:26+sample_tag_length]
for pos in range(sample_tag_length):
letter =str(record.seq)[26+pos]
barcode=list(str(record.seq)[26:26+sample_tag_length])
if letter=='A':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
elif letter=='G':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
elif letter=='C':
barcode[pos]='T'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
else:
barcode[pos]='A'
odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
barcode[pos]='G'
odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
barcode[pos]='C'
odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
return odict
In [ ]:
# Make kallisto index from ClickTag sequences
!mv D1.1831 70BPbarcodes.fa
tags_file_path = ("70BPbarcodes.fa") #70BPbarcodes.fa
tag_map=parse_tags(tags_file_path)
work_folder = ''
data_folder = ''#/home/jgehring/scRNAseq/clytia/20190405/all_tag_fastqs//
write_folder = ''
#Write the list of barcodes as a fasta, then make a kallisto index
!mkdir barcode_corrected_tags
tagmap_file_path = "barcode_corrected_tags/70BPbarcodes_tagmap.fa"
tagmap_file = open(tagmap_file_path, "w+")
for i in list(tag_map.keys()):
tagmap_file.write(">" + i + "\n" +tag_map[i] + "\n")
tagmap_file.close()
!kallisto index -i {tagmap_file_path}.idx -k 11 {tagmap_file_path}
!kallisto inspect {tagmap_file_path}.idx
Run kallisto bus on ClickTag fastqs
In [ ]:
#Run kallisto bus on ClickTag fastqs
!mkdir jelly4stim_1_tags_HiSeq
write_folder = 'jelly4stim_1_tags_HiSeq'
!kallisto bus -i {tagmap_file_path}.idx -o {write_folder} -x 10xv3 -t 20 {os.path.join(data_folder,'jelly4stim1MiSeqHiSeq_tags_R1.fastq')} {os.path.join(data_folder,'jelly4stim1MiSeqHiSeq_tags_R2.fastq')}
#sort bus file
!bustools sort -o {os.path.join(write_folder,'output_sorted.bus')} {os.path.join(write_folder,'output.bus')}
# convert the sorted busfile to txt
!bustools text -o {os.path.join(write_folder,'output_sorted.txt')} {os.path.join(write_folder,'output_sorted.bus')}
!mkdir jelly4stim_2_tags_HiSeq
write_folder = 'jelly4stim_2_tags_HiSeq'
!kallisto bus -i {tagmap_file_path}.idx -o {write_folder} -x 10xv3 -t 20 {os.path.join(data_folder,'jelly4stim2MiSeqHiSeq_tags_R1.fastq')} {os.path.join(data_folder,'jelly4stim2MiSeqHiSeq_tags_R2.fastq')} \
#sort bus file
!bustools sort -o {os.path.join(write_folder,'output_sorted.bus')} {os.path.join(write_folder,'output.bus')}
# convert the sorted busfile to txt
!bustools text -o {os.path.join(write_folder,'output_sorted.txt')} {os.path.join(write_folder,'output_sorted.bus')}
In [ ]:
write_folder = '' #/home/jgehring/scRNAseq/clytia/20190405/
bus_data_jelly1 = pd.read_csv(os.path.join(write_folder,'jelly4stim_1_tags_HiSeq/output_sorted.txt'), delimiter='\t', header=None, names = ['barcode', 'umi', 'tag_eqclass', 'multiplicity'])
bus_data_jelly1.head()
bus_data_jelly1
bus_data_jelly2 = pd.read_csv(os.path.join(write_folder,'jelly4stim_2_tags_HiSeq/output_sorted.txt'), delimiter='\t', header=None, names = ['barcode', 'umi', 'tag_eqclass', 'multiplicity'])
bus_data_jelly2.head()
tag_map_df = pd.DataFrame.from_dict(tag_map, orient = 'index').reset_index()
tag_map_df.columns=['tagname','tagseq']
tag_map_df['ClickTag'] = tag_map_df['tagname'].str.split('-').str.get(0)
tag_map_df.head()
bus_data_jelly1['tag']= bus_data_jelly1['tag_eqclass'].map(tag_map_df['ClickTag'])
bus_data_jelly1.head()
bus_data_jelly2['tag']= bus_data_jelly2['tag_eqclass'].map(tag_map_df['ClickTag'])
bus_data_jelly2.head()
print('Counting UMIs')
counted_data_jelly1 = bus_data_jelly1.groupby(['barcode', 'tag'])['umi'].count().reset_index()
counted_data_jelly1.rename(columns={'umi':'umi_counts'}, inplace = True)
counted_data_jelly1.head()
print('Counting UMIs')
counted_data_jelly2 = bus_data_jelly2.groupby(['barcode', 'tag'])['umi'].count().reset_index()
counted_data_jelly2.rename(columns={'umi':'umi_counts'}, inplace = True)
counted_data_jelly2.head()
Out[ ]:
In [ ]:
!mkdir counted_tag_data
In [ ]:
data_dict={'counted_data_jelly1':counted_data_jelly1, 'counted_data_jelly2':counted_data_jelly2}
counted_data_jelly1['barcode']=[x+'-1' for x in counted_data_jelly1['barcode'].values]
counted_data_jelly2['barcode']=[x+'-2' for x in counted_data_jelly2['barcode'].values]
for counted_data in data_dict:
le_barcode = LabelEncoder()
barcode_idx =le_barcode.fit_transform(data_dict[counted_data]['barcode'].values)
print('Barcode index shape:', barcode_idx.shape)
le_umi = LabelEncoder()
umi_idx = le_umi.fit_transform(data_dict[counted_data]['umi_counts'].values)
print('UMI index shape:', umi_idx.shape)
le_tag = LabelEncoder()
tag_idx = le_tag.fit_transform(data_dict[counted_data]['tag'].values)
print('Tag index shape:', tag_idx.shape)
# convert data to csr matrix
csr_matrix_data = scipy.sparse.csr_matrix((data_dict[counted_data]['umi_counts'].values,(barcode_idx,tag_idx)))
scipy.io.mmwrite(os.path.join(write_folder,'counted_tag_data/' + counted_data + '.mtx'),csr_matrix_data)
print('Saved sparse csr matrix')
pd.DataFrame(le_tag.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), index = False, header = False)
pd.DataFrame(le_barcode.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_barcode_labels.csv'), index = False, header = False)
print('Saved cell barcode and hashtag labels')
print('Number of unique cell barcodes seen:', len(le_barcode.classes_))
In [ ]:
# Clicktag for both 10x lanes concatenated
ClickTagCountsmat=scipy.io.mmread(os.path.join(write_folder,'counted_tag_data/counted_data_jelly1.mtx'))
ClickTagCounts=pd.DataFrame(ClickTagCountsmat.toarray(),
index=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly1_ClickTag_barcode_labels.csv'), header=None).loc[:,0]),
columns=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), header=None).loc[:,0]))
ClickTagCountsmat=scipy.io.mmread(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2.mtx'))
ClickTagCounts=ClickTagCounts.append(pd.DataFrame(ClickTagCountsmat.toarray(),
index=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2_ClickTag_barcode_labels.csv'), header=None).loc[:,0]),
columns=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), header=None).loc[:,0])))
ClickTagCounts
Out[ ]:
Use UMI count knee-plot to filter for high-quality ClickTagged cells
In [ ]:
ClickTag_counts_sorted = copy.deepcopy(ClickTagCounts.T.sum())
ClickTag_counts_sorted = ClickTag_counts_sorted.sort_values(ascending=False)
plt.plot(ClickTag_counts_sorted.apply(np.log10),np.log10(range(len(ClickTag_counts_sorted))))
plt.axhline(np.log10(50000))
plt.show()
In [ ]:
ClickTag_counts_filtered=ClickTagCounts.loc[list(ClickTag_counts_sorted[:50000].index)]
ClickTag_counts_filtered=ClickTag_counts_filtered.iloc[:,4:]
filtered_ClickTag_counts=ClickTag_counts_filtered.T
filtered_ClickTag_counts
hits = 0
counter = 1
for barcode in filtered_ClickTag_counts.index:
for i in filtered_ClickTag_counts:
if filtered_ClickTag_counts[i].idxmax() == barcode:
if hits ==0:
sortedheatmap_dtf=pd.DataFrame({counter:filtered_ClickTag_counts[i]})
hits+=1
counter+=1
else:
sortedheatmap_dtf = sortedheatmap_dtf.assign(i = filtered_ClickTag_counts[i])
sortedheatmap_dtf.rename(columns = {'i':counter}, inplace = True)
counter+=1
filtered_ClickTag_counts=copy.deepcopy(sortedheatmap_dtf)
percentClickTags_counts = copy.deepcopy(filtered_ClickTag_counts)
for i in filtered_ClickTag_counts:
percentClickTags_counts[i] = filtered_ClickTag_counts[i]/filtered_ClickTag_counts[i].sum()
In [ ]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(np.log1p(filtered_ClickTag_counts), cmap='viridis')
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(percentClickTags_counts, cmap='viridis')
plt.show()
filtered_ClickTag_counts
Out[ ]:
Cluster cells to find clear ClickTag 'expression'
In [ ]:
adata = sc.AnnData(ClickTag_counts_filtered)
adata.var_names= list(ClickTag_counts_filtered.columns)
adata.obs_names= list(ClickTag_counts_filtered[:50000].index)
#pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2_ClickTag_barcode_labels.csv'), header=None, sep=',')[:15000][0].tolist()
In [ ]:
adata
Out[ ]:
In [ ]:
adata
sc.pp.filter_genes(adata, min_counts=0)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
adata.obs['n_countslog']=np.log(adata.obs['n_counts'])
sc.pp.log1p(adata)
sc.pp.regress_out(adata, ['n_counts'])
sc.tl.tsne(adata, perplexity=30, use_rep=None)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.louvain(adata, resolution=2)
sc.pl.tsne(adata, color='n_countslog')
sc.set_figure_params(dpi=120)
sc.pl.tsne(adata,color=['louvain'])
# sc.pl.tsne(adata, color=adata.var_names)
# for i in range(20):
# sc.pl.violin(adata[adata.obs['louvain'].isin([str(i)])], keys=adata.var_names)
# sc.pl.tsne(adata, color=['louvain'],palette='rainbow',legend_loc='on data')
In [ ]:
sc.pl.umap(adata, color='louvain')
sc.pl.umap(adata, color='n_countslog')
sc.pl.umap(adata, color=adata.var_names)