Initial Quantification of ClickTag Counts with Cell Ranger (Starvation Data)
In [ ]:
!date
Download Data¶
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#70BPbarcodes (ClickTag sequences)
download_file('10.22002/D1.1831','.gz')
#tags1.bam
download_file('10.22002/D1.1815','.gz')
#tags2.bam
download_file('10.22002/D1.1816','.gz')
Out[ ]:
In [ ]:
!gunzip *.gz
In [ ]:
!pip install --quiet pysam
!pip install --quiet scanpy==1.6.0
!pip install --quiet fuzzywuzzy
!pip install --quiet biopython
Import Packages¶
In [ ]:
import pysam
import pickle
import os
import csv
import pandas
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from Bio import SeqIO
import time
import copy
import matplotlib.pyplot as plt
import scipy.io
import seaborn as sns
from collections import Counter
from collections import defaultdict
from collections import OrderedDict
from itertools import islice
from itertools import combinations
import pandas as pd
import time
import locale
#import Levenshtein
import re
import scanpy as sc
import multiprocessing
In [ ]:
from google.colab import drive
drive.mount('/drive')
In [ ]:
## Set parameters - below are parameters for 10x 3' v2 chemistry
cell_barcode_length = 16
UMI_length = 10
!mv D1.1831 70BPbarcodes.fa
!mv D1.1815 tags1.bam
!mv D1.1816 tags2.bam
tags = "70BPbarcodes.fa"
CellRangerOut = "tags1.bam"
In [ ]:
def parse_tags(filename):
odict = OrderedDict()
for record in SeqIO.parse(filename, "fasta"):
#odict[row[0].encode('utf-8')] = row[1]
odict[record.name] = str(record.seq)[28:36]
return odict
def worker(procnum, unique_lines_full, start, end, celltags, cellbarcodes, return_list):
"""worker function"""
full_dataframe=pd.DataFrame(index=ab_map.keys(), columns=set([x[:16] for x in unique_lines_full]))
full_dataframe.fillna(0, inplace=True)
#variables
res_table = defaultdict(lambda : defaultdict(int))
n=0
for line in unique_lines_full[start:end]:
cell_barcode = line[0:cell_barcode_length]
UMI = line[cell_barcode_length:cell_barcode_length+UMI_length]
BC_UMI = cell_barcode + UMI
TAG_seq = line[len(BC_UMI):]
tagpositions = [TAG_seq[i:(i+23)] for i in range(0,10)]
fuzzpos = process.extractOne("TCGTCGGCAGCGTCAGATGTGTA", tagpositions)
if fuzzpos[1] > 85:
pos = TAG_seq.find(fuzzpos[0])
fuzzbc = process.extractOne(TAG_seq[pos+23:pos+31], list(ab_map.values()))
if fuzzbc[1] > 85:
best = list(ab_map.keys())[list(ab_map.values()).index(fuzzbc[0])]
BC_UMI_TAG = BC_UMI + best
if BC_UMI_TAG not in UMI_reduce:
#print("got one")
res_table[cell_barcode][best]+=1
full_dataframe.loc[best,cell_barcode]+=1
#print(full_dataframe.loc[best,cell_barcode])
UMI_reduce.add(BC_UMI_TAG)
n += 1
#if(n>5):break
if(n%20000==0):
print(n)
print("elapsed time " + str(time.time()-start))
if procnum < 5:
print('hello_world '+cell_barcode)
print(full_dataframe.loc[:,cell_barcode])
#put res_table into dataframe
"""worker function"""
print('worker '+str(procnum) + '\t' + 'start '+str(start) + '\t' + 'end '+str(end))
print(pd.DataFrame(res_table))
print(' ')
return_list.append(full_dataframe)
return
In [ ]:
ab_map=parse_tags(tags)
In [ ]:
ab_map
Out[ ]:
Count ClickTags lane 1 tags first¶
Will need multiple Colab sessions to complete
In [ ]:
"Create a set for UMI reduction. Fast way to collapse UMIs"
UMI_reduce=set()
#Creaet result table
res_table=defaultdict(lambda : defaultdict(int))
#set counter
n=0
#set number of reads to process
top_n = None
In [ ]:
"""
This section of code processes the entire genome BAM file.
Proccessing is slow due to multiple fuzzy matching steps
Currently configured for 10x v2 and sample tags used in Gehring et. al 2018
The script iterates through the genome BAM, identifies quality barcode sequences with fuzzywuzzy score > 85,
then classifies them.
fuzzpos is a constant sequence just upstream of the barcode
fuzzbc is the barcode sequence extracted for classification
"""
#Load TAGS barcodes
ab_map = parse_tags(tags)
#Create a set for UMI reduction. Fast way to check if it already exists
UMI_reduce = set()
#Create result table
res_table = defaultdict(lambda : defaultdict(int))
res_table_sum = defaultdict(lambda : defaultdict(int))
# set counter
n = 0
top_n = None
unique_lines = set()
start = time.time()
samfile = pysam.AlignmentFile(CellRangerOut, "rb")
for read in samfile.fetch(until_eof=True):
if read.has_tag('CB'):
line = str(read.get_tag('CB'))[:-2] + read.get_tag('UR') + str(read.query_sequence)
unique_lines.add(line)
if top_n:
if top_n < n: break
n += 1
if(n%1000000==0):print(n)
if(n%1000000==0):
print('elapsed time: ' + str(time.time()-start))
samfile.close()
print(str(n) + ' reads loaded')
print(str(len(unique_lines)) + ' unique reads loaded')
print("runtime " + str(time.time() - start))
In [ ]:
# unique_lines = list(unique_lines)
# #unique_lines
Run for first half of sequences only (in Colab session)¶
In [ ]:
unique_lines = list(unique_lines)
jobs=[]
manager = multiprocessing.Manager()
return_list = manager.list()
n_threads=4 #4
slicestart=0
celltags=list(ab_map.keys())
cellbarcodes=list(set([x[:16] for x in unique_lines]))
for job in range(2):
#divide up the unique_lines by the number of threads
#call the worker function, passing positions in unique_lines as input
sliceend=slicestart+len(unique_lines)//n_threads #len(unique_lines)
p = multiprocessing.Process(target=worker, args=(job, unique_lines,slicestart, sliceend, celltags, cellbarcodes, return_list))
jobs.append(p)
p.start()
slicestart=sliceend
for p in jobs:
p.join()
In [ ]:
return_list = list(return_list)
print(return_list)
In [ ]:
print(slicestart)
print(sliceend)
In [ ]:
with open('/drive/MyDrive/listfile_half.data', 'wb') as filehandle:
# store the data as binary data stream
pickle.dump(return_list, filehandle)
Run for second half of sequences + to produce output file (in new Colab session)¶
Previous code take 6+ hours, and cannot be run in a one full 12 hr Colab session
In [ ]:
with open('/drive/MyDrive/listfile_half.data', 'rb') as filehandle:
# read the data as binary data stream
return_list_half = pickle.load(filehandle)
In [ ]:
unique_lines = list(unique_lines)
jobs=[]
manager = multiprocessing.Manager()
return_list = manager.list()
n_threads=4
slicestart=4813282
celltags=list(ab_map.keys())
cellbarcodes=list(set([x[:16] for x in unique_lines]))
for job in range(2):
#divide up the unique_lines by the number of threads
#call the worker function, passing positions in unique_lines as input
sliceend=slicestart+len(unique_lines)//n_threads #len(unique_lines)
p = multiprocessing.Process(target=worker, args=(job, unique_lines,slicestart, sliceend, celltags, cellbarcodes, return_list))
jobs.append(p)
p.start()
slicestart=sliceend
for p in jobs:
p.join()
In [ ]:
pd.DataFrame(res_table)
Out[ ]:
In [ ]:
final_df=pd.DataFrame(index=ab_map.keys(), columns=set([x[:16] for x in unique_lines]))
final_df.fillna(0,inplace=True)
In [ ]:
final_df = return_list[0].fillna(0) + return_list_half[0].fillna(0)
In [ ]:
final_df.head()
Out[ ]:
In [ ]:
res_matrix=copy.deepcopy(final_df)
sortedREAP_dtf = copy.deepcopy(res_matrix.sum())
sortedREAP_dtf = sortedREAP_dtf.sort_values(ascending=False)
plt.plot(np.log10(range(len(sortedREAP_dtf))), sortedREAP_dtf.apply(np.log10))
plt.show()
In [ ]:
"""
Have a look at the data
Columns of the matrix are cells
Rows are tag counts for each cell
Only 8 of twenty possible tags were used in this experiment (2 tags for each of 10 samples)
"""
res_matrix.fillna(0, inplace=True)
res_matrix.loc[:,sortedREAP_dtf.index[6000:6020]]
Out[ ]:
In [ ]:
res_matrix.to_csv(path_or_buf="jelly3tags1counts.csv")
Analysis for lane 2 ClickTags Following Same Protocol as With lane 1 ClickTags¶
In [ ]:
CellRangerOut = "tags2.bam"
In [ ]:
"Create a set for UMI reduction. Fast way to collapse UMIs"
UMI_reduce=set()
#Creaet result table
res_table=defaultdict(lambda : defaultdict(int))
#set counter
n=0
#set number of reads to process
top_n = None
In [ ]:
"""
This section of code processes the entire genome BAM file. Runtime is about one hour for 3 million reads.
Proccessing is slow due to multiple fuzzy matching steps
Currently configured for 10x v2 and sample tags used in Gehring et. al 2018
The script iterates through the genome BAM, identifies quality barcode sequences with fuzzywuzzy score > 85,
then classifies them.
fuzzpos is a constant sequence just upstream of the barcode
fuzzbc is the barcode sequence extracted for classification
"""
#Load TAGS barcodes
ab_map = parse_tags(tags)
#Create a set for UMI reduction. Fast way to check if it already exists
UMI_reduce = set()
#Create result table
res_table = defaultdict(lambda : defaultdict(int))
res_table_sum = defaultdict(lambda : defaultdict(int))
# set counter
n = 0
top_n = None
unique_lines = set()
start = time.time()
samfile = pysam.AlignmentFile(CellRangerOut, "rb")
for read in samfile.fetch(until_eof=True):
if read.has_tag('CB'):
line = str(read.get_tag('CB'))[:-2] + read.get_tag('UR') + str(read.query_sequence)
unique_lines.add(line)
if top_n:
if top_n < n: break
n += 1
if(n%1000000==0):print(n)
if(n%1000000==0):
print('elapsed time: ' + str(time.time()-start))
samfile.close()
print(str(n) + ' reads loaded')
print(str(len(unique_lines)) + ' unique reads loaded')
print("runtime " + str(time.time() - start))
Run for first half of sequences only (in Colab session)¶
In [ ]:
unique_lines = list(unique_lines)
jobs=[]
manager = multiprocessing.Manager()
return_list = manager.list()
n_threads=4 #4
slicestart=0
celltags=list(ab_map.keys())
cellbarcodes=list(set([x[:16] for x in unique_lines]))
for job in range(2):
#divide up the unique_lines by the number of threads
#call the worker function, passing positions in unique_lines as input
sliceend=slicestart+len(unique_lines)//n_threads #len(unique_lines)
p = multiprocessing.Process(target=worker, args=(job, unique_lines,slicestart, sliceend, celltags, cellbarcodes, return_list))
jobs.append(p)
p.start()
slicestart=sliceend
for p in jobs:
p.join()
In [ ]:
return_list = list(return_list)
print(return_list)
In [ ]:
print(slicestart)
print(sliceend)
In [ ]:
with open('/drive/MyDrive/listfile_half_tags2.data', 'wb') as filehandle:
# store the data as binary data stream
pickle.dump(return_list, filehandle)
Run for second half of sequences + to produce output file (in new Colab session)¶
Previous code take 6+ hours, and cannot be run in a one full 12 hr Colab session
In [ ]:
with open('/drive/MyDrive/listfile_half_tags2.data', 'rb') as filehandle:
# read the data as binary data stream
return_list_half = pickle.load(filehandle)
In [ ]:
unique_lines = list(unique_lines)
jobs=[]
manager = multiprocessing.Manager()
return_list = manager.list()
n_threads=4
slicestart=4813282
celltags=list(ab_map.keys())
cellbarcodes=list(set([x[:16] for x in unique_lines]))
for job in range(2):
#divide up the unique_lines by the number of threads
#call the worker function, passing positions in unique_lines as input
sliceend=slicestart+len(unique_lines)//n_threads #len(unique_lines)
p = multiprocessing.Process(target=worker, args=(job, unique_lines,slicestart, sliceend, celltags, cellbarcodes, return_list))
jobs.append(p)
p.start()
slicestart=sliceend
for p in jobs:
p.join()
In [ ]:
pd.DataFrame(res_table)
Out[ ]:
In [ ]:
final_df=pd.DataFrame(index=ab_map.keys(), columns=set([x[:16] for x in unique_lines]))
final_df.fillna(0,inplace=True)
In [ ]:
final_df = return_list[0].fillna(0) + return_list_half[0].fillna(0)
In [ ]:
final_df.head()
Out[ ]:
In [ ]:
res_matrix=copy.deepcopy(final_df)
sortedREAP_dtf = copy.deepcopy(res_matrix.sum())
sortedREAP_dtf = sortedREAP_dtf.sort_values(ascending=False)
plt.plot(np.log10(range(len(sortedREAP_dtf))), sortedREAP_dtf.apply(np.log10))
plt.show()
In [ ]:
"""
Have a look at the data
Columns of the matrix are cells
Rows are tag counts for each cell
Only 8 of twenty possible tags were used in this experiment (2 tags for each of 4 samples)
"""
res_matrix.fillna(0, inplace=True)
res_matrix.loc[:,sortedREAP_dtf.index[5000:5200]]
In [ ]:
res_matrix.to_csv(path_or_buf="jelly3tags2counts.csv")