Quantification of cDNA Counts with kallisto-bustools (All Data)
In [ ]:
!date
Download Data¶
In [ ]:
#Install kallisto and bustools
!wget --quiet https://github.com/pachterlab/kallisto/releases/download/v0.46.2/kallisto_linux-v0.46.2.tar.gz
!tar -xf kallisto_linux-v0.46.2.tar.gz
!cp kallisto/kallisto /usr/local/bin/
!wget --quiet https://github.com/BUStools/bustools/releases/download/v0.40.0/bustools_linux-v0.40.0.tar.gz
!tar -xf bustools_linux-v0.40.0.tar.gz
!cp bustools/bustools /usr/local/bin/
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#Get reference data () fastq's)
#Trinity Transcripts
download_file('10.22002/D1.1825','.gz')
#Gff3 (Trinity)
download_file('10.22002/D1.1824','.gz')
Out[ ]:
In [ ]:
# Get doi links for all Starvation cDNA fastq.gz files
starvFiles = []
dois = ['10.22002/D1.1840','10.22002/D1.1841','10.22002/D1.1842','10.22002/D1.1843',
'10.22002/D1.1844','10.22002/D1.1845','10.22002/D1.1846','10.22002/D1.1847',
'10.22002/D1.1848','10.22002/D1.1849','10.22002/D1.1850','10.22002/D1.1851',
'10.22002/D1.1852','10.22002/D1.1853','10.22002/D1.1854','10.22002/D1.1855'] #16 doi numbers
for doi in dois:
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
starvFiles += [netcdf_url]
s1 = starvFiles[0]
s2 = starvFiles[1]
s3 = starvFiles[2]
s4 = starvFiles[3]
s5 = starvFiles[4]
s6 = starvFiles[5]
s7 = starvFiles[6]
s8 = starvFiles[7]
s9 = starvFiles[8]
s10 = starvFiles[9]
s11 = starvFiles[10]
s12 = starvFiles[11]
s13 = starvFiles[12]
s14 = starvFiles[13]
s15 = starvFiles[14]
s16 = starvFiles[15]
In [ ]:
# Get doi links for all Stimulation cDNA fastq.gz files
stimFiles = []
dois = ['10.22002/D1.1860','10.22002/D1.1863','10.22002/D1.1864','10.22002/D1.1865',
'10.22002/D1.1866','10.22002/D1.1868','10.22002/D1.1870','10.22002/D1.1871'] #8 numbers
for doi in dois:
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
stimFiles += [netcdf_url]
stim1 = stimFiles[0]
stim2 = stimFiles[1]
stim3 = stimFiles[2]
stim4 = stimFiles[3]
stim5 = stimFiles[4]
stim6 = stimFiles[5]
stim7 = stimFiles[6]
stim8 = stimFiles[7]
In [ ]:
#Get original, CellRanger clustered starvation adata
download_file('10.22002/D1.1798','.gz')
#Cell barcodes selected from Stim ClickTags
download_file('10.22002/D1.1817','.gz')
Out[ ]:
In [ ]:
!gunzip *.gz
In [ ]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
!pip install --quiet louvain
Import Packages¶
In [ ]:
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt
%matplotlib inline
sc.set_figure_params(dpi=125)
Run kallisto bus on data with Starvation cDNA data¶
In [ ]:
#Make Kallisto index (referene https://www.kallistobus.tools/getting_started)
!mv D1.1825 transcripts.fa
!kallisto index -i clytia_trin.idx -k 31 transcripts.fa
Run kallisto for one set of samples
In [ ]:
#Create BUS files from fastq's, can't do separate lines
!mkfifo R1.gz R2.gz R1_02.gz R2_02.gz R1_03.gz R2_03.gz R1_04.gz R2_04.gz; curl -Ls $s1 > R1.gz & curl -Ls $s2 > R2.gz & curl -Ls $s3 > R1_02.gz & curl -Ls $s4 > R2_02.gz & curl -Ls $s5 > R1_03.gz & curl -Ls $s6 > R2_03.gz & curl -Ls $s7 > R1_04.gz & curl -Ls $s8 > R2_04.gz & kallisto bus -i clytia_trin.idx -o bus_output/ -x 10xv2 -t 2 R1.gz R2.gz R1_02.gz R2_02.gz R1_03.gz R2_03.gz R1_04.gz R2_04.gz
In [ ]:
#Generate gene-count matrices
!wget --quiet https://github.com/bustools/getting_started/releases/download/getting_started/10xv2_whitelist.txt
#Make t2g file
!mv D1.1824 trinity.gff3
!awk '{ print $12"\t"$10}' trinity.gff3 > t2g_rough.txt
!sed 's/[";]//g' t2g_rough.txt > t2g_trin.txt
In [ ]:
#!cd bus_output/
!mkdir bus_output/genecount/ bus_output/tmp/
!bustools correct -w 10xv2_whitelist.txt -p bus_output/output.bus | bustools sort -T bus_output/tmptmp/ -t 2 -p - | bustools count -o bus_output/genecount/genes -g t2g_trin.txt -e bus_output/matrix.ec -t bus_output/transcripts.txt --genecounts -
Run kallisto for other sample set
In [ ]:
#Create BUS files from fastq's
!mkfifo R1_new.gz R2_new.gz R1_02_new.gz R2_02_new.gz R1_03_new.gz R2_03_new.gz R1_04_new.gz R2_04_new.gz; curl -Ls $s9 > R1_new.gz & curl -Ls $s10 > R2_new.gz & curl -Ls $s11 > R1_02_new.gz & curl -Ls $s12 > R2_02_new.gz & curl -Ls $s13 > R1_03_new.gz & curl -Ls $s14 > R2_03_new.gz & curl -Ls $s15 > R1_04_new.gz & curl -Ls $s16 > R2_04_new.gz & kallisto bus -i clytia_trin.idx -o bus_output_02/ -x 10xv2 -t 2 R1_new.gz R2_new.gz R1_02_new.gz R2_02_new.gz R1_03_new.gz R2_03_new.gz R1_04_new.gz R2_04_new.gz
In [ ]:
#Generate gene-count matrices
!cd bus_output_02/
!mkdir bus_output_02/genecount/ bus_output_02/tmp/
!bustools correct -w 10xv2_whitelist.txt -p bus_output_02/output.bus | bustools sort -T bus_output_02/tmp/ -t 2 -p - | bustools count -o bus_output_02/genecount/genes -g t2g_trin.txt -e bus_output_02/matrix.ec -t bus_output_02/transcripts.txt --genecounts -
Merge matrices (Add -1 to first and -2 to second dataset)
In [ ]:
path = "bus_output/genecount/"
jelly_adata_01 = sc.read(path+'genes.mtx', cache=True)
jelly_adata_01.var_names = pd.read_csv(path+'genes.genes.txt', header=None)[0]
jelly_adata_01.obs_names = pd.read_csv(path+'genes.barcodes.txt', header=None)[0]
jelly_adata_01.obs_names = [i+"-1" for i in jelly_adata_01.obs_names]
In [ ]:
path = "bus_output_02/genecount/"
jelly_adata_02 = sc.read(path+'genes.mtx', cache=True)
jelly_adata_02.var_names = pd.read_csv(path+'genes.genes.txt', header=None)[0]
jelly_adata_02.obs_names = pd.read_csv(path+'genes.barcodes.txt', header=None)[0]
jelly_adata_02.obs_names = [i+"-2" for i in jelly_adata_02.obs_names]
In [ ]:
jelly_adata = jelly_adata_01.concatenate(jelly_adata_02,join='outer', index_unique=None)
Filter cell barcodes by previous selection done with ClickTag counts
In [ ]:
# Filter barcodes by 'real' cells
cellR = anndata.read('D1.1798')
cells = list(cellR.obs_names)
jelly_adata = jelly_adata[cells,:]
jelly_adata
Out[ ]:
In [ ]:
jelly_adata.write('fedStarved_raw.h5ad')
Run kallisto bus on data with Stimulation cDNA data¶
Run kallisto for one set of samples
In [ ]:
#Create BUS files from fastq's, can't do separate lines
!mkfifo R1_stim.gz R2_stim.gz R1_02_stim.gz R2_02_stim.gz ; curl -Ls $stim1 > R1_stim.gz & curl -Ls $stim2 > R2_stim.gz & curl -Ls $stim3 > R1_02_stim.gz & curl -Ls $stim4 > R2_02_stim.gz & kallisto bus -i clytia_trin.idx -o bus_output/ -x 10xv3 -t 2 R1_stim.gz R2_stim.gz R1_02_stim.gz R2_02_stim.gz
In [ ]:
#Generate gene-count matrices
!wget --quiet https://github.com/bustools/getting_started/releases/download/getting_started/10xv3_whitelist.txt
In [ ]:
#!cd bus_output/
!mkdir bus_output/genecount/ bus_output/tmp/
!bustools correct -w 10xv3_whitelist.txt -p bus_output/output.bus | bustools sort -T bus_output/tmptmp/ -t 2 -p - | bustools count -o bus_output/genecount/genes -g t2g_trin.txt -e bus_output/matrix.ec -t bus_output/transcripts.txt --genecounts -
In [ ]:
!ls test
Run kallisto for other sample set
In [ ]:
#Create BUS files from fastq's
!mkfifo R1_stim2.gz R2_stim2.gz R1_02_stim2.gz R2_02_stim2.gz ; curl -Ls $stim5 > R1_stim2.gz & curl -Ls $stim6 > R2_stim2.gz & curl -Ls $stim7 > R1_02_stim2.gz & curl -Ls $stim8 > R2_02_stim2.gz & kallisto bus -i clytia_trin.idx -o bus_output_02/ -x 10xv3 -t 2 R1_stim2.gz R2_stim2.gz R1_02_stim2.gz R2_02_stim2.gz
In [ ]:
#Generate gene-count matrices
!cd bus_output_02/
!mkdir bus_output_02/genecount/ bus_output_02/tmp/
!bustools correct -w 10xv3_whitelist.txt -p bus_output_02/output.bus | bustools sort -T bus_output_02/tmp/ -t 2 -p - | bustools count -o bus_output_02/genecount/genes -g t2g_trin.txt -e bus_output_02/matrix.ec -t bus_output_02/transcripts.txt --genecounts -
Merge matrices (Add -1 to first and -2 to second dataset)
In [ ]:
path = "bus_output/genecount/"
jelly_adata_01 = sc.read(path+'genes.mtx', cache=True)
jelly_adata_01.var_names = pd.read_csv(path+'genes.genes.txt', header=None)[0]
jelly_adata_01.obs_names = pd.read_csv(path+'genes.barcodes.txt', header=None)[0]
jelly_adata_01.obs_names = [i+"-1" for i in jelly_adata_01.obs_names]
In [ ]:
path = "bus_output_02/genecount/"
jelly_adata_02 = sc.read(path+'genes.mtx', cache=True)
jelly_adata_02.var_names = pd.read_csv(path+'genes.genes.txt', header=None)[0]
jelly_adata_02.obs_names = pd.read_csv(path+'genes.barcodes.txt', header=None)[0]
jelly_adata_02.obs_names = [i+"-2" for i in jelly_adata_02.obs_names]
In [ ]:
jelly_adata = jelly_adata_01.concatenate(jelly_adata_02,join='outer', index_unique=None)
Filter cell barcodes by previous filtering done with ClickTag counts
In [ ]:
# Filter barcodes by 'real' cells
!mv D1.1817 jelly4stim_individs_tagCells_50k.mat
barcodes_list = sio.loadmat('jelly4stim_individs_tagCells_50k.mat')
barcodes_list.pop('__header__', None)
barcodes_list.pop('__version__', None)
barcodes_list.pop('__globals__', None)
# Add all cell barcodes for each individual
barcodes = []
for b in barcodes_list:
if barcodes_list[b] != "None":
barcodes.append(b)
print(len(barcodes))
barcodes = [s.replace('-1', '-3') for s in barcodes]
barcodes = [s.replace('-2', '-1') for s in barcodes]
barcodes = [s.replace('-3', '-2') for s in barcodes]
In [ ]:
jelly_adata = jelly_adata[barcodes,:]
In [ ]:
jelly_adata.write('stimulation_raw.h5ad')
In [ ]: