Skip to content

Quantification of ClickTag Counts with kallisto-bustools (Starvation Data)

Open In Colab

In [ ]:
!date

Download Data

In [1]:
#Install kallisto and bustools

!wget --quiet https://github.com/pachterlab/kallisto/releases/download/v0.46.2/kallisto_linux-v0.46.2.tar.gz
!tar -xf kallisto_linux-v0.46.2.tar.gz
!cp kallisto/kallisto /usr/local/bin/

!wget --quiet https://github.com/BUStools/bustools/releases/download/v0.40.0/bustools_linux-v0.40.0.tar.gz
!tar -xf bustools_linux-v0.40.0.tar.gz
!cp bustools/bustools /usr/local/bin/
In [2]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
    url = 'https://api.datacite.org/dois/'+doi+'/media'
    r = requests.get(url).json()
    netcdf_url = r['data'][0]['attributes']['url']
    r = requests.get(netcdf_url,stream=True)
    #Set file name
    fname = doi.split('/')[-1]+ext
    #Download file with progress bar
    if r.status_code == 403:
        print("File Unavailable")
    if 'content-length' not in r.headers:
        print("Did not get file")
    else:
        with open(fname, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            pbar = tnrange(int(total_length/1024), unit="B")
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update()
                    f.write(chunk)
        return fname
In [3]:
# Get ClickTag Barcodes/Sequences
download_file('10.22002/D1.1831','.gz')

#All ClickTag MiSeq fastqs
download_file('10.22002/D1.1856','.tar.gz')

# jelly3tags1counts.csv
download_file('10.22002/D1.1799','.gz')

# jelly3tags2counts.csv
download_file('10.22002/D1.1800','.gz')

# cellRanger fs/h5ad
download_file('10.22002/D1.1798','.gz')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`

Out[3]:
'D1.1798.gz'
In [4]:
!tar -xf D1.1856.tar.gz
In [5]:
!gunzip D1.1831.gz D1.1799.gz D1.1800.gz D1.1798.gz
In [6]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.7.0rc1
!pip install --quiet louvain
!pip3 install --quiet biopython
     |████████████████████████████████| 133kB 6.7MB/s 
     |████████████████████████████████| 10.2MB 6.6MB/s 
     |████████████████████████████████| 71kB 11.8MB/s 
     |████████████████████████████████| 71kB 10.9MB/s 
  Building wheel for umap-learn (setup.py) ... done
  Building wheel for sinfo (setup.py) ... done
     |████████████████████████████████| 2.2MB 8.4MB/s 
     |████████████████████████████████| 3.2MB 37.6MB/s 
     |████████████████████████████████| 2.3MB 7.0MB/s 

Import Packages

In [7]:
import pandas as pd
import copy
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import scipy
import numpy as np
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc

from collections import OrderedDict
from Bio import SeqIO
import os
from scipy import io
import scipy.io as sio
import time


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Quantify ClickTags with kallisto-bustools for Selecting High-Quality (Tagged) Cells

Create map of all ClickTag barcodes and sequences Hamming distance 1 away from these barcodes

In [8]:
## Set parameters - below are parameters for 10x 3' v2 chemistry
## The cell hashing method uses tags of length 12, we included the variable B (T,C,G) in the end to make it lenght 13
cell_barcode_length = 16
UMI_length = 12
sample_tag_length=11

"""
This function returns all sample tags and and their single base mismatches (hamming distance 1).
ClickTag sequences are provided as a fasta file, and the script indexes the barcode region of the fasta
"""

"parse the tags file and output the set of tag sequences"

def parse_tags(filename):
    odict = OrderedDict()
    print('Read the following tags:')
    for record in SeqIO.parse(filename, "fasta"):
        counter=0
        print(record.seq)
        odict[record.name] = str(record.seq)[26:26+sample_tag_length]
        for pos in range(sample_tag_length):
            letter =str(record.seq)[26+pos]
            barcode=list(str(record.seq)[26:26+sample_tag_length])
            if letter=='A':
                barcode[pos]='T'
                odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
                barcode[pos]='G'
                odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
                barcode[pos]='C'
                odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
            elif letter=='G':
                barcode[pos]='T'
                odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
                barcode[pos]='A'
                odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
                barcode[pos]='C'
                odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
            elif letter=='C':
                barcode[pos]='T'
                odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
                barcode[pos]='G'
                odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
                barcode[pos]='A'
                odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
            else:
                barcode[pos]='A'
                odict[record.name+'-'+str(pos)+'-1'] = "".join(barcode)
                barcode[pos]='G'
                odict[record.name+'-'+str(pos)+'-2'] = "".join(barcode)
                barcode[pos]='C'
                odict[record.name+'-'+str(pos)+'-3'] = "".join(barcode)
                
    return odict
In [9]:
# Make kallisto index from ClickTag sequences

!mv D1.1831 70BPbarcodes.fa
tags_file_path = ("70BPbarcodes.fa") #70BPbarcodes.fa
tag_map=parse_tags(tags_file_path)


work_folder = ''
data_folder = ''
write_folder = ''

#Write the list of barcodes as a fasta, then make a kallisto index
!mkdir barcode_corrected_tags
tagmap_file_path = "barcode_corrected_tags/70BPbarcodes_tagmap.fa"

tagmap_file = open(tagmap_file_path, "w+")
for i in list(tag_map.keys()):
    tagmap_file.write(">" + i + "\n" +tag_map[i] + "\n")
tagmap_file.close()

!kallisto index -i {tagmap_file_path}.idx -k 11 {tagmap_file_path}
!kallisto inspect {tagmap_file_path}.idx
Read the following tags:
CTGGGTCGTCGGCAGCGTCAGATGTGTAAGCAGTTACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTACTTGTACCCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAGAACCCGGCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTATCGTAGATCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAACGCGGAACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTACGCTATCCCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAGTTGCATGCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTATAAATCGTCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAATCGCCATCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTACATAAAGGCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTATCACGGTACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTACACTCAACCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAGCTGTGTACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTATTGCGTCGCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAATATGAGACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTACACCTCAGCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAGCTACTTCCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTATGGGAGCTCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTAATCCGGCACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
CTGGGTCGTCGGCAGCGTCAGATGTGTACCGTTATGCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

[build] loading fasta file barcode_corrected_tags/70BPbarcodes_tagmap.fa
[build] k-mer length: 11
[build] counting k-mers ... done.
[build] building target de Bruijn graph ...  done 
[build] creating equivalence classes ...  done
[build] target de Bruijn graph has 680 contigs and contains 680 k-mers 

[index] k-mer length: 11
[index] number of targets: 680
[index] number of k-mers: 680
[index] number of equivalence classes: 680
#[inspect] Index version number = 10
#[inspect] k = 11
#[inspect] number of targets = 680
#[inspect] number of equivalence classes = 680
#[inspect] number of contigs = 680
#[inspect] Number of k-mers in index = 680
#EC.size	Num.targets
1	680


#EC.size	Num.kmers
1	680

Quantify ClickTags with kallisto bustools and barcode whitelist

In [10]:
#Run kallisto bus on ClickTag fastqs

!mkdir jelly3fedstarved_1_tags_HiSeq
write_folder = 'jelly3fedstarved_1_tags_HiSeq'

R1_1='Jelly3tags1_S1_L001_R1_001.fastq.gz'
R2_1='Jelly3tags1_S1_L001_R2_001.fastq.gz'

!kallisto bus -i {tagmap_file_path}.idx -o {write_folder} -x 10xv2 -t 20 {R1_1} {R2_1}

#sort bus file
!bustools sort -o {os.path.join(write_folder,'output_sorted.bus')} {os.path.join(write_folder,'output.bus')}

# convert the sorted busfile to txt
!bustools text -o {os.path.join(write_folder,'output_sorted.txt')} {os.path.join(write_folder,'output_sorted.bus')}


!mkdir jelly3fedstarved_2_tags_HiSeq
write_folder = 'jelly3fedstarved_2_tags_HiSeq'

R1_2='Jelly3tags2_S2_L001_R1_001.fastq.gz'
R2_2='Jelly3tags2_S2_L001_R2_001.fastq.gz'

!kallisto bus -i {tagmap_file_path}.idx -o {write_folder} -x 10xv2 -t 20 {R1_2} {R2_2}

#sort bus file
!bustools sort -o {os.path.join(write_folder,'output_sorted.bus')} {os.path.join(write_folder,'output.bus')}

# convert the sorted busfile to txt
!bustools text -o {os.path.join(write_folder,'output_sorted.txt')} {os.path.join(write_folder,'output_sorted.bus')}
Warning: you asked for 20, but only 2 cores on the machine
[index] k-mer length: 11
[index] number of targets: 680
[index] number of k-mers: 680
[index] number of equivalence classes: 680
[quant] will process sample 1: Jelly3tags1_S1_L001_R1_001.fastq.gz
                               Jelly3tags1_S1_L001_R2_001.fastq.gz
[quant] finding pseudoalignments for the reads ... done
[quant] processed 11,225,060 reads, 10,004,853 reads pseudoaligned

Read in 10004853 BUS records
Read in 9946557 BUS records

Warning: you asked for 20, but only 2 cores on the machine
[index] k-mer length: 11
[index] number of targets: 680
[index] number of k-mers: 680
[index] number of equivalence classes: 680
[quant] will process sample 1: Jelly3tags2_S2_L001_R1_001.fastq.gz
                               Jelly3tags2_S2_L001_R2_001.fastq.gz
[quant] finding pseudoalignments for the reads ... done
[quant] processed 10,090,654 reads, 8,879,356 reads pseudoaligned

Read in 8879356 BUS records
Read in 8826794 BUS records
In [11]:
write_folder = '' 

bus_data_jelly1 = pd.read_csv(os.path.join(write_folder,'jelly3fedstarved_1_tags_HiSeq/output_sorted.txt'), delimiter='\t', header=None, names = ['barcode', 'umi', 'tag_eqclass', 'multiplicity'])
bus_data_jelly1.head()

bus_data_jelly1

bus_data_jelly2 = pd.read_csv(os.path.join(write_folder,'jelly3fedstarved_2_tags_HiSeq/output_sorted.txt'), delimiter='\t', header=None, names = ['barcode', 'umi', 'tag_eqclass', 'multiplicity'])
bus_data_jelly2.head()

tag_map_df = pd.DataFrame.from_dict(tag_map, orient = 'index').reset_index()
tag_map_df.columns=['tagname','tagseq']
tag_map_df['ClickTag'] = tag_map_df['tagname'].str.split('-').str.get(0)
tag_map_df.head()

bus_data_jelly1['tag']= bus_data_jelly1['tag_eqclass'].map(tag_map_df['ClickTag'])
bus_data_jelly1.head()

bus_data_jelly2['tag']= bus_data_jelly2['tag_eqclass'].map(tag_map_df['ClickTag'])
bus_data_jelly2.head()



print('Counting UMIs')
counted_data_jelly1 = bus_data_jelly1.groupby(['barcode', 'tag'])['umi'].count().reset_index()
counted_data_jelly1.rename(columns={'umi':'umi_counts'}, inplace = True)
counted_data_jelly1.head()

print('Counting UMIs')
counted_data_jelly2 = bus_data_jelly2.groupby(['barcode', 'tag'])['umi'].count().reset_index()
counted_data_jelly2.rename(columns={'umi':'umi_counts'}, inplace = True)
counted_data_jelly2.head()
Counting UMIs
Counting UMIs
Out[11]:
barcode tag umi_counts
0 AAAAAAAAAAAAAAAA BC_23 1
1 AAAAAAAAAAAAAAAA BC_32 1
2 AAAAAAAAAAAAAAAA BC_34 1
3 AAAAAAAAAAAAAAAA BC_40 1
4 AAAAAAAAAAAAAAAC BC_34 1
In [12]:
!mkdir counted_tag_data
In [13]:
data_dict={'counted_data_jelly1':counted_data_jelly1, 'counted_data_jelly2':counted_data_jelly2}
counted_data_jelly1['barcode']=[x+'-1' for x in counted_data_jelly1['barcode'].values]
counted_data_jelly2['barcode']=[x+'-2' for x in counted_data_jelly2['barcode'].values]



for counted_data in data_dict:

    le_barcode = LabelEncoder()
    barcode_idx =le_barcode.fit_transform(data_dict[counted_data]['barcode'].values)
    print('Barcode index shape:', barcode_idx.shape)

    le_umi = LabelEncoder()
    umi_idx = le_umi.fit_transform(data_dict[counted_data]['umi_counts'].values)
    print('UMI index shape:', umi_idx.shape)

    le_tag = LabelEncoder()
    tag_idx = le_tag.fit_transform(data_dict[counted_data]['tag'].values)
    print('Tag index shape:', tag_idx.shape)
    
    # convert data to csr matrix
    csr_matrix_data = scipy.sparse.csr_matrix((data_dict[counted_data]['umi_counts'].values,(barcode_idx,tag_idx)))
    
    
    scipy.io.mmwrite(os.path.join(write_folder,'counted_tag_data/' + counted_data + '.mtx'),csr_matrix_data)
    print('Saved sparse csr matrix')
    
    pd.DataFrame(le_tag.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), index = False, header = False)
    pd.DataFrame(le_barcode.classes_).to_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_barcode_labels.csv'), index = False, header = False)
    print('Saved cell barcode and hashtag labels')
    print('Number of unique cell barcodes seen:', len(le_barcode.classes_))
Barcode index shape: (2019238,)
UMI index shape: (2019238,)
Tag index shape: (2019238,)
Saved sparse csr matrix
Saved cell barcode and hashtag labels
Number of unique cell barcodes seen: 463514
Barcode index shape: (1836152,)
UMI index shape: (1836152,)
Tag index shape: (1836152,)
Saved sparse csr matrix
Saved cell barcode and hashtag labels
Number of unique cell barcodes seen: 425169
In [14]:
# Clicktag for both 10x lanes concatenated

ClickTagCountsmat=scipy.io.mmread(os.path.join(write_folder,'counted_tag_data/counted_data_jelly1.mtx'))
ClickTagCounts=pd.DataFrame(ClickTagCountsmat.toarray(), 
                            index=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly1_ClickTag_barcode_labels.csv'), header=None).loc[:,0]),
                            columns=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), header=None).loc[:,0]))

ClickTagCountsmat=scipy.io.mmread(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2.mtx'))
ClickTagCounts=ClickTagCounts.append(pd.DataFrame(ClickTagCountsmat.toarray(), 
                            index=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/counted_data_jelly2_ClickTag_barcode_labels.csv'), header=None).loc[:,0]),
                            columns=list(pd.read_csv(os.path.join(write_folder,'counted_tag_data/' + counted_data + '_ClickTag_tag_labels.csv'), header=None).loc[:,0])))


ClickTagCounts
Out[14]:
BC_21 BC_22 BC_23 BC_24 BC_25 BC_26 BC_27 BC_28 BC_29 BC_30 BC_31 BC_32 BC_33 BC_34 BC_35 BC_36 BC_37 BC_38 BC_39 BC_40
AAAAAAAAAAAAAAAA-1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
AAAAAAAAAAAAAAAC-1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
AAAAAAAAAAAAAAAG-1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
AAAAAAAAAAAAAAAT-1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
AAAAAAAAAAAAAATT-1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTTTGTCGAAGCGCG-2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
TTTTTGTGTGCCTGAG-2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
TTTTTTACCTTTAGAC-2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
TTTTTTAGGTGGTCCG-2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
TTTTTTTTTTTTTTTT-2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0

888683 rows × 20 columns

Use knee-plot to set cutoff for real cells with high-quality ClickTags

In [15]:
ClickTag_counts_sorted = copy.deepcopy(ClickTagCounts.T.sum())
ClickTag_counts_sorted = ClickTag_counts_sorted.sort_values(ascending=False)
plt.plot(ClickTag_counts_sorted.apply(np.log10),np.log10(range(len(ClickTag_counts_sorted))))
plt.axhline(np.log10(50000))
plt.show()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log10
  This is separate from the ipykernel package so we can avoid doing imports until
In [16]:
ClickTag_counts_filtered=ClickTagCounts.loc[list(ClickTag_counts_sorted[:50000].index)]
filtered_ClickTag_counts=ClickTag_counts_filtered.T
filtered_ClickTag_counts



hits = 0
counter = 1
for barcode in filtered_ClickTag_counts.index:
    for i in filtered_ClickTag_counts:    
        if filtered_ClickTag_counts[i].idxmax() == barcode:
            if hits ==0:
                sortedheatmap_dtf=pd.DataFrame({counter:filtered_ClickTag_counts[i]})
                hits+=1
                counter+=1
            else:
               sortedheatmap_dtf = sortedheatmap_dtf.assign(i = filtered_ClickTag_counts[i])
               sortedheatmap_dtf.rename(columns = {'i':counter}, inplace = True)
               counter+=1

filtered_ClickTag_counts=copy.deepcopy(sortedheatmap_dtf)

percentClickTags_counts = copy.deepcopy(filtered_ClickTag_counts)
for i in filtered_ClickTag_counts:
        percentClickTags_counts[i] = filtered_ClickTag_counts[i]/filtered_ClickTag_counts[i].sum()
In [17]:
fig, ax = plt.subplots(figsize=(10,10))  
sns.heatmap(np.log1p(filtered_ClickTag_counts), cmap='viridis')
plt.show()

fig, ax = plt.subplots(figsize=(10,10))  
sns.heatmap(percentClickTags_counts, cmap='viridis')
plt.show()

filtered_ClickTag_counts
Out[17]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 ... 49961 49962 49963 49964 49965 49966 49967 49968 49969 49970 49971 49972 49973 49974 49975 49976 49977 49978 49979 49980 49981 49982 49983 49984 49985 49986 49987 49988 49989 49990 49991 49992 49993 49994 49995 49996 49997 49998 49999 50000
BC_21 4838 4466 4020 4029 3849 3388 3145 3134 3080 2732 2187 2348 1854 2130 2027 1845 1950 2160 1688 1762 1713 1578 1426 1373 1460 1484 1466 1264 1365 1427 1209 1335 1128 1257 1204 1084 1140 909 1164 1273 ... 3 7 7 7 7 3 4 3 3 2 5 7 1 7 2 5 8 7 6 5 3 5 3 3 8 3 4 6 5 7 5 4 8 3 0 4 6 4 4 4
BC_22 3502 3433 3298 3142 3095 2532 2738 2227 2242 2120 2052 1842 1771 1534 1609 1609 1436 1131 1468 1351 1220 1260 1368 1249 1142 1145 1131 960 1099 1029 1075 860 1036 950 976 992 972 531 932 827 ... 1 7 4 4 4 4 1 4 2 5 1 2 2 1 2 3 4 5 0 3 4 6 1 2 5 6 4 3 3 2 1 7 1 3 2 2 4 4 2 2
BC_23 9 10 10 10 9 12 13 3 7 13 4 8 4 5 6 7 5 8 7 2 8 6 2 6 2 1 3 2 3 3 1 5 3 1 5 0 7 171 1 3 ... 1 4 0 3 3 1 3 2 2 2 1 1 2 2 4 2 0 0 3 4 3 0 0 0 1 0 3 3 5 2 0 2 0 0 1 2 3 1 3 4
BC_24 25 25 22 18 17 22 19 26 16 23 19 10 8 17 15 11 13 15 6 14 10 6 12 14 21 11 7 10 8 14 5 2 5 6 14 13 13 472 9 9 ... 2 8 2 4 5 6 3 6 4 0 2 3 5 5 1 8 5 4 4 6 7 6 3 5 5 3 3 7 4 2 4 6 4 6 4 3 6 2 3 7
BC_25 31 29 22 27 30 35 30 24 13 25 20 12 23 19 13 15 13 19 20 13 17 10 18 13 22 11 9 11 11 16 18 17 15 13 10 14 10 24 7 11 ... 4 3 5 5 4 6 4 3 5 5 1 4 6 8 2 4 4 7 5 4 5 8 2 4 5 1 6 3 7 5 5 5 3 3 8 3 8 6 5 7
BC_26 30 23 23 19 29 20 21 20 20 18 9 18 16 8 10 10 11 9 10 17 14 11 15 14 12 13 17 9 13 10 9 8 10 9 8 14 10 16 13 10 ... 2 5 8 0 4 4 2 5 2 6 1 4 4 3 0 6 5 4 3 2 1 6 0 4 2 9 8 7 5 4 2 4 3 3 3 5 4 6 3 3
BC_27 15 19 10 17 11 7 10 8 17 13 10 14 14 6 8 12 5 7 12 12 6 7 2 11 11 10 12 9 11 3 9 1 10 6 6 7 1 10 4 4 ... 0 1 9 4 3 1 2 0 2 1 2 3 2 0 4 2 4 2 7 2 4 2 3 3 1 0 1 1 4 0 6 4 3 2 0 4 4 5 7 4
BC_28 19 12 15 14 17 9 12 10 13 5 10 9 12 8 7 5 5 6 6 10 6 4 4 1 5 8 8 3 6 4 10 5 3 2 2 12 7 6 6 9 ... 1 0 1 5 2 6 1 0 1 2 0 1 2 5 1 1 3 2 5 3 0 1 2 2 6 1 1 1 1 2 0 1 1 0 1 1 0 3 2 2
BC_29 9 21 22 19 13 14 14 12 20 22 12 11 10 10 12 11 10 12 15 11 13 8 12 8 10 7 11 9 9 10 8 13 9 7 6 16 10 6 11 6 ... 1 7 3 4 8 3 4 15 1 4 2 4 7 8 3 6 3 4 3 1 3 4 0 7 6 6 4 3 4 4 3 1 9 3 9 2 4 4 4 3
BC_30 13 5 17 9 13 12 11 7 5 10 11 9 6 3 5 4 8 4 7 6 8 5 9 7 6 6 7 7 5 4 6 8 5 5 4 12 4 2 9 1 ... 5 2 1 2 1 2 2 13 4 3 2 0 3 3 1 3 2 2 2 3 3 5 2 4 1 2 2 2 2 0 4 3 6 2 7 7 5 6 0 3
BC_31 9 15 9 13 14 14 19 10 18 16 14 15 22 9 13 10 9 7 12 11 11 8 7 9 16 6 9 11 11 3 8 8 10 4 4 8 8 8 11 6 ... 5 1 2 4 3 4 3 1 2 4 2 0 3 3 4 2 1 6 2 1 2 1 2 5 2 5 2 4 6 3 3 3 3 1 3 4 1 2 2 6
BC_32 23 20 10 18 14 13 9 8 13 15 5 10 20 13 12 9 14 9 12 11 7 9 12 15 8 6 5 5 7 9 13 8 12 9 7 8 7 8 5 5 ... 0 4 4 5 4 4 4 0 6 5 1 3 3 2 3 4 3 6 5 5 7 6 5 3 5 7 2 2 3 2 4 6 6 4 6 6 5 3 4 6
BC_33 15 11 13 9 13 10 17 10 12 8 11 9 7 9 9 5 7 5 8 5 7 6 6 6 4 6 3 122 7 6 7 9 5 5 5 9 10 7 3 6 ... 0 2 1 2 0 1 3 0 1 2 1 2 6 3 3 2 5 0 1 1 3 0 2 4 2 1 2 1 3 3 3 0 3 3 2 0 3 2 4 1
BC_34 19 24 29 22 23 23 19 19 20 24 14 16 21 14 7 13 23 15 16 10 14 9 14 14 7 8 9 183 7 15 13 14 14 6 10 17 8 7 8 11 ... 1 7 1 8 4 7 10 2 3 10 4 6 6 6 6 9 5 7 8 6 8 5 4 6 5 4 4 7 6 8 2 7 4 7 5 4 7 5 3 3
BC_35 18 20 16 20 9 19 23 12 24 13 14 19 25 7 12 12 9 8 11 9 28 6 9 11 7 7 5 10 9 14 10 4 9 3 7 10 5 9 10 7 ... 2 5 6 2 2 6 2 0 4 4 0 6 6 5 2 4 2 2 3 3 3 3 3 6 7 1 2 6 2 4 4 4 1 5 1 4 0 2 8 3
BC_36 21 28 22 20 23 11 22 20 19 26 18 16 25 10 15 17 15 14 15 9 45 10 14 10 6 8 10 12 15 9 15 12 15 7 11 18 8 9 12 12 ... 2 5 5 6 9 5 5 2 3 6 3 5 7 5 7 2 4 6 4 6 7 5 10 4 5 9 5 4 5 7 8 3 5 6 9 7 7 6 6 5
BC_37 20 14 18 15 17 15 16 10 10 10 16 9 16 10 8 8 6 9 7 10 10 8 6 9 13 9 8 5 5 9 10 14 4 2 4 8 4 5 2 7 ... 2 3 5 4 3 2 5 0 4 5 0 6 7 2 5 6 3 4 2 8 3 4 4 6 5 5 8 4 3 8 2 7 2 7 5 6 4 3 6 5
BC_38 17 24 36 26 16 32 26 16 23 20 15 16 17 12 21 20 16 15 14 15 13 12 13 12 16 13 9 26 13 8 13 12 8 11 8 26 8 14 9 7 ... 1 5 7 7 6 7 7 2 1 8 1 4 5 5 7 2 5 4 3 7 8 6 4 6 5 4 7 5 3 8 4 7 7 11 5 8 3 6 5 4
BC_39 12 13 8 7 9 6 6 6 7 4 8 4 6 5 4 6 6 2 2 5 7 3 4 5 2 4 4 2 2 9 3 7 2 2 2 10 2 8 4 2 ... 19 0 4 1 4 5 9 9 11 1 20 12 1 3 6 5 3 3 6 4 1 2 7 3 0 4 7 1 4 4 6 2 3 2 2 1 0 5 1 3
BC_40 14 12 10 7 17 11 9 8 7 14 11 6 9 13 7 9 8 5 8 8 15 10 8 8 4 1 2 5 5 5 8 5 7 4 8 7 1 4 5 5 ... 35 11 12 10 11 10 12 19 25 11 37 13 8 10 23 10 17 10 13 11 10 10 28 8 9 13 9 14 9 9 18 8 12 12 10 10 9 8 11 8

20 rows × 50000 columns

In [18]:
adata = sc.AnnData(ClickTag_counts_filtered)
adata.var_names= list(ClickTag_counts_filtered.columns)
adata.obs_names= list(ClickTag_counts_filtered[:50000].index)
In [19]:
adata
Out[19]:
AnnData object with n_obs × n_vars = 50000 × 20

Cluster ClickTag counts to determine high quality cells (cells strongly marked by the assigned barcodes)

In [20]:
sc.pp.filter_genes(adata, min_counts=0)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
adata.obs['n_countslog']=np.log(adata.obs['n_counts'])
sc.pp.log1p(adata)
sc.pp.regress_out(adata, ['n_counts'])
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [21]:
adata
sc.tl.tsne(adata, perplexity=30, use_rep=None)




sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.louvain(adata, resolution=2)
sc.pl.tsne(adata, color='n_countslog')

sc.set_figure_params(dpi=120)
sc.pl.tsne(adata,color=['louvain'])



# sc.pl.tsne(adata, color=adata.var_names)

# for i in range(20):
#     sc.pl.violin(adata[adata.obs['louvain'].isin([str(i)])], keys=adata.var_names)

# sc.pl.tsne(adata, color=['louvain'],palette='rainbow',legend_loc='on data')
WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.
/usr/local/lib/python3.7/dist-packages/numba/np/ufunc/parallel.py:363: NumbaWarning: The TBB threading layer requires TBB version 2019.5 or later i.e., TBB_INTERFACE_VERSION >= 11005. Found TBB_INTERFACE_VERSION = 9107. The TBB threading layer is disabled.
  warnings.warn(problem)
In [ ]:
sc.pl.umap(adata, color='louvain')
sc.pl.umap(adata, color='n_countslog')
sc.pl.umap(adata, color=adata.var_names)
In [ ]:
sc.tl.louvain(adata, resolution=0.15)#0.15
sc.pl.umap(adata, color='louvain')

Select ClickTag Clusters with Clean Expression

In [ ]:
adata_sub = adata[adata.obs['louvain'].isin(['1','8','9','10','11','12','13','14','15','16'])]
adata_sub 
sc.tl.louvain(adata_sub, random_state=42,resolution=0.05, key_added='louvain_new')
sc.tl.umap(adata_sub,random_state = 42,spread=0.5 )
In [ ]:
sc.pl.umap(adata_sub, color=['louvain_new','n_countslog'])

Compare ClickTag Counts/Cell Filtering to Previous Cell Ranger-based Analysis (Found in this notebook)

In [ ]:
adata
Out[ ]:
AnnData object with n_obs × n_vars = 50000 × 20
    obs: 'n_counts', 'n_countslog', 'louvain'
    var: 'n_counts'
    uns: 'log1p', 'neighbors', 'umap', 'louvain', 'louvain_colors'
    obsm: 'X_tsne', 'X_umap'
    obsp: 'distances', 'connectivities'
In [28]:
adata.obs
Out[28]:
n_counts n_countslog
CCATGTCAGTGTTTGC-2 13584.0 9.516648
AGGCCGTCAATCGAAA-2 12926.0 9.466996
CGCTGGACAAGACGTG-1 11720.0 9.369052
CTCGAGGTCACAATGC-1 10908.0 9.297252
CACACCTGTCTGATCA-1 9881.0 9.198369
... ... ...
TGACTAGTCAAACGGG-1 83.0 4.418840
CGGACGTAGTAGCGGT-1 83.0 4.418840
AACTCTTGTTGCCTCT-1 83.0 4.418840
ACGAGGAGTAAGTTCC-1 83.0 4.418840
GGGAATGAGCGTTGCC-2 83.0 4.418840

50000 rows × 2 columns

In [29]:
#Read in clustered, filtered adata from Cell Ranger analysis of Starvation data
!mv D1.1798 D1.1798.h5ad
old=sc.read('D1.1798.h5ad')
In [30]:
old.obs
Out[30]:
n_counts n_countslog louvain orgID fed starved fed_ord starved_ord new_fed fed_neighbor_score
index
AAACCTGAGAGGGCTT-1 2188.0 3.340047 10 3 True False True False True 14
AAACCTGAGAGGTTAT-1 1565.0 3.194514 4 9 False True False True False 0
AAACCTGAGCGAGAAA-1 342.0 2.534026 3 3 True False True False True 3
AAACCTGAGCGTGTCC-1 4694.0 3.671543 15 10 False True False True False 5
AAACCTGAGGATTCGG-1 4876.0 3.688064 3 8 False True False True False 2
... ... ... ... ... ... ... ... ... ... ...
TTTGTCAGTGGACGAT-2 2625.0 3.419129 24 4 True False True False True 8
TTTGTCATCATCTGTT-2 664.0 2.822168 2 7 False True False True False 8
TTTGTCATCGCCTGAG-2 433.0 2.636488 6 7 False True False True False 4
TTTGTCATCTAACGGT-2 31328.0 4.495933 22 3 True False True False True 6
TTTGTCATCTGAGTGT-2 734.0 2.865696 0 9 False True False True False 2

13673 rows × 10 columns

Look at ClickTag Counts per condition and cell type (counts per cell in each group)

In [31]:
cells = list(set(old.obs_names).intersection(adata.obs_names))
test = adata[cells,:]
test.obs
Out[31]:
n_counts n_countslog
GTGTGCGAGAATTGTG-1 234.0 5.455321
TTGCCGTGTACAGCAG-1 148.0 4.997212
CTGCGGAAGGTGATAT-1 247.0 5.509388
GACTGCGTCTACCTGC-2 85.0 4.442651
CACACAAGTTGTGGAG-1 389.0 5.963579
... ... ...
CGTAGGCCAGTATAAG-1 224.0 5.411646
CTCTACGGTGCCTTGG-1 262.0 5.568345
TCATTACCAAGCGTAG-2 127.0 4.844187
TCTTTCCTCAATCACG-1 258.0 5.552959
GCAAACTCAGGTCCAC-1 446.0 6.100319

13657 rows × 2 columns

In [32]:
test.obs['fed'] = pd.Categorical(old[cells,:].obs['fed'])
test.obs['cellRanger_louvain'] = pd.Categorical(old[cells,:].obs['louvain'])
test.obs
Trying to set attribute `.obs` of view, copying.
Out[32]:
n_counts n_countslog fed cellRanger_louvain
GTGTGCGAGAATTGTG-1 234.0 5.455321 False 2
TTGCCGTGTACAGCAG-1 148.0 4.997212 True 0
CTGCGGAAGGTGATAT-1 247.0 5.509388 False 28
GACTGCGTCTACCTGC-2 85.0 4.442651 True 2
CACACAAGTTGTGGAG-1 389.0 5.963579 False 7
... ... ... ... ...
CGTAGGCCAGTATAAG-1 224.0 5.411646 False 4
CTCTACGGTGCCTTGG-1 262.0 5.568345 True 17
TCATTACCAAGCGTAG-2 127.0 4.844187 True 8
TCTTTCCTCAATCACG-1 258.0 5.552959 True 7
GCAAACTCAGGTCCAC-1 446.0 6.100319 False 29

13657 rows × 4 columns

In [33]:
plt.figure(figsize=(8,4))
plt.grid(False)

ax = sns.violinplot(x="fed", y="n_countslog", hue="fed",data=test.obs,alpha=0.6)

fsize = 11
plt.yticks(fontsize=fsize  )
plt.xticks(fontsize=fsize  )
plt.ylabel('ClickTag Counts (log)',fontsize=fsize )
plt.xlabel('Control (Fed)',fontsize=fsize )


plt.show()
In [34]:
oocytes = test[test.obs['cellRanger_louvain'].isin(['2','13','35'])]
oocytes

plt.figure(figsize=(8,4))
plt.grid(False)

ax = sns.violinplot(x="fed", y="n_countslog", hue="fed",data=oocytes.obs,alpha=0.6)

fsize = 11
plt.yticks(fontsize=fsize  )
plt.xticks(fontsize=fsize  )
plt.ylabel('ClickTag Counts (log)',fontsize=fsize )
plt.xlabel('Control (Fed)',fontsize=fsize )


plt.show()
In [36]:
from scipy import stats
res = stats.ttest_ind(oocytes.obs["n_countslog"][oocytes.obs ['fed'] == True],oocytes.obs["n_countslog"][oocytes.obs['fed'] == False], equal_var = False)
In [37]:
#Alternative: Control/Fed > Starved
print('p-value: ',res[1]/2)
p-value:  0.02505519761521832
In [ ]:
plt.figure(figsize=(8,4))
plt.grid(False)

ax = sns.violinplot(x="cellRanger_louvain", y="n_countslog", hue="cellRanger_louvain",data=test.obs,alpha=0.6,legend=False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,ncol=3)

fsize = 11
plt.yticks(fontsize=fsize  )
plt.xticks(fontsize=fsize  )
plt.ylabel('ClickTag Counts (log)',fontsize=fsize )
plt.xlabel('Cluster',fontsize=fsize )


plt.show()
In [ ]:
counts = test.obs
counts = counts.groupby("cellRanger_louvain").mean()
counts.head()
Out[ ]:
n_counts n_countslog
cellRanger_louvain
0 501.967834 6.108386
1 250.082336 5.377275
2 485.516296 5.632371
3 397.085999 5.727575
4 408.277374 5.766906
In [ ]:
counts['Cluster'] = list(counts.index)
In [ ]:
#Check to see if cluster size (number of cells) related to average number of clickTags
plt.figure(figsize=(12,4))
plt.grid(False)

ax = sns.scatterplot(x="Cluster", y="n_countslog",data=counts,alpha=0.6,legend=False)
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,ncol=3)

fsize = 11
plt.yticks(fontsize=fsize  )
plt.xticks(fontsize=fsize  )
plt.ylabel('Average ClickTag Count (log)',fontsize=fsize )
plt.xlabel('Cluster',fontsize=fsize )


plt.show()

Compare counts between cellRanger and Kallisto analysis

In [ ]:
adata['AAACCTGAGAGGGCTT-1'].obs
Out[ ]:
n_counts n_countslog louvain
AAACCTGAGAGGGCTT-1 258.0 5.552959 13
In [ ]:
adata['AAACCTGAGCGAGAAA-1'].obs
Out[ ]:
n_counts n_countslog louvain
AAACCTGAGCGAGAAA-1 279.0 5.631212 13
In [ ]:
adata['TTTGTCATCTAACGGT-2'].obs
Out[ ]:
n_counts n_countslog louvain
TTTGTCATCTAACGGT-2 1815.0 7.503841 13
In [ ]:
counter=0
for x in old[old.obs['orgID']=='2'].obs_names:
  print(adata[x].obs['louvain'][0])
  counter+=1
  if counter==100:break
11
11
11
11
11
1
0
4
2
11
11
11
11
11
1
2
1
11
1
4
2
11
11
11
11
11
1
1
11
1
2
11
1
11
2
11
11
1
11
11
11
1
4
1
11
4
11
2
1
11
11
11
1
4
11
1
2
11
1
11
11
11
11
4
4
11
11
1
2
11
4
1
11
11
11
11
2
11
11
11
1
11
4
11
11
2
1
1
2
11
11
11
11
2
11
0
1
11
11
1
In [ ]:
old[old.obs['orgID']==3]
Out[ ]:
View of AnnData object with n_obs × n_vars = 0 × 2657
    obs: 'n_counts', 'n_countslog', 'louvain', 'orgID', 'fed', 'starved', 'fed_ord', 'starved_ord', 'new_fed', 'fed_neighbor_score'
    var: 'n_counts', 'n_cells'
    uns: 'fed_ord_colors', 'louvain', 'louvain_colors', 'louvain_sizes', 'neighbors', 'new_fed_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'starved_ord_colors'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [ ]:
adata[x].obs['louvain'][0]
Out[ ]:
'13'
In [ ]:
counter=0
for x in old.obs_names:
  if x not in adata.obs_names:
    print(x)
AACTGGTAGGGAAACA-2
AAGGTTCCACGAAACG-2
ACCTTTATCCGGCACA-2
ACGCCGACAGCTCGCA-2
ATAGACCGTTAAGGGC-2
CCCTCCTCATCGTCGG-2
CCTCAGTCAGGAATGC-2
CGATTGACATACGCTA-2
CGGAGTCGTTTGGCGC-2
CTGGTCTGTCAGAGGT-2
GACAGAGCAAGCCGCT-2
GATCGTATCCATGAAC-2
TACTCATCAAGGTTTC-2
TGGTTAGCATGTCGAT-2
TTAGTTCTCTCCAACC-2
TTTGTCAAGGGCATGT-2
In [ ]:
#ClickTag lane 1 counts estimated from Cell Ranger analysis
jelly1tags=pd.read_csv('D1.1799')
jelly1tags
Out[ ]:
Unnamed: 0 ACACCCTGTCATCGGC CGATTGAGTACTCTCC GACACGCTCCGAACGC AGATCTGCAGGACCCT CGATCGGGTGTCGCTG CACACCTCATTGTGCA CTCCTAGCATCTCGCT CCACTACCATATACGC ACTATCTCAAGCCGTC CCACCTACAAATACAG ATAAGAGCAATTCCTT CAGAGAGGTACATGTC AAAGCAAAGAAAGTGG CAGCTAATCCGTTGCT AGTGGGAAGATGTGGC CAGCCGAAGCGATAGC ATCATCTTCGGAGGTA GATCGATAGGGTCGAT CGTGAGCCACAAGCCC TTGGCAACAGTAAGCG ACATCAGAGACTGGGT AGTCTTTTCTACTTAC CTGGTCTCAAGGTTCT GTTACAGCAGCGTAAG AGCTCTCCATCACAAC TGCTACCGTTAAGGGC GCGCAACTCATTCACT TCCCGATCAAGTTCTG GGGTCTGAGCCTCGTG GCATACAGTTAGGGTG TCAGGATGTGGTAACG CGAATGTGTTGCGCAC GGGTCTGCACAGACAG TCACAAGCAGCCAATT GCGAGAAAGCAGCGTA CTCACACCATTACCTT CAGTAACCAATCCGAT ATCGAGTCATTGGTAC GTCCTCACATCACAAC ... GCGCAACGTTATCGGT AGTGAGGAGCCAGGAT CGAACATAGGGTATCG TAGCCGGTCGATGAGG GACTACAAGCACGCCT CATGCCTAGAAACGAG CGAGCCAAGTGTACGG ACATGGTAGCTAAACA GTAACGTCAAAGTGCG ATCCGAACAACACCTA TAAACCGAGGTGTTAA GAATGAAAGGCCATAG AGCGTATCATGCGCAC CCATGTCAGCTGAACG TGCGTGGCATAAAGGT ACTGTCCGTAGAGTGC GTACGTAAGTTGTAGA ACAGCCGCATTTCAGG ATTCTACTCCATGCTC GGACAAGTCGTTTGCC TGACAACAGGACATTA CAAGAAAAGTACGTAA TGACTTTCATGCAACT CTTACCGGTGCAGACA TGACTTTGTGCCTGCA TGCGCAGCAACCGCCA GGACATTAGGGCTCTC ACGATGTGTTAGAACA TAAACCGTCAACGCTA TGTGGTATCAACACGT GTTACAGCACCAGGCT GTATCTTAGCTGCAAG GTACGTAGTATATCCG CGTAGCGTCCTATGTT TGTCCCAGTCTACCTC ATGCGATAGATCCCAT AGTGGGATCTGATACG TCGGGACGTTTACTCT TTTGGTTCATGTCTCC CTTTGCGGTAGTACCT
0 BC_21 0 5 0 1 0 3 7 0 0 4 0 8 0 0 0 4 4 0 5 0 0 0 5 0 0 4 0 0 8 0 0 0 0 1 0 0 0 0 1 ... 0 9 0 0 0 0 0 0 0 7 0 0 0 0 0 0 3 3 0 0 0 0 0 2 5 0 0 4 5 0 0 3 0 0 3 9 0 0 0 0
1 BC_22 0 1 0 4 0 1 2 0 0 2 0 7 0 0 0 7 6 0 7 0 0 0 3 0 0 6 0 0 4 0 0 0 0 2 0 1 0 0 0 ... 0 3 0 0 0 0 0 0 0 1 0 0 0 0 0 1 2 1 0 0 0 0 1 4 2 0 0 6 9 0 0 1 0 0 7 3 0 0 0 0
2 BC_23 0 1 0 7 0 1 5 0 0 0 0 2 0 0 0 4 2 0 2 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 ... 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 4 4 1 0 0 0 0 2 5 0 0 3 1 0 0 5 0 0 0 2 0 0 0 0
3 BC_24 0 6 0 1 0 5 8 0 0 2 0 3 0 0 0 4 4 0 1 0 0 0 1 0 0 7 0 0 3 0 0 0 0 1 0 0 0 0 0 ... 0 1 0 1 1 0 0 0 2 7 0 0 0 0 0 0 5 1 0 0 0 0 0 3 11 0 0 4 5 0 0 5 0 0 7 4 1 0 0 0
4 BC_25 0 13 0 6 0 4 2 0 0 9 0 16 0 0 0 8 3 1 6 0 0 0 4 0 0 10 0 0 2 0 0 0 0 2 0 0 0 0 0 ... 0 7 0 0 0 0 0 0 0 3 0 0 0 0 0 0 4 5 0 0 0 0 0 6 5 0 0 8 6 0 0 4 0 0 6 7 0 0 0 0
5 BC_26 1 2 0 5 0 5 3 0 0 6 0 7 0 0 0 5 1 0 4 1 0 0 1 0 0 9 0 0 3 0 0 0 0 5 0 0 0 0 0 ... 0 5 0 0 0 0 0 0 0 1 0 0 0 0 0 0 5 9 0 0 0 0 0 8 8 0 0 4 3 0 0 8 0 1 2 4 0 1 1 0
6 BC_27 0 4 0 2 0 1 3 0 0 1 0 2 0 0 0 2 1 0 3 0 0 0 0 0 0 5 0 0 0 0 0 0 0 1 0 0 0 0 0 ... 0 1 1 0 0 0 1 0 0 5 0 0 0 0 0 0 1 4 0 1 0 0 0 2 2 1 0 4 5 0 0 1 0 0 3 3 0 0 0 0
7 BC_28 0 2 0 2 0 0 5 0 0 3 0 4 0 0 0 2 0 0 6 0 0 0 2 0 0 2 0 0 4 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 3 1 0 0 0 0 0 2 3 0 0 2 1 0 0 3 0 0 1 1 0 0 0 0
8 BC_29 0 7 1 1 0 1 9 1 0 2 0 7 0 0 0 6 3 0 5 0 0 0 3 1 0 8 0 0 2 0 0 0 0 2 0 0 0 0 0 ... 0 3 0 0 0 0 0 0 0 1 0 0 0 0 0 0 6 4 0 0 0 0 0 4 8 0 0 1 5 0 1 4 0 0 6 8 0 0 0 0
9 BC_30 0 4 0 0 0 3 3 0 0 5 0 3 1 0 0 4 1 0 3 0 0 0 0 0 0 1 0 0 2 1 0 0 1 2 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 2 1 0 0 0 0 0 1 2 0 0 3 2 0 0 4 0 0 0 5 0 0 0 0
10 BC_31 0 7 0 4 0 2 2 0 0 5 0 5 0 0 0 4 2 0 8 0 0 0 0 0 0 2 0 0 1 0 0 0 0 82 0 0 0 0 0 ... 0 5 0 0 0 0 0 0 0 4 0 0 0 0 1 0 2 3 0 0 0 1 0 3 5 0 0 4 2 0 0 2 0 0 1 5 0 0 0 0
11 BC_32 1 2 0 5 0 2 2 0 1 5 0 5 0 1 0 3 11 0 5 0 0 0 0 0 0 5 0 0 3 0 1 0 0 109 0 0 1 0 0 ... 0 3 0 0 0 0 0 0 0 7 0 0 0 0 0 0 4 5 0 0 0 0 0 4 6 0 0 4 4 0 0 3 0 0 0 10 0 0 0 0
12 BC_33 0 1 0 0 0 3 2 0 0 2 0 1 0 0 0 2 3 0 1 0 0 0 2 0 0 4 0 0 6 0 0 0 0 4 1 0 0 0 0 ... 0 2 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 4 0 0 0 0 0 1 1 0 0 4 3 0 0 4 0 0 4 4 0 0 0 0
13 BC_34 0 11 0 6 0 8 3 0 0 7 0 4 0 0 1 5 5 0 7 0 1 0 4 0 0 5 1 0 2 0 0 0 0 4 0 0 0 0 0 ... 0 3 0 0 1 0 0 0 0 3 1 0 0 0 0 0 5 4 0 0 0 0 0 7 4 0 0 3 5 0 0 2 1 0 6 7 0 0 1 0
14 BC_35 0 6 0 2 0 2 2 0 0 2 1 1 0 0 0 5 1 0 5 0 0 0 1 0 0 1 0 0 1 0 0 0 0 2 0 0 0 1 0 ... 0 8 0 0 0 0 0 0 0 6 0 0 0 0 0 0 3 2 0 0 0 0 0 3 3 0 1 2 5 0 0 1 0 0 3 6 0 0 0 0
15 BC_36 0 4 0 6 0 7 5 0 1 8 0 8 0 0 0 6 8 0 13 0 0 0 3 0 0 3 0 1 5 0 0 0 0 8 0 0 0 0 0 ... 0 4 0 0 0 1 0 0 0 8 0 1 0 0 0 0 3 7 0 0 1 0 0 4 5 0 0 8 7 0 0 6 0 0 0 8 0 0 0 1
16 BC_37 0 6 0 4 0 1 5 0 0 5 0 3 0 0 0 3 5 0 6 0 0 0 13 0 0 1 0 0 4 0 0 0 0 3 0 0 0 0 0 ... 0 7 0 0 0 0 0 1 0 4 0 0 1 0 0 0 2 2 0 0 0 0 0 8 1 0 0 6 6 0 0 15 0 0 7 1 0 0 0 0
17 BC_38 0 10 0 6 0 4 6 0 0 8 0 4 0 0 0 8 10 0 12 0 0 1 12 0 0 7 0 0 3 0 0 1 0 4 0 0 0 0 0 ... 0 7 0 0 0 0 0 0 0 2 0 0 1 0 0 0 7 5 0 0 0 0 0 3 5 0 0 6 6 0 0 18 0 0 8 2 0 0 0 0
18 BC_39 0 0 0 1 0 2 0 0 0 0 0 1 0 0 0 2 0 0 3 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 3 2 1 0 1 1 0 0 2 0 0 4 1 0 0 0 0
19 BC_40 0 2 0 0 0 3 7 0 0 4 0 2 0 0 0 2 2 0 4 0 0 0 1 0 1 1 0 0 2 0 0 0 0 4 0 0 0 0 0 ... 0 4 0 0 0 0 0 0 0 2 0 0 0 0 0 0 4 0 0 0 0 0 0 4 1 0 0 2 1 1 0 4 0 0 2 1 0 0 0 0

20 rows × 234269 columns

In [ ]:
#ClickTag lane 2 counts estimated from Cell Ranger analysis
jelly2tags=pd.read_csv('D1.1800')
jelly2tags
Out[ ]:
Unnamed: 0 GATTCAGAGGCATGGT GCAGCCAAGTGGGCTA CTAATGGGTGCTGTAT CAAGGCCAGAAGGGTA CGCTTCAGTATCAGTC AACACGTGTTGCGTTA CTAGAGTCACCAGGCT CCATGTCCACAAGCCC GGCTGGTGTCTCTTAT TCACAAGCACCCATTC TTCTACAAGCCTCGTG GCGAGAACAAGGTTCT TGGGCGTAGAGTGAGA AGTTGGTGTGTGGCTC GCTCCTAGTGATAAGT GGATTACCAGTATCTG GGTGCGTCAATGGAAT ATCCACCCACCAGTTA TCGAGGCAGCCACTAT AAACCTGTCCTGTACC CAGCTAACACGGTAGA GTTACAGCACAACGTT CGTGTAACAGTGGAGT CCTATTATCACCATAG CGAGAAGGTCGAAAGC CAAGGCCGTGTGGTTT CGCTGGACAATCCGAT GAAATGATCATTATCC CAGTCCTTCCTTGGTC TTGCCGTTCGCATGGC CTCGAGGAGTCATCCA GGCTCGAGTGCCTGTG CAGCCGACAGTGGAGT CACACTCTCTAACCGA GTGGGTCCACCGCTAG TCAACGAAGGTCATCT ACGAGGAGTATCTGCA CTCACACAGCCTATGT CGTCTACGTAGAGTGC ... AAAGCAAAGCGGATCA CGAGCACAGGGTATCG TGCGCAGCACAACGTT TTTCCTCCAGATGGCA CTCCTAGCAGCATACT AGCTCCTGTTACGACT TCAGGATTCCCTCTTT ATTACTCGTTTGCATG ATCATGGTCTTGGGTA GATCAGTAGAACAATC CAGCTAAGTTTGTGTG CGTGTCTTCCCTGACT TCTCTAAGTGGTCCGT ACGTCAAGTCTTCTCG CCTAGCTGTTTGTTGG CTGAAACAGCGATTCT CGCCAAGTCCACGTTC GACTACAGTAGGGTAC GTAGTCAGTAGAAAGG AGGGTGACAGATGGGT CCTATTAGTCGCTTCT CTTCTCTAGCGCCTCA CCGTACTCACAGACTT AAACGGGCACTTCGAA AACCGCGGTCAGTGGA CTGGTCTTCCATGAAC CGAGCACAGCCACGTC CTCTAATGTAAACCTC ACAGCTAAGGCAAAGA CTCTACGGTGGTGTAG TCACGAAGTGAGGGAG CTGATCCGTCTGCCAG CTACACCTCCGCGCAA AAAGCAAGTCCGAGTC ATTGGTGAGAGACGAA ATGGGAGAGGCTCATT CGATCGGCACACCGAC AGCGTATTCTGTCTCG CGCGTTTCAGCATACT ACGCCGACAAGAGTCG
0 BC_21 0 0 0 5 0 0 4 6 0 0 0 0 0 4 11 0 0 4 1 4 0 0 0 0 0 11 9 6 3 5 7 0 0 0 0 0 8 0 0 ... 4 0 7 0 0 0 4 1 1 0 0 0 0 0 4 0 0 0 1 1 2 0 8 0 0 8 0 7 0 0 0 0 0 0 0 0 0 1 0 0
1 BC_22 0 1 0 5 0 0 1 4 0 0 0 0 0 2 9 0 0 5 4 1 0 0 0 1 0 2 3 6 5 2 2 0 0 0 0 0 4 0 0 ... 3 0 4 0 0 0 3 0 0 1 0 0 0 0 3 0 0 0 0 3 0 0 2 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
2 BC_23 0 0 0 4 0 0 1 0 0 0 0 0 0 2 1 0 0 2 1 2 0 0 0 0 0 6 2 1 4 0 0 0 0 0 0 0 3 0 0 ... 2 0 3 0 0 1 1 0 0 0 0 0 0 0 3 0 0 1 0 4 1 1 1 1 1 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0
3 BC_24 0 0 0 0 0 0 4 4 0 1 0 0 1 4 2 0 0 10 3 6 0 0 0 0 0 6 4 4 2 7 6 0 0 0 0 0 4 0 0 ... 1 0 2 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 6 0 0 4 0 3 0 0 0 0 0 0 0 0 0 0 0 0
4 BC_25 0 0 0 6 0 0 9 3 0 0 0 0 0 3 5 0 0 5 4 7 1 0 0 0 0 8 7 8 1 3 3 0 0 0 1 0 3 0 1 ... 4 0 2 1 0 0 8 0 0 0 0 0 0 0 6 0 0 0 0 2 6 0 12 0 0 6 0 8 0 0 0 0 0 1 0 0 0 0 1 0
5 BC_26 0 0 0 5 0 0 2 1 0 0 0 0 0 3 7 0 0 3 6 5 0 0 0 0 0 6 3 8 4 9 1 0 0 0 0 0 2 0 0 ... 3 0 2 0 1 0 4 0 0 0 0 0 0 1 4 0 0 0 0 2 2 0 1 0 0 3 0 2 0 0 1 0 0 0 0 0 0 0 0 0
6 BC_27 0 0 0 5 0 0 2 2 0 0 1 0 0 0 1 0 0 1 5 5 0 0 0 0 0 5 4 7 3 3 4 0 0 0 0 0 3 0 0 ... 4 0 3 0 0 0 5 0 0 0 0 0 0 0 3 0 0 0 1 4 1 0 2 0 0 2 1 4 0 0 0 0 0 0 0 0 0 0 0 0
7 BC_28 0 0 0 2 0 0 0 0 0 0 0 0 0 3 2 0 0 2 0 0 0 0 0 0 0 2 2 4 2 3 2 0 0 0 0 0 2 0 0 ... 2 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0
8 BC_29 0 0 0 4 0 0 6 4 1 0 0 1 0 2 7 0 0 4 2 4 0 0 0 0 0 5 5 8 6 6 2 0 0 0 0 0 1 0 0 ... 8 0 3 0 0 0 2 0 0 0 0 0 0 0 4 0 0 0 0 5 4 0 7 0 0 3 0 3 0 0 0 1 1 0 0 0 0 0 0 0
9 BC_30 0 0 0 3 1 0 2 3 0 0 0 0 0 1 2 0 0 2 2 4 0 0 0 0 0 5 1 2 3 3 5 0 0 1 0 1 4 0 0 ... 4 0 1 0 0 0 2 0 0 0 0 0 0 0 2 1 0 0 0 1 4 0 4 0 0 3 0 5 0 0 0 0 0 0 0 1 0 0 0 0
10 BC_31 0 1 0 2 0 0 3 3 0 0 0 0 0 3 5 1 0 3 2 3 0 0 0 0 0 9 3 3 3 4 3 0 0 0 0 0 6 0 0 ... 2 0 5 0 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 5 0 0 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0
11 BC_32 0 0 0 2 0 0 7 5 0 0 0 0 0 3 3 0 0 4 1 10 0 0 0 0 0 9 4 9 5 3 8 0 0 0 0 0 4 0 0 ... 4 0 2 0 0 0 5 0 0 0 0 0 1 0 5 0 0 0 0 6 6 0 5 0 0 4 0 7 0 0 0 0 0 0 0 0 0 0 0 0
12 BC_33 1 0 0 1 0 0 2 1 0 0 0 0 0 3 2 0 0 1 2 1 0 1 0 0 0 3 6 3 1 3 2 0 0 0 0 0 1 0 0 ... 4 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 3 0 2 0 0 4 0 6 0 0 0 0 0 0 0 0 0 0 0 0
13 BC_34 0 0 0 6 0 0 10 4 0 0 0 0 0 7 9 0 0 5 8 9 0 0 0 1 0 9 8 4 3 3 11 0 0 0 0 0 6 0 0 ... 7 1 7 0 0 0 1 0 0 0 0 0 0 0 6 0 0 0 0 8 1 0 4 0 0 5 0 6 1 0 0 0 0 0 1 0 0 0 0 0
14 BC_35 0 0 0 5 0 0 3 3 0 0 0 0 0 4 5 0 0 2 2 5 0 0 1 0 0 5 3 4 5 8 1 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 4 0 0 0 0 0 0 0 5 0 0 0 0 2 2 0 1 0 0 7 0 3 0 0 0 0 0 0 0 0 0 0 0 0
15 BC_36 0 0 0 5 0 0 7 7 0 0 0 0 0 7 6 0 0 9 6 10 0 0 0 0 0 22 5 10 7 6 4 0 0 0 0 0 3 0 0 ... 8 0 7 0 0 0 12 0 0 0 0 0 0 0 11 0 0 0 0 4 1 0 7 0 0 6 0 9 0 0 0 0 0 0 0 0 0 0 0 0
16 BC_37 0 0 0 6 0 0 9 3 0 0 0 0 0 8 9 0 0 3 4 7 0 0 0 0 0 6 7 5 4 2 2 0 0 0 0 0 3 0 0 ... 2 0 2 0 0 1 71 0 0 0 0 0 0 0 1 0 0 0 0 2 7 0 5 0 0 7 0 3 0 0 0 0 0 0 0 0 1 0 0 0
17 BC_38 0 0 0 5 0 0 5 8 0 0 0 0 0 13 8 0 1 3 3 5 0 0 0 0 0 12 5 10 6 6 7 0 0 0 0 0 10 1 0 ... 4 0 3 0 0 0 116 0 0 0 1 0 0 0 6 0 1 0 0 6 5 0 11 0 0 10 0 10 0 0 0 0 0 0 0 0 0 0 0 0
18 BC_39 0 0 0 4 0 0 0 21 0 0 0 0 0 0 2 0 0 2 1 8 0 0 0 0 0 3 0 1 0 1 0 0 0 0 0 0 0 0 0 ... 4 0 1 0 0 0 2 0 0 0 0 0 0 0 4 0 0 0 0 0 2 0 2 0 0 2 0 52 0 0 0 0 0 0 0 0 0 0 0 0
19 BC_40 0 0 0 3 0 0 4 32 0 0 0 0 0 6 3 0 0 1 1 8 0 0 0 0 0 5 3 4 2 3 7 0 1 0 0 0 4 0 0 ... 2 0 2 0 0 0 2 0 0 0 0 0 0 0 6 0 0 0 0 2 1 0 2 1 0 1 0 98 0 0 0 0 0 0 0 0 0 0 0 1

20 rows × 217582 columns

In [ ]:
jelly2tags['CCATGTCAGTGTTTGC']
Out[ ]:
0       77
1       39
2       15
3       75
4     7114
5     5928
6       35
7       45
8       45
9       25
10      45
11      51
12      27
13      49
14      23
15      53
16      38
17      55
18      20
19      28
Name: CCATGTCAGTGTTTGC, dtype: int64
In [ ]:
adata['CCATGTCAGTGTTTGC-2'].X
Out[ ]:
ArrayView([[ 7.832645 ,  6.9506545,  6.153693 ,  8.8333645, 13.878383 ,
            13.557319 ,  9.487391 ,  7.2928925,  7.6783485,  6.5176826,
             7.2424545,  8.551719 ,  7.2883577,  9.619349 ,  6.4227967,
             7.5293713,  8.727089 ,  9.155742 ,  5.0387535,  7.7183394]],
          dtype=float32)
In [ ]:
ClickTag_counts_filtered.T['CCATGTCAGTGTTTGC-2']
Out[ ]:
BC_21      73
BC_22      40
BC_23      14
BC_24      74
BC_25    6992
BC_26    5863
BC_27      36
BC_28      45
BC_29      41
BC_30      25
BC_31      43
BC_32      50
BC_33      26
BC_34      48
BC_35      24
BC_36      50
BC_37      38
BC_38      56
BC_39      19
BC_40      27
Name: CCATGTCAGTGTTTGC-2, dtype: int64
In [ ]:
ClickTag_counts_filtered
Out[ ]:
BC_21 BC_22 BC_23 BC_24 BC_25 BC_26 BC_27 BC_28 BC_29 BC_30 BC_31 BC_32 BC_33 BC_34 BC_35 BC_36 BC_37 BC_38 BC_39 BC_40
CCATGTCAGTGTTTGC-2 73 40 14 74 6992 5863 36 45 41 25 43 50 26 48 24 50 38 56 19 27
AGGCCGTCAATCGAAA-2 35 23 18 36 52 25 30 18 33 29 47 26 4530 7795 46 58 25 43 26 31
CGCTGGACAAGACGTG-1 30 28 20 24 36 38 14 42 18 34 16 43 15 30 5000 6260 18 32 8 14
CTCGAGGTCACAATGC-1 35 19 53 135 41 23 20 14 34 35 51 36 4344 5855 29 54 33 47 15 35
CACACCTGTCTGATCA-1 72 37 14 58 5111 3898 31 51 31 23 33 25 137 199 27 32 33 32 18 19
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TGACTAGTCAAACGGG-1 7 6 1 4 8 1 5 4 8 4 2 5 1 6 4 4 5 4 3 1
CGGACGTAGTAGCGGT-1 7 6 3 5 5 5 2 1 8 4 2 4 4 6 6 5 3 1 3 3
AACTCTTGTTGCCTCT-1 5 5 3 4 4 3 5 1 10 3 3 4 3 5 2 7 4 9 1 2
ACGAGGAGTAAGTTCC-1 10 5 3 5 6 2 5 1 3 3 3 5 1 4 3 7 7 4 4 2
GGGAATGAGCGTTGCC-2 4 5 2 10 7 6 3 2 3 2 2 6 2 5 3 7 4 6 1 3

50000 rows × 20 columns

In [ ]:
ClickTag_counts_filtered.T.sum()
Out[ ]:
CCATGTCAGTGTTTGC-2    13584
AGGCCGTCAATCGAAA-2    12926
CGCTGGACAAGACGTG-1    11720
CTCGAGGTCACAATGC-1    10908
CACACCTGTCTGATCA-1     9881
                      ...  
TGACTAGTCAAACGGG-1       83
CGGACGTAGTAGCGGT-1       83
AACTCTTGTTGCCTCT-1       83
ACGAGGAGTAAGTTCC-1       83
GGGAATGAGCGTTGCC-2       83
Length: 50000, dtype: int64
In [ ]:
old.obs_names[:][:]
Out[ ]:
Index(['AAACCTGAGAGGGCTT-1', 'AAACCTGAGAGGTTAT-1', 'AAACCTGAGCGAGAAA-1',
       'AAACCTGAGCGTGTCC-1', 'AAACCTGAGGATTCGG-1', 'AAACCTGCAAAGTGCG-1',
       'AAACCTGCAAGCTGGA-1', 'AAACCTGCAATCACAC-1', 'AAACCTGCATAACCTG-1',
       'AAACCTGCATGCAACT-1',
       ...
       'TTTGTCACACTTAACG-2', 'TTTGTCAGTACCTACA-2', 'TTTGTCAGTATGGTTC-2',
       'TTTGTCAGTCATTAGC-2', 'TTTGTCAGTCTCCACT-2', 'TTTGTCAGTGGACGAT-2',
       'TTTGTCATCATCTGTT-2', 'TTTGTCATCGCCTGAG-2', 'TTTGTCATCTAACGGT-2',
       'TTTGTCATCTGAGTGT-2'],
      dtype='object', name='index', length=13673)
In [ ]:
counter=0
for x in old.obs_names[:]:
  if x[-1]=='2':
    print(counter)
    break
  counter+=1
6853
In [ ]:
old.obs_names[6850:6860]
Out[ ]:
Index(['TTTGTCAGTGTGAAAT-1', 'TTTGTCAGTTCCACAA-1', 'TTTGTCATCCGGGTGT-1',
       'AAACCTGAGCAGCGTA-2', 'AAACCTGAGCTGAACG-2', 'AAACCTGCAAGCCATT-2',
       'AAACCTGCAGGGAGAG-2', 'AAACCTGCATGAGCGA-2', 'AAACCTGGTCTGCCAG-2',
       'AAACCTGTCAATCTCT-2'],
      dtype='object', name='index')
In [ ]:
kallistocounts=[]
cellrangercounts=[]
counter=0
for x in old.obs_names[:6853]:
  kallistocounts+=[ClickTag_counts_filtered.T[x]]
  cellrangercounts+=[jelly1tags[x[:-2]]]
  counter+=1
In [ ]:
counter
Out[ ]:
6853
In [ ]:
for tag in list(range(10)):
  plt.scatter(np.log10([x[tag] for x in kallistocounts]), np.log10([x[tag] for x in cellrangercounts]), alpha=0.3)
  plt.show()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: RuntimeWarning: divide by zero encountered in log10
  
In [ ]: