Comparative Analysis of Cell Atlas Concordance and Perturbation Responses

In [ ]:

!date

Mon May 10 17:03:01 UTC 2021

Download Data¶

In [1]:

import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
    url = 'https://api.datacite.org/dois/'+doi+'/media'
    r = requests.get(url).json()
    netcdf_url = r['data'][0]['attributes']['url']
    r = requests.get(netcdf_url,stream=True)
    #Set file name
    fname = doi.split('/')[-1]+ext
    #Download file with progress bar
    if r.status_code == 403:
        print("File Unavailable")
    if 'content-length' not in r.headers:
        print("Did not get file")
    else:
        with open(fname, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            pbar = tnrange(int(total_length/1024), unit="B")
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update()
                    f.write(chunk)
        return fname

In [3]:

#Kallisto bus clustered starvation data, h5ad
download_file('10.22002/D1.1796','.gz')

#CellRanger Starvation h5ad data
download_file('10.22002/D1.1798','.gz')

#Import previously saved, clustered, & filtered stimulation data 

download_file('10.22002/D1.1821','.gz')

In [ ]:

#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing
download_file('10.22002/D1.1797','.gz')

#Import raw, unclustered stimulation data
download_file('10.22002/D1.1814','.gz')

In [ ]:

#Import merged data with knn clusters
download_file('10.22002/D1.1823','.gz')

In [5]:

!gunzip *.gz

In [6]:

!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
!pip install --quiet louvain

     |████████████████████████████████| 133kB 24.4MB/s 
     |████████████████████████████████| 7.7MB 20.0MB/s 
     |████████████████████████████████| 81kB 9.4MB/s 
     |████████████████████████████████| 71kB 7.7MB/s 
     |████████████████████████████████| 1.2MB 33.4MB/s 
  Building wheel for sinfo (setup.py) ... done
  Building wheel for umap-learn (setup.py) ... done
  Building wheel for pynndescent (setup.py) ... done
     |████████████████████████████████| 2.2MB 25.8MB/s 
     |████████████████████████████████| 3.2MB 43.0MB/s

Import packages¶

In [4]:

import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sc.set_figure_params(dpi=125)

import seaborn as sns
sns.set(style="whitegrid")

Do Distance Based Analysis for Inter- and Intra- Cluster Distances¶

Read in previously saved data

In [5]:

# Description: bus_fs_combo data (filtered + clustered starvation (fed/starved --> fs) data)
# Description: bus_combo_noZero (filtered + clustered stimulation data) from Kallisto bus

#bus_fs_combo
bus_fs_combo = anndata.read("D1.1796")

print(bus_fs_combo)

bus_combo_noZero = anndata.read('D1.1821') 
print(bus_combo_noZero)

#Previously saved overlap, merged data (from both experiments)
overlap_combo = anndata.read("D1.1823")
overlap_combo

AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 18921 × 10260
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'condition', 'orgID', 'cellRanger_louvain'
    var: 'n_counts', 'mean', 'std'
    uns: 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', 'condition_colors', 'louvain', 'neighbors', 'paga', 'pca', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

Out[5]:

AnnData object with n_obs × n_vars = 32594 × 6756
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus'
    var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std'
    uns: 'pca'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'

In [15]:

#How raw overlap adata was made (no genes filtered out)
bus_fs_raw = anndata.read("D1.1797")

bus_stim_raw = anndata.read("D1.1814")

#Merge datasets
bus_fs_raw.obs_names = [i+'_fs' for i in bus_fs_raw.obs_names]
bus_stim_raw.obs_names = [i+'_ieg' for i in bus_stim_raw.obs_names]

raw_overlap_combo = bus_fs_raw.concatenate(bus_stim_raw,join='outer', index_unique=None)

Predict labels for stimulation data

In [ ]:

#Make X_pca with merged, use top n PC's
def getPredLabels(overlap_fs,overlap_combo, n_PC):
    
    numFS = len(overlap_fs.obs_names)
    X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]

    #X_pca at stim rows = X_test
    X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]

    #y_train is f/s louvain labels
    y_train = overlap_fs.obs['cellRanger_louvain']

    #Fit and predict
    classifier = KNeighborsClassifier(n_neighbors=15)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    
    labels = list(y_train)+list(y_pred)
    
    print(len(labels),' labels created')
    
    return labels

How merged dataset (joint-PC space) with stimulation and starvation experiments was made (= overlap_combo)¶

In [ ]:

#Get intersection of var names between the two (making overlap_combo)
overlap = list(set(bus_combo_noZero.var_names).intersection(bus_fs_combo.var_names))

overlap_fs = bus_fs_combo[:,overlap]

overlap_stim = bus_combo_noZero[:,overlap]


#Merge datasets
overlap_fs.obs_names = [i+'_fs' for i in overlap_fs.obs_names]
overlap_stim.obs_names = [i+'_ieg' for i in overlap_stim.obs_names]

origin = list(np.repeat('FS',len(overlap_fs.obs_names))) + list(np.repeat('Stim',len(overlap_stim.obs_names)))
overlap_combo = overlap_fs.concatenate(overlap_stim,join='outer', index_unique=None)

overlap_combo.obs['cell_origin'] = pd.Categorical(origin)
sc.pp.scale(overlap_combo, max_value=10)

#Do PCA on merged dataset + plot variance explained
sc.tl.pca(overlap_combo, n_comps=60)
#sc.pl.pca_variance_ratio(overlap_combo, log=True)

#overlap_combo

labels = getPredLabels(overlap_fs,overlap_combo,n_PC = 60)

overlap_combo.obs['knn_clus'] = pd.Categorical(labels)

32594  labels created

In [ ]:

sc.pp.neighbors(overlap_combo,n_neighbors=150, n_pcs=60,random_state=42)
sc.tl.paga(overlap_combo, groups='knn_clus')
sc.pl.paga(overlap_combo, color=['knn_clus'])

In [ ]:

sc.tl.tsne(overlap_combo,random_state=42,n_pcs=60,early_exaggeration=5)

WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.

In [ ]:

sc.tl.umap(overlap_combo,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga')

In [ ]:

# sc.pl.tsne(overlap_combo,color=['knn_clus'])

In [ ]:

sc.pl.umap(overlap_combo,color=['knn_clus'],color_map='viridis')

In [ ]:

#overlap_combo.write('overlap_combo.h5ad')

Plot percent of K neighbors for each stim cell that are in one unique cluster¶

In [ ]:

from sklearn.neighbors import NearestNeighbors
import random 

#Read in previously saved data
overlap_combo = anndata.read("D1.1823")

n = 15

neigh = NearestNeighbors(n_neighbors=n)
n_PC = 60


numFS = len(overlap_fs.obs_names)
X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]

#X_pca at stim rows = X_test
X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]

#y_train is f/s louvain labels
y_train = overlap_fs.obs['cellRanger_louvain']

#Fit and predict
#neigh.fit(X_train)


#Subsample 70% of fed/starved data for training, test on remaining 30% and stim data
numSamp = np.int(0.7*X_train.shape[0])
ind = random.sample(range(0,X_train.shape[0]),numSamp)
allInd = range(0,X_train.shape[0])
rest = [i for i in allInd if i not in ind]

#Fit and predict
neigh.fit(X_train[ind,:])

X_rest = X_train[rest,:]



fsNeigh = neigh.kneighbors(X_rest)
stimNeigh = neigh.kneighbors(X_test)

In [ ]:

y_train = y_train[ind]

In [ ]:

stim_neighbors = stimNeigh[1]
fs_neighbors = fsNeigh[1]

#y_train = y_train[ind]
stim_perc = np.zeros(stim_neighbors.shape[0])
for i in range(0,len(stim_perc)): 
  #How many of top neighbors come from same cluster in fed/starved data (out of n=15)
  stim_perc[i] = np.max(y_train[stim_neighbors[i]].value_counts())/n 

fs_perc = np.zeros(fs_neighbors.shape[0])
#y_train = y_train[ind]
for i in range(0,len(fs_perc)): 
  fs_perc[i] = np.max(y_train[fs_neighbors[i]].value_counts())/n

In [ ]:

plt.hist(stim_perc,density=False,range=(0,1),alpha=0.6 ,bins=15,weights=np.ones(len(stim_perc))/len(stim_perc),label='stim-neighbors')
plt.hist(fs_perc,density=False,range=(0,1),alpha=0.6 ,bins=15 ,weights=np.ones(len(fs_perc))/len(fs_perc),label='fs-neighbors')
plt.legend(loc='upper left')
plt.title('Histogram of Nearest Neighbor Specificity',fontsize=11)
plt.grid(b=None)
plt.xlabel("Fraction of n=15 nearest neighbors in unique cluster",fontsize=10)
plt.ylabel("Frequency",fontsize=10)
plt.show()

Determine Inter and Intra Cluster Distances (perturbation effect vs cell type distance)¶

In [ ]:

#Plot distances between centroid of stim-starved and fed-starved for each cluster
def changeBool(fed):
    if fed == 'True':
        return 'fed'
    elif fed == 'False':
        return 'starved' 
    else:
        return 'stim'

def addConds(overlap_combo):
    fs_conds = [changeBool(i) for i in overlap_combo.obs['fed']]
    overlap_combo.obs['centroid_conds'] = pd.Categorical(fs_conds)

    overlap_combo.obs['centroid_conds']

In [ ]:

from sklearn.metrics import pairwise_distances
#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
    clusters = np.unique(overlap_combo.obs[clusType])
    centroids = np.zeros((len(clusters),pcs))
    
    for c in clusters:
        
        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
        centroid = pca_embed.mean(axis=0)
        
        centroids[c,:] = list(centroid)
        
    return centroids

In [ ]:

#Distance between conditions within clusters
def getCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
    fedStarv = []
    fsStim = []

    cluster = []

    #Initialize output matrix
    addConds(overlap_combo)
    clusters = np.unique(overlap_combo.obs[clusType])
    centroid_dist = pd.DataFrame(columns =['fedStarv','fsStim','cluster','clus_color']) 

    for c in clusters:
        #Get cells in cluster and their conditions
        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        centroid_conds = sub_data.obs['centroid_conds'] 

        #Get 2D tsne embedding
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA

        #Get location of condition cells
        stim_pos = list(centroid_conds == 'stim')
        allFS_pos = list(centroid_conds != 'stim')
        fed_pos = list(centroid_conds == 'fed')
        starved_pos = list(centroid_conds == 'starved')

        #Get column means for x,y coords
        meanFed = pca_embed[fed_pos,].mean(axis=0)

        meanStarv = pca_embed[starved_pos,].mean(axis=0)

        meanStim = pca_embed[stim_pos,].mean(axis=0)

        meanFS = pca_embed[allFS_pos,].mean(axis=0)


        cluster += [c]

        #Dist between fed and starved
        fs_dist = np.linalg.norm(meanFed - meanStarv,1)

        #Dist between (all) starved experiment and stim

        starvStim_dist = np.linalg.norm(meanFS - meanStim,1)

        fedStarv += [fs_dist]
        fsStim += [starvStim_dist]

    centroid_dist['fedStarv'] = fedStarv
    centroid_dist['fsStim'] = fsStim

    centroid_dist['cluster'] = cluster

    return centroid_dist

In [ ]:

#Distance between conditions within clusters
def getStimCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
    swKCl = []
    swDI = []

    cluster = []

    #Initialize output matrix
    #addConds(overlap_combo)
    clusters = np.unique(overlap_combo.obs[clusType])
    centroid_dist = pd.DataFrame(columns =['swKCl','swDI','cluster','clus_color']) 

    for c in clusters:
        #Get cells in cluster and their conditions
        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        centroid_conds = sub_data.obs['condition'] 

        #Get 2D tsne embedding
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA

        #Get location of condition cells
        sw_pos = list(centroid_conds == 'SW')
        kcl_pos = list(centroid_conds == 'KCl')
        di_pos = list(centroid_conds == 'DI')
        

        #Get column means for x,y coords
        meanSW = pca_embed[sw_pos,].mean(axis=0)

        meanKCl = pca_embed[kcl_pos,].mean(axis=0)

        meanDI = pca_embed[di_pos,].mean(axis=0)




        cluster += [c]

        #Dist between fed and starved
        swKCl_dist = np.linalg.norm(meanSW - meanKCl,1)

        #Dist between (all) starved experiment and stim

        swDI_dist = np.linalg.norm(meanSW - meanDI,1)

        swKCl += [swKCl_dist]
        swDI += [swDI_dist]

    centroid_dist['swKCl'] = swKCl 
    centroid_dist['swDI'] =  swDI

    centroid_dist['cluster'] = cluster

    return centroid_dist

Fed/Starved L1 Analysis¶

Cell type v state plots

In [ ]:

withinFS_Dists = getCentroidDists(bus_fs_combo,60,'cellRanger_louvain')
withinFS_Dists.head()

Out[ ]:

	fedStarv	fsStim	cluster	clus_color
0	11.357661	NaN	0	NaN
1	17.776861	NaN	1	NaN
2	12.411180	NaN	2	NaN
3	18.126976	NaN	3	NaN
4	13.799874	NaN	4	NaN

In [ ]:

#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_fs_combo,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))

150.34457080272998

In [ ]:

np.unique(bus_fs_combo.obs['cellRanger_louvain'])

Out[ ]:

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35])

In [ ]:

#Early cnidocytes, germ cells, stem cells have distances that overlap with intra-F/S distances for 14&19
np.where(pairCentroid_dists < 62)

Out[ ]:

(array([ 0,  0,  1,  2,  2,  3,  3,  4,  5,  6,  7,  7,  8,  9, 10, 11, 12,
        12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 26, 27,
        28, 29, 30, 31, 32, 33, 34, 35]),
 array([ 0, 12,  1,  2, 20,  3,  7,  4,  5,  6,  3,  7,  8,  9, 10, 11,  0,
        12, 13, 14, 15, 16, 17, 18, 19,  2, 20, 21, 22, 23, 24, 25, 26, 27,
        28, 29, 30, 31, 32, 33, 34, 35]))

In [ ]:

max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))

matplotlib.rc('axes',edgecolor='black')

plt.figure(figsize=(16,10))
plt.hist(withinFS_Dists['fedStarv'],color='r',bins=15,alpha=0.6,label='fed-starved',density=True)

plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with fed/starved cells)')
plt.grid(b=None)
plt.xlabel("L1 Distance with 60 PC's")
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

49.54310559108853

In [ ]:

#Area where line between cell state/type is blurred seems to be only in digestive endodermal clusters 14,19 (strong metabolic responses to starvation)
outlier_clus = withinFS_Dists[withinFS_Dists.fedStarv> 46.69]['cluster']
outlier_clus

Out[ ]:

14    14
19    19
Name: cluster, dtype: int64

Look at median genes per cell across types

In [16]:

bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_combo.obs['cellRanger_louvain'])
sc.pp.filter_genes(bus_fs_raw, min_counts=3)
sc.pp.filter_cells(bus_fs_raw, min_counts=1)
bus_fs_raw

Out[16]:

AnnData object with n_obs × n_vars = 13673 × 22454
    obs: 'batch', 'cellRanger_louvain', 'n_counts'
    var: 'n_counts'

In [12]:

# raw_overlap_combo[raw_overlap_combo.obs['cellRanger_louvain'].isin([0])].X

Out[12]:

<0x40157 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [17]:

bus_fs_raw.obs['cellRanger_louvain']

Out[17]:

AAACCTGAGAGGGCTT-1_fs    10
AAACCTGAGAGGTTAT-1_fs     4
AAACCTGAGCGAGAAA-1_fs     3
AAACCTGAGCGTGTCC-1_fs    15
AAACCTGAGGATTCGG-1_fs     3
                         ..
TTTGTCAGTGGACGAT-2_fs    24
TTTGTCATCATCTGTT-2_fs     2
TTTGTCATCGCCTGAG-2_fs     6
TTTGTCATCTAACGGT-2_fs    22
TTTGTCATCTGAGTGT-2_fs     0
Name: cellRanger_louvain, Length: 13673, dtype: category
Categories (36, int64): [0, 1, 2, 3, ..., 32, 33, 34, 35]

In [18]:

clusters = np.unique(bus_fs_raw.obs['cellRanger_louvain'])
len(clusters)

Out[18]:

In [48]:

clusters = np.unique(bus_fs_raw.obs['cellRanger_louvain'])

medians = []
clus = []

X = bus_fs_raw.X.todense()
#For each cell type
for cl in clusters:
  data = bus_fs_raw[bus_fs_raw.obs['cellRanger_louvain'].isin([cl])]

  #Get if for a gene count is > 0 (True/False)
  subX = X[bus_fs_raw.obs['cellRanger_louvain'].isin([cl]),:]
  nonzero = subX > 0


  #Sum each row (cell)
  sum = np.sum(nonzero,axis=1)
  s = np.median(np.array(sum.tolist())) #np.reshape(sum, sum.shape[0])


  #Take median
  med = np.median(s,axis = None)

  clus += [cl]
  medians += [med]

meds = pd.DataFrame()
meds['Median'] = medians
meds['Louvain Cluster'] = clus
meds.head()

Out[48]:

	Median	Louvain Cluster
0	1587.0	0
1	533.0	1
2	313.0	2
3	531.0	3
4	668.0	4

In [52]:

meds['Median'] = np.log(meds['Median'])
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="Louvain Cluster", y="Median", data=meds,linewidth=0.5,s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Median Genes per Cell (log)')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(meds["Louvain Cluster"]))  
plt.show()

Compare internal distances to differences in numbers of fed & starved cells (Spearman Rank correlation)

In [ ]:

#Composition of broad cell types between fed and starved , cell counts
counts = pd.DataFrame(columns =['count', 'orgID','condition','cluster']) 
clusters = np.unique(bus_fs_combo.obs['cellRanger_louvain'])

c = []
org = []
cond = []
clus = []

orgs = np.unique(bus_fs_combo.obs['orgID'])
conds = ['True', 'False']
for cl in clusters:
    data = bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([cl])]
    
    for cd in conds:
        #c_data = data[data.obs['condition'].isin([cd])]
        
        for o in orgs:
            pos = (data.obs['fed'].isin([cd])) & (data.obs['orgID'].isin([o]))
            org_data = data[pos]
            c += [org_data.n_obs]
            org += [o]
            if cd == 'True':
              cond += ['fed']
            else:
              cond += ['starved']
            clus += [cl]
            
                
print(len(c))


   
    
counts['count'] = c
counts['orgID'] = org
counts['condition'] = cond
counts['cluster'] = clus

In [ ]:

counts = counts[counts['count']> 0 ]
counts.head()

Out[ ]:

	count	orgID	condition
0	118	1	fed
2	83	2	fed
3	102	3	fed
4	102	4	fed
5	118	5	fed

In [ ]:

mean = []
var = []
clus = []
for i in np.unique(counts['cluster']):

  sub = counts[counts['cluster']==i]
  mean += [np.mean(sub['count'])]
  var += [np.var(sub['count'],ddof=1)]
  clus += [i]

vars = pd.DataFrame()
vars['mean'] = mean
vars['var'] = var
vars['cluster'] = clus

vars.head()

Out[ ]:

	mean	var	cluster
0	127.5	1116.055556	0
1	125.2	4871.066667	1
2	89.4	1530.266667	2
3	85.0	221.777778	3
4	68.5	535.388889	4

In [ ]:

#from sklearn.metrics import r2_score
import pylab as pl

vars['cluster'] = [str(i) for i in vars['cluster']] 
vars['logmean'] = np.log10(vars['mean'])
vars['logvar'] = np.log10(vars['var'])
ax = sns.scatterplot(x="logmean", y="logvar", hue="cluster", data=vars,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Variance in Cell Counts (log10)')
ax.set(xlabel='Average Cell Count (log10)')


poly = pl.polyfit(vars['logmean'], vars['logvar'], 1)

def poly2latex(poly, variable="x", width=2):
    t = ["{0:0.{width}f}"]
    t.append(t[-1] + " {variable}")
    t.append(t[-1] + "^{1}")

    def f():
        for i, v in enumerate(reversed(poly)):
            idx = i if i < 2 else 2
            yield t[idx].format(v, i, variable=variable, width=width)

    return "${}$".format("+".join(f()))

x2 = np.linspace(0, 2.5, 100)
y2 = np.polyval(poly, x2)
plt.plot(x2, y2, "b--",lw=2)
pl.text(x2[5]+0.1, y2[5], poly2latex(poly), fontsize=11)


r2 = np.corrcoef(np.polyval(poly, vars['logmean']),vars['logvar'])
plt.text(x2[5]+0.1, y2[5]-0.25, "r: "+str(np.round(r2[0,1],3)), fontsize=11)

plt.grid(False)

In [ ]:

vars['cluster'] = [str(i) for i in vars['cluster']] 
vars['logmean'] = np.log10(vars['mean'])
vars['logvar'] = np.log10(vars['var'])
ax = sns.scatterplot(x="mean", y="var", hue="cluster", data=vars,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Variance in Cell Counts ')
ax.set(xlabel='Average Cell Count ')
plt.grid(False)

In [ ]:

#All animals have >0 cells in all cell types
numOrgs = []
for i in np.unique(counts.cluster):
  numOrgs += [len(np.unique(counts.orgID[counts['cluster'] == i ]))]
np.unique(numOrgs)

Out[ ]:

array([10])

In [ ]:

individSum = counts.groupby('orgID').sum()
frac = []
for i in np.unique(counts.cluster):
  frac += list(counts['count'][counts['cluster'] == i ]/sum(counts['count'][counts['cluster'] == i ]))
counts['frac'] = frac
counts.head()

Out[ ]:

	count	orgID	condition	frac
0	118	1	fed	0.092549
2	83	2	fed	0.065098
3	102	3	fed	0.080000
4	102	4	fed	0.080000
5	118	5	fed	0.092549

In [ ]:

print(individSum)
np.mean(individSum['count'])

       count  cluster
orgID                
1       1235      630
10      1276      630
2       1305      630
3       1274      630
4        990      630
5       1807      630
6       1064      630
7       1467      630
8       1470      630
9       1785      630

Out[ ]:

1367.3

In [ ]:

#plt.plot(counts['count'],counts['frac'],'o')
counts['cluster'] = [str(i) for i in counts['cluster']] 
ax = sns.scatterplot(x="count", y="frac", hue="cluster",style="orgID", data=counts,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Fraction of Cell Type')
ax.set(xlabel='Cell Count')
scipy.stats.spearmanr(counts['count'],counts['frac'])

Out[ ]:

SpearmanrResult(correlation=0.4322226529501913, pvalue=8.084348794253579e-18)

In [ ]:

counts['cluster'] = [int(i) for i in counts['cluster']] 
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="orgID",style="orgID", data=counts,linewidth=0.5,s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))  
plt.show()

In [ ]:

counts['cluster'] = [int(i) for i in counts['cluster']] 
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="condition",style="orgID", data=counts,linewidth=0.5,palette=["darkorange", "b"],s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))  
plt.show()

In [ ]:

#One-way anova for each cluster
pvalsFTest = {}

for cl in clusters:
  sub = counts[counts.cluster == cl]
  subFed = sub[sub.condition == 'fed']
  subStarv = sub[sub.condition == 'starved']

  fedVals = list(subFed['count'])
  starvVals = list(subStarv['count'])

  pvalsFTest[cl] = scipy.stats.f_oneway(fedVals,starvVals)

In [ ]:

import statsmodels.stats as sm
pvals = []
# sizes = [] 
for c in clusters:
  pvals += [pvalsFTest[c][1]] 
  # sizes += [len(bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([c])].obs_names)]

pvals = sm.multitest.fdrcorrection(pvals,alpha=0.05, method='indep')
pvals

Out[ ]:

(array([False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False]),
 array([1.64253441e-01, 9.35522778e-01, 5.21386720e-01, 8.07361578e-01,
        9.97826979e-01, 3.31498471e-01, 8.07361578e-01, 5.68508228e-01,
        6.91908915e-01, 5.34369937e-01, 4.49408286e-04, 1.99877010e-01,
        9.63535360e-01, 7.78106033e-01, 2.25187596e-01, 4.82880690e-01,
        6.42595657e-01, 4.19543898e-01, 6.87698796e-01, 9.35522778e-01,
        3.31498471e-01, 9.63535360e-01, 9.63535360e-01, 3.31498471e-01,
        3.31498471e-01, 9.97826979e-01, 4.82880690e-01, 1.64253441e-01,
        4.82880690e-01, 2.09527206e-01, 4.82880690e-01, 3.31498471e-01,
        9.63535360e-01, 1.64253441e-01, 9.35522778e-01, 1.00000000e+00]))

In [ ]:

#counts = counts[counts['count']> 0 ]
#https://stackoverflow.com/questions/36578458/how-does-one-insert-statistical-annotations-stars-or-p-values-into-matplotlib/37518947#37518947

counts['cluster'] = [int(i) for i in counts['cluster']] 
plt.figure(figsize=(16, 6))
ax = sns.boxplot(x="cluster", y="count", hue="condition", data=counts,linewidth=0.5,palette=["darkorange", "b"])
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')

# statistical annotation
# alpha < 0.05
for i in [11]: 
  w = 0.5
  x1, x2 = i-w,i+w  

  y, h, col = counts['count'][counts['cluster'] == i].max() + 2, 4, 'k'
  plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
  plt.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)


plt.show()

In [ ]:

internalDist = list(withinFS_Dists['fedStarv'])

In [ ]:

scipy.stats.spearmanr(internalDist,pvals[1])

Out[ ]:

SpearmanrResult(correlation=-0.050871795043468955, pvalue=0.768261688241116)

In [ ]:

plt.scatter(internalDist,pvals[1])
plt.xlabel('Internal Distances')
plt.ylabel('P-value: Cell Count Difference')
plt.grid(None)

In [ ]:

counts.to_csv('individ_cellCounts.csv',index=None)

Repeat for Stimulation Data¶

In [ ]:

overlap_combo = anndata.read("D1.1823")
bus_combo_noZero.obs['cellRanger_louvain'] = pd.Categorical(overlap_combo[overlap_combo.obs['cell_origin'] == 'Stim'].obs['knn_clus'])

In [ ]:

withinStim_Dists = getStimCentroidDists(bus_combo_noZero,60,'cellRanger_louvain')
withinStim_Dists

Out[ ]:

	swKCl	swDI	cluster	clus_color
0	7.350214	8.503921	0	NaN
1	21.065245	36.732548	1	NaN
2	20.948408	20.759415	2	NaN
3	13.442171	34.598927	3	NaN
4	14.205286	25.923903	4	NaN
5	22.041147	31.476810	5	NaN
6	14.477929	25.217993	6	NaN
7	11.867729	17.921000	7	NaN
8	12.292484	11.644725	8	NaN
9	22.288679	22.918928	9	NaN
10	10.736828	11.939169	10	NaN
11	9.275630	8.065190	11	NaN
12	5.801665	22.793827	12	NaN
13	31.335329	31.087091	13	NaN
14	25.940889	28.084023	14	NaN
15	10.881435	18.185766	15	NaN
16	25.211300	17.230129	16	NaN
17	22.916479	12.188016	17	NaN
18	21.876759	14.847662	18	NaN
19	23.442184	24.017313	19	NaN
20	56.649334	65.165184	20	NaN
21	24.023872	16.950150	21	NaN
22	13.108215	19.507177	22	NaN
23	6.248736	12.076812	23	NaN
24	32.759808	45.112389	24	NaN
25	11.500583	18.344841	25	NaN
26	34.523087	26.619263	26	NaN
27	25.315067	31.565296	27	NaN
28	46.271626	31.329525	28	NaN
29	13.501001	10.425589	29	NaN
30	18.720173	38.476334	30	NaN
31	38.197033	57.892250	31	NaN
32	31.816713	17.769464	32	NaN
33	40.618649	44.847408	33	NaN
34	11.309300	12.301980	34	NaN
35	16.585102	11.396259	35	NaN

In [ ]:

#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_combo_noZero,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))

173.6400492134319

In [ ]:

#withinStim_Dists['swDI']
print(np.max(withinStim_Dists))

swKCl         56.649334
swDI          65.165184
cluster       35.000000
clus_color          NaN
dtype: float64

In [ ]:

max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))

plt.figure(figsize=(16,10))
plt.hist(withinStim_Dists['swKCl'],color='r',bins=15,alpha=0.6,label='SW-KCl',density=True)
plt.hist(withinStim_Dists['swDI'],color='b',bins=15,alpha=0.6,label='SW-DI',density=True)

plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with SW/KCl/DI cells)')

plt.xlabel("L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")
plt.grid(b=None)
plt.show()

65.46425357810222

In [ ]:

print(np.where(pairCentroid_dists < 65.46))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35]))

Organismal Distances, Fed/Starved + Stimulation¶

Batch effect plots

In [ ]:

#condition -- Stim, fed -- Fed/Starved data
overlap_combo = anndata.read("D1.1823")
overlap_combo

Out[ ]:

AnnData object with n_obs × n_vars = 32594 × 6756
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus'
    var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std'
    uns: 'pca'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'

In [ ]:

#Use overlap_combo

#Go through each clustee
clus = np.unique(overlap_combo.obs['knn_clus'])

fed_data = overlap_combo[overlap_combo.obs['fed'] == 'True']
sw_data = overlap_combo[overlap_combo.obs['condition'] == 'SW']

fed_orgs = np.unique(fed_data.obs['orgID'])
sw_orgs = np.unique(sw_data.obs['orgID'])

pcs = 60

org_metadata = pd.DataFrame(columns =['orgID','cluster','condition']) #centroid = row mean for all cells in organism
num_points = (len(fed_orgs)+len(sw_orgs))*len(clus)

#Get centroid in PCA space in each cluster for each organism
org_centroids = np.zeros((num_points,pcs))

condition = []
cluster = []
orgID = []

count = 0
toRemove = []
for c in clus:
  #Get all cells in chosen condition
  fed_sub = fed_data[fed_data.obs['knn_clus'] == c]
  fed_pca_embed = fed_sub.obsm['X_pca'][:,0:pcs] #PCA matrix

  sw_sub = sw_data[sw_data.obs['knn_clus'] == c]
  sw_pca_embed = sw_sub.obsm['X_pca'][:,0:pcs] #PCA matrix

  #For each org, get centroid of cells (X_pca)
  for sw in sw_orgs:
    condition += ['SW']
    cluster += [c]
    orgID += [sw]

    #Get location of org cells
    org_pos = list(sw_sub.obs['orgID'] == sw)

    #Sum down rows of cells for each PC
    if len(sw_pca_embed[org_pos,]) == 0:
      toRemove += [count]

    org_centroids[count,:] = list(sw_pca_embed[org_pos,].mean(axis=0))

    count += 1

  for fed in fed_orgs:
    condition += ['Fed']
    cluster += [c]
    orgID += [fed]

    #Get location of org cells
    org_pos = list(fed_sub.obs['orgID'] == fed)
    #Sum down rows of cells for each PC
    if len(fed_pca_embed[org_pos,]) == 0:
      toRemove += [count]

    org_centroids[count,:] = list(fed_pca_embed[org_pos,].mean(axis=0))

    count += 1    

  #For each org, get centroid of cells (X_pca)

  #Average pairwise distance between org centroid

org_metadata['orgID'] = orgID
org_metadata['cluster'] = cluster
org_metadata['condition'] = condition

In [ ]:

toRemove

Out[ ]:

[]

In [ ]:

org_metadata = org_metadata.drop(org_metadata.index[toRemove])
org_metadata.head()

org_centroids= np.delete(org_centroids, toRemove, 0)
org_centroids[0,:]

Out[ ]:

array([-2.25471210e+00, -1.51177585e+00, -1.93993020e+00, -9.54584032e-02,
       -2.38718843e+00, -1.51001954e+00, -2.81468654e+00,  1.05100930e+00,
        1.54143572e+00, -7.58547902e-01, -1.50774598e+00, -2.47600722e+00,
        7.53088653e-01, -9.25398111e-01,  5.30199647e-01, -2.36053783e-02,
        9.48955193e-02, -6.50646329e-01,  1.05686748e+00, -2.67228413e+00,
        2.98014688e+00,  5.93579225e-02,  5.69488704e-01, -1.05491745e+00,
        1.12574625e+00, -1.49573892e-01,  5.68022847e-01, -1.29234409e+00,
        1.40452191e-01, -3.84621084e-01,  2.95532733e-01,  1.13393039e-01,
        2.88706899e-01, -5.50938368e-01, -1.17623888e-01, -1.51531205e-01,
       -7.78403163e-01, -7.72583902e-01,  9.31857646e-01,  9.21139657e-01,
       -4.21358228e-01,  9.92068797e-02, -3.57610792e-01,  1.40223965e-01,
        4.30851281e-02,  2.75345147e-01,  5.04794896e-01, -1.59878790e-01,
        1.80910379e-01,  2.61838585e-01, -2.14510873e-01,  1.25861779e-01,
       -3.76004130e-01,  2.04274207e-01, -1.61955715e-03, -9.25957318e-03,
        5.98372109e-02, -2.70651639e-01,  2.62535036e-01,  1.59801412e-02])

In [ ]:

org_pairDists = pd.DataFrame(columns =['cluster','pair_fed','pair_sw','pair_mix'])
cluster = []
pair_fed = []
pair_sw = []
pair_mix = []

for c in clus:

  #Fed org centroids
  where_fed = (org_metadata.cluster == c) & (org_metadata.condition == 'Fed')
  fed_mat = org_centroids[where_fed,:]

  pair_fed_dist = pairwise_distances(fed_mat, metric = 'l1')
  pair_fed_mean = np.mean(pair_fed_dist)

  #SW org centroids
  where_sw = (org_metadata.cluster == c) & (org_metadata.condition == 'SW')
  sw_mat = org_centroids[where_sw,:]

  pair_sw_dist = pairwise_distances(sw_mat, metric = 'l1')
  pair_sw_mean = np.mean(pair_sw_dist)

  pair_mix_dist = pairwise_distances(fed_mat,sw_mat, metric = 'l1')
  pair_mix_mean = np.mean(pair_mix_dist)

  cluster += [c]
  pair_fed += [pair_fed_mean]
  pair_sw += [pair_sw_mean]
  pair_mix += [pair_mix_mean]

org_pairDists['cluster'] = cluster
org_pairDists['pair_fed'] = pair_fed
org_pairDists['pair_sw'] = pair_sw
org_pairDists['pair_mix'] = pair_mix

In [ ]:

org_pairDists.head()

Out[ ]:

	cluster	pair_fed	pair_sw	pair_mix
0	0	8.005131	7.995117	23.570756
1	1	15.629137	32.140367	75.364971
2	2	9.598802	14.187616	49.332659
3	3	19.421792	22.127487	42.742088
4	4	16.030268	26.287836	36.652727

In [ ]:

plt.figure(figsize=(16,10))
plt.hist(org_pairDists['pair_fed'],color='r',bins=15,alpha=0.6,label='control org dist',density=True)
plt.hist(org_pairDists['pair_sw'],color='blue',bins=15,alpha=0.6,label='sw org dist',density=True)
plt.hist(org_pairDists['pair_mix'], bins=15,color = 'orange',alpha=0.6,label='mix dist',density=True)
plt.legend(loc='upper right')
plt.title('Inter-Organism Distances (L1)')
plt.grid(b=None)
plt.xlabel("Average L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")

plt.show()

In [ ]:

overlap_combo

Out[ ]:

AnnData object with n_obs × n_vars = 32594 × 6756
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus'
    var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std'
    uns: 'pca'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'

In [ ]:

overlap_combo.write('overlap_combo.h5ad')

... storing 'louvain' as categorical
... storing 'leiden' as categorical
... storing 'orgID' as categorical
... storing 'fed' as categorical
... storing 'starved' as categorical
... storing 'cellRanger_louvain' as categorical
... storing 'annos' as categorical
... storing 'new_cellRanger_louvain' as categorical
... storing 'annosSub' as categorical
... storing 'condition' as categorical

In [ ]: