Skip to content

Comparative Analysis of Cell Atlas Concordance and Perturbation Responses

Open In Colab

In [ ]:
!date
Mon May 10 17:03:01 UTC 2021

Download Data

In [1]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
    url = 'https://api.datacite.org/dois/'+doi+'/media'
    r = requests.get(url).json()
    netcdf_url = r['data'][0]['attributes']['url']
    r = requests.get(netcdf_url,stream=True)
    #Set file name
    fname = doi.split('/')[-1]+ext
    #Download file with progress bar
    if r.status_code == 403:
        print("File Unavailable")
    if 'content-length' not in r.headers:
        print("Did not get file")
    else:
        with open(fname, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            pbar = tnrange(int(total_length/1024), unit="B")
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update()
                    f.write(chunk)
        return fname
In [3]:
#Kallisto bus clustered starvation data, h5ad
download_file('10.22002/D1.1796','.gz')

#CellRanger Starvation h5ad data
download_file('10.22002/D1.1798','.gz')

#Import previously saved, clustered, & filtered stimulation data 

download_file('10.22002/D1.1821','.gz')
In [ ]:
#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing
download_file('10.22002/D1.1797','.gz')

#Import raw, unclustered stimulation data
download_file('10.22002/D1.1814','.gz')
In [ ]:
#Import merged data with knn clusters
download_file('10.22002/D1.1823','.gz')
In [5]:
!gunzip *.gz
In [6]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
!pip install --quiet louvain
     |████████████████████████████████| 133kB 24.4MB/s 
     |████████████████████████████████| 7.7MB 20.0MB/s 
     |████████████████████████████████| 81kB 9.4MB/s 
     |████████████████████████████████| 71kB 7.7MB/s 
     |████████████████████████████████| 1.2MB 33.4MB/s 
  Building wheel for sinfo (setup.py) ... done
  Building wheel for umap-learn (setup.py) ... done
  Building wheel for pynndescent (setup.py) ... done
     |████████████████████████████████| 2.2MB 25.8MB/s 
     |████████████████████████████████| 3.2MB 43.0MB/s 

Import packages

In [4]:
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sc.set_figure_params(dpi=125)

import seaborn as sns
sns.set(style="whitegrid")

Do Distance Based Analysis for Inter- and Intra- Cluster Distances

Read in previously saved data

In [5]:
# Description: bus_fs_combo data (filtered + clustered starvation (fed/starved --> fs) data)
# Description: bus_combo_noZero (filtered + clustered stimulation data) from Kallisto bus

#bus_fs_combo
bus_fs_combo = anndata.read("D1.1796")

print(bus_fs_combo)

bus_combo_noZero = anndata.read('D1.1821') 
print(bus_combo_noZero)

#Previously saved overlap, merged data (from both experiments)
overlap_combo = anndata.read("D1.1823")
overlap_combo
AnnData object with n_obs × n_vars = 13673 × 8696
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub'
    var: 'n_counts', 'mean', 'std'
    uns: 'annosSub_colors', 'annos_colors', 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', "dendrogram_['new_cellRanger_louvain']", 'dendrogram_new_cellRanger_louvain', 'fed_colors', 'fed_neighbor_score_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'new_cellRanger_louvain_colors', 'orgID_colors', 'paga', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 18921 × 10260
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'condition', 'orgID', 'cellRanger_louvain'
    var: 'n_counts', 'mean', 'std'
    uns: 'cellRanger_louvain_colors', 'cellRanger_louvain_sizes', 'condition_colors', 'louvain', 'neighbors', 'paga', 'pca', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
Out[5]:
AnnData object with n_obs × n_vars = 32594 × 6756
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus'
    var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std'
    uns: 'pca'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
In [15]:
#How raw overlap adata was made (no genes filtered out)
bus_fs_raw = anndata.read("D1.1797")

bus_stim_raw = anndata.read("D1.1814")

#Merge datasets
bus_fs_raw.obs_names = [i+'_fs' for i in bus_fs_raw.obs_names]
bus_stim_raw.obs_names = [i+'_ieg' for i in bus_stim_raw.obs_names]

raw_overlap_combo = bus_fs_raw.concatenate(bus_stim_raw,join='outer', index_unique=None)

Predict labels for stimulation data

In [ ]:
#Make X_pca with merged, use top n PC's
def getPredLabels(overlap_fs,overlap_combo, n_PC):
    
    numFS = len(overlap_fs.obs_names)
    X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]

    #X_pca at stim rows = X_test
    X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]

    #y_train is f/s louvain labels
    y_train = overlap_fs.obs['cellRanger_louvain']

    #Fit and predict
    classifier = KNeighborsClassifier(n_neighbors=15)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    
    labels = list(y_train)+list(y_pred)
    
    print(len(labels),' labels created')
    
    return labels

How merged dataset (joint-PC space) with stimulation and starvation experiments was made (= overlap_combo)

In [ ]:
#Get intersection of var names between the two (making overlap_combo)
overlap = list(set(bus_combo_noZero.var_names).intersection(bus_fs_combo.var_names))

overlap_fs = bus_fs_combo[:,overlap]

overlap_stim = bus_combo_noZero[:,overlap]


#Merge datasets
overlap_fs.obs_names = [i+'_fs' for i in overlap_fs.obs_names]
overlap_stim.obs_names = [i+'_ieg' for i in overlap_stim.obs_names]

origin = list(np.repeat('FS',len(overlap_fs.obs_names))) + list(np.repeat('Stim',len(overlap_stim.obs_names)))
overlap_combo = overlap_fs.concatenate(overlap_stim,join='outer', index_unique=None)

overlap_combo.obs['cell_origin'] = pd.Categorical(origin)
sc.pp.scale(overlap_combo, max_value=10)

#Do PCA on merged dataset + plot variance explained
sc.tl.pca(overlap_combo, n_comps=60)
#sc.pl.pca_variance_ratio(overlap_combo, log=True)

#overlap_combo

labels = getPredLabels(overlap_fs,overlap_combo,n_PC = 60)

overlap_combo.obs['knn_clus'] = pd.Categorical(labels)
32594  labels created
In [ ]:
sc.pp.neighbors(overlap_combo,n_neighbors=150, n_pcs=60,random_state=42)
sc.tl.paga(overlap_combo, groups='knn_clus')
sc.pl.paga(overlap_combo, color=['knn_clus'])
In [ ]:
sc.tl.tsne(overlap_combo,random_state=42,n_pcs=60,early_exaggeration=5)
WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.
In [ ]:
sc.tl.umap(overlap_combo,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga')
In [ ]:
# sc.pl.tsne(overlap_combo,color=['knn_clus'])
In [ ]:
sc.pl.umap(overlap_combo,color=['knn_clus'],color_map='viridis')
In [ ]:
#overlap_combo.write('overlap_combo.h5ad')
Plot percent of K neighbors for each stim cell that are in one unique cluster
In [ ]:
from sklearn.neighbors import NearestNeighbors
import random 

#Read in previously saved data
overlap_combo = anndata.read("D1.1823")

n = 15

neigh = NearestNeighbors(n_neighbors=n)
n_PC = 60


numFS = len(overlap_fs.obs_names)
X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]

#X_pca at stim rows = X_test
X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]

#y_train is f/s louvain labels
y_train = overlap_fs.obs['cellRanger_louvain']

#Fit and predict
#neigh.fit(X_train)


#Subsample 70% of fed/starved data for training, test on remaining 30% and stim data
numSamp = np.int(0.7*X_train.shape[0])
ind = random.sample(range(0,X_train.shape[0]),numSamp)
allInd = range(0,X_train.shape[0])
rest = [i for i in allInd if i not in ind]

#Fit and predict
neigh.fit(X_train[ind,:])

X_rest = X_train[rest,:]



fsNeigh = neigh.kneighbors(X_rest)
stimNeigh = neigh.kneighbors(X_test)
In [ ]:
y_train = y_train[ind]
In [ ]:
stim_neighbors = stimNeigh[1]
fs_neighbors = fsNeigh[1]

#y_train = y_train[ind]
stim_perc = np.zeros(stim_neighbors.shape[0])
for i in range(0,len(stim_perc)): 
  #How many of top neighbors come from same cluster in fed/starved data (out of n=15)
  stim_perc[i] = np.max(y_train[stim_neighbors[i]].value_counts())/n 

fs_perc = np.zeros(fs_neighbors.shape[0])
#y_train = y_train[ind]
for i in range(0,len(fs_perc)): 
  fs_perc[i] = np.max(y_train[fs_neighbors[i]].value_counts())/n
In [ ]:
plt.hist(stim_perc,density=False,range=(0,1),alpha=0.6 ,bins=15,weights=np.ones(len(stim_perc))/len(stim_perc),label='stim-neighbors')
plt.hist(fs_perc,density=False,range=(0,1),alpha=0.6 ,bins=15 ,weights=np.ones(len(fs_perc))/len(fs_perc),label='fs-neighbors')
plt.legend(loc='upper left')
plt.title('Histogram of Nearest Neighbor Specificity',fontsize=11)
plt.grid(b=None)
plt.xlabel("Fraction of n=15 nearest neighbors in unique cluster",fontsize=10)
plt.ylabel("Frequency",fontsize=10)
plt.show()

Determine Inter and Intra Cluster Distances (perturbation effect vs cell type distance)

In [ ]:
#Plot distances between centroid of stim-starved and fed-starved for each cluster
def changeBool(fed):
    if fed == 'True':
        return 'fed'
    elif fed == 'False':
        return 'starved' 
    else:
        return 'stim'

def addConds(overlap_combo):
    fs_conds = [changeBool(i) for i in overlap_combo.obs['fed']]
    overlap_combo.obs['centroid_conds'] = pd.Categorical(fs_conds)

    overlap_combo.obs['centroid_conds']
In [ ]:
from sklearn.metrics import pairwise_distances
#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
    clusters = np.unique(overlap_combo.obs[clusType])
    centroids = np.zeros((len(clusters),pcs))
    
    for c in clusters:
        
        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
        centroid = pca_embed.mean(axis=0)
        
        centroids[c,:] = list(centroid)
        
    return centroids
In [ ]:
#Distance between conditions within clusters
def getCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
    fedStarv = []
    fsStim = []

    cluster = []

    #Initialize output matrix
    addConds(overlap_combo)
    clusters = np.unique(overlap_combo.obs[clusType])
    centroid_dist = pd.DataFrame(columns =['fedStarv','fsStim','cluster','clus_color']) 

    for c in clusters:
        #Get cells in cluster and their conditions
        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        centroid_conds = sub_data.obs['centroid_conds'] 

        #Get 2D tsne embedding
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA

        #Get location of condition cells
        stim_pos = list(centroid_conds == 'stim')
        allFS_pos = list(centroid_conds != 'stim')
        fed_pos = list(centroid_conds == 'fed')
        starved_pos = list(centroid_conds == 'starved')

        #Get column means for x,y coords
        meanFed = pca_embed[fed_pos,].mean(axis=0)

        meanStarv = pca_embed[starved_pos,].mean(axis=0)

        meanStim = pca_embed[stim_pos,].mean(axis=0)

        meanFS = pca_embed[allFS_pos,].mean(axis=0)


        cluster += [c]

        #Dist between fed and starved
        fs_dist = np.linalg.norm(meanFed - meanStarv,1)

        #Dist between (all) starved experiment and stim

        starvStim_dist = np.linalg.norm(meanFS - meanStim,1)

        fedStarv += [fs_dist]
        fsStim += [starvStim_dist]

    centroid_dist['fedStarv'] = fedStarv
    centroid_dist['fsStim'] = fsStim

    centroid_dist['cluster'] = cluster

    return centroid_dist
In [ ]:
#Distance between conditions within clusters
def getStimCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
    swKCl = []
    swDI = []

    cluster = []

    #Initialize output matrix
    #addConds(overlap_combo)
    clusters = np.unique(overlap_combo.obs[clusType])
    centroid_dist = pd.DataFrame(columns =['swKCl','swDI','cluster','clus_color']) 

    for c in clusters:
        #Get cells in cluster and their conditions
        sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
        centroid_conds = sub_data.obs['condition'] 

        #Get 2D tsne embedding
        pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA

        #Get location of condition cells
        sw_pos = list(centroid_conds == 'SW')
        kcl_pos = list(centroid_conds == 'KCl')
        di_pos = list(centroid_conds == 'DI')
        

        #Get column means for x,y coords
        meanSW = pca_embed[sw_pos,].mean(axis=0)

        meanKCl = pca_embed[kcl_pos,].mean(axis=0)

        meanDI = pca_embed[di_pos,].mean(axis=0)




        cluster += [c]

        #Dist between fed and starved
        swKCl_dist = np.linalg.norm(meanSW - meanKCl,1)

        #Dist between (all) starved experiment and stim

        swDI_dist = np.linalg.norm(meanSW - meanDI,1)

        swKCl += [swKCl_dist]
        swDI += [swDI_dist]

    centroid_dist['swKCl'] = swKCl 
    centroid_dist['swDI'] =  swDI

    centroid_dist['cluster'] = cluster

    return centroid_dist
Fed/Starved L1 Analysis

Cell type v state plots

In [ ]:
withinFS_Dists = getCentroidDists(bus_fs_combo,60,'cellRanger_louvain')
withinFS_Dists.head()
Out[ ]:
fedStarv fsStim cluster clus_color
0 11.357661 NaN 0 NaN
1 17.776861 NaN 1 NaN
2 12.411180 NaN 2 NaN
3 18.126976 NaN 3 NaN
4 13.799874 NaN 4 NaN
In [ ]:
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_fs_combo,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))
150.34457080272998
In [ ]:
np.unique(bus_fs_combo.obs['cellRanger_louvain'])
Out[ ]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35])
In [ ]:
#Early cnidocytes, germ cells, stem cells have distances that overlap with intra-F/S distances for 14&19
np.where(pairCentroid_dists < 62)
Out[ ]:
(array([ 0,  0,  1,  2,  2,  3,  3,  4,  5,  6,  7,  7,  8,  9, 10, 11, 12,
        12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 21, 22, 23, 24, 25, 26, 27,
        28, 29, 30, 31, 32, 33, 34, 35]),
 array([ 0, 12,  1,  2, 20,  3,  7,  4,  5,  6,  3,  7,  8,  9, 10, 11,  0,
        12, 13, 14, 15, 16, 17, 18, 19,  2, 20, 21, 22, 23, 24, 25, 26, 27,
        28, 29, 30, 31, 32, 33, 34, 35]))
In [ ]:
max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))

matplotlib.rc('axes',edgecolor='black')

plt.figure(figsize=(16,10))
plt.hist(withinFS_Dists['fedStarv'],color='r',bins=15,alpha=0.6,label='fed-starved',density=True)

plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with fed/starved cells)')
plt.grid(b=None)
plt.xlabel("L1 Distance with 60 PC's")
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()
49.54310559108853
In [ ]:
#Area where line between cell state/type is blurred seems to be only in digestive endodermal clusters 14,19 (strong metabolic responses to starvation)
outlier_clus = withinFS_Dists[withinFS_Dists.fedStarv> 46.69]['cluster']
outlier_clus
Out[ ]:
14    14
19    19
Name: cluster, dtype: int64

Look at median genes per cell across types

In [16]:
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_combo.obs['cellRanger_louvain'])
sc.pp.filter_genes(bus_fs_raw, min_counts=3)
sc.pp.filter_cells(bus_fs_raw, min_counts=1)
bus_fs_raw
Out[16]:
AnnData object with n_obs × n_vars = 13673 × 22454
    obs: 'batch', 'cellRanger_louvain', 'n_counts'
    var: 'n_counts'
In [12]:
# raw_overlap_combo[raw_overlap_combo.obs['cellRanger_louvain'].isin([0])].X
Out[12]:
<0x40157 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>
In [17]:
bus_fs_raw.obs['cellRanger_louvain']
Out[17]:
AAACCTGAGAGGGCTT-1_fs    10
AAACCTGAGAGGTTAT-1_fs     4
AAACCTGAGCGAGAAA-1_fs     3
AAACCTGAGCGTGTCC-1_fs    15
AAACCTGAGGATTCGG-1_fs     3
                         ..
TTTGTCAGTGGACGAT-2_fs    24
TTTGTCATCATCTGTT-2_fs     2
TTTGTCATCGCCTGAG-2_fs     6
TTTGTCATCTAACGGT-2_fs    22
TTTGTCATCTGAGTGT-2_fs     0
Name: cellRanger_louvain, Length: 13673, dtype: category
Categories (36, int64): [0, 1, 2, 3, ..., 32, 33, 34, 35]
In [18]:
clusters = np.unique(bus_fs_raw.obs['cellRanger_louvain'])
len(clusters)
Out[18]:
36
In [48]:
clusters = np.unique(bus_fs_raw.obs['cellRanger_louvain'])

medians = []
clus = []

X = bus_fs_raw.X.todense()
#For each cell type
for cl in clusters:
  data = bus_fs_raw[bus_fs_raw.obs['cellRanger_louvain'].isin([cl])]

  #Get if for a gene count is > 0 (True/False)
  subX = X[bus_fs_raw.obs['cellRanger_louvain'].isin([cl]),:]
  nonzero = subX > 0


  #Sum each row (cell)
  sum = np.sum(nonzero,axis=1)
  s = np.median(np.array(sum.tolist())) #np.reshape(sum, sum.shape[0])


  #Take median
  med = np.median(s,axis = None)

  clus += [cl]
  medians += [med]

meds = pd.DataFrame()
meds['Median'] = medians
meds['Louvain Cluster'] = clus
meds.head()
Out[48]:
Median Louvain Cluster
0 1587.0 0
1 533.0 1
2 313.0 2
3 531.0 3
4 668.0 4
In [52]:
meds['Median'] = np.log(meds['Median'])
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="Louvain Cluster", y="Median", data=meds,linewidth=0.5,s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Median Genes per Cell (log)')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(meds["Louvain Cluster"]))  
plt.show()

Compare internal distances to differences in numbers of fed & starved cells (Spearman Rank correlation)

In [ ]:
#Composition of broad cell types between fed and starved , cell counts
counts = pd.DataFrame(columns =['count', 'orgID','condition','cluster']) 
clusters = np.unique(bus_fs_combo.obs['cellRanger_louvain'])

c = []
org = []
cond = []
clus = []

orgs = np.unique(bus_fs_combo.obs['orgID'])
conds = ['True', 'False']
for cl in clusters:
    data = bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([cl])]
    
    for cd in conds:
        #c_data = data[data.obs['condition'].isin([cd])]
        
        for o in orgs:
            pos = (data.obs['fed'].isin([cd])) & (data.obs['orgID'].isin([o]))
            org_data = data[pos]
            c += [org_data.n_obs]
            org += [o]
            if cd == 'True':
              cond += ['fed']
            else:
              cond += ['starved']
            clus += [cl]
            
                
print(len(c))


   
    
counts['count'] = c
counts['orgID'] = org
counts['condition'] = cond
counts['cluster'] = clus
720
In [ ]:
counts = counts[counts['count']> 0 ]
counts.head()
Out[ ]:
count orgID condition cluster
0 118 1 fed 0
2 83 2 fed 0
3 102 3 fed 0
4 102 4 fed 0
5 118 5 fed 0
In [ ]:
mean = []
var = []
clus = []
for i in np.unique(counts['cluster']):

  sub = counts[counts['cluster']==i]
  mean += [np.mean(sub['count'])]
  var += [np.var(sub['count'],ddof=1)]
  clus += [i]

vars = pd.DataFrame()
vars['mean'] = mean
vars['var'] = var
vars['cluster'] = clus

vars.head()
Out[ ]:
mean var cluster
0 127.5 1116.055556 0
1 125.2 4871.066667 1
2 89.4 1530.266667 2
3 85.0 221.777778 3
4 68.5 535.388889 4
In [ ]:
#from sklearn.metrics import r2_score
import pylab as pl

vars['cluster'] = [str(i) for i in vars['cluster']] 
vars['logmean'] = np.log10(vars['mean'])
vars['logvar'] = np.log10(vars['var'])
ax = sns.scatterplot(x="logmean", y="logvar", hue="cluster", data=vars,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Variance in Cell Counts (log10)')
ax.set(xlabel='Average Cell Count (log10)')


poly = pl.polyfit(vars['logmean'], vars['logvar'], 1)

def poly2latex(poly, variable="x", width=2):
    t = ["{0:0.{width}f}"]
    t.append(t[-1] + " {variable}")
    t.append(t[-1] + "^{1}")

    def f():
        for i, v in enumerate(reversed(poly)):
            idx = i if i < 2 else 2
            yield t[idx].format(v, i, variable=variable, width=width)

    return "${}$".format("+".join(f()))

x2 = np.linspace(0, 2.5, 100)
y2 = np.polyval(poly, x2)
plt.plot(x2, y2, "b--",lw=2)
pl.text(x2[5]+0.1, y2[5], poly2latex(poly), fontsize=11)


r2 = np.corrcoef(np.polyval(poly, vars['logmean']),vars['logvar'])
plt.text(x2[5]+0.1, y2[5]-0.25, "r: "+str(np.round(r2[0,1],3)), fontsize=11)

plt.grid(False)
In [ ]:
vars['cluster'] = [str(i) for i in vars['cluster']] 
vars['logmean'] = np.log10(vars['mean'])
vars['logvar'] = np.log10(vars['var'])
ax = sns.scatterplot(x="mean", y="var", hue="cluster", data=vars,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Variance in Cell Counts ')
ax.set(xlabel='Average Cell Count ')
plt.grid(False)
In [ ]:
#All animals have >0 cells in all cell types
numOrgs = []
for i in np.unique(counts.cluster):
  numOrgs += [len(np.unique(counts.orgID[counts['cluster'] == i ]))]
np.unique(numOrgs)
Out[ ]:
array([10])
In [ ]:
individSum = counts.groupby('orgID').sum()
frac = []
for i in np.unique(counts.cluster):
  frac += list(counts['count'][counts['cluster'] == i ]/sum(counts['count'][counts['cluster'] == i ]))
counts['frac'] = frac
counts.head()
Out[ ]:
count orgID condition cluster frac
0 118 1 fed 0 0.092549
2 83 2 fed 0 0.065098
3 102 3 fed 0 0.080000
4 102 4 fed 0 0.080000
5 118 5 fed 0 0.092549
In [ ]:
print(individSum)
np.mean(individSum['count'])
       count  cluster
orgID                
1       1235      630
10      1276      630
2       1305      630
3       1274      630
4        990      630
5       1807      630
6       1064      630
7       1467      630
8       1470      630
9       1785      630
Out[ ]:
1367.3
In [ ]:
#plt.plot(counts['count'],counts['frac'],'o')
counts['cluster'] = [str(i) for i in counts['cluster']] 
ax = sns.scatterplot(x="count", y="frac", hue="cluster",style="orgID", data=counts,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Fraction of Cell Type')
ax.set(xlabel='Cell Count')
scipy.stats.spearmanr(counts['count'],counts['frac'])
Out[ ]:
SpearmanrResult(correlation=0.4322226529501913, pvalue=8.084348794253579e-18)
In [ ]:
counts['cluster'] = [int(i) for i in counts['cluster']] 
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="orgID",style="orgID", data=counts,linewidth=0.5,s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))  
plt.show()
In [ ]:
counts['cluster'] = [int(i) for i in counts['cluster']] 
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="condition",style="orgID", data=counts,linewidth=0.5,palette=["darkorange", "b"],s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))  
plt.show()
In [ ]:
#One-way anova for each cluster
pvalsFTest = {}

for cl in clusters:
  sub = counts[counts.cluster == cl]
  subFed = sub[sub.condition == 'fed']
  subStarv = sub[sub.condition == 'starved']

  fedVals = list(subFed['count'])
  starvVals = list(subStarv['count'])

  pvalsFTest[cl] = scipy.stats.f_oneway(fedVals,starvVals)
In [ ]:
import statsmodels.stats as sm
pvals = []
# sizes = [] 
for c in clusters:
  pvals += [pvalsFTest[c][1]] 
  # sizes += [len(bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([c])].obs_names)]

pvals = sm.multitest.fdrcorrection(pvals,alpha=0.05, method='indep')
pvals
Out[ ]:
(array([False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False]),
 array([1.64253441e-01, 9.35522778e-01, 5.21386720e-01, 8.07361578e-01,
        9.97826979e-01, 3.31498471e-01, 8.07361578e-01, 5.68508228e-01,
        6.91908915e-01, 5.34369937e-01, 4.49408286e-04, 1.99877010e-01,
        9.63535360e-01, 7.78106033e-01, 2.25187596e-01, 4.82880690e-01,
        6.42595657e-01, 4.19543898e-01, 6.87698796e-01, 9.35522778e-01,
        3.31498471e-01, 9.63535360e-01, 9.63535360e-01, 3.31498471e-01,
        3.31498471e-01, 9.97826979e-01, 4.82880690e-01, 1.64253441e-01,
        4.82880690e-01, 2.09527206e-01, 4.82880690e-01, 3.31498471e-01,
        9.63535360e-01, 1.64253441e-01, 9.35522778e-01, 1.00000000e+00]))
In [ ]:
#counts = counts[counts['count']> 0 ]
#https://stackoverflow.com/questions/36578458/how-does-one-insert-statistical-annotations-stars-or-p-values-into-matplotlib/37518947#37518947

counts['cluster'] = [int(i) for i in counts['cluster']] 
plt.figure(figsize=(16, 6))
ax = sns.boxplot(x="cluster", y="count", hue="condition", data=counts,linewidth=0.5,palette=["darkorange", "b"])
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')

# statistical annotation
# alpha < 0.05
for i in [11]: 
  w = 0.5
  x1, x2 = i-w,i+w  

  y, h, col = counts['count'][counts['cluster'] == i].max() + 2, 4, 'k'
  plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
  plt.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)


plt.show()
In [ ]:
internalDist = list(withinFS_Dists['fedStarv'])
In [ ]:
scipy.stats.spearmanr(internalDist,pvals[1])
Out[ ]:
SpearmanrResult(correlation=-0.050871795043468955, pvalue=0.768261688241116)
In [ ]:
plt.scatter(internalDist,pvals[1])
plt.xlabel('Internal Distances')
plt.ylabel('P-value: Cell Count Difference')
plt.grid(None)
In [ ]:
counts.to_csv('individ_cellCounts.csv',index=None)
Repeat for Stimulation Data
In [ ]:
overlap_combo = anndata.read("D1.1823")
bus_combo_noZero.obs['cellRanger_louvain'] = pd.Categorical(overlap_combo[overlap_combo.obs['cell_origin'] == 'Stim'].obs['knn_clus'])
In [ ]:
withinStim_Dists = getStimCentroidDists(bus_combo_noZero,60,'cellRanger_louvain')
withinStim_Dists
Out[ ]:
swKCl swDI cluster clus_color
0 7.350214 8.503921 0 NaN
1 21.065245 36.732548 1 NaN
2 20.948408 20.759415 2 NaN
3 13.442171 34.598927 3 NaN
4 14.205286 25.923903 4 NaN
5 22.041147 31.476810 5 NaN
6 14.477929 25.217993 6 NaN
7 11.867729 17.921000 7 NaN
8 12.292484 11.644725 8 NaN
9 22.288679 22.918928 9 NaN
10 10.736828 11.939169 10 NaN
11 9.275630 8.065190 11 NaN
12 5.801665 22.793827 12 NaN
13 31.335329 31.087091 13 NaN
14 25.940889 28.084023 14 NaN
15 10.881435 18.185766 15 NaN
16 25.211300 17.230129 16 NaN
17 22.916479 12.188016 17 NaN
18 21.876759 14.847662 18 NaN
19 23.442184 24.017313 19 NaN
20 56.649334 65.165184 20 NaN
21 24.023872 16.950150 21 NaN
22 13.108215 19.507177 22 NaN
23 6.248736 12.076812 23 NaN
24 32.759808 45.112389 24 NaN
25 11.500583 18.344841 25 NaN
26 34.523087 26.619263 26 NaN
27 25.315067 31.565296 27 NaN
28 46.271626 31.329525 28 NaN
29 13.501001 10.425589 29 NaN
30 18.720173 38.476334 30 NaN
31 38.197033 57.892250 31 NaN
32 31.816713 17.769464 32 NaN
33 40.618649 44.847408 33 NaN
34 11.309300 12.301980 34 NaN
35 16.585102 11.396259 35 NaN
In [ ]:
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_combo_noZero,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))
173.6400492134319
In [ ]:
#withinStim_Dists['swDI']
print(np.max(withinStim_Dists))
swKCl         56.649334
swDI          65.165184
cluster       35.000000
clus_color          NaN
dtype: float64
In [ ]:
max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))

plt.figure(figsize=(16,10))
plt.hist(withinStim_Dists['swKCl'],color='r',bins=15,alpha=0.6,label='SW-KCl',density=True)
plt.hist(withinStim_Dists['swDI'],color='b',bins=15,alpha=0.6,label='SW-DI',density=True)

plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with SW/KCl/DI cells)')

plt.xlabel("L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")
plt.grid(b=None)
plt.show()
65.46425357810222
In [ ]:
print(np.where(pairCentroid_dists < 65.46))
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35]))

Organismal Distances, Fed/Starved + Stimulation

Batch effect plots

In [ ]:
#condition -- Stim, fed -- Fed/Starved data
overlap_combo = anndata.read("D1.1823")
overlap_combo
Out[ ]:
AnnData object with n_obs × n_vars = 32594 × 6756
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus'
    var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std'
    uns: 'pca'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
In [ ]:
#Use overlap_combo

#Go through each clustee
clus = np.unique(overlap_combo.obs['knn_clus'])

fed_data = overlap_combo[overlap_combo.obs['fed'] == 'True']
sw_data = overlap_combo[overlap_combo.obs['condition'] == 'SW']

fed_orgs = np.unique(fed_data.obs['orgID'])
sw_orgs = np.unique(sw_data.obs['orgID'])

pcs = 60

org_metadata = pd.DataFrame(columns =['orgID','cluster','condition']) #centroid = row mean for all cells in organism
num_points = (len(fed_orgs)+len(sw_orgs))*len(clus)

#Get centroid in PCA space in each cluster for each organism
org_centroids = np.zeros((num_points,pcs))

condition = []
cluster = []
orgID = []

count = 0
toRemove = []
for c in clus:
  #Get all cells in chosen condition
  fed_sub = fed_data[fed_data.obs['knn_clus'] == c]
  fed_pca_embed = fed_sub.obsm['X_pca'][:,0:pcs] #PCA matrix

  sw_sub = sw_data[sw_data.obs['knn_clus'] == c]
  sw_pca_embed = sw_sub.obsm['X_pca'][:,0:pcs] #PCA matrix

  #For each org, get centroid of cells (X_pca)
  for sw in sw_orgs:
    condition += ['SW']
    cluster += [c]
    orgID += [sw]

    #Get location of org cells
    org_pos = list(sw_sub.obs['orgID'] == sw)

    #Sum down rows of cells for each PC
    if len(sw_pca_embed[org_pos,]) == 0:
      toRemove += [count]

    org_centroids[count,:] = list(sw_pca_embed[org_pos,].mean(axis=0))

    count += 1

  for fed in fed_orgs:
    condition += ['Fed']
    cluster += [c]
    orgID += [fed]

    #Get location of org cells
    org_pos = list(fed_sub.obs['orgID'] == fed)
    #Sum down rows of cells for each PC
    if len(fed_pca_embed[org_pos,]) == 0:
      toRemove += [count]

    org_centroids[count,:] = list(fed_pca_embed[org_pos,].mean(axis=0))

    count += 1    

  #For each org, get centroid of cells (X_pca)

  #Average pairwise distance between org centroid

org_metadata['orgID'] = orgID
org_metadata['cluster'] = cluster
org_metadata['condition'] = condition
In [ ]:
toRemove
Out[ ]:
[]
In [ ]:
org_metadata = org_metadata.drop(org_metadata.index[toRemove])
org_metadata.head()

org_centroids= np.delete(org_centroids, toRemove, 0)
org_centroids[0,:]
Out[ ]:
array([-2.25471210e+00, -1.51177585e+00, -1.93993020e+00, -9.54584032e-02,
       -2.38718843e+00, -1.51001954e+00, -2.81468654e+00,  1.05100930e+00,
        1.54143572e+00, -7.58547902e-01, -1.50774598e+00, -2.47600722e+00,
        7.53088653e-01, -9.25398111e-01,  5.30199647e-01, -2.36053783e-02,
        9.48955193e-02, -6.50646329e-01,  1.05686748e+00, -2.67228413e+00,
        2.98014688e+00,  5.93579225e-02,  5.69488704e-01, -1.05491745e+00,
        1.12574625e+00, -1.49573892e-01,  5.68022847e-01, -1.29234409e+00,
        1.40452191e-01, -3.84621084e-01,  2.95532733e-01,  1.13393039e-01,
        2.88706899e-01, -5.50938368e-01, -1.17623888e-01, -1.51531205e-01,
       -7.78403163e-01, -7.72583902e-01,  9.31857646e-01,  9.21139657e-01,
       -4.21358228e-01,  9.92068797e-02, -3.57610792e-01,  1.40223965e-01,
        4.30851281e-02,  2.75345147e-01,  5.04794896e-01, -1.59878790e-01,
        1.80910379e-01,  2.61838585e-01, -2.14510873e-01,  1.25861779e-01,
       -3.76004130e-01,  2.04274207e-01, -1.61955715e-03, -9.25957318e-03,
        5.98372109e-02, -2.70651639e-01,  2.62535036e-01,  1.59801412e-02])
In [ ]:
org_pairDists = pd.DataFrame(columns =['cluster','pair_fed','pair_sw','pair_mix'])
cluster = []
pair_fed = []
pair_sw = []
pair_mix = []

for c in clus:

  #Fed org centroids
  where_fed = (org_metadata.cluster == c) & (org_metadata.condition == 'Fed')
  fed_mat = org_centroids[where_fed,:]

  pair_fed_dist = pairwise_distances(fed_mat, metric = 'l1')
  pair_fed_mean = np.mean(pair_fed_dist)

  #SW org centroids
  where_sw = (org_metadata.cluster == c) & (org_metadata.condition == 'SW')
  sw_mat = org_centroids[where_sw,:]

  pair_sw_dist = pairwise_distances(sw_mat, metric = 'l1')
  pair_sw_mean = np.mean(pair_sw_dist)

  pair_mix_dist = pairwise_distances(fed_mat,sw_mat, metric = 'l1')
  pair_mix_mean = np.mean(pair_mix_dist)

  cluster += [c]
  pair_fed += [pair_fed_mean]
  pair_sw += [pair_sw_mean]
  pair_mix += [pair_mix_mean]

org_pairDists['cluster'] = cluster
org_pairDists['pair_fed'] = pair_fed
org_pairDists['pair_sw'] = pair_sw
org_pairDists['pair_mix'] = pair_mix
In [ ]:
org_pairDists.head()
Out[ ]:
cluster pair_fed pair_sw pair_mix
0 0 8.005131 7.995117 23.570756
1 1 15.629137 32.140367 75.364971
2 2 9.598802 14.187616 49.332659
3 3 19.421792 22.127487 42.742088
4 4 16.030268 26.287836 36.652727
In [ ]:
plt.figure(figsize=(16,10))
plt.hist(org_pairDists['pair_fed'],color='r',bins=15,alpha=0.6,label='control org dist',density=True)
plt.hist(org_pairDists['pair_sw'],color='blue',bins=15,alpha=0.6,label='sw org dist',density=True)
plt.hist(org_pairDists['pair_mix'], bins=15,color = 'orange',alpha=0.6,label='mix dist',density=True)
plt.legend(loc='upper right')
plt.title('Inter-Organism Distances (L1)')
plt.grid(b=None)
plt.xlabel("Average L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")

plt.show()
In [ ]:
overlap_combo
Out[ ]:
AnnData object with n_obs × n_vars = 32594 × 6756
    obs: 'batch', 'n_counts', 'n_countslog', 'louvain', 'leiden', 'orgID', 'fed', 'starved', 'fed_neighbor_score', 'cellRanger_louvain', 'annos', 'new_cellRanger_louvain', 'annosSub', 'condition', 'cell_origin', 'knn_clus'
    var: 'n_counts-0', 'mean-0', 'std-0', 'n_counts-1', 'mean-1', 'std-1', 'mean', 'std'
    uns: 'pca'
    obsm: 'X_nca', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
In [ ]:
overlap_combo.write('overlap_combo.h5ad')
... storing 'louvain' as categorical
... storing 'leiden' as categorical
... storing 'orgID' as categorical
... storing 'fed' as categorical
... storing 'starved' as categorical
... storing 'cellRanger_louvain' as categorical
... storing 'annos' as categorical
... storing 'new_cellRanger_louvain' as categorical
... storing 'annosSub' as categorical
... storing 'condition' as categorical
In [ ]: