Comparative Analysis of Cell Atlas Concordance and Perturbation Responses
In [ ]:
!date
Download Data¶
In [1]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [3]:
#Kallisto bus clustered starvation data, h5ad
download_file('10.22002/D1.1796','.gz')
#CellRanger Starvation h5ad data
download_file('10.22002/D1.1798','.gz')
#Import previously saved, clustered, & filtered stimulation data
download_file('10.22002/D1.1821','.gz')
In [ ]:
#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing
download_file('10.22002/D1.1797','.gz')
#Import raw, unclustered stimulation data
download_file('10.22002/D1.1814','.gz')
In [ ]:
#Import merged data with knn clusters
download_file('10.22002/D1.1823','.gz')
In [5]:
!gunzip *.gz
In [6]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
!pip install --quiet louvain
Import packages¶
In [4]:
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sc.set_figure_params(dpi=125)
import seaborn as sns
sns.set(style="whitegrid")
Do Distance Based Analysis for Inter- and Intra- Cluster Distances¶
Read in previously saved data
In [5]:
# Description: bus_fs_combo data (filtered + clustered starvation (fed/starved --> fs) data)
# Description: bus_combo_noZero (filtered + clustered stimulation data) from Kallisto bus
#bus_fs_combo
bus_fs_combo = anndata.read("D1.1796")
print(bus_fs_combo)
bus_combo_noZero = anndata.read('D1.1821')
print(bus_combo_noZero)
#Previously saved overlap, merged data (from both experiments)
overlap_combo = anndata.read("D1.1823")
overlap_combo
Out[5]:
In [15]:
#How raw overlap adata was made (no genes filtered out)
bus_fs_raw = anndata.read("D1.1797")
bus_stim_raw = anndata.read("D1.1814")
#Merge datasets
bus_fs_raw.obs_names = [i+'_fs' for i in bus_fs_raw.obs_names]
bus_stim_raw.obs_names = [i+'_ieg' for i in bus_stim_raw.obs_names]
raw_overlap_combo = bus_fs_raw.concatenate(bus_stim_raw,join='outer', index_unique=None)
Predict labels for stimulation data
In [ ]:
#Make X_pca with merged, use top n PC's
def getPredLabels(overlap_fs,overlap_combo, n_PC):
numFS = len(overlap_fs.obs_names)
X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]
#X_pca at stim rows = X_test
X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]
#y_train is f/s louvain labels
y_train = overlap_fs.obs['cellRanger_louvain']
#Fit and predict
classifier = KNeighborsClassifier(n_neighbors=15)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
labels = list(y_train)+list(y_pred)
print(len(labels),' labels created')
return labels
How merged dataset (joint-PC space) with stimulation and starvation experiments was made (= overlap_combo)¶
In [ ]:
#Get intersection of var names between the two (making overlap_combo)
overlap = list(set(bus_combo_noZero.var_names).intersection(bus_fs_combo.var_names))
overlap_fs = bus_fs_combo[:,overlap]
overlap_stim = bus_combo_noZero[:,overlap]
#Merge datasets
overlap_fs.obs_names = [i+'_fs' for i in overlap_fs.obs_names]
overlap_stim.obs_names = [i+'_ieg' for i in overlap_stim.obs_names]
origin = list(np.repeat('FS',len(overlap_fs.obs_names))) + list(np.repeat('Stim',len(overlap_stim.obs_names)))
overlap_combo = overlap_fs.concatenate(overlap_stim,join='outer', index_unique=None)
overlap_combo.obs['cell_origin'] = pd.Categorical(origin)
sc.pp.scale(overlap_combo, max_value=10)
#Do PCA on merged dataset + plot variance explained
sc.tl.pca(overlap_combo, n_comps=60)
#sc.pl.pca_variance_ratio(overlap_combo, log=True)
#overlap_combo
labels = getPredLabels(overlap_fs,overlap_combo,n_PC = 60)
overlap_combo.obs['knn_clus'] = pd.Categorical(labels)
In [ ]:
sc.pp.neighbors(overlap_combo,n_neighbors=150, n_pcs=60,random_state=42)
sc.tl.paga(overlap_combo, groups='knn_clus')
sc.pl.paga(overlap_combo, color=['knn_clus'])
In [ ]:
sc.tl.tsne(overlap_combo,random_state=42,n_pcs=60,early_exaggeration=5)
In [ ]:
sc.tl.umap(overlap_combo,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga')
In [ ]:
# sc.pl.tsne(overlap_combo,color=['knn_clus'])
In [ ]:
sc.pl.umap(overlap_combo,color=['knn_clus'],color_map='viridis')
In [ ]:
#overlap_combo.write('overlap_combo.h5ad')
Plot percent of K neighbors for each stim cell that are in one unique cluster¶
In [ ]:
from sklearn.neighbors import NearestNeighbors
import random
#Read in previously saved data
overlap_combo = anndata.read("D1.1823")
n = 15
neigh = NearestNeighbors(n_neighbors=n)
n_PC = 60
numFS = len(overlap_fs.obs_names)
X_train = overlap_combo.obsm['X_pca'][0:numFS,0:n_PC]
#X_pca at stim rows = X_test
X_test = overlap_combo.obsm['X_pca'][numFS:,0:n_PC]
#y_train is f/s louvain labels
y_train = overlap_fs.obs['cellRanger_louvain']
#Fit and predict
#neigh.fit(X_train)
#Subsample 70% of fed/starved data for training, test on remaining 30% and stim data
numSamp = np.int(0.7*X_train.shape[0])
ind = random.sample(range(0,X_train.shape[0]),numSamp)
allInd = range(0,X_train.shape[0])
rest = [i for i in allInd if i not in ind]
#Fit and predict
neigh.fit(X_train[ind,:])
X_rest = X_train[rest,:]
fsNeigh = neigh.kneighbors(X_rest)
stimNeigh = neigh.kneighbors(X_test)
In [ ]:
y_train = y_train[ind]
In [ ]:
stim_neighbors = stimNeigh[1]
fs_neighbors = fsNeigh[1]
#y_train = y_train[ind]
stim_perc = np.zeros(stim_neighbors.shape[0])
for i in range(0,len(stim_perc)):
#How many of top neighbors come from same cluster in fed/starved data (out of n=15)
stim_perc[i] = np.max(y_train[stim_neighbors[i]].value_counts())/n
fs_perc = np.zeros(fs_neighbors.shape[0])
#y_train = y_train[ind]
for i in range(0,len(fs_perc)):
fs_perc[i] = np.max(y_train[fs_neighbors[i]].value_counts())/n
In [ ]:
plt.hist(stim_perc,density=False,range=(0,1),alpha=0.6 ,bins=15,weights=np.ones(len(stim_perc))/len(stim_perc),label='stim-neighbors')
plt.hist(fs_perc,density=False,range=(0,1),alpha=0.6 ,bins=15 ,weights=np.ones(len(fs_perc))/len(fs_perc),label='fs-neighbors')
plt.legend(loc='upper left')
plt.title('Histogram of Nearest Neighbor Specificity',fontsize=11)
plt.grid(b=None)
plt.xlabel("Fraction of n=15 nearest neighbors in unique cluster",fontsize=10)
plt.ylabel("Frequency",fontsize=10)
plt.show()
Determine Inter and Intra Cluster Distances (perturbation effect vs cell type distance)¶
In [ ]:
#Plot distances between centroid of stim-starved and fed-starved for each cluster
def changeBool(fed):
if fed == 'True':
return 'fed'
elif fed == 'False':
return 'starved'
else:
return 'stim'
def addConds(overlap_combo):
fs_conds = [changeBool(i) for i in overlap_combo.obs['fed']]
overlap_combo.obs['centroid_conds'] = pd.Categorical(fs_conds)
overlap_combo.obs['centroid_conds']
In [ ]:
from sklearn.metrics import pairwise_distances
#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
clusters = np.unique(overlap_combo.obs[clusType])
centroids = np.zeros((len(clusters),pcs))
for c in clusters:
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
centroid = pca_embed.mean(axis=0)
centroids[c,:] = list(centroid)
return centroids
In [ ]:
#Distance between conditions within clusters
def getCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
fedStarv = []
fsStim = []
cluster = []
#Initialize output matrix
addConds(overlap_combo)
clusters = np.unique(overlap_combo.obs[clusType])
centroid_dist = pd.DataFrame(columns =['fedStarv','fsStim','cluster','clus_color'])
for c in clusters:
#Get cells in cluster and their conditions
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
centroid_conds = sub_data.obs['centroid_conds']
#Get 2D tsne embedding
pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA
#Get location of condition cells
stim_pos = list(centroid_conds == 'stim')
allFS_pos = list(centroid_conds != 'stim')
fed_pos = list(centroid_conds == 'fed')
starved_pos = list(centroid_conds == 'starved')
#Get column means for x,y coords
meanFed = pca_embed[fed_pos,].mean(axis=0)
meanStarv = pca_embed[starved_pos,].mean(axis=0)
meanStim = pca_embed[stim_pos,].mean(axis=0)
meanFS = pca_embed[allFS_pos,].mean(axis=0)
cluster += [c]
#Dist between fed and starved
fs_dist = np.linalg.norm(meanFed - meanStarv,1)
#Dist between (all) starved experiment and stim
starvStim_dist = np.linalg.norm(meanFS - meanStim,1)
fedStarv += [fs_dist]
fsStim += [starvStim_dist]
centroid_dist['fedStarv'] = fedStarv
centroid_dist['fsStim'] = fsStim
centroid_dist['cluster'] = cluster
return centroid_dist
In [ ]:
#Distance between conditions within clusters
def getStimCentroidDists(overlap_combo,pcs=60,clusType='knn_clus'):
swKCl = []
swDI = []
cluster = []
#Initialize output matrix
#addConds(overlap_combo)
clusters = np.unique(overlap_combo.obs[clusType])
centroid_dist = pd.DataFrame(columns =['swKCl','swDI','cluster','clus_color'])
for c in clusters:
#Get cells in cluster and their conditions
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
centroid_conds = sub_data.obs['condition']
#Get 2D tsne embedding
pca_embed = sub_data.obsm['X_pca'][:,0:pcs] #PCA
#Get location of condition cells
sw_pos = list(centroid_conds == 'SW')
kcl_pos = list(centroid_conds == 'KCl')
di_pos = list(centroid_conds == 'DI')
#Get column means for x,y coords
meanSW = pca_embed[sw_pos,].mean(axis=0)
meanKCl = pca_embed[kcl_pos,].mean(axis=0)
meanDI = pca_embed[di_pos,].mean(axis=0)
cluster += [c]
#Dist between fed and starved
swKCl_dist = np.linalg.norm(meanSW - meanKCl,1)
#Dist between (all) starved experiment and stim
swDI_dist = np.linalg.norm(meanSW - meanDI,1)
swKCl += [swKCl_dist]
swDI += [swDI_dist]
centroid_dist['swKCl'] = swKCl
centroid_dist['swDI'] = swDI
centroid_dist['cluster'] = cluster
return centroid_dist
Fed/Starved L1 Analysis¶
Cell type v state plots
In [ ]:
withinFS_Dists = getCentroidDists(bus_fs_combo,60,'cellRanger_louvain')
withinFS_Dists.head()
Out[ ]:
In [ ]:
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_fs_combo,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))
In [ ]:
np.unique(bus_fs_combo.obs['cellRanger_louvain'])
Out[ ]:
In [ ]:
#Early cnidocytes, germ cells, stem cells have distances that overlap with intra-F/S distances for 14&19
np.where(pairCentroid_dists < 62)
Out[ ]:
In [ ]:
max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))
matplotlib.rc('axes',edgecolor='black')
plt.figure(figsize=(16,10))
plt.hist(withinFS_Dists['fedStarv'],color='r',bins=15,alpha=0.6,label='fed-starved',density=True)
plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with fed/starved cells)')
plt.grid(b=None)
plt.xlabel("L1 Distance with 60 PC's")
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()
In [ ]:
#Area where line between cell state/type is blurred seems to be only in digestive endodermal clusters 14,19 (strong metabolic responses to starvation)
outlier_clus = withinFS_Dists[withinFS_Dists.fedStarv> 46.69]['cluster']
outlier_clus
Out[ ]:
Look at median genes per cell across types
In [16]:
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_combo.obs['cellRanger_louvain'])
sc.pp.filter_genes(bus_fs_raw, min_counts=3)
sc.pp.filter_cells(bus_fs_raw, min_counts=1)
bus_fs_raw
Out[16]:
In [12]:
# raw_overlap_combo[raw_overlap_combo.obs['cellRanger_louvain'].isin([0])].X
Out[12]:
In [17]:
bus_fs_raw.obs['cellRanger_louvain']
Out[17]:
In [18]:
clusters = np.unique(bus_fs_raw.obs['cellRanger_louvain'])
len(clusters)
Out[18]:
In [48]:
clusters = np.unique(bus_fs_raw.obs['cellRanger_louvain'])
medians = []
clus = []
X = bus_fs_raw.X.todense()
#For each cell type
for cl in clusters:
data = bus_fs_raw[bus_fs_raw.obs['cellRanger_louvain'].isin([cl])]
#Get if for a gene count is > 0 (True/False)
subX = X[bus_fs_raw.obs['cellRanger_louvain'].isin([cl]),:]
nonzero = subX > 0
#Sum each row (cell)
sum = np.sum(nonzero,axis=1)
s = np.median(np.array(sum.tolist())) #np.reshape(sum, sum.shape[0])
#Take median
med = np.median(s,axis = None)
clus += [cl]
medians += [med]
meds = pd.DataFrame()
meds['Median'] = medians
meds['Louvain Cluster'] = clus
meds.head()
Out[48]:
In [52]:
meds['Median'] = np.log(meds['Median'])
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="Louvain Cluster", y="Median", data=meds,linewidth=0.5,s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Median Genes per Cell (log)')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(meds["Louvain Cluster"]))
plt.show()
Compare internal distances to differences in numbers of fed & starved cells (Spearman Rank correlation)
In [ ]:
#Composition of broad cell types between fed and starved , cell counts
counts = pd.DataFrame(columns =['count', 'orgID','condition','cluster'])
clusters = np.unique(bus_fs_combo.obs['cellRanger_louvain'])
c = []
org = []
cond = []
clus = []
orgs = np.unique(bus_fs_combo.obs['orgID'])
conds = ['True', 'False']
for cl in clusters:
data = bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([cl])]
for cd in conds:
#c_data = data[data.obs['condition'].isin([cd])]
for o in orgs:
pos = (data.obs['fed'].isin([cd])) & (data.obs['orgID'].isin([o]))
org_data = data[pos]
c += [org_data.n_obs]
org += [o]
if cd == 'True':
cond += ['fed']
else:
cond += ['starved']
clus += [cl]
print(len(c))
counts['count'] = c
counts['orgID'] = org
counts['condition'] = cond
counts['cluster'] = clus
In [ ]:
counts = counts[counts['count']> 0 ]
counts.head()
Out[ ]:
In [ ]:
mean = []
var = []
clus = []
for i in np.unique(counts['cluster']):
sub = counts[counts['cluster']==i]
mean += [np.mean(sub['count'])]
var += [np.var(sub['count'],ddof=1)]
clus += [i]
vars = pd.DataFrame()
vars['mean'] = mean
vars['var'] = var
vars['cluster'] = clus
vars.head()
Out[ ]:
In [ ]:
#from sklearn.metrics import r2_score
import pylab as pl
vars['cluster'] = [str(i) for i in vars['cluster']]
vars['logmean'] = np.log10(vars['mean'])
vars['logvar'] = np.log10(vars['var'])
ax = sns.scatterplot(x="logmean", y="logvar", hue="cluster", data=vars,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Variance in Cell Counts (log10)')
ax.set(xlabel='Average Cell Count (log10)')
poly = pl.polyfit(vars['logmean'], vars['logvar'], 1)
def poly2latex(poly, variable="x", width=2):
t = ["{0:0.{width}f}"]
t.append(t[-1] + " {variable}")
t.append(t[-1] + "^{1}")
def f():
for i, v in enumerate(reversed(poly)):
idx = i if i < 2 else 2
yield t[idx].format(v, i, variable=variable, width=width)
return "${}$".format("+".join(f()))
x2 = np.linspace(0, 2.5, 100)
y2 = np.polyval(poly, x2)
plt.plot(x2, y2, "b--",lw=2)
pl.text(x2[5]+0.1, y2[5], poly2latex(poly), fontsize=11)
r2 = np.corrcoef(np.polyval(poly, vars['logmean']),vars['logvar'])
plt.text(x2[5]+0.1, y2[5]-0.25, "r: "+str(np.round(r2[0,1],3)), fontsize=11)
plt.grid(False)
In [ ]:
vars['cluster'] = [str(i) for i in vars['cluster']]
vars['logmean'] = np.log10(vars['mean'])
vars['logvar'] = np.log10(vars['var'])
ax = sns.scatterplot(x="mean", y="var", hue="cluster", data=vars,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Variance in Cell Counts ')
ax.set(xlabel='Average Cell Count ')
plt.grid(False)
In [ ]:
#All animals have >0 cells in all cell types
numOrgs = []
for i in np.unique(counts.cluster):
numOrgs += [len(np.unique(counts.orgID[counts['cluster'] == i ]))]
np.unique(numOrgs)
Out[ ]:
In [ ]:
individSum = counts.groupby('orgID').sum()
frac = []
for i in np.unique(counts.cluster):
frac += list(counts['count'][counts['cluster'] == i ]/sum(counts['count'][counts['cluster'] == i ]))
counts['frac'] = frac
counts.head()
Out[ ]:
In [ ]:
print(individSum)
np.mean(individSum['count'])
Out[ ]:
In [ ]:
#plt.plot(counts['count'],counts['frac'],'o')
counts['cluster'] = [str(i) for i in counts['cluster']]
ax = sns.scatterplot(x="count", y="frac", hue="cluster",style="orgID", data=counts,linewidth=0.5,s=50)
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=3)
ax.set(ylabel='Fraction of Cell Type')
ax.set(xlabel='Cell Count')
scipy.stats.spearmanr(counts['count'],counts['frac'])
Out[ ]:
In [ ]:
counts['cluster'] = [int(i) for i in counts['cluster']]
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="orgID",style="orgID", data=counts,linewidth=0.5,s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))
plt.show()
In [ ]:
counts['cluster'] = [int(i) for i in counts['cluster']]
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="condition",style="orgID", data=counts,linewidth=0.5,palette=["darkorange", "b"],s=50) #palette=["darkorange", "b"]
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))
plt.show()
In [ ]:
#One-way anova for each cluster
pvalsFTest = {}
for cl in clusters:
sub = counts[counts.cluster == cl]
subFed = sub[sub.condition == 'fed']
subStarv = sub[sub.condition == 'starved']
fedVals = list(subFed['count'])
starvVals = list(subStarv['count'])
pvalsFTest[cl] = scipy.stats.f_oneway(fedVals,starvVals)
In [ ]:
import statsmodels.stats as sm
pvals = []
# sizes = []
for c in clusters:
pvals += [pvalsFTest[c][1]]
# sizes += [len(bus_fs_combo[bus_fs_combo.obs['cellRanger_louvain'].isin([c])].obs_names)]
pvals = sm.multitest.fdrcorrection(pvals,alpha=0.05, method='indep')
pvals
Out[ ]:
In [ ]:
#counts = counts[counts['count']> 0 ]
#https://stackoverflow.com/questions/36578458/how-does-one-insert-statistical-annotations-stars-or-p-values-into-matplotlib/37518947#37518947
counts['cluster'] = [int(i) for i in counts['cluster']]
plt.figure(figsize=(16, 6))
ax = sns.boxplot(x="cluster", y="count", hue="condition", data=counts,linewidth=0.5,palette=["darkorange", "b"])
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
# statistical annotation
# alpha < 0.05
for i in [11]:
w = 0.5
x1, x2 = i-w,i+w
y, h, col = counts['count'][counts['cluster'] == i].max() + 2, 4, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col)
plt.show()
In [ ]:
internalDist = list(withinFS_Dists['fedStarv'])
In [ ]:
scipy.stats.spearmanr(internalDist,pvals[1])
Out[ ]:
In [ ]:
plt.scatter(internalDist,pvals[1])
plt.xlabel('Internal Distances')
plt.ylabel('P-value: Cell Count Difference')
plt.grid(None)
In [ ]:
counts.to_csv('individ_cellCounts.csv',index=None)
Repeat for Stimulation Data¶
In [ ]:
overlap_combo = anndata.read("D1.1823")
bus_combo_noZero.obs['cellRanger_louvain'] = pd.Categorical(overlap_combo[overlap_combo.obs['cell_origin'] == 'Stim'].obs['knn_clus'])
In [ ]:
withinStim_Dists = getStimCentroidDists(bus_combo_noZero,60,'cellRanger_louvain')
withinStim_Dists
Out[ ]:
In [ ]:
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(bus_combo_noZero,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
print(np.mean(pairCentroid_dists))
In [ ]:
#withinStim_Dists['swDI']
print(np.max(withinStim_Dists))
In [ ]:
max(pairCentroid_dists.reshape((36*36,)))
reshape_noZero = [i for i in pairCentroid_dists.reshape((36*36,)) if i != 0]
print(np.min(reshape_noZero))
plt.figure(figsize=(16,10))
plt.hist(withinStim_Dists['swKCl'],color='r',bins=15,alpha=0.6,label='SW-KCl',density=True)
plt.hist(withinStim_Dists['swDI'],color='b',bins=15,alpha=0.6,label='SW-DI',density=True)
plt.hist(reshape_noZero, bins=36,color = 'orange',alpha=0.6,label='pairwise clusters',density=True)
plt.legend(loc='upper right')
plt.title('Comparison to pairwise distribution (pairwise on clusters with SW/KCl/DI cells)')
plt.xlabel("L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")
plt.grid(b=None)
plt.show()
In [ ]:
print(np.where(pairCentroid_dists < 65.46))
Organismal Distances, Fed/Starved + Stimulation¶
Batch effect plots
In [ ]:
#condition -- Stim, fed -- Fed/Starved data
overlap_combo = anndata.read("D1.1823")
overlap_combo
Out[ ]:
In [ ]:
#Use overlap_combo
#Go through each clustee
clus = np.unique(overlap_combo.obs['knn_clus'])
fed_data = overlap_combo[overlap_combo.obs['fed'] == 'True']
sw_data = overlap_combo[overlap_combo.obs['condition'] == 'SW']
fed_orgs = np.unique(fed_data.obs['orgID'])
sw_orgs = np.unique(sw_data.obs['orgID'])
pcs = 60
org_metadata = pd.DataFrame(columns =['orgID','cluster','condition']) #centroid = row mean for all cells in organism
num_points = (len(fed_orgs)+len(sw_orgs))*len(clus)
#Get centroid in PCA space in each cluster for each organism
org_centroids = np.zeros((num_points,pcs))
condition = []
cluster = []
orgID = []
count = 0
toRemove = []
for c in clus:
#Get all cells in chosen condition
fed_sub = fed_data[fed_data.obs['knn_clus'] == c]
fed_pca_embed = fed_sub.obsm['X_pca'][:,0:pcs] #PCA matrix
sw_sub = sw_data[sw_data.obs['knn_clus'] == c]
sw_pca_embed = sw_sub.obsm['X_pca'][:,0:pcs] #PCA matrix
#For each org, get centroid of cells (X_pca)
for sw in sw_orgs:
condition += ['SW']
cluster += [c]
orgID += [sw]
#Get location of org cells
org_pos = list(sw_sub.obs['orgID'] == sw)
#Sum down rows of cells for each PC
if len(sw_pca_embed[org_pos,]) == 0:
toRemove += [count]
org_centroids[count,:] = list(sw_pca_embed[org_pos,].mean(axis=0))
count += 1
for fed in fed_orgs:
condition += ['Fed']
cluster += [c]
orgID += [fed]
#Get location of org cells
org_pos = list(fed_sub.obs['orgID'] == fed)
#Sum down rows of cells for each PC
if len(fed_pca_embed[org_pos,]) == 0:
toRemove += [count]
org_centroids[count,:] = list(fed_pca_embed[org_pos,].mean(axis=0))
count += 1
#For each org, get centroid of cells (X_pca)
#Average pairwise distance between org centroid
org_metadata['orgID'] = orgID
org_metadata['cluster'] = cluster
org_metadata['condition'] = condition
In [ ]:
toRemove
Out[ ]:
In [ ]:
org_metadata = org_metadata.drop(org_metadata.index[toRemove])
org_metadata.head()
org_centroids= np.delete(org_centroids, toRemove, 0)
org_centroids[0,:]
Out[ ]:
In [ ]:
org_pairDists = pd.DataFrame(columns =['cluster','pair_fed','pair_sw','pair_mix'])
cluster = []
pair_fed = []
pair_sw = []
pair_mix = []
for c in clus:
#Fed org centroids
where_fed = (org_metadata.cluster == c) & (org_metadata.condition == 'Fed')
fed_mat = org_centroids[where_fed,:]
pair_fed_dist = pairwise_distances(fed_mat, metric = 'l1')
pair_fed_mean = np.mean(pair_fed_dist)
#SW org centroids
where_sw = (org_metadata.cluster == c) & (org_metadata.condition == 'SW')
sw_mat = org_centroids[where_sw,:]
pair_sw_dist = pairwise_distances(sw_mat, metric = 'l1')
pair_sw_mean = np.mean(pair_sw_dist)
pair_mix_dist = pairwise_distances(fed_mat,sw_mat, metric = 'l1')
pair_mix_mean = np.mean(pair_mix_dist)
cluster += [c]
pair_fed += [pair_fed_mean]
pair_sw += [pair_sw_mean]
pair_mix += [pair_mix_mean]
org_pairDists['cluster'] = cluster
org_pairDists['pair_fed'] = pair_fed
org_pairDists['pair_sw'] = pair_sw
org_pairDists['pair_mix'] = pair_mix
In [ ]:
org_pairDists.head()
Out[ ]:
In [ ]:
plt.figure(figsize=(16,10))
plt.hist(org_pairDists['pair_fed'],color='r',bins=15,alpha=0.6,label='control org dist',density=True)
plt.hist(org_pairDists['pair_sw'],color='blue',bins=15,alpha=0.6,label='sw org dist',density=True)
plt.hist(org_pairDists['pair_mix'], bins=15,color = 'orange',alpha=0.6,label='mix dist',density=True)
plt.legend(loc='upper right')
plt.title('Inter-Organism Distances (L1)')
plt.grid(b=None)
plt.xlabel("Average L1 Distance with 60 PC's")
plt.ylabel("Normalized Frequency")
plt.show()
In [ ]:
overlap_combo
Out[ ]:
In [ ]:
overlap_combo.write('overlap_combo.h5ad')
In [ ]: