Cell Atlas Clustering and Analysis from kallisto-bustools
!date
Download Data¶
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing
download_file('10.22002/D1.1797','.gz')
#CellRanger Starvation h5ad data
download_file('10.22002/D1.1798','.gz')
#Kallisto bus clustered starvation data, h5ad
download_file('10.22002/D1.1796','.gz')
#Human ortholog annotations
download_file('10.22002/D1.1819','.gz')
#Panther annotations
download_file('10.22002/D1.1820','.gz')
#GO Terms
download_file('10.22002/D1.1822','.gz')
#Saved DeSeq2 Results for Fed/Starved (Differentially expressed under starvation --> perturbed genes)
download_file('10.22002/D1.1810','.gz')
#Saved gene modules adata
download_file('10.22002/D1.1813','.gz')
#Gene Markers to plot (for cell atlas) --> Fig 2 heatmap
download_file('10.22002/D1.1809','.gz')
!gunzip *.gz
!pip install --quiet anndata
!pip install --quiet scanpy==1.7.0rc1
!pip install --quiet louvain
!pip3 install --quiet rpy2
Import Packages¶
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import random
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
sc.set_figure_params(dpi=125)
import seaborn as sns
sns.set(style="whitegrid")
%load_ext rpy2.ipython
# # See version of all installed packages, last done 01/11/2021
# !pip list -v > pkg_vers_20210111.txt
#Read in annotations
from io import StringIO
hg_ortho_df = pd.read_csv(StringIO(''.join(l.replace('|', '\t') for l in open('D1.1819'))),
sep="\t",header=None,skiprows=[0,1,2,3])
hg_ortho_df[['XLOC','TCONS']] = hg_ortho_df[13].str.split(expand=True)
hg_ortho_df[['Gene','gi']] = hg_ortho_df[3].str.split(expand=True)
hg_ortho_df['Description']= hg_ortho_df[11]
panther_df = pd.read_csv('D1.1820',
sep="\t",header=None) #skiprows=[0,1,2,3]
goTerm_df = pd.read_csv('D1.1822',
sep=" ",header=None) #skiprows=[0,1,2,3]
How Gene Filtering & Clustering Was Done for Kallisto Processed Data¶
#Read in starvation data
#Kallisto bus h5ad file with no gene filtering
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )
#CellRanger h5ad file with old louvain clustering
cellRanger_fs = anndata.read("D1.1798")
print(cellRanger_fs)
Apply labels from ClickTags (organism number and condition)
bus_fs_raw.obs['orgID'] = pd.Categorical(cellRanger_fs.obs['orgID'])
bus_fs_raw.obs['fed'] = pd.Categorical(cellRanger_fs.obs['fed'])
bus_fs_raw.obs['starved'] = pd.Categorical(cellRanger_fs.obs['starved'])
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(cellRanger_fs.obs['louvain'])
bus_fs_raw
sc.pp.filter_cells(bus_fs_raw, min_counts=0) #1
sc.pp.filter_genes(bus_fs_raw, min_counts=0)
bus_fs_raw_sub = bus_fs_raw[bus_fs_raw.obs['fed'] == True]
bus_fs_raw_sub2 = bus_fs_raw[bus_fs_raw.obs['fed'] == False]
bus_fs_raw_sub2
print('Fed reads:' + str(sum(bus_fs_raw_sub.obs['n_counts'])))
print('Starved reads:' + str(sum(bus_fs_raw_sub2.obs['n_counts'])))
print('Fed cells: '+str(len(bus_fs_raw_sub.obs_names)))
print('Starved cells: '+str(len(bus_fs_raw_sub2.obs_names)))
#Median Genes/cell
nonZero = bus_fs_raw.X.todense() != 0.0
nonZeroCounts = np.sum(nonZero,axis=1)
nonZeroCounts.shape
print('Median genes/cell:' + str(np.median(list(nonZeroCounts))))
#Median UMIs/cell
print('Median UMIs/cell:' + str(np.median(bus_fs_raw.obs['n_counts'])))
Filter for highly variable genes and apply cluster labels
#Find highly variable genes from all nonzero genes
#How annotated/filtered adata was made
#sc.pp.filter_cells(bus_fs_raw, min_counts=0) #1
#sc.pp.filter_genes(bus_fs_raw, min_counts=0)
bus_fs_raw.obs['n_countslog']=np.log10(bus_fs_raw.obs['n_counts'])
bus_fs_raw.raw = sc.pp.log1p(bus_fs_raw, copy=True)
sc.pp.normalize_per_cell(bus_fs_raw, counts_per_cell_after=1e4)
filter_result = sc.pp.filter_genes_dispersion(
bus_fs_raw.X, min_mean=0.0125, max_mean=4.5, min_disp=0.2)
sc.pl.filter_genes_dispersion(filter_result)
#Filter genes from anndata
bus_fs_raw = bus_fs_raw[:, filter_result.gene_subset]
print(bus_fs_raw)
#How to get PAGA/UMAP embedding (creates bus_fs_clus)
sc.pp.scale(bus_fs_raw, max_value=10)
sc.tl.pca(bus_fs_raw, n_comps=60)
sc.pp.neighbors(bus_fs_raw,n_neighbors=150, n_pcs=60,random_state=42) #use_rep='X_nca'
sc.tl.paga(bus_fs_raw, groups='cellRanger_louvain')
sc.pl.paga(bus_fs_raw, color=['cellRanger_louvain'])
sc.tl.umap(bus_fs_raw,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga') #min_dist=0.5,spread=3, min_dist=0.3
Cell Atlas Analysis for previously saved clustered and labeled data¶
Use for Cell Atlas markers, labels, perturbation response analysis, etc
#Read in PREVIOUSLY SAVED clustered + labeled data
bus_fs_clus = anndata.read("D1.1796")
print(bus_fs_clus )
#Code here for neighbor score
#Calculate number of neighbors (out of top 15) with same starvation condition
#Input adata object and if score is for fed or starved (True or False)
def neighborScores(adata,conditionBool):
sc.pp.neighbors(adata,n_neighbors=15)
neighborDists = adata.uns['neighbors']['distances'].todense()
counts = []
for i in range(0,len(adata.obs_names)):
cellNames = adata.obs_names
#get fed observation for this cell
cellObs = adata.obs['fed'][cellNames[i]]
#get row for cell
nonZero = neighborDists[i,:]>0
l = nonZero.tolist()[0]
cellRow = neighborDists[i,:]
cellRow = cellRow[:,l]
#get 'fed' observations
obs = adata.obs['fed'][l]
# count # in 'fed' observations == cell obs
count = 0
#if cellObs == conditionBool:
for j in obs:
if j == conditionBool:
count += 1
counts += [count]
print(len(counts))
return counts
Use PAGA embedding of cells to visualize umi counts, cell types, and animal conditions/labels
#Exact values used to generate PAGA image below
# sc.pp.neighbors(bus_fs_clus,n_neighbors=150, n_pcs=60,random_state=42) #use_rep='X_nca'
# sc.tl.paga(bus_fs_clus, groups='cellRanger_louvain',)
sc.pl.paga(bus_fs_clus, color=['cellRanger_louvain'])
bus_fs_clus
# Add fed_neighbor_score to adata
# bus_fs_clus.obs['fed_neighbor_score'] = neighborScores(bus_fs_clus,'True') #Uncomment to run
#Exact parameters for umap embedding below
#sc.tl.umap(bus_fs_clus,random_state=42,spread=2.5,min_dist = 0.8,init_pos='paga') #min_dist=0.5,spread=3, min_dist=0.3
#sc.pl.umap(bus_fs_clus,color=['annos'])
sc.pl.umap(bus_fs_clus,color=['n_countslog'],color_map='viridis')
#Change colormap range
rgb_list = [tuple([67/256,114/256,196/256]),tuple([237/256,125/256,49/256])]
float_list = list(np.linspace(0,1,len(rgb_list)))
cdict = dict()
for num, col in enumerate(['red','green','blue']):
col_list = [[float_list[i],rgb_list[i][num],rgb_list[i][num]] for i in range(len(float_list))]
cdict[col] = col_list
cmp = mcolors.LinearSegmentedColormap('starv',segmentdata=cdict,N=256)
sc.pl.umap(bus_fs_clus,color=['fed_neighbor_score'],color_map=cmp,save='score.pdf')
bus_fs_clus.obs['Individual'] = pd.Categorical(bus_fs_clus.obs['orgID'])
bus_fs_clus.obs['Individual'].cat.categories
bus_fs_clus.uns['Individual_colors'] = np.array(['#B00005','#8F5DFD','#FED354',
'#7D98D3','#FD4F53','#5FA137',
'#F0A171','#BBD9BB','#D18085',
'#16A53F'])
#Assign color to orgIDs
sc.pl.umap(bus_fs_clus,color=['Individual'],save='orgID.pdf')
How Cluster/Cell Type Labels are Assigned¶
#Defining the eight main classes
def annotateLouvain(bus_fs_clus):
cluster_types = []
for i in bus_fs_clus.obs_names:
if bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [35,13,2,0]:
cluster_types += ['Stem Cell/Germ Cell']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [12,11,23,17,5,10,21]:
cluster_types += ['Nematocyte']
# elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [5,10,21]:
# cluster_types += ['Mechanosensory']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [6,9,26,31]:
cluster_types += ['Neural']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [34,22,27,32,25]:
cluster_types += ['Gland Cell']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [3,7,14,15,19,24,16,33]:
cluster_types += ['Gastroderm']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [28]:
cluster_types += ['Bioluminescent Cells']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [1,4,29,18,8,30,20]:
cluster_types += ['Epidermal/Muscle']
bus_fs_clus.obs['annos'] = pd.Categorical(cluster_types)
annotateLouvain(bus_fs_clus)
bus_fs_clus
bus_fs_clus.obs["annos"].cat.categories
bus_fs_clus.uns['annos_colors'] = ['#707B7C','#AF7AC5','#BF124F','#1A5276','#1D8348','#7DCEA0','#F1C40F'] #'#5499C7'
sc.pl.umap(bus_fs_clus,color="annos")
#Get colors for broad class annotations, and apply colors to 36 subclusters
colors = bus_fs_clus.uns['annos_colors']
colorsCellR = bus_fs_clus.uns['cellRanger_louvain_colors']
annos_names = bus_fs_clus.obs["annos"].cat.categories
cellR_names = bus_fs_clus.obs["cellRanger_louvain"].cat.categories
annos_dict = {}
annos_dict['Stem Cell/Germ Cell'] = [35,13,2,0]
annos_dict['Nematocyte'] = [12,11,23,17,5,10,21]
#annos_dict['Mechanosensory'] = [5,10,21]
annos_dict['Neural'] = [6,9,26,31]
annos_dict['Gland Cell'] = [34,22,27,32,25]
annos_dict['Gastroderm'] = [3,7,14,15,19,24,16,33]
annos_dict['Bioluminescent Cells'] = [28]
annos_dict['Epidermal/Muscle'] = [1,4,29,18,8,30,20]
#annos_dict
#Update 36 Louvain cluster colors with main class colors
for n in annos_names:
pos = annos_dict[n]
for p in pos:
#
pos_2 = np.where(annos_names == n)[0][0]
colorsCellR[p] = colors[pos_2]
#colorsCellR
colorsCopy = colorsCellR.copy()
#Append color information to adata
bus_fs_clus.obs['new_cellRanger_louvain'] = bus_fs_clus.obs['cellRanger_louvain']
bus_fs_clus.uns['new_cellRanger_louvain_colors'] = colorsCellR
#Convert labels to string format for plotting - and update order of colors
#Make dictionary with cellRanger_louvain 36 clusters and new names --> new_cellRanger_louvain
bus_fs_clus.obs['new_cellRanger_louvain'] = [str(i) for i in bus_fs_clus.obs['new_cellRanger_louvain']]
bus_fs_clus.obs['new_cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['new_cellRanger_louvain'])
new_cats = bus_fs_clus.obs['new_cellRanger_louvain'].cat.categories
new_colors = bus_fs_clus.uns['new_cellRanger_louvain_colors']
for i in range(0,len(new_cats)):
new_colors[i] = colorsCopy[int(new_cats[i])]
bus_fs_clus.uns['new_cellRanger_louvain_colors'] = new_colors
#Create dendrogram for subclusters
sc.tl.dendrogram(bus_fs_clus,'new_cellRanger_louvain',linkage_method='ward')
bus_fs_clus.uns["dendrogram_new_cellRanger_louvain"] = bus_fs_clus.uns["dendrogram_['new_cellRanger_louvain']"]
bus_fs_clus.uns['new_cellRanger_louvain_colors']
#Make plot for 8 broad classes of cell types
colors = bus_fs_clus.uns['annos_colors']
colors
fig, ax = plt.subplots(figsize=(10,10))
c = np.unique(bus_fs_clus.obs["annos"].values)
cmap = [i+'70' for i in colors]#plt.cm.get_cmap("tab20")
names = c
for idx, (cluster, name) in enumerate(zip(c, names)):
XX = bus_fs_clus[bus_fs_clus.obs.annos == cluster,:].obsm["X_umap"]
x = XX[:,0]
y = XX[:,1]
ax.scatter(x, y, color = cmap[idx], label=cluster,s=5)
if name == 'Bioluminescent Cells':
ax.annotate(name,
(np.median(x)+7, np.median(y)),
horizontalalignment='center',
verticalalignment='bottom',
size=14, weight='bold',
color="black",
backgroundcolor=cmap[idx])
else:
ax.annotate(name,
(np.median(x), np.median(y)),
horizontalalignment='center',
verticalalignment='bottom',
size=14, weight='bold',
color="black",
backgroundcolor=cmap[idx])
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')
ax.grid(False)
ax.get_xaxis().set_ticks([])
ax.get_yaxis().set_ticks([])
for edge_i in ['bottom','left']:
ax.spines[edge_i].set_edgecolor("black")
for edge_i in ['top', 'right']:
ax.spines[edge_i].set_edgecolor("white")
plt.savefig('broadAtlas.pdf')
plt.show()
#Names for all 36 clusters/cell types
def annotateLouvainSub(bus_fs_clus):
cluster_types = []
for i in bus_fs_clus.obs_names:
if bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [12]:
cluster_types += ['Nematocyte Precursors']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [17]:
cluster_types += ['Late Nematoblasts']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [11]:
cluster_types += ['Early Nematoblasts']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [23]:
cluster_types += ['Mid Nematoblasts']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [2]:
cluster_types += ['Medium Oocytes']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [13]:
cluster_types += ['Small Oocytes']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [7]:
cluster_types += ['GastroDigestive-B']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [15]:
cluster_types += ['GastroDigestive-D']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [3]:
cluster_types += ['GastroDigestive-A']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [14]:
cluster_types += ['GastroDigestive-C']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [19]:
cluster_types += ['GastroDigestive-E']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [24]:
cluster_types += ['GastroDigestive-F']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [5]:
cluster_types += ['Terminal Differentiating Nematocytes']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [10]:
cluster_types += ['Differentiating Nematocytes']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [21]:
cluster_types += ['Mature Nematocytes']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [0]:
cluster_types += ['i-Cells']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [1]:
cluster_types += ['Exumbrella Epidermis']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [4]:
cluster_types += ['Manubrium Epidermis']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [6]:
cluster_types += ['Neural Cells Early Stages']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [9]:
cluster_types += ['Neural Cells-A (incl. GLWa, MIH cells)']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [26]:
cluster_types += ['Neural Cells-B (incl. RFamide cells)']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [31]:
cluster_types += ['Neural Cells-C (incl. YFamide cells)']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [8]:
cluster_types += ['Striated Muscle of Subumbrella']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [16]:
cluster_types += ['Tentacle Bulb Distal Gastroderm']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [18]:
cluster_types += ['Radial Smooth Muscles']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [20]:
cluster_types += ['Tentacle Epidermis']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [22]:
cluster_types += ['Gland Cells-A']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [25]:
cluster_types += ['Gland Cells-C']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [27]:
cluster_types += ['Gland Cells-B']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [28]:
cluster_types += ['Tentacle GFP Cells']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [29]:
cluster_types += ['Gonad Epidermis']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [30]:
cluster_types += ['Striated Muscle of Velum']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [32]:
cluster_types += ['Gland Cells-D']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [33]:
cluster_types += ['Endodermal Plate']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [34]:
cluster_types += ['Gland Cells-E']
elif bus_fs_clus[i,:].obs['cellRanger_louvain'][0] in [35]:
cluster_types += ['Very Early Oocytes']
bus_fs_clus.obs['annosSub'] = pd.Categorical(cluster_types)
annotateLouvainSub(bus_fs_clus)
bus_fs_clus
sc.pl.umap(bus_fs_clus,color='annosSub')
colors = bus_fs_clus.uns['annosSub_colors']
colors
fig, ax = plt.subplots(figsize=(10,10))
c = np.unique(bus_fs_clus.obs["cellRanger_louvain"].values)
cmap = [i+'70' for i in colors]
names = c
for idx, (cluster, name) in enumerate(zip(c, names)):
XX = bus_fs_clus[bus_fs_clus.obs.cellRanger_louvain.isin([cluster]),:].obsm["X_umap"]
text = list(bus_fs_clus[bus_fs_clus.obs.cellRanger_louvain.isin([cluster]),:].obs.annosSub)[0]
x = XX[:,0]
y = XX[:,1]
ax.scatter(x, y, color = cmap[idx], label=str(cluster)+': '+text,s=5)
ax.annotate(name,
(np.mean(x), np.mean(y)),
horizontalalignment='center',
verticalalignment='bottom',
size=20, weight='bold',
color="black",
backgroundcolor=cmap[idx])
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False)
ax.set_axis_off()
#plt.savefig('36ClusAtlas.pdf')
plt.show()
# This is the saved output used in the notebook
bus_fs_clus.write('fedStarved_withUMAPPaga.h5ad')
Heatmaps for Top Markers¶
How marker genes are selected and annotated for the 36 cell types¶
#Get top n marker genes for each cluster
#Keep top 100 genes, 'louvain_neur' is label for neuron clusters determined using Louvain clustering algorithm
sc.tl.rank_genes_groups(bus_fs_clus, 'cellRanger_louvain',n_genes = 100,method='wilcoxon') #Using non-parametric test for significance
#Make dataframe, with 100 marker genes for each cluster + annotations
clusters = np.unique(bus_fs_clus.obs['cellRanger_louvain'])
markers = pd.DataFrame()
clus = []
markerGene = []
padj = []
orthoGene = []
orthoDescr = []
pantherNum = []
pantherDescr = []
goTerms = []
for i in clusters:
genes = bus_fs_clus.uns['rank_genes_groups']['names'][str(i)]
clus += list(np.repeat(i,len(genes)))
markerGene += list(genes)
padj += list(bus_fs_clus.uns['rank_genes_groups']['pvals_adj'][str(i)])
for g in genes:
sub_df = hg_ortho_df[hg_ortho_df.XLOC.isin([g])]
panth_df = panther_df[panther_df[0].isin([g])]
go_df = goTerm_df[goTerm_df[0].isin([g])]
if len(sub_df) > 0:
#Save first result for gene/description
orthoGene += [list(sub_df.Gene)[0]]
orthoDescr += [list(sub_df.Description)[0]]
else:
orthoGene += ['NA']
orthoDescr += ['NA']
if len(panth_df) > 0:
pantherNum += [list(panth_df[1])]
pantherDescr += [list(panth_df[2])]
else:
pantherNum += ['NA']
pantherDescr += ['NA']
if len(go_df) > 0:
goTerms += [list(go_df[1])]
else:
goTerms += ['NA']
markers['clus'] = clus
markers['markerGene'] = markerGene
markers['padj'] = padj
markers['orthoGene'] = orthoGene
markers['orthoDescr'] = orthoDescr
markers['pantherID'] = pantherNum
markers['pantherDescr'] = pantherDescr
markers['goTerms'] = goTerms
markers.head()
#list(neurons.uns['rank_genes_groups']['names']['1'])
#Write to csv
markers.to_csv('fs_marker_annotations.csv')
#Read in csv (previously saved version, uploaded to Box)
markers = pd.read_csv('fs_marker_annotations.csv',
sep=",")
markers.head()
topGenes = []
clusts = []
names = []
var_groups = []
var_labels = []
top5 = pd.DataFrame()
ind = 0
n_genes = 5
for j in np.unique(markers.clus):
sub = markers[markers.clus == j]
sub.sort_values(by='padj',ascending=True)
noDups = [i for i in sub.markerGene if i not in topGenes] #Remove duplicate genes
topGenes += list(noDups[0:n_genes])
clusts += [j]*n_genes
top5['marker'] = topGenes
top5['clus'] = clusts
top5.to_csv('top5markersAtlas.csv')
Low expression markers (using pseudo-bulk counts across cell types) Given that we have multiple biological replicates across cell types we can binarize gene expression, and look fro any genes that have 90% expression in one cell type and > 10 counts overall.
#Set threshold for expression cutoff (just using raw counts)
exprThresh = 10 # num of raw counts
percUniqThresh = 0.9
#Read in starvation data
#Kallisto bus h5ad file with no gene filtering
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )
#CellRanger h5ad file with old louvain clustering
cellRanger_fs = anndata.read("D1.1798")
print(cellRanger_fs)
bus_fs_raw.obs['orgID'] = pd.Categorical(cellRanger_fs.obs['orgID'])
bus_fs_raw.obs['fed'] = pd.Categorical(cellRanger_fs.obs['fed'])
bus_fs_raw.obs['starved'] = pd.Categorical(cellRanger_fs.obs['starved'])
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(cellRanger_fs.obs['louvain'])
sc.pp.filter_genes(bus_fs_raw, min_counts=1) #Filter for at least one count
exprMat = bus_fs_raw.X.toarray()
candidates = {}
gene_names = bus_fs_raw.var_names
for g in range(0,len(gene_names)):#
#Where is expression nonzero, etc
ind = np.where(exprMat[:,g] > exprThresh)
#Get clusters with above thresh expression
clusNames = pd.Series(bus_fs_raw.obs['cellRanger_louvain'][ind[0]])
#Highest percent of unique cluster counts
normVal = clusNames.value_counts(normalize=True)
maxPerc = np.max(normVal)
indMax = np.argmax(normVal)
if maxPerc >= percUniqThresh:
candidates[gene_names[g]] = normVal.index[indMax] #Save which cluster had expression
#save maxPerc
print(len(candidates))
#Make dataframe, with 100 marker genes for each cluster + annotations
lowExpr_markers = pd.DataFrame()
markerGene = []
orthoGene = []
orthoDescr = []
pantherNum = []
pantherDescr = []
goTerms = []
clus = []
for g in candidates.keys():
markerGene += [g]
clus += [candidates[g]]
sub_df = hg_ortho_df[hg_ortho_df.XLOC.isin([g])]
panth_df = panther_df[panther_df[0].isin([g])]
go_df = goTerm_df[goTerm_df[0].isin([g])]
if len(sub_df) > 0:
#Save first result for gene/description
orthoGene += [list(sub_df.Gene)[0]]
orthoDescr += [list(sub_df.Description)[0]]
else:
orthoGene += ['NA']
orthoDescr += ['NA']
if len(panth_df) > 0:
pantherNum += [list(panth_df[1])]
pantherDescr += [list(panth_df[2])]
else:
pantherNum += ['NA']
pantherDescr += ['NA']
if len(go_df) > 0:
goTerms += [list(go_df[1])]
else:
goTerms += ['NA']
lowExpr_markers['markerGene'] = markerGene
lowExpr_markers['clus'] = clus
lowExpr_markers['orthoGene'] = orthoGene
lowExpr_markers['orthoDescr'] = orthoDescr
lowExpr_markers['pantherID'] = pantherNum
lowExpr_markers['pantherDescr'] = pantherDescr
lowExpr_markers['goTerms'] = goTerms
lowExpr_markers.head()
#Filter out genes already in marker list
overlap = list(set(markers.markerGene).intersection(lowExpr_markers.markerGene))
lowExpr_markers = lowExpr_markers[~lowExpr_markers.markerGene.isin(overlap)]
lowExpr_markers.head()
lowExpr_markers.to_csv('fs_lowExpr_markers_atlas.csv')
Read in saved markers for figure¶
#List of top markers for Figure 2, from subset of top5markers
topMarkers = pd.read_csv('D1.1809',sep=",")
topMarkers.head()
topMarkers = topMarkers[0:51]
topGenes = []
names = []
var_groups = []
var_labels = []
ind = 0
#Add cell type labels for gene markers
for i in np.unique(topMarkers.clus):
sub = topMarkers[topMarkers.clus == i]
topGenes += list(sub.markerGene)
names += list(sub.annot)
var_groups += [(ind,ind+len(list(sub.annot))-1)]
var_labels += [str(int(i))]
ind += len(list(sub.annot))
#Add to raw data so any genes can be plotted
#Kallisto bus h5ad file with no gene filtering
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )
sc.pp.filter_cells(bus_fs_raw, min_counts=0) #1
sc.pp.filter_genes(bus_fs_raw, min_counts=0)
bus_fs_raw.obs['new_cellRanger_louvain'] = bus_fs_clus.obs['new_cellRanger_louvain']
bus_fs_raw.uns["dendrogram_new_cellRanger_louvain"] = bus_fs_clus.uns["dendrogram_new_cellRanger_louvain"]
bus_fs_raw.uns['new_cellRanger_louvain_colors'] = bus_fs_clus.uns['new_cellRanger_louvain_colors']
bus_fs_raw.obs['fed'] = pd.Categorical(bus_fs_clus.obs['fed'])
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['cellRanger_louvain'])
#Plot data with names attached to gene XLOCs
toPlot = bus_fs_raw[:,topGenes]
toPlot.var['names'] = names
sc.pp.log1p(toPlot)
#Subsample for clusters > 100 in size
#Subsample from full dataset, across each cluster
def subSample(adata):
groups = np.unique(adata.obs['cellRanger_louvain'])
subSample = 100
cellNames = np.array(adata.obs_names)
allCells = []
for i in groups:
cells = np.array(list(adata.obs['cellRanger_louvain'].isin([i])))
cellLocs = list(np.where(cells)[0])
if len(cellLocs) > 100:
#Take all cells if < subSample
choice = random.sample(cellLocs,subSample)
else:
choice = cellLocs
pos = list(choice)
#print(len(pos))
allCells += list(cellNames[pos])
sub = adata[allCells,:]
return sub
toPlotSub = subSample(toPlot)
toPlotSub
bus_fs_raw.uns['new_cellRanger_louvain_colors']
sc.set_figure_params(scanpy=True, fontsize=30,dpi=150)
#bus_fs_clus.obs['cellRanger_louvain'] = pd.Categorical([str(i) for i in bus_fs_clus.obs['cellRanger_louvain']])
sc.pl.heatmap(toPlotSub, names, groupby='new_cellRanger_louvain', dendrogram=True,show_gene_labels=True,
var_group_positions=var_groups,var_group_labels=var_labels,cmap='PuBuGn',gene_symbols='names',
standard_scale='var',use_raw=False,swap_axes=True,figsize = (30,30),save='cellAtlas.pdf')
DE Gene Analysis Across Clusters (After Extracting Perturbed Genes)¶
#Remove clusters with < 10 cells per condition
#Read in previously saved data
bus_fs_clus = anndata.read("D1.1796")
print(bus_fs_clus )
bus_fs_raw = anndata.read("D1.1797")
bus_fs_raw = bus_fs_raw[bus_fs_clus.obs_names,]
bus_fs_raw.obs['fed'] = bus_fs_clus.obs['fed']
bus_fs_raw.obs['cellRanger_louvain'] = bus_fs_clus.obs['cellRanger_louvain']
bus_fs_raw
Cluster DE Genes by Expression and Run TopGO Analysis¶
Use output from DeSeq2 analysis in separate notebook: Genes in each cell type with differential expression under starvation
deGenesDF = pd.read_csv('D1.1810') #deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample_annotations.csv from DeSeq2 analysis
deGenesDF.head()
deGenesDF_sig = deGenesDF[deGenesDF.padjClus < 0.05]
Cluster gene x cell matrix for 'perturbed' genes
#Filter raw count dataset for only perturbed genes
bus_fs_raw = anndata.read("D1.1797")
bus_fs_raw = bus_fs_raw [:,np.unique(deGenesDF_sig.Genes)]
bus_fs_raw =bus_fs_raw[bus_fs_clus.obs_names,:]
bus_fs_raw.obs['cellRanger_louvain'] = bus_fs_clus.obs['cellRanger_louvain']
bus_fs_raw.obs['fed'] = bus_fs_clus.obs['fed']
de_gene_adata = anndata.AnnData(X=bus_fs_raw.X.T)
de_gene_adata.var_names = bus_fs_raw.obs_names
de_gene_adata.obs_names = bus_fs_raw.var_names
de_gene_adata_orig = de_gene_adata.copy()
#de_gene_adata
numIntersects = deGenesDF['Genes'].value_counts()
num_intersect = []
for g in de_gene_adata.obs_names:
if g in list(deGenesDF['Genes']):
num_intersect += [numIntersects[g]]
else:
num_intersect += [0]
de_gene_adata.obs['numIntersects'] = pd.Categorical(num_intersect)
#de_gene_adata
de_gene_adata
#Normalize and scale data
sc.pp.filter_genes(de_gene_adata, min_counts=0)
sc.pp.normalize_per_cell(de_gene_adata, counts_per_cell_after=1e4)
de_gene_adata.raw = sc.pp.log1p(de_gene_adata, copy=True)
sc.pp.scale(de_gene_adata, max_value=10)
sc.tl.pca(de_gene_adata, n_comps=60,random_state=42)
#sc.pl.pca_variance_ratio(bus_combo, log=True)
#Determine neighbors for clustering
sc.pp.neighbors(de_gene_adata,n_neighbors=20, n_pcs=15) #n_neighbors=5, n_pcs=15, 20, n_pcs=15
sc.tl.louvain(de_gene_adata,resolution=2)
sc.tl.tsne(de_gene_adata, n_pcs=15,random_state=42)
sc.pl.tsne(de_gene_adata,color=['louvain','numIntersects'])
sc.tl.tsne(de_gene_adata, n_pcs=10,random_state=42,perplexity=15) #
sc.pl.tsne(de_gene_adata,color=['louvain'])
# Add which gene modules the pertubed genes are in
clusters = []
for g in deGenesDF.Genes:
if g in list(de_gene_adata.obs_names):
clus = de_gene_adata[g,:].obs['louvain'][0]
clusters += [clus]
else:
clusters += ['padjClus_not_sig']
deGenesDF['geneClus'] = clusters
deGenesDF.head()
deGenesDF.to_csv('atlas_deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample_annotations.csv')
deGenesDF = pd.read_csv('D1.1810')
deGenesDF.head()
Run TopGO analysis for gene modules to find GO Term enrichment
#For topGO analysis (To Assign Labels to Modules)
def returnVal(i):
if i == i:
i= i.replace("[","")
i = i.replace("]","")
i= i.replace("'","")
i = i.replace("'","")
return i
else:
return 'nan'
deGenesDF.goTerms = [returnVal(i) for i in list(deGenesDF.goTerms)]
deGenesDF = deGenesDF[deGenesDF.goTerms != 'nan']
deGenesDF = deGenesDF[deGenesDF.geneClus != 'padjClus_not_sig']
deGenesDF.to_csv('atlas_deseq2_genes_fortopGO_metadata.txt',sep='\t',columns=['Genes','goTerms','geneClus'],
header=None,index_label=False,index=False)
deGenesDF.to_csv('atlas_deseq2_genes_fortopGO.txt',sep='\t',columns=['Genes','goTerms'],
header=None,index_label=False,index=False)
%%R
install.packages('rlang')
if (!requireNamespace("BiocManager", quietly=TRUE)){
install.packages("BiocManager")
}
BiocManager::install("topGO")
%%R
library(topGO)
library(readr)
#Read in DE genes (XLOC's) with GO Terms
geneID2GO <- readMappings(file = "atlas_deseq2_genes_fortopGO.txt")
str(head(geneID2GO ))
#Add gene modules as factor
atlas_deseq2_genes_fortopGO_metadata <- read_delim("atlas_deseq2_genes_fortopGO_metadata.txt",
"\t", escape_double = FALSE, col_names = FALSE,
trim_ws = TRUE)
#Set variables
allMods = unique(atlas_deseq2_genes_fortopGO_metadata$X3)
alpha = 0.05/length(allMods) #Bonferroni correction, could correct for all pairwise comparisons?
getEnrichTerms <- function(geneID2GO, modMetadata, clus){
mods <- factor(as.integer(modMetadata$X3 == clus)) #Choose gene module to make 'interesting'
names(mods) <- names(geneID2GO)
#Get genes only in module of interest
clusGenes <- function(mods) {
return(mods == 1)
}
subMods <- clusGenes(mods)
#Make GO data
GOdata <- new("topGOdata", ontology = "BP", allGenes = mods,
geneSel = subMods, annot = annFUN.gene2GO, gene2GO = geneID2GO)
#GOdata
#sigGenes(GOdata)
resultFis <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
resultWeight <- runTest(GOdata, statistic = "fisher")
#P-values from Weight Algorithm
pvalsWeight <- score(resultWeight)
#hist(pvalsWeight, 50, xlab = "p-values")
allRes <- GenTable(GOdata, classic = resultFis, weight = resultWeight,
orderBy = "weight", ranksOf = "classic", topNodes = 20)
subRes <- allRes[as.numeric(allRes$weight) < alpha,]
#Write output
write.csv(subRes,file=paste('mod',clus,'_GOTerms.csv',sep=""))
}
#Run for all modules and write outputs
for(c in allMods){
getEnrichTerms(geneID2GO = geneID2GO,modMetadata = atlas_deseq2_genes_fortopGO_metadata, clus = c)
}
DE Genes Across Cell Types¶
We analyze clustered perturbed genes with GO term analysis output by looking at the sharing of gene modules between cell types and differential expression of these genes under starvation
deGenesDF = pd.read_csv('D1.1810')
deGenesDF.head()
#Read in saved de_gene_adata here
de_gene_adata = anndata.read('D1.1813')
de_gene_adata
sc.set_figure_params(dpi=125)
sc.pl.tsne(de_gene_adata,color=['louvain','numIntersects'])
#Based on saved topGO output (See script....)
c = np.unique(de_gene_adata.obs["louvain"].values)
print(c)
names = ['Protein Synthesis & Stress Response','Oxidative Phosphorylation','Cytoskeletal','Synaptic Transmission & Transport','NA','NA',
'Phosphate-Containing Metabolic Process','Cell Cycle & Development (Early Oocytes)','Protein Synthesis',
'Proteolysis & Cell Physiology','Cell Cycle','Protein Synthesis & Transport','Cell-Matrix Adhesion','Metabolic Process']
geneClusNames = dict(zip(c, names))
addNames = []
deGenesDF_sig = deGenesDF[deGenesDF.geneClus != 'padjClus_not_sig']
for i in deGenesDF_sig.geneClus:
addNames += [geneClusNames[i]]
deGenesDF_sig['names'] = addNames
deGenesDF_sig.head()
#https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
def draw_plots(df, variables, n_rows, n_cols):
fig=plt.figure(figsize=(20,20))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
name = geneClusNames[var_name]
sub = df[df['names'] == name]
sub.Cluster.value_counts().sort_values().plot(kind = 'barh',ax=ax,fontsize = 10)
#sub['Cluster'].hist(bins=10,ax=ax)
ax.set_title(var_name+': '+name+" Distribution",fontsize = 12)
ax.set_ylabel("Cell Atlas Clusters",fontsize = 12)
ax.set_xlabel("Number of Perturbed Genes from Atlas Cluster",fontsize = 12)
fig.tight_layout() # Improves appearance a bit.
plt.show()
draw_plots(deGenesDF_sig, np.unique(deGenesDF_sig.geneClus), 5, 4)
Gene Module Plots¶
Plot gene modules colored by how genes are shared between cell types
#Mark DE/Perturbed Genes
def returnDE(i,names):
if i in list(names):
return 'DE'
else:
return 'nonSig'
#Mark genes with no GO Terms
def returnVal(i):
if i == i:
i= i.replace("[","")
i = i.replace("]","")
i= i.replace("'","")
i = i.replace("'","")
return i
else:
return 'nan'
def returnGO(i,names):
if i in names:
return "withGO"
else:
return "n/a"
deGenesDF.goTerms = [returnVal(i) for i in list(deGenesDF.goTerms)]
deGenesDF_sub = deGenesDF[deGenesDF.geneClus != 'padjClus_not_sig']
deGenesDF_sub = deGenesDF_sub[deGenesDF_sub.goTerms != 'nan'] # Has GO Term
withGO_names = list(deGenesDF_sub.Genes)
print(len(np.unique(withGO_names)))
goLabels = [returnGO(i,withGO_names) for i in de_gene_adata.obs_names]
de_gene_adata.obs['withGO'] = pd.Categorical(goLabels)
de_gene_adata
clus14Genes = deGenesDF[deGenesDF.Cluster == 14]
clus19Genes = deGenesDF[deGenesDF.Cluster == 19]
clus35Genes = deGenesDF[deGenesDF.Cluster == 35]
# Label DE/notDE genes
clus35Labels = [returnDE(i,np.unique(clus35Genes.Genes)) for i in de_gene_adata.obs_names]
de_gene_adata.obs['clus35'] = pd.Categorical(clus35Labels)
clus14Labels = [returnDE(i,np.unique(clus14Genes.Genes)) for i in de_gene_adata.obs_names]
de_gene_adata.obs['clus14'] = pd.Categorical(clus14Labels)
#sc.pl.tsne(de_gene_adata,groups=['DE'],color=['clus14'])
# Label DE/notDE genes
clus19Labels = [returnDE(i,np.unique(clus19Genes.Genes)) for i in de_gene_adata.obs_names]
de_gene_adata.obs['clus19'] = pd.Categorical(clus19Labels)
#sc.pl.tsne(de_gene_adata,groups=['DE'],color=['clus19'])
c = list(np.unique(de_gene_adata.obs["louvain"].values))
c
#Add labels to each cluster in de_gene_adata (from GO terms) on tSNE plot
fig, ax = plt.subplots(figsize=(6,6))
#c = np.unique(de_gene_adata.obs["louvain"].values)
cmap = plt.cm.get_cmap("tab20")
for idx, (cluster, cluster) in enumerate(zip(c, c)):
XX = de_gene_adata[de_gene_adata.obs.louvain == cluster,:].obsm["X_tsne"]
x = XX[:,0]
y = XX[:,1]
ax.scatter(x, y, color = cmap(idx), label=cluster,s=25,alpha=0.7)
ax.annotate(cluster,
(np.median(x), np.median(y)),
horizontalalignment='right',
verticalalignment='bottom',
size=12, weight='bold',
color="black",
backgroundcolor=cmap(idx) )
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
#ax.set_axis_off()
ax.set_xlabel('tSNE1')
ax.set_ylabel('tSNE2')
ax.grid(False)
ax.get_xaxis().set_ticks([])
ax.get_yaxis().set_ticks([])
for edge_i in ['bottom','left']:
ax.spines[edge_i].set_edgecolor("black")
for edge_i in ['top', 'right']:
ax.spines[edge_i].set_edgecolor("white")
plt.show()
# Density plot of perturbed genes with two cell types
def multDensityofDE(de_gene_adata,clusName1,clusName2,label1,label2):
s = 13
# Add labels to each cluster in de_gene_adata (from GO terms) on tSNE plot
fig, ax = plt.subplots(figsize=(3,3))
XX = de_gene_adata.obsm["X_tsne"]
de1 = np.array([de_gene_adata.obs[clusName1] == 'DE'])
de2 = np.array([de_gene_adata.obs[clusName2] == 'DE'])
overlap = list(np.where(de1 & de2)[1])
only1 = [i for i in list(np.where(de1)[1]) if i not in overlap]
only2 = [i for i in list(np.where(de2)[1]) if i not in overlap]
nonsig = [i for i in range(0,len(XX[:,0])) if i not in overlap+only1+only2]
x = XX[nonsig,0]
y = XX[nonsig,1]
ax.scatter(x, y, color = 'grey',s=25,alpha=0.1,edgecolors='none') #cmap(idx),label=cluster
x_DE1 = XX[only1,0]
y_DE1 = XX[only1,1]
ax.scatter(x_DE1, y_DE1, color = 'navy',s=25,alpha=0.3,label=label1) #label=cluster
x_DE2 = XX[only2,0]
y_DE2 = XX[only2,1]
ax.scatter(x_DE2, y_DE2, color = 'orange',s=25,alpha=0.3,label=label2) #label=cluster
x_DE3 = XX[overlap,0]
y_DE3 = XX[overlap,1]
ax.scatter(x_DE3, y_DE3, color = 'green',s=25,alpha=0.3,label='Both') #label=cluster
ax.set_axis_off()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# Density plot of perturbed genes with three cell types
def tripleDensityofDE(de_gene_adata,clusName1,clusName2,clusName3,label1,label2):
s = 13
# Add labels to each cluster in de_gene_adata (from GO terms) on tSNE plot
fig, ax = plt.subplots(figsize=(3,3))
XX = de_gene_adata.obsm["X_tsne"]
de1 = np.array([de_gene_adata.obs[clusName1] == 'DE'])
de2 = np.array([de_gene_adata.obs[clusName2] == 'DE'])
de3 = np.array([de_gene_adata.obs[clusName3] == 'DE'])
overlap = list(np.where((de1 & de2) | (de1 & de3))[1])
only1 = [i for i in list(np.where(de1)[1]) if i not in overlap]
other = [i for i in list(np.where(de2 | de3)[1]) if i not in overlap]
nonsig = [i for i in range(0,len(XX[:,0])) if i not in overlap+only1+other]
x = XX[nonsig,0]
y = XX[nonsig,1]
ax.scatter(x, y, color = 'grey',s=25,alpha=0.1,edgecolors='none') #cmap(idx),label=cluster
x_DE1 = XX[only1,0]
y_DE1 = XX[only1,1]
ax.scatter(x_DE1, y_DE1, color = 'purple',s=25,alpha=0.3,label=label1) #label=cluster
x_DE2 = XX[other,0]
y_DE2 = XX[other,1]
ax.scatter(x_DE2, y_DE2, color = 'lightcoral',s=25,alpha=0.3,label=label2) #label=cluster
x_DE3 = XX[overlap,0]
y_DE3 = XX[overlap,1]
ax.scatter(x_DE3, y_DE3, color = 'green',s=25,alpha=0.3,label='Shared') #label=cluster
ax.set_axis_off()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
multDensityofDE(de_gene_adata,'clus14','clus19','Cluster 14','Cluster 19')
tripleDensityofDE(de_gene_adata,'clus35','clus14','clus19','Cluster 35','Digestive Types (14/19)')
#Saved version used for this analysis
#de_gene_adata.write('de_gene_clus_adata.h5ad')
For Gene Cluster SplitsTree Conversion¶
from sklearn.metrics import pairwise_distances
#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
clusters = np.unique(overlap_combo.obs[clusType])
centroids = np.zeros((len(clusters),pcs))
for c in clusters:
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
centroid = pca_embed.mean(axis=0)
centroids[int(c),:] = list(centroid)
return centroids
#Compare to pairwise distances between cell atlas clusters
centroids = getClusCentroids(de_gene_adata,60,'louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
df = pd.DataFrame(pairCentroid_dists)
clus = np.unique(de_gene_adata.obs['louvain'])
df['cluster'] = range(0,len(clus))
clusters = [int(i) for i in de_gene_adata.obs['louvain']]
names = [geneClusDict[str(i)] for i in df['cluster']]
df['annos'] = names
df.to_csv('geneClustDist_SplitsTree.csv')
df.head()
Gene expression Plots & Cluster 14 & 19 DE Gene Comparisons/Violin Plots¶
Look at expression profiles of perturbed genes from different cell types
#Kallisto bus h5ad file with no gene filtering
bus_fs_raw = anndata.read("D1.1797")
print(bus_fs_raw )
#Read in PREVIOUSLY SAVED clustered + labeled data (cells x gene)
bus_fs_clus = anndata.read("D1.1796")
print(bus_fs_clus)
bus_fs_clus.obs['Fed'] = pd.Categorical(bus_fs_clus.obs['fed'])
#sc.pl.umap(bus_fs_clus, color='Fed')
bus_fs_raw.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['cellRanger_louvain'])
bus_fs_raw
deGenesDF_sig = deGenesDF[deGenesDF.padjClus < 0.05]
clusGenes = list(deGenesDF_sig.Genes[deGenesDF_sig.Cluster == 3])
len(clusGenes)
#Expression profiles for perturbed genes in each cell type (all high expression or not)
cellTypes = np.unique(deGenesDF_sig.Cluster)
colors = bus_fs_clus.uns['annosSub_colors']
cmap = [i for i in colors]
vals = []
types = []
num = []
cmap_l = []
exprPlots = pd.DataFrame()
for t in cellTypes:
clusGenes = list(deGenesDF_sig.Genes[deGenesDF_sig.Cluster == t])
sub = bus_fs_raw[:,clusGenes]
sub = sub[sub.obs['cellRanger_louvain'].isin([t])]
toAdd = sub.X.todense().mean(0)
#Calculate average expression for each gene across cells in type t
means = toAdd.flatten().tolist()[0]
vals += means
types += list(np.repeat(t,len(means)))
num += list(np.repeat(len(means),len(means)))
cmap_l += [cmap[t]]
exprPlots['cellType'] = types
exprPlots['expr'] = np.log1p(vals)
exprPlots['numGenes'] = num
exprPlots.head()
palette = sns.color_palette(cmap_l)
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x="numGenes", y="expr", hue='cellType',palette=palette, data=exprPlots,order=np.arange(190),dodge=False)
for i,artist in enumerate(ax.artists):
# Set the linecolor on the artist to the facecolor, and set the facecolor to None
col = artist.get_facecolor()
artist.set_edgecolor(col)
artist.set_facecolor('None')
# Each box has 6 associated Line2D objects (to make the whiskers, fliers, etc.)
# Loop over them here, and use the same colour as above
for j in range(i*6,i*6+6):
line = ax.lines[j]
if j == (i*6+4):
line.set_color('black')
line.set_mfc('black')
line.set_mec('black')
else:
line.set_color(col)
line.set_mfc(col)
line.set_mec(col)
ax.set_xlabel('Number of Perturbed Genes')
ax.set_ylabel('log(Expression)')
ax.grid(False)
for edge_i in ['bottom','left']:
ax.spines[edge_i].set_edgecolor("black")
for edge_i in ['top', 'right']:
ax.spines[edge_i].set_edgecolor("white")
ax.legend(loc='center left',bbox_to_anchor=(1, 0.5),prop={'size': 10},frameon=False,ncol=2,title="Cell Type")
for i in range(190):
if i % 25 != 0:
ax.xaxis.get_major_ticks()[i].draw = lambda *args:None
plt.show()
clus14 = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([14])]
clus14.obs['Condition2'] = pd.Categorical(clus14.obs['fed'])
clus19 = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([19])]
clus19.obs['Condition2'] = pd.Categorical(clus19.obs['fed'])
newOfInterest = ['XLOC_043836','XLOC_043846','XLOC_007437','XLOC_009798','XLOC_008632','XLOC_033541','XLOC_004011']
axes = sc.pl.violin(clus14, newOfInterest, groupby='Condition2',ylabel=['Oxidoreductase','Dioxygenase',
'POSTN-related','KRTAP5','Uncharacterized','NPC2','NPC2'],show=False)
for ax in axes:
ax.set_ylim(0, 5)
ax.grid(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
axes = sc.pl.violin(clus19, newOfInterest, groupby='Condition2',ylabel=['Oxidoreductase','Dioxygenase',
'POSTN-related','KRTAP5','Uncharacterized','NPC2','NPC2'],show=False)
for ax in axes:
ax.set_ylim(0, 5)
ax.grid(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
newOfInterest = ['XLOC_035232','XLOC_007437']
axes = sc.pl.violin(clus14, newOfInterest, groupby='Condition2',ylabel=['TGFB-like',
'POSTN-like'],show=False)
for ax in axes:
ax.set_ylim(0, 7)
ax.grid(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
axes = sc.pl.violin(clus19, newOfInterest, groupby='Condition2',ylabel=['TGFB-like',
'POSTN-like'],show=False)
for ax in axes:
ax.set_ylim(0, 7)
ax.grid(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
#Cluster 35, early oocytes Violin Plots
bus_fs_clus.obs['fed'] = pd.Categorical(bus_fs_clus.obs['fed'])
clus35 = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([35])]
newOfInterest = ['XLOC_012655','XLOC_004306','XLOC_037567','XLOC_006902','XLOC_016286','XLOC_037771',
'XLOC_035249','XLOC_012527','XLOC_001916','XLOC_015039']
clus35.obs['Condition2'] = pd.Categorical(clus35.obs['fed'])
#sc.pl.umap(clus35, color='Condition')
sc.pl.violin(clus35, newOfInterest, groupby='Condition2',ylabel=['DNAJ','MCM8','TF AP4','CAF-1','SPO11','MOV10L1',
'SIN3A','FAF1','CDK9','TMBIM4'])
newOfInterest = ['XLOC_016286','XLOC_012527']
#sc.pl.umap(clus35, color='Condition')
axes = sc.pl.violin(clus35, newOfInterest, groupby='Condition2',ylabel=['MEIOTIC RECOMBINATION PROTEIN','FAF1'],show=False)
for ax in axes:
ax.set_ylim(0, 5)
ax.grid(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
Overlap with Original CellRanger Clusters¶
Use Jaccard Index to quantify overlap of markers for the 36 cell types between the Kallisto-processed and initial, Cell Ranger processed data
n=100
bus_fs_clus = anndata.read("D1.1796")
cellRanger_fs = anndata.read("D1.1798")
bus_fs_clus.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_clus.obs['cellRanger_louvain'])
sc.tl.rank_genes_groups(cellRanger_fs,groupby='louvain',n_genes=n,method='wilcoxon')
sc.tl.rank_genes_groups(bus_fs_clus,groupby='cellRanger_louvain',n_genes=n,method='wilcoxon')
#Show pairwise overlap in top 100 names between all clusters, 36x36 grid
clus = np.unique(bus_fs_clus.obs['cellRanger_louvain'])
cellR = [[]]*len(clus) #np array of top 100 genes for each of 36 clusters
busFS = [[]]*len(clus)#np array of top 100 genes for each of 36 clusters
for c in clus:
cellR[c] = list(cellRanger_fs.uns['rank_genes_groups']['names'][str(c)])
busFS[c] = list(bus_fs_clus.uns['rank_genes_groups']['names'][str(c)])
#https://stackoverflow.com/questions/52408910/python-pairwise-intersection-of-multiple-lists-then-sum-up-all-duplicates
from itertools import combinations_with_replacement
#Changed to calculate Jaccard Index
def intersect(i,j):
return len(set(cellR[i]).intersection(busFS[j]))/len(set(cellR[i]).union(busFS[j]))
def pairwise(clus):
# Initialise precomputed matrix (num of clusters, 36x36)
precomputed = np.zeros((len(clus),len(clus)), dtype='float')
# Initialise iterator over objects in X
iterator = combinations_with_replacement(range(0,len(clus)), 2)
# Perform the operation on each pair
for i,j in iterator:
precomputed[i,j] = intersect(i,j)
# Make symmetric and return
return precomputed + precomputed.T - np.diag(np.diag(precomputed))
pairCorrs = pairwise(clus)
plt.figure(figsize=(7,7))
plt.imshow(pairCorrs, cmap='viridis')
plt.colorbar()
plt.xlabel('kallisto Clusters')
plt.ylabel('Cell Ranger Clusters')
plt.xticks(np.arange(0, 36, 1),fontsize=6)
plt.yticks(np.arange(0, 36, 1),fontsize=6)
plt.grid(color='black',linewidth=0.3)
plt.show()
Cell Type Data for SplitsTree¶
print(bus_fs_clus)
from sklearn.metrics import pairwise_distances
#Centroids of cell atlas/defined clusters
def getClusCentroids(overlap_combo,pcs=60,clusType='knn_clus'):
clusters = np.unique(overlap_combo.obs[clusType])
centroids = np.zeros((len(clusters),pcs))
for c in clusters:
sub_data = overlap_combo[overlap_combo.obs[clusType] == c]
pca_embed = sub_data.obsm['X_pca'][:,0:pcs]
centroid = pca_embed.mean(axis=0)
centroids[int(c),:] = list(centroid)
return centroids
#Compare to pairwise distances between cell atlas clusters
fed_only = bus_fs_clus[bus_fs_clus.obs['fed'] == 'True']
centroids = getClusCentroids(fed_only,60,'cellRanger_louvain')
#centroids_arr = centroids['centroid'].to_numpy()
pairCentroid_dists = pairwise_distances(centroids, metric = 'l1')
pairCentroid_dists.shape
clus = np.unique(fed_only.obs['cellRanger_louvain'])
df = pd.DataFrame(pairCentroid_dists)
df['cluster'] = range(0,len(clus))
clusters = [int(i) for i in fed_only.obs['cellRanger_louvain']]
annos = fed_only.obs['annosSub']
annosDict = {}
for i in range(0,len(clusters)):
annosDict[clusters[i]] = annos[i]
names = [annosDict[i] for i in df['cluster']]
df['annos'] = names
df.head()
df.to_csv('cellTypeDist_SplitsTree.csv')
df.head()
Data for SplitsTree analysis of all cells within cell types of interest
def convertCond(fed):
if fed == 'True':
return 'Control'
else:
return 'Starved'
#Save csv for cell type 14 (dig. gastroderm type)
pcs=60
sub = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'] == 14]
pca_embed = sub.obsm['X_pca'][:,0:pcs]
cellDists = pairwise_distances(pca_embed, metric = 'l1')
df = pd.DataFrame(cellDists)
conds = [convertCond(i) for i in sub.obs['fed']]
for i in range(0,len(conds)):
conds[i] = conds[i]+'_'+str(i)
df['annos'] = conds
df['cluster'] = np.repeat(14,len(conds))
df.to_csv('cells14Dist_SplitsTree.csv')
df.head()
#Save csv for cell type 8
pcs=60
sub = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'] == 8]
pca_embed = sub.obsm['X_pca'][:,0:pcs]
cellDists = pairwise_distances(pca_embed, metric = 'l1')
df = pd.DataFrame(cellDists)
conds = [convertCond(i) for i in sub.obs['fed']]
for i in range(0,len(conds)):
conds[i] = conds[i]+'_'+str(i)
df['annos'] = conds
df['cluster'] = np.repeat(8,len(conds))
df.to_csv('cells8Dist_SplitsTree.csv')
df.head()