Analysis of Neuron Subpopulations
!date
Download Data¶
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
#Kallisto bus clustered starvation data, h5ad
download_file('10.22002/D1.1796','.gz')
#Starvation h5ad data, all nonzero genes included, filtered for 'real cells' from de-multiplexing
download_file('10.22002/D1.1797','.gz')
#Human ortholog annotations
download_file('10.22002/D1.1819','.gz')
#Panther annotations
download_file('10.22002/D1.1820','.gz')
#GO Terms
download_file('10.22002/D1.1822','.gz')
#Previously saved neuron subpopulations
download_file('10.22002/D1.1804','.gz')
#Previously saved marker genes for neurons
download_file('10.22002/D1.1837','.gz')
!gunzip *.gz
!pip install --quiet anndata
!pip install --quiet scanpy==1.7.0rc1
!pip install --quiet louvain
!pip3 install --quiet rpy2
Import Packages¶
#Install Packages
import random
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
#import scrublet as scr
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('axes',edgecolor='black')
%matplotlib inline
sc.set_figure_params(dpi=125)
import seaborn as sns
sns.set(style="whitegrid")
%load_ext rpy2.ipython
Read in Previously Saved Data¶
#Read in h5ad file
bus_fs_combo = anndata.read("D1.1796")
#Previously saved neurons
neurons = anndata.read("D1.1804")
#Read in annotations
from io import StringIO
hg_ortho_df = pd.read_csv(StringIO(''.join(l.replace('|', '\t') for l in open('D1.1819'))),
sep="\t",header=None,skiprows=[0,1,2,3])
hg_ortho_df[['XLOC','TCONS']] = hg_ortho_df[13].str.split(expand=True)
hg_ortho_df[['Gene','gi']] = hg_ortho_df[3].str.split(expand=True)
hg_ortho_df['Description']= hg_ortho_df[11]
panther_df = pd.read_csv('D1.1820',
sep="\t",header=None) #skiprows=[0,1,2,3]
goTerm_df = pd.read_csv('D1.1822',
sep=" ",header=None) #skiprows=[0,1,2,3]
bus_fs_combo
raw_fs_combo = anndata.read('D1.1797')
raw_fs_combo = raw_fs_combo[bus_fs_combo.obs_names,]
#Transfer info from embedded version
raw_fs_combo.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_combo.obs['cellRanger_louvain'])
raw_fs_combo.obs['fed'] = pd.Categorical(bus_fs_combo.obs['fed'])
raw_fs_combo.obsm['X_tsne'] = bus_fs_combo.obsm['X_tsne']
raw_fs_combo
Generating Neuron Subpopulations¶
We begin by restricting analysis to four clusters that contain neuron subpopulations (31,26,6 and 9). After constraining the single-cell count matrices to the relevant cells, we examine the variance explained in the top principal components, and based on the "elbow plot" continue analysis with 15 principal components.
# #Neurons, start from raw counts + unfiltered genes
neurons = raw_fs_combo[raw_fs_combo.obs['cellRanger_louvain'].isin([31,26,6,9])]
sc.pp.filter_cells(neurons, min_counts=0)
sc.pp.filter_genes(neurons, min_counts=0)
sc.pp.normalize_per_cell(neurons, counts_per_cell_after=1e4)
sc.pp.log1p(neurons)
sc.pp.highly_variable_genes(neurons,n_top_genes=2000,n_bins=50)
neurons = neurons[:,neurons.var['highly_variable']]
sc.pp.scale(neurons, max_value=10)
sc.tl.pca(neurons, n_comps=60)
sc.pl.pca_variance_ratio(neurons, log=True)
sc.pp.neighbors(neurons,n_neighbors=15, n_pcs=15) #n_neighbors=5, n_pcs=15,use_rep='X_nca'
sc.tl.louvain(neurons,resolution=1,key_added='louvain_neur',random_state=42)#Clustering algorithm,resolution=0.5
Next, we perform Louvain clustering to identify distinct neuron subpopulations, and we plot them using a UMAP reduction of the count matrix. The plot on the left shows the 15 Louvain derived clusters; the plot on the right shows the same UMAP representation of the cells but colored according to the clusters obtained from clustering with all the cells. This shows that a reclustering on the restricted cell x gene matrix produces a refinement of the original clusters.
sc.tl.louvain(neurons,resolution=2.5,key_added='louvain_neur',random_state=42)#Clustering algorithm,resolution=0.5
neurons
sc.tl.tsne(neurons, n_pcs=15,perplexity=25,random_state = 42) #learning_rate=250
sc.tl.umap(neurons,random_state=42,spread=2.5, min_dist=1)
neurons.obs['cellAtlasClusters'] = pd.Categorical(neurons.obs['cellRanger_louvain'] )
sc.pl.umap(neurons, color=['louvain_neur'],color_map='viridis',size=50,legend_loc='on data')
The plots below show the cells expressing several different neuropeptides:
sc.pl.umap(neurons, color=['XLOC_040584','XLOC_019434','XLOC_042761','XLOC_017097',
'XLOC_004021'],color_map='viridis',size=50,legend_loc='on data',
title=['GRWGamide','RFamide','PRPamide','GLWamide2',
'FLFamide'],vmin=0,vmax=5)
Next, we identify marker genes for each of the 15 Louvain clusters determined for the neuron subpopulation, and we annotate them using the Panther database, gene ontology (GO) terms and via orthology searches.
#Get top n marker genes for each cluster
#Keep top 100 genes, 'louvain_neur' is label for neuron clusters determined using Louvain clustering algorithm
sc.tl.rank_genes_groups(neurons, 'louvain_neur',n_genes = 100,method='wilcoxon') #Using non-parametric test for significance
neurons
#Make dataframe, with 100 marker genes for each cluster + annotations
neuron_clusters = np.unique(neurons.obs['louvain_neur'])
neuron_markers = pd.DataFrame()
neurClus = []
markerGene = []
padj = []
orthoGene = []
orthoDescr = []
pantherNum = []
pantherDescr = []
goTerms = []
for i in neuron_clusters:
genes = neurons.uns['rank_genes_groups']['names'][str(i)]
neurClus += list(np.repeat(i,len(genes)))
markerGene += list(genes)
padj += list(neurons.uns['rank_genes_groups']['pvals_adj'][str(i)])
for g in genes:
sub_df = hg_ortho_df[hg_ortho_df.XLOC.isin([g])]
panth_df = panther_df[panther_df[0].isin([g])]
go_df = goTerm_df[goTerm_df[0].isin([g])]
if len(sub_df) > 0:
#Save first result for gene/description
orthoGene += [list(sub_df.Gene)[0]]
orthoDescr += [list(sub_df.Description)[0]]
else:
orthoGene += ['NA']
orthoDescr += ['NA']
if len(panth_df) > 0:
pantherNum += [list(panth_df[1])]
pantherDescr += [list(panth_df[2])]
else:
pantherNum += ['NA']
pantherDescr += ['NA']
if len(go_df) > 0:
goTerms += [list(go_df[1])]
else:
goTerms += ['NA']
neuron_markers['neurClus'] = neurClus
neuron_markers['markerGene'] = markerGene
neuron_markers['padj'] = padj
neuron_markers['orthoGene'] = orthoGene
neuron_markers['orthoDescr'] = orthoDescr
neuron_markers['pantherID'] = pantherNum
neuron_markers['pantherDescr'] = pantherDescr
neuron_markers['goTerms'] = goTerms
neuron_markers.head()
#list(neurons.uns['rank_genes_groups']['names']['1'])
#Write to csv
neuron_markers.to_csv('neuron_marker_annotations.csv')
#Read in csv (previously saved version, uploaded to Box)
neuron_markers = pd.read_csv('neuron_marker_annotations.csv',
sep=",")
neuron_markers.head()
neurons.write('neuron_subpops_fs.h5ad')
Neuron Subpopulation Plots¶
Hierarchical Clustering of Neuron Subpopulations
#Previously saved neurons
neurons = anndata.read("D1.1804")
print(neurons)
raw_fs_combo = anndata.read('D1.1797')
raw_fs_combo = raw_fs_combo[bus_fs_combo.obs_names,]
#Transfer info from embedded version
raw_fs_combo.obs['cellRanger_louvain'] = pd.Categorical(bus_fs_combo.obs['cellRanger_louvain'])
raw_fs_combo.obs['fed'] = pd.Categorical(bus_fs_combo.obs['fed'])
raw_fs_combo.obsm['X_tsne'] = bus_fs_combo.obsm['X_tsne']
raw_fs_combo
To examine similarities between different subpopulations we infer a dendogram based on similarities of cells in the different Louvain derived clusters:
sc.tl.dendrogram(neurons,'louvain_neur',linkage_method='ward')
neurons
#neurons.uns['dendrogram_louvain_neur'] = neurons.uns["dendrogram_['louvain_neur']"]
sc.pl.dendrogram(neurons,'louvain_neur')
sc.pl.umap(neurons,color=['louvain_neur'])
Plot marker genes
neuron_markers = pd.read_csv('D1.1837')
neuron_markers.head()
# Use raw adata with all genes, to plot any gene expression in neuron subpopulations
neurons_raw = raw_fs_combo[raw_fs_combo.obs['cellRanger_louvain'].isin([31,26,6,9])]
sc.pp.filter_cells(neurons_raw, min_counts=0)
sc.pp.filter_genes(neurons_raw, min_counts=0)
sc.pp.normalize_per_cell(neurons_raw, counts_per_cell_after=1e4)
sc.pp.log1p(neurons_raw)
#sc.pp.scale(neurons_raw,max_value=10)
neurons_raw.obs['louvain_neur'] = neurons.obs['louvain_neur']
neurons_raw.obsm['X_tsne'] = neurons.obsm['X_tsne']
neurons_raw.obsm['X_umap'] = neurons.obsm['X_umap']
neurons_raw.uns['dendrogram_louvain_neur'] = neurons.uns['dendrogram_louvain_neur']
neurons_raw.uns['louvain_neur_colors'] = neurons.uns['louvain_neur_colors']
#Plot main figure genes
#'XLOC_010412' bHLH TF 1, 'XLOC_010752' bHLH TF 3
topGenes = ['XLOC_030920','XLOC_018937','XLOC_035224','XLOC_008730' ,'XLOC_014624',
'XLOC_030120','XLOC_038155',
'XLOC_040584','XLOC_008764', 'XLOC_040209', 'XLOC_010892', 'XLOC_041402',
'XLOC_004021','XLOC_019434',
'XLOC_003339','XLOC_042761','XLOC_017097',
'XLOC_021799']
neurons_sigTop = neurons_raw[:,topGenes]
labels = ['hlh-6','Neurogenin','+PP24','+PP26','+PP27',
'+PP17','+PP20',
'PP7','PP8', '+PP19','+PP21', 'PP9',
'PP14','PP5',
'+PP15','PP1','PP11',
'+PP25']
neurons_sigTop.var['names'] = labels
toPlotLabs = ['Neurogenin','hlh-6','+PP24','PP7','PP8', '+PP19','+PP21','PP5','+PP25','+PP15','PP1','PP11','PP9','PP14','+PP26','+PP27',
'+PP17','+PP20' ]
sc.set_figure_params(scanpy=True, fontsize=10)
sc.pl.heatmap(neurons_sigTop, toPlotLabs , groupby='louvain_neur', dendrogram=True,show_gene_labels=True,
gene_symbols = 'names',swap_axes=True,
cmap='PuBuGn',standard_scale='obs',save='neuronMarkers.pdf') #var_group_positions =var_groups ,var_group_labels = var_labels
#Plot supp neurpop machinery figure genes
#PAMs, PALs, Peptidases
topGenes = ['XLOC_005044','XLOC_043907','XLOC_036126','XLOC_014167' ,'XLOC_036675',
'XLOC_033088','XLOC_033057',
'XLOC_029131','XLOC_019418', 'XLOC_038295', 'XLOC_012308', 'XLOC_016870',
'XLOC_045079','XLOC_023701',
'XLOC_012427', 'XLOC_012428', 'XLOC_006924', 'XLOC_002557',
'XLOC_036719', 'XLOC_036718', 'XLOC_038522', 'XLOC_010017' , 'XLOC_039385', 'XLOC_045849']
neurons_sigTop = neurons_raw[:,topGenes]
sc.set_figure_params(scanpy=True, fontsize=10)
sc.pl.heatmap(neurons_sigTop, topGenes , groupby='louvain_neur', dendrogram=True,show_gene_labels=True,
swap_axes=True,
cmap='PuBuGn',standard_scale='obs',save='neuronMarkers.pdf') #var_group_positions =var_groups ,var_group_labels = var_labels
#Plot supp neurpop machinery figure genes
#PALs
topGenes = ['XLOC_012427', 'XLOC_012428', 'XLOC_006924', 'XLOC_002557']
neurons_sigTop = neurons_raw[:,topGenes]
sc.set_figure_params(scanpy=True, fontsize=10)
sc.pl.heatmap(neurons_sigTop, topGenes , groupby='louvain_neur', dendrogram=True,show_gene_labels=True,
swap_axes=True,
cmap='PuBuGn',standard_scale='obs',save='neuronMarkers.pdf') #var_group_positions =var_groups ,var_group_labels = var_labels
sc.pl.umap(neurons_raw,color=['XLOC_019434','XLOC_040584','XLOC_017097'],title=['PP5','PP7','PP11'],color_map='plasma')
sc.pl.umap(neurons_raw,color=['XLOC_021799','XLOC_030120','XLOC_038155'],title=['+PP25','+PP17','+PP20'],color_map='plasma')
#Composition of broad cell types between fed and starved , cell counts
counts = pd.DataFrame(columns =['count', 'orgID','condition','cluster'])
clusters = np.unique(neurons.obs['louvain_neur'])
c = []
org = []
cond = []
clus = []
#orgs = np.unique(neurons.obs['orgID'])
conds = [True,False]#['True', 'False']
for cl in clusters:
data = neurons[neurons.obs['louvain_neur'].isin([cl])]
for cd in conds:
#c_data = data[data.obs['condition'].isin([cd])]
#for o in orgs:
pos = (data.obs['fed'].isin([cd])) #& (data.obs['orgID'].isin([o]))
org_data = data[pos]
c += [org_data.n_obs]
if cd == True:#'True':
cond += ['fed']
else:
cond += ['starved']
clus += [cl]
print(len(c))
counts['count'] = c
counts['condition'] = cond
counts['cluster'] = clus
counts = counts[counts['count']> 0 ]
counts.head()
counts['cluster'] = [int(i) for i in counts['cluster']]
plt.figure(figsize=(16, 6))
ax = sns.scatterplot(x="cluster", y="count", hue="condition", data=counts,palette=["darkorange", "b"])
ax.set(ylabel='Cell Count')
ax.set(xlabel='Louvain Cluster')
ax.xaxis.set_ticks(np.unique(counts.cluster))
plt.show()
Plots for Neuropeptide Processing Machinery
raw_fs_combo.obs['new_cellRanger_louvain'] = bus_fs_combo.obs['new_cellRanger_louvain']
raw_fs_combo.uns["dendrogram_new_cellRanger_louvain"] = bus_fs_combo.uns["dendrogram_new_cellRanger_louvain"]
raw_fs_combo.uns['new_cellRanger_louvain_colors'] = bus_fs_combo.uns['new_cellRanger_louvain_colors']
#Log values in adata
toPlot = raw_fs_combo.copy()
sc.pp.log1p(toPlot)
#Plot supp neurpop machinery figure genes
#PAMs, PALs, Peptidases
topGenes = ['XLOC_005044','XLOC_043907','XLOC_036126','XLOC_014167' ,'XLOC_036675',
'XLOC_033088','XLOC_033057',
'XLOC_029131','XLOC_019418', 'XLOC_038295', 'XLOC_012308', 'XLOC_016870',
'XLOC_045079','XLOC_023701',
'XLOC_012427', 'XLOC_012428', 'XLOC_006924', 'XLOC_002557',
'XLOC_036719', 'XLOC_036718', 'XLOC_038522', 'XLOC_010017' , 'XLOC_039385', 'XLOC_045849']
neurons_sigTop = neurons_raw[:,topGenes]
sc.set_figure_params(scanpy=True, fontsize=20)
sc.pl.heatmap(toPlot, topGenes , groupby='new_cellRanger_louvain', dendrogram=True,show_gene_labels=True,figsize = (50,10),
swap_axes=True,
cmap='PuBuGn',standard_scale='obs',save='neuronMarkersAtlas.pdf') #var_group_positions =var_groups ,var_group_labels = var_labels