Extraction of Differentially Expressed Genes Under Stimulation
In [ ]:
!date
Download Data¶
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#Import raw, unclustered stimulation data
download_file('10.22002/D1.1814','.gz')
Out[ ]:
In [ ]:
#Import previously saved, clustered, & filtered stimulation data
download_file('10.22002/D1.1821','.gz')
Out[ ]:
In [ ]:
#Import merged data with knn clusters
download_file('10.22002/D1.1823','.gz')
Out[ ]:
In [ ]:
#Human ortholog annotations
download_file('10.22002/D1.1819','.gz')
#Panther annotations
download_file('10.22002/D1.1820','.gz')
#GO Terms
download_file('10.22002/D1.1822','.gz')
Out[ ]:
In [ ]:
#Previously saved DeSeq2 results (perturbed genes in DI and KCl conditions)
download_file('10.22002/D1.1818','.gz')
Out[ ]:
In [ ]:
!gunzip *.gz
In [ ]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
!pip install --quiet louvain
In [ ]:
!pip3 install --quiet rpy2
Import Packages¶
In [ ]:
import scanpy as sc
import anndata
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
import scipy.io as sio
import seaborn as sns
import random
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
sc.set_figure_params(dpi=125)
%load_ext rpy2.ipython
In [ ]:
#Read in annotations
from io import StringIO
hg_ortho_df = pd.read_csv(StringIO(''.join(l.replace('|', '\t') for l in open('D1.1819'))),
sep="\t",header=None,skiprows=[0,1,2,3])
hg_ortho_df[['XLOC','TCONS']] = hg_ortho_df[13].str.split(expand=True)
hg_ortho_df[['Gene','gi']] = hg_ortho_df[3].str.split(expand=True)
hg_ortho_df['Description']= hg_ortho_df[11]
panther_df = pd.read_csv('D1.1820',
sep="\t",header=None) #skiprows=[0,1,2,3]
goTerm_df = pd.read_csv('D1.1822',
sep=" ",header=None) #skiprows=[0,1,2,3]
Run DeSeq2 Analysis for Stimulation Perturbations (KCL and DI water)¶
Set up data
In [ ]:
#Remove clusters with < 10 cells per condition
bus_stim = anndata.read("D1.1814")
#Read in previously saved/clustered data with low count cells removed
bus_stim_clus = anndata.read("D1.1821")
bus_stim = bus_stim[bus_stim_clus.obs_names,]
#bus_fs_raw.obs['orgID'] = bus_fs_clus.obs['orgID']
bus_stim.obs['condition'] = bus_stim_clus.obs['condition']
bus_stim.obs['cellRanger_louvain'] = bus_stim_clus.obs['cellRanger_louvain']
bus_stim
#clusSize
Out[ ]:
In [ ]:
def clusToKeep(bus_fs_clus):
keep = []
clusSize = {}
for i in np.unique(bus_fs_clus.obs['cellRanger_louvain']):
cells = bus_fs_clus[bus_fs_clus.obs['cellRanger_louvain'].isin([i])]
kcl_cells = len(cells[cells.obs['condition']=='KCl'].obs_names)
di_cells = len(cells[cells.obs['condition']=='DI'].obs_names)
sw_cells = len(cells[cells.obs['condition']=='SW'].obs_names)
min_cells = np.min([kcl_cells,di_cells,sw_cells])
if min_cells > 10:
keep += [i]
#clusSize[i] = min_cells
return keep
In [ ]:
#Subsample from full dataset, across each cluster
def getSampled_Cluster(bus_fs_clus,bus_fs_raw,keep):
subSample = 150 #100
cellNames = np.array(bus_fs_clus.obs_names)
kcl = np.array(list(bus_fs_clus.obs['condition'] == 'KCl'))
di = np.array(list(bus_fs_clus.obs['condition'] == 'DI'))
sw = np.array(list(bus_fs_clus.obs['condition'] == 'SW'))
allCells = []
for i in keep:
#subSample = clusSize[i]
cells = np.array(list(bus_fs_clus.obs['cellRanger_louvain'].isin([i])))
kcl_cells = list(np.where(kcl & cells)[0])
di_cells = list(np.where(di & cells)[0])
sw_cells = list(np.where(sw & cells)[0])
#Take all cells if < subSample
if len(kcl_cells) >= subSample:
kcl_choice = random.sample(kcl_cells,subSample)
else:
kcl_choice = kcl_cells
if len(di_cells) >= subSample:
di_choice = random.sample(di_cells,subSample)
else:
di_choice = di_cells
if len(sw_cells) >= subSample:
sw_choice = random.sample(sw_cells,subSample)
else:
sw_choice = sw_cells
pos = list(kcl_choice)+list(di_choice)+list(sw_choice)
#print(len(pos))
allCells += list(cellNames[pos])
sub_raw = bus_fs_raw[allCells,:]
return sub_raw
Create subsampled dataset of cells x genes (subsamples across cell types)
In [ ]:
#For full dataset don't filter by highly variable
keep = clusToKeep(bus_stim_clus)
sub_raw = getSampled_Cluster(bus_stim_clus,bus_stim,keep)
sub_raw_copy = sub_raw.copy()
sc.pp.filter_cells(sub_raw, min_counts=1)
sc.pp.filter_genes(sub_raw, min_counts=1)
sub_raw
# sc.pp.normalize_per_cell(sub_raw_copy, counts_per_cell_after=1e4)
# sub_raw_copy.raw = sc.pp.log1p(sub_raw_copy, copy=True)
# sc.pp.highly_variable_genes(sub_raw_copy,n_top_genes=5000) #This is just a small example, for full data used all nonzero genes
# sub_raw = sub_raw[:,sub_raw_copy.var['highly_variable']]
Out[ ]:
In [ ]:
#Instantiate dataframe with gene names, convert to R-compatible format
def makeDF_forR(sub_raw):
fullDF = pd.DataFrame(scipy.sparse.csr_matrix.toarray(sub_raw.X).T, index = sub_raw.var_names.tolist(), columns= sub_raw.obs_names.tolist())
conds = sub_raw.obs['condition'].tolist()
#ids = sub_jelly4Raw.obs['orgID'].tolist()
clus = sub_raw.obs['cellRanger_louvain'].tolist()
reps = np.repeat(0,len(sub_raw.obs_names))
length = len(sub_raw[sub_raw.obs['condition'] == 'KCl'].obs_names)
reps[sub_raw.obs['condition'] == 'KCl'] = range(1,length+1)
length = len(sub_raw[sub_raw.obs['condition'] == 'DI'].obs_names)
reps[sub_raw.obs['condition'] == 'DI'] = range(1,length+1)
length = len(sub_raw[sub_raw.obs['condition'] == 'SW'].obs_names)
reps[sub_raw.obs['condition'] == 'SW'] = range(1,length+1)
sampleDF = pd.DataFrame({'cell_ID': fullDF.columns}) \
.assign(condition = conds) \
.assign(replicate = reps) \
.assign(cluster = clus)
sampleDF.index = sampleDF.cell_ID
sampleDF.head()
fullDF.to_csv('fullDF.csv')
sampleDF.to_csv('sampleDF.csv')
Read data into R and run DeSeq2 Models¶
In [ ]:
makeDF_forR(sub_raw)
In [ ]:
%%R
fullDF <- read.csv(file = 'fullDF.csv')
sampleDF <- read.csv(file = 'sampleDF.csv')
head(sampleDF)
In [ ]:
%%R
rownames(sampleDF) <- sampleDF$cell_ID
#Replace '.' in cell barcodes with '-'
rownames(fullDF) <- fullDF$X
colnames(fullDF) <- gsub("\\.", "-", colnames(fullDF))
fullDF <- subset(fullDF, select = -c(X) )
#head(fullDF)
sampleDF <- subset(sampleDF, select = -c(cell_ID.1) )
head(sampleDF)
sampleDF$condition <- factor(sampleDF$condition)
In [ ]:
%%R
head(sampleDF)
In [ ]:
%%R
install.packages("BiocManager")
BiocManager::install(version = "3.10")
In [ ]:
!sudo apt-get update
!sudo apt-get install libxml2-dev
!sudo apt-get install r-cran-xml
!sudo apt-get install libcurl4-openssl-dev
In [ ]:
%%R
#install.packages("DESeq2",repos = "http://cran.us.r-project.org")
BiocManager::install("DESeq2")
In [ ]:
#Make output directory
!mkdir kallistoDEAnalysis_Stim
In [ ]:
%%R
install.packages("DESeq2",repos = "http://cran.us.r-project.org")
library("DESeq2")
#library("apeglm")
#library(Rcpp)
#.libPaths()
clusters <- unique(sampleDF$cluster)
Genes <- c()
Cluster <- c()
Condition <- c()
padj <- c()
log2FC <- c()
for (i in clusters){
indices = which(sampleDF$cluster == i)
subset = fullDF[,indices]
subset_meta = subset(sampleDF,cluster == i)
dds <- DESeqDataSetFromMatrix(countData = subset, colData = subset_meta, design= ~replicate + condition)
#Set control condition
dds$condition <- relevel(dds$condition, ref = 'SW')
dds <- DESeq(dds,test="LRT", reduced=~replicate, sfType="poscounts", useT=TRUE, minmu=1e-6,
minReplicatesForReplace=Inf,betaPrior = FALSE)#parallel = TRUE
fc = 1 #usually fold change cutoff
#SW v KCl results
res <- results(dds,alpha=0.05,name="condition_KCl_vs_SW")
resLFC <- res
resLFC <- na.omit(resLFC)
resOrdered <- resLFC[resLFC$padj < .05,]
#Keep log2 fold changes < -1 or > 1
resOrdered <- resOrdered[abs(resOrdered$log2FoldChange) > fc,]
outcomes <- resOrdered[order(resOrdered$padj),]
Genes <- c(Genes,row.names(outcomes))
Cluster <- c(Cluster,rep(i,length(row.names(outcomes))))
Condition <- c(Condition,rep('KCl',length(row.names(outcomes))))
padj <- c(padj,outcomes$padj)
log2FC <- c(log2FC,outcomes$log2FoldChange)
#SW v DI results
res <- results(dds,alpha=0.05,name="condition_DI_vs_SW")
resLFC <- res
resLFC <- na.omit(resLFC)
resOrdered <- resLFC[resLFC$padj < .05,]
#Keep log2 fold changes < -1 or > 1
resOrdered <- resOrdered[abs(resOrdered$log2FoldChange) > fc,]
outcomes <- resOrdered[order(resOrdered$padj),]
Genes <- c(Genes,row.names(outcomes))
Cluster <- c(Cluster,rep(i,length(row.names(outcomes))))
Condition <- c(Condition,rep('DI',length(row.names(outcomes))))
padj <- c(padj,outcomes$padj)
log2FC <- c(log2FC,outcomes$log2FoldChange)
}
deGenesDF <- data.frame(matrix(ncol = 6, nrow = length(Genes)))
names(deGenesDF) <- c("Genes", "Cluster", "Condition","padj","padjClus","log2FC")
deGenesDF$Genes <- Genes
deGenesDF$Cluster <- Cluster
deGenesDF$Condition <- Condition
deGenesDF$padj <- padj
deGenesDF$padjClus <- padj*length(unique(Cluster))
deGenesDF$log2FC <- log2FC
write.csv(deGenesDF,'./kallistoDEAnalysis_Stim/deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample.csv')
head(deGenesDF)
Read in and Plot Previously Saved DeSeq2 Results¶
In [ ]:
%%R
install.packages("UpSetR",repos = "http://cran.us.r-project.org")
Create UpSet plot for KCl perturbed cells
In [ ]:
%%R -w 20 -h 20 --units in -r 500
#install.packages("UpSetR",repos = "http://cran.us.r-project.org")
#Cite http://people.seas.harvard.edu/~alex/papers/2014_infovis_upset.pdf
library("UpSetR")
deGenesDF <- read.csv(file = 'D1.1818') #'./kallistoDEAnalysis_Stim/deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample.csv'
#Use Bonferronni correction across clusters to filter genes
deGenesDF_toPlot = subset(deGenesDF,padjClus < .05)
kcl_toPlot = subset(deGenesDF_toPlot,Condition =='KCl')
di_toPlot = subset(deGenesDF_toPlot,Condition =='DI')
#deGenesDF_toPlot = subset(deGenesDF_toPlot,abs(log2FC) < 10) #Remove genes from plot that are inflated by zero expression in other cells
# Create empty list to store vectors
vecsToPlot <- list()
#Plot UpSet for KCl-perturbed genes
clusters = unique(kcl_toPlot$Cluster)
for (i in 1:length(clusters)){
subset = subset(kcl_toPlot,Cluster == clusters[i])
vecsToPlot[[i]] <- unique(subset$Genes)
}
names(vecsToPlot) <- clusters
upset(fromList(vecsToPlot), sets = as.character(clusters),nintersects= NA,order.by = "freq",
mainbar.y.label='Number of Genes in Intersection',
sets.x.label = 'Number of Perturbed Genes',
text.scale = c(3, 3, 3, 3, 3, 1.3),
show.numbers = "no",
point.size = 2.8,
mb.ratio= c(0.5, 0.5),
queries = list(list(query = intersects, params = list("18"), color = "firebrick",active = T),
list(query = intersects, params = list("19"), color = "firebrick",active = T),
list(query = intersects, params = list("22"), color = "firebrick",active = T),
list(query = intersects, params = list("23"), color = "firebrick",active = T),
list(query = intersects, params = list("17"), color = "firebrick",active = T),
list(query = intersects, params = list("14"), color = "firebrick",active = T),
list(query = intersects, params = list("12"), color = "firebrick",active = T),
list(query = intersects, params = list("9"), color = "firebrick",active = T),
list(query = intersects, params = list("6"), color = "firebrick",active = T),
list(query = intersects, params = list("15"), color = "firebrick",active = T),
list(query = intersects, params = list("10"), color = "firebrick",active = T),
list(query = intersects, params = list("13"), color = "firebrick",active = T),
list(query = intersects, params = list("16"), color = "firebrick",active = T),
list(query = intersects, params = list("11"), color = "firebrick",active = T),
list(query = intersects, params = list("7"), color = "firebrick",active = T),
list(query = intersects, params = list("4"), color = "firebrick",active = T),
list(query = intersects, params = list("3"), color = "firebrick",active = T),
list(query = intersects, params = list("5"), color = "firebrick",active = T),
list(query = intersects, params = list("25"), color = "firebrick",active = T),
list(query = intersects, params = list("0"), color = "firebrick",active = T),
list(query = intersects, params = list("2"), color = "firebrick",active = T))) #Add queries
Create UpSet plot for DI perturbed cells
In [ ]:
%%R -w 20 -h 20 --units in -r 500
# Create empty list to store vectors
vecsToPlot <- list()
clusters = unique(di_toPlot$Cluster)
for (i in 1:length(clusters)){
subset = subset(di_toPlot,Cluster == clusters[i])
vecsToPlot[[i]] <- unique(subset$Genes)
}
names(vecsToPlot) <- clusters
upset(fromList(vecsToPlot), sets = as.character(clusters),nintersects= NA,order.by = "freq",
mainbar.y.label='Number of Genes in Intersection',
sets.x.label = 'Number of Perturbed Genes',
text.scale = c(3, 3, 3, 3, 3, 1.3),
show.numbers = "no",
point.size = 2.8,
mb.ratio= c(0.5, 0.5),
queries = list(list(query = intersects, params = list("24"), color = "firebrick",active = T),
list(query = intersects, params = list("19"), color = "firebrick",active = T),
list(query = intersects, params = list("11"), color = "firebrick",active = T),
list(query = intersects, params = list("22"), color = "firebrick",active = T),
list(query = intersects, params = list("16"), color = "firebrick",active = T),
list(query = intersects, params = list("14"), color = "firebrick",active = T),
list(query = intersects, params = list("9"), color = "firebrick",active = T),
list(query = intersects, params = list("6"), color = "firebrick",active = T),
list(query = intersects, params = list("17"), color = "firebrick",active = T),
list(query = intersects, params = list("7"), color = "firebrick",active = T),
list(query = intersects, params = list("15"), color = "firebrick",active = T),
list(query = intersects, params = list("23"), color = "firebrick",active = T),
list(query = intersects, params = list("4"), color = "firebrick",active = T),
list(query = intersects, params = list("2"), color = "firebrick",active = T),
list(query = intersects, params = list("25"), color = "firebrick",active = T),
list(query = intersects, params = list("3"), color = "firebrick",active = T),
list(query = intersects, params = list("0"), color = "firebrick",active = T),
list(query = intersects, params = list("5"), color = "firebrick",active = T)))
#Add queries
In [ ]:
deseq_df = pd.read_csv('D1.1818',sep=",") #./kallistoDEAnalysis_Stim/deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample.csv
deseq_df.head()
Out[ ]:
How gene annotations are added
In [ ]:
orthoGene = []
orthoDescr = []
pantherNum = []
pantherDescr = []
goTerms = []
for g in deseq_df.Genes:
sub_df = hg_ortho_df[hg_ortho_df.XLOC.isin([g])]
panth_df = panther_df[panther_df[0].isin([g])]
go_df = goTerm_df[goTerm_df[0].isin([g])]
if len(sub_df) > 0:
#Save first result for gene/description
orthoGene += [list(sub_df.Gene)[0]]
orthoDescr += [list(sub_df.Description)[0]]
else:
orthoGene += ['NA']
orthoDescr += ['NA']
if len(panth_df) > 0:
#Save first result for gene/description
pantherNum += [list(panth_df[1])]
pantherDescr += [list(panth_df[2])]
else:
pantherNum += ['NA']
pantherDescr += ['NA']
if len(go_df) > 0:
#Save first result for gene/description
goTerms += [list(go_df[1])]
else:
goTerms += ['NA']
deseq_df['orthoGene'] = orthoGene
deseq_df['orthoDescr'] = orthoDescr
deseq_df['pantherID'] = pantherNum
deseq_df['pantherDescr'] = pantherDescr
deseq_df['goTerms'] = goTerms
deseq_df.head()
Out[ ]:
In [ ]:
deseq_df.to_csv('stim_deSeq2_deGenesDF_log2FCof1_singleCellReplicates_noShrinkage_subSample_annotations.csv')
# DOI: D1.1818
In [ ]: