scRNA-seq_analysis

This commit is contained in:
veghp 2019-07-08 12:22:01 +01:00
commit 82cc2d191e
188 changed files with 146184 additions and 0 deletions

View file

@ -0,0 +1,80 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
save.at.arg = args[2]
doublet.svm.arg = args[3]
"
{
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires these parameters: seurat.addr ; save.at (name of RDS file where processed data are saved), doublet.svm (folder name where singlet/doublet svm classifier for given organ is stored)'))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
save.at = file.path("../../data", save.at)
library(Seurat)
library(RColorBrewer)
library(dplyr)
library(plyr)
}
###############################################################################
# Load data
print("Loading data ...")
seurat.obj = readRDS(seurat.addr) # seurat.addr.arg
print("Data loaded.")
print("Identifying doublets")
seurat.obj@meta.data$doublets <- Apply_Classifier_On_Seurat_Object(
seurat.obj = seurat.obj,
classifier.fname = doublet.svm, # doublet.svm.arg
tool_addr = tool_addr,
python.addr = python.addr)
print("Doublets and singlets: ")
print(table(seurat.obj@meta.data$lanes, seurat.obj@meta.data$doublets))
print("Saving data")
saveRDS(seurat.obj, save.at) # save.at.arg
unlink(output_folder_material, recursive = T, force = T)

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N apply_doublets_SVM
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=100G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript apply_doublet_classifier.R $1
echo "End on `date`"

View file

@ -0,0 +1,54 @@
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pickle
import sys
from os.path import join
args = sys.argv
save_to = args[1]
print("Loading data ...")
X = pd.read_csv(join(save_to, "./data.csv"), sep = ",", index_col = 0).values
y = pd.read_csv(join(save_to, "./labels.csv")).values[:, 0].reshape(-1, 1).ravel()
from sklearn.decomposition import PCA
pca = PCA(n_components = .8)
X = pca.fit_transform(X)
modelFile = open(join(save_to, "pca.pickle"), "wb")
print(modelFile)
modelFile.write(pickle.dumps(pca))
modelFile.close()
print(X.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=19)
params = {"C":[1, 10, 100, 300]}
print("Creating the model and fitting the data ...")
model = GridSearchCV(SVC(probability = False, kernel = "rbf"), params, cv=3)
model.fit(X_train, y_train)
print("Testing ...")
pred = model.predict(X)
cls_report = classification_report(y, pred, target_names = model.classes_)
print(cls_report)
with open(join(save_to, "classification_report.txt"), "w") as cl_f:
cl_f.write(cls_report)
print("Saving model and confusion matrix to disk ...")
cnf_matrix = confusion_matrix(y, pred)
df = pd.DataFrame(cnf_matrix)
df.columns = model.classes_
df.to_csv(join(save_to, "confusion_matrix.csv"))
modelFile = open(join(save_to, "model.pickle"), "wb")
modelFile.write(pickle.dumps(model))
modelFile.close()
print(model.best_params_)

View file

@ -0,0 +1,119 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
save.to.arg = args[2]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(RColorBrewer)
library(plyr)
library(dplyr)
#######################################################################################################
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
save.to = file.path("../../resources", save.to)
dir.create(save.to)
# for HdCA samples:
#seurat.obj@meta.data$lanes <- seurat.obj@meta.data$biomaterial_id
# mix cells within lanes
lanes <- as.vector(unique(seurat.obj@meta.data$lanes))
mixersA <- c(); mixersB <- c();
for (i in 1:length(lanes)){
lane = lanes[i]
print(lane)
lane.indices <- which(seurat.obj@meta.data$lanes == lane)
mixersA <- c(mixersA, sample(x=lane.indices, size=length(lane.indices), replace=T))
mixersB <- c(mixersB, sample(x=lane.indices, size=length(lane.indices), replace=T))
}
# create a matrix of raw data for singlets - these are the cell collected
singlets <- seurat.obj@raw.data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
# create doublets: 1) select from the singlets matrix 2 other matrices using the mixersA and mixersB; 2) add these 2 matrices
doublets <- singlets[, mixersA] + singlets[, mixersB]
# make the cell names unique
colnames(singlets) <- paste("Singlet", 1:dim(singlets)[2], sep = "_")
colnames(doublets) <- paste("Doublets", 1:dim(doublets)[2], sep = "_")
# merge the singlets and doublets sparse matrices
# need to look into whether cBind deprecation will affect results
merged.matrix <- Matrix::cBind(singlets, doublets)
# free up some space
rm(seurat.obj, lanes, mixersA, mixersB)
# create label vector
labels <- rep(c("Singlet", "Doublet"), each = ncol(singlets))
labels <- data.frame(Labels = labels)
# create a seurat object from singlets and doublets
seurat.obj <- CreateSeuratObject(raw.data = merged.matrix, project="Doublet_Classifier", min.cells=0, min.genes=0)
print("Checkpoint 2")
# free up some space
rm(singlets, doublets, merged.matrix)
print("Checkpoint 3")
# normalize the data
seurat.obj <- NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
# compute variable genes
seurat.obj <- FindVariableGenes(object = seurat.obj, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = .0125,
x.high.cutoff = 3, y.cutoff = .625)
# save variable genes in the same folder as the classifier
classifier.features <- seurat.obj@var.genes
saveRDS(classifier.features, file.path(save.to, "feature_genes.RDS"))
print("Checkpoint 4")
shuffle.key <- sample(x=1:dim(labels)[1], size=dim(labels)[1], replace=F)
# save the labels and normalized data to disk
write.csv(labels[shuffle.key,], file.path(save.to, "labels.csv"), row.names = F)
x.data <- as.data.frame(t(as.matrix(seurat.obj@data[classifier.features, ])))[shuffle.key,]
print(dim(x.data))
write.csv(x.data, file.path(save.to, "data.csv"), row.names = T)
command = sprintf("%s svm.py %s", python.addr, save.to)
system(command, wait = T)
file.remove(file.path(save.to, "data.csv"), file.path(save.to, "labels.csv"), './Rplots.pdf')
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N train_doublets_SVM
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=300G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript train_doublets_SVM.R $1
echo "End on `date`"