mirror of
https://github.com/haniffalab/scRNA-seq_analysis.git
synced 2024-10-23 08:29:24 -07:00
scRNA-seq_analysis
This commit is contained in:
commit
82cc2d191e
188 changed files with 146184 additions and 0 deletions
80
pipelines/16_train_doublets_classifier/apply_doublet_classifier.R
Executable file
80
pipelines/16_train_doublets_classifier/apply_doublet_classifier.R
Executable file
|
|
@ -0,0 +1,80 @@
|
|||
args = commandArgs(trailingOnly=T)
|
||||
args = paste(args, collapse = "")
|
||||
args = unlist(strsplit(args, ";"))
|
||||
|
||||
arguments.list = "
|
||||
seurat.addr.arg = args[1]
|
||||
save.at.arg = args[2]
|
||||
doublet.svm.arg = args[3]
|
||||
"
|
||||
{
|
||||
expected_arguments = unlist(strsplit(arguments.list, "\n"))
|
||||
expected_arguments = expected_arguments[!(expected_arguments == "")]
|
||||
|
||||
if(length(args) != length(expected_arguments)){
|
||||
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
|
||||
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
|
||||
stop(sprintf('This pipeline requires these parameters: seurat.addr ; save.at (name of RDS file where processed data are saved), doublet.svm (folder name where singlet/doublet svm classifier for given organ is stored)'))
|
||||
}
|
||||
|
||||
eval(parse(text = arguments.list))
|
||||
|
||||
for(n in 1:length(expected_arguments)){
|
||||
argument = expected_arguments[n]
|
||||
argument = gsub(pattern=" ", replacement="", x=argument)
|
||||
argument.name = unlist(strsplit(argument, "="))[1]
|
||||
variable.name = gsub(pattern=".arg", replacement="", argument.name)
|
||||
argument.content = eval(parse(text = argument.name))
|
||||
eval(parse(text = argument.content))
|
||||
if (!exists(variable.name)){
|
||||
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
|
||||
}
|
||||
}
|
||||
|
||||
# create required folders for output and work material
|
||||
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
|
||||
output_folder = paste(output_folder, seurat.addr, sep = "_")
|
||||
c.time = Sys.time()
|
||||
c.time = gsub(pattern=" BST", replacement="", x=c.time)
|
||||
c.time = gsub(pattern=":", replacement="", x=c.time)
|
||||
c.time = gsub(pattern=" ", replacement="", x=c.time)
|
||||
c.time = gsub(pattern="-", replacement="", x=c.time)
|
||||
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
|
||||
output_folder = paste(output_folder, c.time, sep = "_")
|
||||
output_folder = file.path("../../output", output_folder)
|
||||
dir.create(output_folder)
|
||||
|
||||
output_folder_material = file.path(output_folder, "material")
|
||||
dir.create(output_folder_material)
|
||||
|
||||
seurat.addr = file.path("../../data", seurat.addr)
|
||||
|
||||
source("../../tools/bunddle_utils.R")
|
||||
|
||||
save.at = file.path("../../data", save.at)
|
||||
|
||||
library(Seurat)
|
||||
library(RColorBrewer)
|
||||
library(dplyr)
|
||||
library(plyr)
|
||||
}
|
||||
###############################################################################
|
||||
# Load data
|
||||
print("Loading data ...")
|
||||
seurat.obj = readRDS(seurat.addr) # seurat.addr.arg
|
||||
print("Data loaded.")
|
||||
|
||||
print("Identifying doublets")
|
||||
seurat.obj@meta.data$doublets <- Apply_Classifier_On_Seurat_Object(
|
||||
seurat.obj = seurat.obj,
|
||||
classifier.fname = doublet.svm, # doublet.svm.arg
|
||||
tool_addr = tool_addr,
|
||||
python.addr = python.addr)
|
||||
|
||||
print("Doublets and singlets: ")
|
||||
print(table(seurat.obj@meta.data$lanes, seurat.obj@meta.data$doublets))
|
||||
|
||||
print("Saving data")
|
||||
saveRDS(seurat.obj, save.at) # save.at.arg
|
||||
|
||||
unlink(output_folder_material, recursive = T, force = T)
|
||||
16
pipelines/16_train_doublets_classifier/apply_doublet_classifier.sh
Executable file
16
pipelines/16_train_doublets_classifier/apply_doublet_classifier.sh
Executable file
|
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
|
||||
#$ -cwd
|
||||
#$ -N apply_doublets_SVM
|
||||
#$ -V
|
||||
#$ -l h_rt=23:59:59
|
||||
#$ -l h_vmem=100G
|
||||
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Illegal number of parameters"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
Rscript apply_doublet_classifier.R $1
|
||||
|
||||
echo "End on `date`"
|
||||
54
pipelines/16_train_doublets_classifier/svm.py
Executable file
54
pipelines/16_train_doublets_classifier/svm.py
Executable file
|
|
@ -0,0 +1,54 @@
|
|||
import pandas as pd
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
import pickle
|
||||
|
||||
import sys
|
||||
from os.path import join
|
||||
args = sys.argv
|
||||
save_to = args[1]
|
||||
|
||||
print("Loading data ...")
|
||||
X = pd.read_csv(join(save_to, "./data.csv"), sep = ",", index_col = 0).values
|
||||
y = pd.read_csv(join(save_to, "./labels.csv")).values[:, 0].reshape(-1, 1).ravel()
|
||||
|
||||
from sklearn.decomposition import PCA
|
||||
pca = PCA(n_components = .8)
|
||||
X = pca.fit_transform(X)
|
||||
|
||||
modelFile = open(join(save_to, "pca.pickle"), "wb")
|
||||
print(modelFile)
|
||||
modelFile.write(pickle.dumps(pca))
|
||||
modelFile.close()
|
||||
|
||||
print(X.shape)
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=19)
|
||||
|
||||
params = {"C":[1, 10, 100, 300]}
|
||||
|
||||
print("Creating the model and fitting the data ...")
|
||||
model = GridSearchCV(SVC(probability = False, kernel = "rbf"), params, cv=3)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
print("Testing ...")
|
||||
pred = model.predict(X)
|
||||
cls_report = classification_report(y, pred, target_names = model.classes_)
|
||||
print(cls_report)
|
||||
with open(join(save_to, "classification_report.txt"), "w") as cl_f:
|
||||
cl_f.write(cls_report)
|
||||
|
||||
print("Saving model and confusion matrix to disk ...")
|
||||
cnf_matrix = confusion_matrix(y, pred)
|
||||
df = pd.DataFrame(cnf_matrix)
|
||||
df.columns = model.classes_
|
||||
df.to_csv(join(save_to, "confusion_matrix.csv"))
|
||||
|
||||
modelFile = open(join(save_to, "model.pickle"), "wb")
|
||||
modelFile.write(pickle.dumps(model))
|
||||
modelFile.close()
|
||||
|
||||
print(model.best_params_)
|
||||
119
pipelines/16_train_doublets_classifier/train_doublets_SVM.R
Executable file
119
pipelines/16_train_doublets_classifier/train_doublets_SVM.R
Executable file
|
|
@ -0,0 +1,119 @@
|
|||
args = commandArgs(trailingOnly=T)
|
||||
args = paste(args, collapse = "")
|
||||
args = unlist(strsplit(args, ";"))
|
||||
|
||||
arguments.list = "
|
||||
seurat.addr.arg = args[1]
|
||||
save.to.arg = args[2]
|
||||
"
|
||||
|
||||
expected_arguments = unlist(strsplit(arguments.list, "\n"))
|
||||
expected_arguments = expected_arguments[!(expected_arguments == "")]
|
||||
|
||||
if(length(args) != length(expected_arguments)){
|
||||
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
|
||||
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
|
||||
stop(sprintf('This pipeline requires %s parameters: '))
|
||||
}
|
||||
|
||||
eval(parse(text = arguments.list))
|
||||
|
||||
for(n in 1:length(expected_arguments)){
|
||||
argument = expected_arguments[n]
|
||||
argument = gsub(pattern=" ", replacement="", x=argument)
|
||||
argument.name = unlist(strsplit(argument, "="))[1]
|
||||
variable.name = gsub(pattern=".arg", replacement="", argument.name)
|
||||
argument.content = eval(parse(text = argument.name))
|
||||
eval(parse(text = argument.content))
|
||||
if (!exists(variable.name)){
|
||||
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
|
||||
}
|
||||
}
|
||||
|
||||
seurat.addr = file.path("../../data", seurat.addr)
|
||||
|
||||
source("../../tools/bunddle_utils.R")
|
||||
|
||||
library(Seurat)
|
||||
library(RColorBrewer)
|
||||
library(plyr)
|
||||
library(dplyr)
|
||||
|
||||
#######################################################################################################
|
||||
|
||||
# load data
|
||||
print("loading data ... ")
|
||||
seurat.obj = readRDS(seurat.addr)
|
||||
print("Data loaded.")
|
||||
|
||||
save.to = file.path("../../resources", save.to)
|
||||
|
||||
dir.create(save.to)
|
||||
|
||||
# for HdCA samples:
|
||||
#seurat.obj@meta.data$lanes <- seurat.obj@meta.data$biomaterial_id
|
||||
|
||||
# mix cells within lanes
|
||||
lanes <- as.vector(unique(seurat.obj@meta.data$lanes))
|
||||
mixersA <- c(); mixersB <- c();
|
||||
for (i in 1:length(lanes)){
|
||||
lane = lanes[i]
|
||||
print(lane)
|
||||
lane.indices <- which(seurat.obj@meta.data$lanes == lane)
|
||||
mixersA <- c(mixersA, sample(x=lane.indices, size=length(lane.indices), replace=T))
|
||||
mixersB <- c(mixersB, sample(x=lane.indices, size=length(lane.indices), replace=T))
|
||||
}
|
||||
|
||||
# create a matrix of raw data for singlets - these are the cell collected
|
||||
singlets <- seurat.obj@raw.data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
|
||||
|
||||
# create doublets: 1) select from the singlets matrix 2 other matrices using the mixersA and mixersB; 2) add these 2 matrices
|
||||
doublets <- singlets[, mixersA] + singlets[, mixersB]
|
||||
|
||||
# make the cell names unique
|
||||
colnames(singlets) <- paste("Singlet", 1:dim(singlets)[2], sep = "_")
|
||||
colnames(doublets) <- paste("Doublets", 1:dim(doublets)[2], sep = "_")
|
||||
|
||||
# merge the singlets and doublets sparse matrices
|
||||
# need to look into whether cBind deprecation will affect results
|
||||
merged.matrix <- Matrix::cBind(singlets, doublets)
|
||||
|
||||
# free up some space
|
||||
rm(seurat.obj, lanes, mixersA, mixersB)
|
||||
|
||||
# create label vector
|
||||
labels <- rep(c("Singlet", "Doublet"), each = ncol(singlets))
|
||||
labels <- data.frame(Labels = labels)
|
||||
|
||||
# create a seurat object from singlets and doublets
|
||||
seurat.obj <- CreateSeuratObject(raw.data = merged.matrix, project="Doublet_Classifier", min.cells=0, min.genes=0)
|
||||
|
||||
print("Checkpoint 2")
|
||||
# free up some space
|
||||
rm(singlets, doublets, merged.matrix)
|
||||
print("Checkpoint 3")
|
||||
# normalize the data
|
||||
seurat.obj <- NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
|
||||
# compute variable genes
|
||||
seurat.obj <- FindVariableGenes(object = seurat.obj, mean.function = ExpMean,
|
||||
dispersion.function = LogVMR, x.low.cutoff = .0125,
|
||||
x.high.cutoff = 3, y.cutoff = .625)
|
||||
# save variable genes in the same folder as the classifier
|
||||
classifier.features <- seurat.obj@var.genes
|
||||
saveRDS(classifier.features, file.path(save.to, "feature_genes.RDS"))
|
||||
print("Checkpoint 4")
|
||||
|
||||
shuffle.key <- sample(x=1:dim(labels)[1], size=dim(labels)[1], replace=F)
|
||||
|
||||
# save the labels and normalized data to disk
|
||||
write.csv(labels[shuffle.key,], file.path(save.to, "labels.csv"), row.names = F)
|
||||
x.data <- as.data.frame(t(as.matrix(seurat.obj@data[classifier.features, ])))[shuffle.key,]
|
||||
print(dim(x.data))
|
||||
write.csv(x.data, file.path(save.to, "data.csv"), row.names = T)
|
||||
|
||||
command = sprintf("%s svm.py %s", python.addr, save.to)
|
||||
system(command, wait = T)
|
||||
|
||||
file.remove(file.path(save.to, "data.csv"), file.path(save.to, "labels.csv"), './Rplots.pdf')
|
||||
|
||||
print("Ended beautifully ... ")
|
||||
16
pipelines/16_train_doublets_classifier/train_doublets_SVM.sh
Executable file
16
pipelines/16_train_doublets_classifier/train_doublets_SVM.sh
Executable file
|
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
|
||||
#$ -cwd
|
||||
#$ -N train_doublets_SVM
|
||||
#$ -V
|
||||
#$ -l h_rt=47:59:59
|
||||
#$ -l h_vmem=300G
|
||||
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Illegal number of parameters"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
Rscript train_doublets_SVM.R $1
|
||||
|
||||
echo "End on `date`"
|
||||
Loading…
Add table
Add a link
Reference in a new issue