scRNA-seq_analysis

2024-10-23 08:29:24 -07:00 · 2019-07-08 12:22:01 +01:00 · 2019-07-08 12:22:01 +01:00 · 82cc2d191e
commit 82cc2d191e
188 changed files with 146184 additions and 0 deletions
--- a/pipelines/15_train_classifier/svm.py
+++ b/pipelines/15_train_classifier/svm.py
@ -0,0 +1,58 @@
+import pandas as pd
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+from sklearn.svm import SVC
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import train_test_split
+import pickle
+
+import sys
+args = sys.argv
+material_dir = args[1]
+output_dir = args[2]
+
+from os.path import join
+
+print("Loading data ...")
+X = pd.read_csv(join(material_dir, 'data.csv'), sep = ",", index_col = 0).values
+Y = pd.read_csv(join(material_dir, 'labels.csv')).values[:, 0].reshape(-1, 1).ravel()
+
+from sklearn.decomposition import PCA
+pca = PCA(n_components = .8)
+X = pca.fit_transform(X)
+
+modelFile = open(join(output_dir, "pca.pickle"), "wb")
+print(modelFile)
+modelFile.write(pickle.dumps(pca))
+modelFile.close()
+
+print("Splitting into training and test sets...")
+(X_train, X_test, y_train, y_test) = train_test_split(X, Y, test_size = .3, random_state = 42)
+
+params = {"C":[1e-6, 1e-3, .1, 1, 10, 100, 1000],
+          "gamma": [1e-6, 1e-3, .1, 1]}
+
+# established as the best paramaters in some other work
+params = {"C":[10], "gamma": [1e-3]}
+
+print("Creating the model and fitting the data ...")
+model = GridSearchCV(SVC(probability = False, kernel = "rbf"), params, cv=5)
+model.fit(X_train, y_train)
+
+print("Testing ...")
+pred = model.predict(X_test)
+cls_report = classification_report(y_test, pred, target_names = model.classes_)
+print(cls_report)
+with open(join(output_dir, 'classification_report.txt'), "w")  as cl_f:
+    cl_f.write(cls_report)
+
+print("Saving model and confusion matrix to disk ...")
+cnf_matrix = confusion_matrix(y_test, pred)
+df = pd.DataFrame(cnf_matrix)
+df.columns = model.classes_
+df.to_csv(join(output_dir, 'confusion_matrix.csv'))
+
+modelFile = open(join(output_dir, 'model.pickle'), "wb")
+modelFile.write(pickle.dumps(model))
+modelFile.close()
+
--- a/pipelines/15_train_classifier/train_classifier.R
+++ b/pipelines/15_train_classifier/train_classifier.R
@ -0,0 +1,107 @@
+args = commandArgs(trailingOnly=T)
+args = paste(args, collapse = "")
+args = unlist(strsplit(args, ";"))
+
+arguments.list = "
+seurat.addr.arg  = args[1]
+marker.genes.addr = args[2]
+save.at          = args[3]
+classifier       = args[4]
+"
+
+expected_arguments = unlist(strsplit(arguments.list, "\n"))
+expected_arguments = expected_arguments[!(expected_arguments == "")]
+
+if(length(args) != length(expected_arguments)){
+  error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
+  expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
+  stop(sprintf('This pipeline requires %s parameters: '))
+}
+
+eval(parse(text = arguments.list))
+
+for(n in 1:length(expected_arguments)){
+  argument = expected_arguments[n]
+  argument = gsub(pattern=" ", replacement="", x=argument)
+  argument.name = unlist(strsplit(argument, "="))[1]
+  variable.name = gsub(pattern=".arg", replacement="", argument.name)
+  argument.content = eval(parse(text = argument.name))
+  eval(parse(text = argument.content))
+  if (!exists(variable.name)){
+    stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
+  }
+}
+
+# create required folders for output and work material
+working_dir = paste(sample(LETTERS, 50, replace=T),collapse = '')
+material_dir = file.path(working_dir, 'material')
+output_dir = file.path(working_dir, 'output')
+dir.create(working_dir)
+dir.create(material_dir)
+dir.create(output_dir)
+
+save.at = file.path('../../resources', save.at)
+
+seurat.addr = file.path("../../data", seurat.addr)
+
+classifier = paste(classifier, '.py', sep = '.')
+
+source("../../tools/bunddle_utils.R")
+
+library(Seurat)
+library(RColorBrewer)
+library(plyr)
+library(dplyr)
+library(ggplot2)
+
+# load data
+print("loading data ... ")
+seurat.obj = readRDS(seurat.addr)
+print("Data loaded.")
+
+# create and save label data frame
+print("Create and save label data frame ...")
+singlets <- as.vector(seurat.obj@meta.data$doublets) == "Singlet"
+labels   <- data.frame(Labels = as.vector(seurat.obj@meta.data$cell.labels)[singlets])
+write.csv(labels, file.path(material_dir, 'labels.csv'), row.names = F)
+
+# save variable genes in the output folder
+print("Choose features genes ...")
+marker.genes = file.path('../../resources/marker_genes', marker.genes)
+marker.genes <- read.csv(marker.genes)
+marker.genes <- marker.genes %>% group_by(cluster) %>% top_n(20, avg_logFC)
+classifier.features <- unique(as.vector(marker.genes$gene))
+saveRDS(classifier.features, file.path(output_dir, 'feature_genes.RDS'))
+
+# save the normalized data to disk
+print("saving training data to disk ...")
+cell.names <- names(seurat.obj@ident)[singlets]
+x.data <- as.data.frame(t(as.matrix(seurat.obj@data[classifier.features, cell.names])))
+write.csv(x.data, file.path(material_dir, 'data.csv'), row.names = T)
+
+print("initiating SVM trainer ... ")
+system(sprintf('%s svm.py %s %s', python.addr, material_dir, output_dir), wait = T)
+
+# plot confusion matrix
+cnf_matrix = read.csv(file.path(output_dir, 'confusion_matrix.csv'))
+cnf_matrix <- cnf_matrix[, -c(1)]
+confusion <- expand.grid(Actual = colnames(cnf_matrix), Predicted = colnames(cnf_matrix))
+cnf_matrix <- cnf_matrix / colSums(cnf_matrix)
+confusion$freq <- rapply(cnf_matrix, c)
+pdf(file.path(output_dir, 'confusion_matrix.pdf'), width = 14, height = 14)
+ggplot(data = confusion, aes(x = Actual, y = Predicted)) + geom_tile(aes(fill = freq)) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
+dev.off()
+
+unlink(save.at, recursive=T, force=T)
+dir.create(save.at)
+
+file.rename(from=file.path(output_dir, 'feature_genes.RDS'), to=file.path(save.at, "feature_genes.RDS"))
+file.rename(from=file.path(output_dir, 'classification_report.txt'), to=file.path(save.at, "classification_report.txt"))
+file.rename(from=file.path(output_dir, 'confusion_matrix.csv'), to=file.path(save.at, "confusion_matrix.csv"))
+file.rename(from=file.path(output_dir, 'confusion_matrix.pdf'), to=file.path(save.at, "confusion_matrix.pdf"))
+file.rename(from=file.path(output_dir, 'model.pickle'), to=file.path(save.at, "model.pickle"))
+file.rename(from=file.path(output_dir, 'pca.pickle'), to=file.path(save.at, "pca.pickle"))
+
+unlink(working_dir, recursive=T, force=T)
+
+print("Ended beautifully ... ")
--- a/pipelines/15_train_classifier/train_classifier.sh
+++ b/pipelines/15_train_classifier/train_classifier.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+#$ -cwd
+#$ -N train_classifier
+#$ -V
+#$ -l h_rt=23:59:59
+#$ -l h_vmem=100G
+
+if [ "$#" -ne 1 ]; then
+    echo "Illegal number of parameters"
+    exit 1
+fi
+
+Rscript train_classifier.R $1
+
+echo "End on `date`"