scRNA-seq_analysis

2024-10-23 08:29:24 -07:00 · 2019-07-08 12:22:01 +01:00 · 2019-07-08 12:22:01 +01:00 · 82cc2d191e
commit 82cc2d191e
188 changed files with 146184 additions and 0 deletions
--- a/pipelines/09_gene_heatmap_and_spotplot/gene_heatmap_and_spotplot.R
+++ b/pipelines/09_gene_heatmap_and_spotplot/gene_heatmap_and_spotplot.R
@ -0,0 +1,179 @@
+args = commandArgs(trailingOnly=T)
+args = paste(args, collapse = "")
+args = unlist(strsplit(args, ";"))
+if(length(args) != 8){
+  stop('This pipeline requires 8 parameters: seurat.addr\n set.ident \n genes.to.plot (name of file containing genes to plot)\n cell.types (name of file containing cell types to plot or all)\n cluster.genes (boolean)\n diagonalize (boolean indicating to compute the order of the genes in such a way as to make the appearance of a diagonal on the spotplot - will override the cluster.genes boolean)\n plot.dims (4 tuple for plots dimenssion)\n save.gene.order (boolean indicate to save the genes in the order computed by clustering and/or diaganolization - can be NA or a file name)')
+}
+
+arguments.list = "
+seurat.addr.arg     = args[1]
+set.ident.arg       = args[2]
+genes.to.plot.arg   = args[3]
+cell.types.arg      = args[4]  
+cluster.genes.arg   = args[5]
+diagonalize.arg     = args[6]
+plot.dims.arg       = args[7]
+save.gene.order.arg = args[8]
+"
+eval(parse(text = arguments.list))
+
+arguments.list = unlist(strsplit(arguments.list, "\n"))
+arguments.list = arguments.list[!(arguments.list == "")]
+
+for(n in 1:length(arguments.list)){
+  argument = arguments.list[n]
+  argument = gsub(pattern=" ", replacement="", x=argument)
+  argument.name = unlist(strsplit(argument, "="))[1]
+  variable.name = gsub(pattern=".arg", replacement="", argument.name)
+  argument.content = eval(parse(text = argument.name))
+  eval(parse(text = argument.content))
+  if (!exists(variable.name)){
+    stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
+  }
+}
+
+# create required folders for output and work material
+output_folder = paste("09_gene_expression_heatmap_and_spotplot", seurat.addr, sep = "_")
+c.time = Sys.time()
+c.time = gsub(pattern=" BST", replacement="", x=c.time)
+c.time = gsub(pattern=":", replacement="", x=c.time)
+c.time = gsub(pattern=" ", replacement="", x=c.time)
+c.time = gsub(pattern="-", replacement="", x=c.time)
+c.time = substr(x=c.time, start=3, stop=nchar(c.time))
+output_folder = paste(output_folder, c.time, sep = "_")
+output_folder = file.path("../../output", output_folder)
+dir.create(output_folder)
+
+seurat.addr   = file.path("../../data", seurat.addr)
+genes.to.plot = file.path("../../resources", genes.to.plot)
+
+library(Seurat)
+library(dplyr)
+library(plyr)
+library(reshape)
+
+#######################################################################################################
+
+# load seurat object
+print("loading data ...")
+seurat.obj = readRDS(seurat.addr)
+print("Data loaded.")
+
+# set identies
+seurat.obj = SetAllIdent(object=seurat.obj, id=set.ident)
+
+# load genes
+genes.to.plot = file(genes.to.plot, "r")
+genes = readLines(genes.to.plot, warn=F)
+close(genes.to.plot)
+genes = unlist(strsplit(genes, "\n"))
+
+# check that all genes are in the dataset
+if(!(all(genes %in% rownames(seurat.obj@data)))){
+  not.found = genes[!(genes %in% rownames(seurat.obj@data))]
+  print(sprintf("The following genes were not found in the data: %s", paste(not.found, collapse = ", ")))
+  genes = genes[genes %in% rownames(seurat.obj@data)]
+}
+
+# check for duplicates
+if(length(genes) != length(unique(genes))){
+  duplicates = names(table(genes)[table(genes) > 1])
+  duplicates = paste(duplicates, collapse = ", ")
+  print(sprintf("Duplicates found: %s", duplicates))
+  print("This will not affect the workflow, but be aware the heat map will have fewer genes than expected.")
+  genes = unique(genes)
+}
+
+# rearange expression matrix by the order in cell types
+if(cell.types != "all"){
+  cell.types = file.path("../../resources", cell.types)
+  cell_types_file = file(cell.types, "r")
+  cell.types      = readLines(cell_types_file, warn = F)
+  close(cell_types_file)
+  cell.types = unlist(strsplit(cell.types, ", "))
+  print(cell.types)
+  print("All cell types in data set:")
+  print(table(cell.types %in% as.vector(unique(seurat.obj@ident))))
+}else{
+  cell.types = sort(as.vector(unique(seurat.obj@ident)))
+}
+
+# subset expression data matrix
+keep.cell.names = names(seurat.obj@ident)[seurat.obj@ident %in% cell.types]
+expression.data = data.matrix(seurat.obj@data[genes, keep.cell.names])
+
+# create a data matrix with mean expression of each marker by cell type
+expression.data = t(expression.data)
+expression.data = as.data.frame(expression.data)
+expression.data = cbind(data.frame(CellLabels = as.vector(seurat.obj@ident[keep.cell.names])), expression.data)
+expression.data = aggregate(expression.data[2:dim(expression.data)[2]], list(expression.data$CellLabels), mean)
+expression.data = cbind(data.frame(CellType = expression.data$Group.1), expression.data[, 2:dim(expression.data)[2]])
+rownames(expression.data) = expression.data$CellType
+expression.data = expression.data[, 2:ncol(expression.data)]
+
+# cluster the genes and reorder the expression matrix
+if (cluster.genes){
+  expression.distance = dist(x=t(expression.data), method="euclidian")
+  gene.order = hclust(d=expression.distance, method="ward.D")$order
+  expression.data = expression.data[, gene.order]
+}
+
+if (diagonalize){
+  computer.vector.weight.center = function(vecn){
+    indices = 1:length(vecn)
+    sum(indices * vecn) / sum(vecn)
+  }
+  centers = apply(X=expression.data, MARGIN=2, FUN=computer.vector.weight.center)
+  centers = order(centers)
+  print(length(centers))
+  expression.data = expression.data[, centers]
+}
+
+# plot the heatmap
+expression.melt = reshape::melt(data=as.matrix(expression.data))
+colnames(expression.melt) = c("CellTypes", "Genes", "Values")
+expression.melt$CellTypes = factor(as.vector(expression.melt$CellTypes), levels = cell.types)
+heatmap.plot = ggplot(expression.melt, aes(factor(Genes, levels = colnames(expression.data)), factor(CellTypes, levels = rev(cell.types)))) + geom_tile(aes(fill = Values), color = "black")
+heatmap.plot = heatmap.plot + scale_fill_gradient(low = "lightblue", high = "darkred")
+heatmap.plot = heatmap.plot + theme(axis.title.x=element_blank(), axis.title.y=element_blank(),
+                                     axis.text.x = element_text(angle = 45, hjust = 1))
+heatmap.plot = heatmap.plot + labs(fill='Expression') 
+heatmap.fname = file.path(output_folder, "./heatmap.pdf")
+pdf(heatmap.fname, width = plot.dims[1], height = plot.dims[2])
+print(heatmap.plot)
+dev.off()
+
+# plot diag matrix as dot plot (spot plot)
+expression.data.r = expression.data
+expression.data.r = expression.data.r[rev(cell.types), rev(colnames(expression.data.r))]
+expression.melt = reshape::melt(data=as.matrix(expression.data.r))
+colnames(expression.melt) = c("CellTypes", "Genes", "Values")
+expression.melt$X = rep(1:length(unique(expression.melt$Genes)), each=nrow(expression.data.r))
+expression.melt$Y = rep(length(unique(expression.melt$CellTypes)):1, times=ncol(expression.data.r))
+colnames(expression.melt) = c("CellTypes", "Genes", "Expression", "X", "Y" )
+
+max.expression = floor(max(expression.melt$Expression)) + 1
+
+spot.plot = ggplot(expression.melt, aes(x = Y, y = X)) +
+  geom_point(aes(size = Expression, color = Expression)) + 
+  scale_color_gradient(limits = c(0, max.expression), breaks = seq(0,max.expression, by = 1), low = "lightsteelblue1", high = "darkred") +
+  guides(color = guide_legend(), size = guide_legend()) +
+  scale_size_continuous(limits=c(0, max.expression), breaks=seq(0, max.expression, by=1)) + 
+  scale_y_discrete(name ="",  limits=colnames(expression.data.r)) +  
+  scale_x_discrete(name ="",  limits=rev(rownames(expression.data.r))) +
+  theme(axis.title.x=element_blank(), axis.title.y=element_blank(),
+        axis.text.x = element_text(angle = 45, hjust = 1))
+
+splotplot.fname = file.path(output_folder, "./spotplot.pdf")
+pdf(splotplot.fname, width = plot.dims[3], height = plot.dims[4])
+print(spot.plot)
+dev.off()
+
+if(!is.na(save.gene.order)){
+  save.gene.order = file.path("../../resources", save.gene.order)
+  save.gene.order = file(save.gene.order, "w")
+  writeLines(colnames(expression.data), save.gene.order)
+  close(save.gene.order)
+}
+
+print("Ended beautifully ... ")
--- a/pipelines/09_gene_heatmap_and_spotplot/gene_heatmap_and_spotplot.sh
+++ b/pipelines/09_gene_heatmap_and_spotplot/gene_heatmap_and_spotplot.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+#$ -cwd
+#$ -N gene_heatmap_and_spotplot
+#$ -V
+#$ -l h_rt=47:59:59
+#$ -l h_vmem=200G
+
+if [ "$#" -ne 1 ]; then
+    echo "Illegal number of parameters"
+    exit 1
+fi
+
+Rscript gene_heatmap_and_spotplot.R $1
+
+echo "End on `date`"
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/flow_cell_types.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/flow_cell_types.txt
@ -0,0 +1,28 @@
+HSC/MPP
+Pre pro B cell
+pro-B cell
+pre-B cell
+B cell
+ILC progenitor
+Early lymphoid/T lymphocyte
+NK
+Neutrophil-myeloid progenitor
+Monocyte-DC precursor
+pDC precursor
+DC1
+DC2
+Monocyte
+Mono-Mac
+Mono-NK
+Kupffer Cell
+VCAM1+ EI macrophage
+EI macrophage
+MEMP
+Mast cell
+Megakaryocyte
+Early Erythroid
+Mid  Erythroid
+Late Erythroid
+Endothelial cell
+Fibroblast
+Hepatocyte
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/flow_markers_liver.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/flow_markers_liver.txt
@ -0,0 +1,21 @@
+PTPRC
+IL3RA
+CD7
+EPCAM
+FCGR3A
+CD4
+HLA-DRA
+MS4A1
+VCAM1
+CD38
+NCAM1
+CLEC9A
+CD14
+KIT
+ESAM
+CD3E
+CD8A
+CD1C
+CD34
+GYPA
+CD79B
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/flow_markers_liver_padded.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/flow_markers_liver_padded.txt
@ -0,0 +1,38 @@
+HLA-DRA
+CD34
+SPINK2
+JCHAIN
+IGLL1
+CD79B
+TCL1A
+IGKC
+MS4A1
+LTB
+PTPRC
+CD3E
+CD7
+IL32
+CD8A
+NKG7
+XCL2
+NCAM1
+MPO
+LYZ
+PLAC8
+IL3RA
+CLEC9A
+CD1C
+S100A9
+CCL4
+CD14
+FCGR3A
+CD4
+C1QA
+VCAM1
+GYPA
+SERPINB1
+TPSAB1
+KIT
+PF4
+ESAM
+UBE2C
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/for_splot_plot_all.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/for_splot_plot_all.txt
@ -0,0 +1,48 @@
+HLA-DRA
+CD34
+SPINK2
+JCHAIN
+IGLL1
+CD79B
+TCL1A
+IGKC
+MS4A1
+CD19
+LTB
+KLRB1
+PTPRC
+CD3E
+CD7
+IL32
+CD8A
+KLRD1
+NKG7
+XCL2
+NCAM1
+MPO
+LYZ
+PLAC8
+IL3RA
+CLEC9A
+CD1C
+S100A9
+CCL4
+CD14
+FCGR3A
+CD4
+C1QA
+VCAM1
+GYPA
+SERPINB1
+TPSAB1
+KIT
+PF4
+ITGA2B
+UBE2C
+GATA1
+KLF1
+ALAS2
+HBA1
+ESAM
+ECM1
+APOA1
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_all_cell_types.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_all_cell_types.txt
@ -0,0 +1 @@
+HSC
pro B cell early
pro B cell
pre B cell
B cell
ILC progenitor
NK Progenitor
NK
Neut-myeloid progenitor
Monocyte-DC progenitor
pDC progenitor
DC1
DC2
Monocyte
Mono-Mac
Mono-4 like
Kupffer Cell
VCAM1+ Erythroid Macrophage
Erythroid Macrophage
MEP
Mast cell
Megakaryocyte
Early Erythroid
Mid  Erythroid
Late Erythroid
Endothelial cell
Fibroblast
Hepatocyte
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_genes_heatmap.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_genes_heatmap.txt
@ -0,0 +1,84 @@
+VCAM1
+FCGR3A
+CD14
+GYPA
+CD1C
+LYZ
+NKG7
+CD3D
+CTSW
+ESAM
+CD34
+MYC
+GATA2
+CLEC9A
+IL3RA
+SPIB
+IRF8
+TPSAB1
+CPA3
+PF4
+ITGA2B
+MKI67
+MS4A1
+CD79B
+EBF1
+DNTT
+SPINK2
+IGLL1
+CD7
+XCL2
+IFNG
+RORC
+MPO
+GATA1
+KLF1
+APOA1
+AHSG
+IGKC
+IGLC2
+IGLC3
+HLA-DQB1
+HLA-DPB1
+HLA-DPA1
+HLA-DRA
+CNRIP1
+DNASE1L3
+AHSP
+HBM
+HBZ
+HBA1
+HBA2
+HBG1
+APOA2
+ALB
+C1QTNF4
+IL7R
+LTB
+CD52
+C1QC
+C1QA
+C1QB
+TPSB2
+HBD
+PPBP
+UBE2C
+PRSS57
+SERPINB1
+KLRB1
+CCL4
+CCL3
+HLA-DRB1
+S100A9
+S100A8
+LGALS1
+AZU1
+PRTN3
+GZMA
+IL32
+JCHAIN
+PLAC8
+IGHM
+TCL1A
+VPREB3
+HBB
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_genes_heatmap_RB.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_genes_heatmap_RB.txt
@ -0,0 +1,84 @@
+SPINK2
+CD34
+C1QTNF4
+IGLL1
+EBF1
+DNTT
+LTB
+CD52
+CD79B
+VPREB3
+IGHM
+JCHAIN
+IGLC2
+TCL1A
+SPIB
+IGKC
+MS4A1
+IGLC3
+RORC
+IL7R
+KLRB1
+CD3D
+IL32
+CD7
+CTSW
+GZMA
+XCL2
+IFNG
+CCL4
+CCL3
+NKG7
+PRSS57
+SERPINB1
+APOA2
+ALB
+MPO
+AZU1
+PRTN3
+APOA1
+AHSG
+HLA-DPB1
+HLA-DPA1
+HLA-DRA
+HLA-DRB1
+CD1C
+PLAC8
+IRF8
+IL3RA
+DNASE1L3
+CLEC9A
+HLA-DQB1
+LGALS1
+LYZ
+S100A9
+S100A8
+C1QC
+C1QA
+C1QB
+HBA1
+HBA2
+HBG1
+HBB
+AHSP
+HBM
+FCGR3A
+CD14
+VCAM1
+GYPA
+HBZ
+KLF1
+MYC
+CPA3
+TPSAB1
+TPSB2
+GATA2
+CNRIP1
+ITGA2B
+PPBP
+PF4
+HBD
+ESAM
+GATA1
+MKI67
+UBE2C
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_immune_cell_types.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/liver_immune_cell_types.txt
@ -0,0 +1 @@
+HSC
pro B cell early
pro B cell
pre B cell
B cell
ILC progenitor
NK Progenitor
NK
NK - proliferating
Neut-myeloid progenitor
Monocyte-DC progenitor
pDC progenitor
DC1
DC2
Monocyte
Mono-Mac
Mono-4 like
Kupffer Cell
VCAM1+ Erythroid Macrophage
Erythroid Macrophage
MEP
Mast cell
Megakaryocyte
Megakaryocyte - proliferating
--- a/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/splotplot_markers.txt
+++ b/pipelines/09_gene_heatmap_and_spotplot/inputs_must_be_in_resource_folder/splotplot_markers.txt
@ -0,0 +1,48 @@
+HLA-DRA
+CD34
+SPINK2
+JCHAIN
+IGLL1
+CD79B
+TCL1A
+IGKC
+MS4A1
+CD19
+LTB
+KLRB1
+PTPRC
+CD3E
+CD7
+IL32
+CD8A
+KLRD1
+NKG7
+XCL2
+NCAM1
+MPO
+LYZ
+PLAC8
+IL3RA
+CLEC9A
+CD1C
+S100A9
+CCL4
+CD14
+FCGR3A
+CD4
+C1QA
+VCAM1
+GYPA
+SERPINB1
+TPSAB1
+KIT
+PF4
+ITGA2B
+UBE2C
+GATA1
+KLF1
+ALAS2
+HBA1
+ESAM
+ECM1
+APOA1
				`@ -0,0 +1 @@`
				`HSC pro B cell early pro B cell pre B cell B cell ILC progenitor NK Progenitor NK Neut-myeloid progenitor Monocyte-DC progenitor pDC progenitor DC1 DC2 Monocyte Mono-Mac Mono-4 like Kupffer Cell VCAM1+ Erythroid Macrophage Erythroid Macrophage MEP Mast cell Megakaryocyte Early Erythroid Mid Erythroid Late Erythroid Endothelial cell Fibroblast Hepatocyte`