scRNA-seq_analysis

2024-10-23 08:29:24 -07:00 · 2019-07-08 12:22:01 +01:00 · 2019-07-08 12:22:01 +01:00 · 82cc2d191e
commit 82cc2d191e
188 changed files with 146184 additions and 0 deletions
--- a/pipelines/22_cell_comparison/cell_comparison.R
+++ b/pipelines/22_cell_comparison/cell_comparison.R
@ -0,0 +1,463 @@
+args = commandArgs(trailingOnly=T)
+args = paste(args, collapse = "")
+args = unlist(strsplit(args, ";"))
+
+arguments.list = "
+seurat.query.addr.arg = args[1]
+seurat.ref.addr.arg   = args[2]
+set.ident.query.arg   = args[3]
+set.ident.ref.arg     = args[4]
+cell.types.query.arg  = args[5]
+cell.types.ref.arg    = args[6]
+dims.plot.arg         = args[7]
+compute.DEGs.arg      = args[8]
+"
+
+DEG.number.limit = 20
+
+expected_arguments = unlist(strsplit(arguments.list, "\n"))
+expected_arguments = expected_arguments[!(expected_arguments == "")]
+
+if(length(args) != length(expected_arguments)){
+  error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
+  expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
+  stop(sprintf('This pipeline requires %s parameters: '))
+}
+
+eval(parse(text = arguments.list))
+
+for(n in 1:length(expected_arguments)){
+  argument = expected_arguments[n]
+  argument = gsub(pattern=" ", replacement="", x=argument)
+  argument.name = unlist(strsplit(argument, "="))[1]
+  variable.name = gsub(pattern=".arg", replacement="", argument.name)
+  argument.content = eval(parse(text = argument.name))
+  eval(parse(text = argument.content))
+  if (!exists(variable.name)){
+    stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
+  }
+}
+
+# create required folders for output and work material
+output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
+label1 = gsub(pattern="\\.RDS", replacement="", x=seurat.query.addr)
+label2 = gsub(pattern="\\.RDS", replacement="", x=seurat.ref.addr)
+output_folder = paste(output_folder, label1, label2, sep = "_")
+c.time = Sys.time()
+c.time = gsub(pattern=" BST", replacement="", x=c.time)
+c.time = gsub(pattern=":", replacement="", x=c.time)
+c.time = gsub(pattern=" ", replacement="", x=c.time)
+c.time = gsub(pattern="-", replacement="", x=c.time)
+c.time = substr(x=c.time, start=3, stop=nchar(c.time))
+output_folder = paste(output_folder, c.time, sep = "_")
+output_folder = file.path("../../output", output_folder)
+dir.create(output_folder)
+
+output_folder_material = file.path(output_folder, "material")
+dir.create(output_folder_material)
+
+seurat.query.addr = file.path("../../data", seurat.query.addr)
+seurat.ref.addr   = file.path("../../data", seurat.ref.addr)
+
+source("../../tools/bunddle_utils.R")
+
+gene_mean_expression = function(seurat.obj){
+  no.genes = nrow(seurat.obj@data)
+  start_index = 1
+  while (start_index < no.genes){
+    end_index = start_index + 999
+    end_index = min(end_index, no.genes)
+    expression.data_ = data.matrix(seurat.obj@data[start_index:end_index, ])
+    expression.data_ = t(expression.data_)
+    expression.data_ = as.data.frame(expression.data_)
+    expression.data_ = cbind(data.frame(CellLabels = as.vector(seurat.obj@ident)), expression.data_)
+    expression.data_ = aggregate(expression.data_[2:dim(expression.data_)[2]], list(expression.data_$CellLabels), mean)
+    expression.data_ = cbind(data.frame(CellType = expression.data_$Group.1), expression.data_[, 2:dim(expression.data_)[2]])
+    rownames(expression.data_) = expression.data_$CellType
+    expression.data_ = expression.data_[, 2:ncol(expression.data_)]
+    print(start_index)
+    if (start_index == 1){
+      expression.data = expression.data_
+    }else{
+      expression.data = cbind(expression.data, expression.data_)
+    }
+    start_index = start_index + 1000
+  }
+  expression.data
+}
+
+make_heatmap = function(table.data, measure.name, low.col = "darkslateblue", mid.col = "floralwhite", high.col = "goldenrod1"){
+  expression.melt = reshape::melt(data=as.matrix(table.data))
+  colnames(expression.melt) = c("CellTypesQuery", "CellTypesReference", measure.name)
+  expression.melt$CellTypesQuery     = factor(as.vector(expression.melt$CellTypesQuery),     levels=rev(query.labels))
+  expression.melt$CellTypesReference = factor(as.vector(expression.melt$CellTypesReference), levels = ref.labels)
+  heatmap.plot = ggplot(expression.melt, aes(CellTypesReference, CellTypesQuery, fill = expression.melt[, 3])) +
+    geom_tile(color = "black") +
+    scale_fill_gradientn(colours = c(low.col, mid.col, high.col), values = c(0, .5, 1.0)) +
+    theme(axis.title.x=element_blank(), axis.title.y=element_blank(),
+                                      axis.text.x = element_text(angle = 45, hjust = 1)) +
+    labs(fill=measure.name)
+  heatmap.plot
+}
+
+plot_DEGs = function(query.pop){
+  print("running plot_degs function")
+  vln.grobs = c()
+  first_time = T
+  for (i in seq_along(ref.labels)){
+    ref.pop = ref.labels[i]
+    # compute DEGs for query vs refs
+    if (query.pop != ref.pop){
+      print("printing query pop")
+      print(query.pop)
+      print("printing ref.pop")
+      print(ref.pop)
+      #DEGs = FindMarkers(object=seurat.obj, ident.1=query.pop, ident.2=ref.pop, genes.use=rownames(seurat.obj@raw.data), only.pos=F, max.cells.per.ident = 500)
+      DEGs = FindMarkers(object=seurat.obj, ident.1=query.pop, ident.2=ref.pop, features=rownames(seurat.obj@raw.data), only.pos=F, max.cells.per.ident = 500)
+      print("printing DEGs that are found straight after findmarkers is run")
+      print(DEGs)
+      DEGs$Gene = rownames(DEGs)
+      DEGs$Reference_Population = gsub(pattern="R::", replacement="", x=ref.pop)
+      DEGs$Query_Population     = gsub(pattern="Q::", replacement="", x=query.pop)
+      if (first_time){
+        DEG.matrix = DEGs
+        first_time = F
+      }else{
+        DEG.matrix = rbind(DEG.matrix, DEGs)
+      }
+      
+      print("printing DEGs after if loop to add cols")
+      print(DEGs)
+      pos.markers = DEGs %>% top_n(deg.span, avg_logFC)
+      pos.markers = as.vector(pos.markers$Gene)
+      neg.markers = DEGs %>% top_n(-deg.span, avg_logFC)
+      neg.markers = rev(as.vector(neg.markers$Gene))
+      markers     = c(pos.markers, neg.markers)
+      
+      # make marker data - used downstream for plotting
+      markers.data.query = t(as.matrix(seurat.obj@data[markers, as.vector(seurat.obj@ident) == query.pop]))
+      markers.data.query = reshape::melt(markers.data.query)
+      colnames(markers.data.query) = c("Population", "Gene", "Expression")
+      markers.data.query$Population = "Query_Population"
+      
+      markers.data.ref   = t(as.matrix(seurat.obj@data[markers, as.vector(seurat.obj@ident) == ref.pop]))
+      markers.data.ref = reshape::melt(markers.data.ref)
+      colnames(markers.data.ref) = c("Population", "Gene", "Expression")
+      markers.data.ref$Population = "Reference_Population"
+      
+      markers.data = as.data.frame(rbind(markers.data.query, markers.data.ref))
+      markers.data$Gene = factor(as.vector(markers.data$Gene), levels = markers)
+            
+      # make violin plot object
+      vln.plot = ggplot(markers.data, aes(x = Population, y = Expression, col = Population, fill = Population))
+      vln.plot = vln.plot + geom_jitter(size = .3)
+      vln.plot = vln.plot + facet_wrap(~Gene, nrow = 1)
+      vln.plot = vln.plot + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), legend.position="none")
+      vln.plot = vln.plot + theme(axis.title.y = element_text(angle = 90, hjust = 1)) + ylab(ref.pop)
+      
+      unique_vln_plot = paste("VlnPlot", i, sep = "_")
+      eval(parse(text = sprintf('%s = vln.plot', unique_vln_plot)))
+      vln.grobs = c(vln.grobs, unique_vln_plot)
+    }
+  }
+  vln.grobs = paste(vln.grobs, collapse = ", ")
+  eval(parse(text = sprintf('vln.grobs = list(%s)', vln.grobs)))
+  
+  DEG.fname = gsub(pattern="/", replacement='_', x=query.pop, fixed = T)
+  DEG.fname = gsub(pattern="Q::", replacement="", x=DEG.fname)
+  DEG.fname = paste(DEG.fname, ".png", sep = "")
+  DEG.fname = file.path(deg.folder, DEG.fname)
+  
+  print(DEG.fname)
+  
+  png(DEG.fname, width = plot.W, height = plot.H)
+  grid.arrange(grobs = vln.grobs, ncol = 1)
+  dev.off()
+  
+  DEG.fname = gsub(pattern="/", replacement='_', x=query.pop, fixed = T)
+  DEG.fname = gsub(pattern="Q::", replacement="", x=DEG.fname)
+  DEG.fname = paste(DEG.fname, ".csv", sep = "")
+  DEG.fname = file.path(deg.folder, DEG.fname)
+  
+  print(DEG.fname)
+  
+  write.csv(DEG.matrix, DEG.fname)
+}
+
+DownSample = function(seurat.obj, limit){
+  pop.types = as.vector(unique(seurat.obj@ident))
+  to.keep = c()
+  for(k in seq_along(pop.types)){
+    pop.type = pop.types[k]
+    pop.names = names(seurat.obj@ident)[as.vector(seurat.obj@ident) == pop.type]
+    if (length(pop.names) > limit){
+      pop.names = sample(x=pop.names, size=limit, replace=FALSE)
+    }
+    to.keep = c(to.keep, pop.names)
+  }
+  SubsetData(object=seurat.obj, cells.use=to.keep)
+}
+
+library(Seurat)
+library(RColorBrewer)
+library(plyr)
+library(dplyr)
+library(gridExtra)
+library(BiocParallel)
+
+#######################################################################################################
+
+# load data
+print("loading query data set ... ")
+seurat.query.obj = readRDS(seurat.query.addr)
+seurat.query.obj = SetAllIdent(object=seurat.query.obj, id=set.ident.query)
+if (ncol(seurat.query.obj@data) > 100000){
+  print("Query data too big. Downsampling query data ...")
+  seurat.query.obj = DownSample(seurat.query.obj, limit = 2000)
+}
+print("Query data loaded.")
+
+if(seurat.ref.addr != seurat.query.addr){
+  print("loading reference data set ... ")
+  seurat.ref.obj = readRDS(seurat.ref.addr)
+  seurat.ref.obj   = SetAllIdent(object=seurat.ref.obj,   id=set.ident.ref)
+  if(ncol(seurat.ref.obj@data) > 100000){
+    print("Reference data too big. Downsampling reference data ...")
+    seurat.ref.obj = DownSample(seurat.ref.obj, limit = 2000)
+  }
+  print("Reference data loaded.")
+}else{
+  print("Query and reference data sets are the same. Copying the query to make the reference data sets ...")
+  if (set.ident.query != set.ident.ref){
+    stop("The reference and query data sets are the same but the set identities are different. Terminating script ...")
+  }
+  seurat.ref.obj = seurat.query.obj
+  seurat.obj     = seurat.query.obj
+}
+
+# load lists of cell types
+if (is.na(cell.types.query)){
+  cell.types.query = sort(as.vector(unique(seurat.query.obj@ident)))
+}else{
+  # open file connection
+  cell.types.query.file = file.path("../../resources", cell.types.query)
+  cell.types.query.file = file(cell.types.query.file, "r")
+  cell.types.query = readLines(cell.types.query.file)
+  close(cell.types.query.file)
+}
+
+if(is.na(cell.types.ref)){
+  cell.types.ref = sort(as.vector(unique(seurat.ref.obj@ident)))
+}else{
+  # open file connection
+  cell.types.ref.file = file.path("../../resources/", cell.types.ref)
+  cell.types.ref.file = file(cell.types.ref.file, "r")
+  cell.types.ref = readLines(cell.types.ref.file)
+  close(cell.types.ref.file)
+}
+
+# subset seurat objects
+if (!all(as.vector(unique(seurat.query.obj@ident)) %in% cell.types.query)){
+  print("Subsetting the query data ...")
+  print("as.vector(unique(seurat.query.obj@ident)")
+  print(as.vector(unique(seurat.query.obj@ident)))
+  print("cell types query")
+  print(cell.types.query)
+  seurat.query.obj = SubsetData(object=seurat.query.obj, ident.use=cell.types.query, do.clean=T, subset.raw=T)
+}
+if(!all(as.vector(unique(seurat.ref.obj@ident)) %in% cell.types.ref)){
+  print("Subsetting the reference data ...")
+  seurat.ref.obj   = SubsetData(object=seurat.ref.obj,   ident.use=cell.types.ref,   do.clean=T, subset.raw=T)
+}
+
+#remove any cell type represented by to few cells to allow a statistical test of DEGs
+if (compute.DEGs){
+  
+cell.types.query.fail = names(table(seurat.query.obj@ident)[table(seurat.query.obj@ident) < DEG.number.limit])
+cell.types.ref.fail   = names(table(seurat.ref.obj@ident)[table(seurat.ref.obj@ident) < DEG.number.limit])
+
+if (length(cell.types.query.fail) > 0){
+  print(sprintf("The query data has cell population represented by too few cells to allow DEG statistical test. These are: '%s';", paste(cell.types.query.fail, collapse = ",")))
+  print("Removing these cell types from query data ...")
+  cell.types.query = cell.types.query[!(cell.types.query %in% cell.types.query.fail)]
+  seurat.query.obj = SubsetData(object=seurat.query.obj, ident.use=cell.types.query, do.clean=T, subset.raw=T)
+}
+
+if (length(cell.types.ref.fail) > 0){
+  print(sprintf("The reference data has cell population represented by too few cells to allow DEG statistical test. These are:' %s'; And will be removed.", paste(cell.types.ref.fail, collapse = ",")))
+  print("Removing these cell types from reference data ...")
+  cell.types.ref = cell.types.ref[!(cell.types.ref %in% cell.types.ref.fail)]
+  seurat.ref.obj   = SubsetData(object=seurat.ref.obj,   ident.use=cell.types.ref,   do.clean=T, subset.raw=T)
+}
+}
+
+# merge the seurat objects
+if(seurat.ref.addr == seurat.query.addr){
+  seurat.obj = SubsetData(object=seurat.obj, ident.use=unique(c(cell.types.query, cell.types.ref)), do.clean=T, subset.raw=T)
+}else{
+  seurat.query.obj@meta.data$comparison_labels = eval(parse(text = sprintf("seurat.query.obj@meta.data$%s", set.ident.query)))
+  seurat.query.obj@meta.data$comparison_labels = paste("Q::", as.vector(seurat.query.obj@meta.data$comparison_labels), sep = "")
+  seurat.ref.obj@meta.data$comparison_labels   = eval(parse(text = sprintf("seurat.ref.obj@meta.data$%s", set.ident.ref)))
+  seurat.ref.obj@meta.data$comparison_labels   = paste("R::", as.vector(seurat.ref.obj@meta.data$comparison_labels), sep = "")
+  
+  seurat.obj = MergeSeurat(object1=seurat.ref.obj, object2=seurat.query.obj, min.cells=0, min.genes=0, do.normalize=T, add.cell.id1="R", add.cell.id2="Q")
+  seurat.obj = SetAllIdent(object=seurat.obj, id="comparison_labels")
+}
+
+# compute gene x cell type mean gene expression matrix for the query and reference data sets
+query.expression.data = gene_mean_expression(seurat.query.obj)
+ref.expression.data   = gene_mean_expression(seurat.ref.obj)
+
+query.expression.data = query.expression.data[cell.types.query, ]
+ref.expression.data   = ref.expression.data[cell.types.ref,     ]
+
+# process and merge the expression matrices
+if(seurat.ref.addr == seurat.query.addr){
+  query.labels = cell.types.query
+  ref.labels   = cell.types.ref
+}else{
+  query.labels = paste("Q::", rownames(query.expression.data), sep = "")
+  ref.labels   = paste("R::", rownames(ref.expression.data),   sep = "")
+}
+
+rownames(query.expression.data) = query.labels
+rownames(ref.expression.data)   = ref.labels
+
+# must compute variable genes in both seurat objects, otherwise the might be no shared variable genes
+print("Computing variable genes ... ")
+seurat.query.obj = NormalizeData(object = seurat.query.obj, normalization.method = "LogNormalize", scale.factor = 10000)
+print("Computing variable genes ... ")
+seurat.query.obj = FindVariableGenes(object = seurat.query.obj, mean.function = ExpMean, 
+                                     dispersion.function = LogVMR, x.low.cutoff = .0125, 
+                                     x.high.cutoff = 3, y.cutoff = .625)
+seurat.ref.obj = NormalizeData(object = seurat.ref.obj, normalization.method = "LogNormalize", scale.factor = 10000)
+print("Computing variable genes ... ")
+seurat.ref.obj = FindVariableGenes(object = seurat.ref.obj, mean.function = ExpMean, 
+                                   dispersion.function = LogVMR, x.low.cutoff = .0125, 
+                                   x.high.cutoff = 3, y.cutoff = .625)
+print("Finished computing the variable genes.")
+
+shared.genes = c(seurat.query.obj@var.genes, seurat.ref.obj@var.genes)
+shared.genes = intersect(shared.genes, colnames(query.expression.data))
+shared.genes = intersect(shared.genes, colnames(ref.expression.data))
+
+query.expression.data = query.expression.data[, shared.genes]
+ref.expression.data   = ref.expression.data[, shared.genes]
+
+expression.data = rbind(query.expression.data, ref.expression.data)
+
+# compute correlation distances
+cor.dist = cor(t(expression.data), method="spearman")
+cor.dist = cor.dist[query.labels, ref.labels]
+
+p.values = c()
+for(i in seq_along(1:nrow(expression.data))){
+  for(j in seq_along(1:nrow(expression.data))){
+    p.values = c(p.values, cor.test(x = as.numeric(expression.data[i, ]), y = as.numeric(expression.data[j, ]), method="spearman")$p.value)
+  }
+}
+p.values = matrix(p.values, nrow=nrow(expression.data))
+rownames(p.values) = rownames(expression.data)
+colnames(p.values) = rownames(expression.data)
+p.values = p.values[query.labels, ref.labels]
+write.csv(p.values, file.path(output_folder, "correlation_p_values.csv"))
+
+# make correlation plot
+cor.plot = make_heatmap(table.data=cor.dist, measure.name="Correlation")
+cor.plot.fname = file.path(output_folder, "correlation_matrix.pdf")
+pdf(cor.plot.fname, width=dims.plot[1], height = dims.plot[2])
+print(cor.plot)
+dev.off()
+
+####### compute AGA scores ######
+print("Computing AGA scores")
+print("... but first need to under-sample cell populations that are more than 500 in numbers in order to make a smaller seurat object.")
+seurat.obj = DownSample(seurat.obj, limit = 2000)
+print("Done sampling, back to AGA score computation ...")
+
+AGA_folder = file.path(output_folder, "AGA_folder")
+dir.create(AGA_folder)
+# write cell labels to disk
+write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
+
+# save raw data to disk
+raw_data = seurat.obj@raw.data
+raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
+
+writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
+# save gene names
+gene_names = rownames(raw_data)
+write.csv(data.frame(Genes = gene_names), file.path(output_folder_material, "genenames.csv"))
+# save cell names
+cell_names = colnames(raw_data)
+write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv"))
+# write cell labels to disk
+write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
+
+command = file.path(tool_addr, "AGA/AGA_from_Seurat.py")
+command = paste(paste(python.addr, command, sep = " "), output_folder, sep = " ")
+command = paste(command, "AGA_scores", sep =" ")
+system(command, wait = T)
+
+# read the AGA output
+connectivities = read.csv(file.path(AGA_folder, "connectivities.csv"), row.names = 1)
+colnames(connectivities) = rownames(connectivities)
+connectivities = connectivities[query.labels, ref.labels] 
+print("printing connectivities from line 394")
+print(connectivities)
+
+# make AGA plot
+AGA.plot = make_heatmap(table.data=connectivities, measure.name="AGA score", "cadetblue1", "#928c91", "brown4")
+AGA.plot.fname = file.path(output_folder, "AGA_scores_matrix.pdf")
+pdf(AGA.plot.fname, width = dims.plot[3], height = dims.plot[4])
+print(AGA.plot)
+dev.off()
+
+# delete unused folders
+unlink(output_folder_material,              recursive=T, force=T)
+unlink(AGA_folder,                          recursive=T, force=T)
+#unlink(file.path(output_folder, "figures"), recursive=T, force=T)
+
+# clear up some memory
+rm(seurat.query.obj, seurat.ref.obj, query.expression.data, ref.expression.data, expression.data, cor.dist, raw_data)
+
+###### compute DEGs #######
+if(compute.DEGs){
+  print("Computing DEGs ...")
+  deg.span = 50
+  padW = 2
+  padH = 2
+  bitW = .6
+  bitH = 1.1
+  plot.W = 120 * (padW + 2 * bitW * deg.span)
+  plot.H = 130 * (padH + bitH * length(ref.labels))
+  deg.folder = file.path(output_folder, "DEGs")
+  dir.create(deg.folder)
+  
+  print("Plotting DEGs")
+  #bplapply(query.labels, plot_DEGs, BPPARAM=MulticoreParam(3))
+  print("printing query labels")
+  print(query.labels)
+  for (i in 1:length(query.labels)){
+    plot_DEGs(query.labels[i])
+  }
+  file.remove("Rplots.pdf")
+
+  print("removed rplots")
+  
+  # merge all DEG files
+  DEG_folder    = file.path(output_folder, "DEGs")
+  print("made deg folder")
+  DEGs.fnames   = list.files(DEG_folder)
+  DEG.csv.file.paths = DEGs.fnames[unlist(lapply(DEGs.fnames, function(fname){substr(x=fname, start=nchar(fname) - 3, stop=nchar(fname)) == '.csv'}))]
+  DEG.csv.file.paths = file.path(DEG_folder, DEG.csv.file.paths)
+  print("made DEG.csv.file.paths")
+  DEG.csv.files = lapply(DEG.csv.file.paths, read.csv, row.names = 1)
+  DEG.csv.files = Reduce(f=rbind, x=DEG.csv.files)
+  print("DEG.csv.files")
+  write.csv(DEG.csv.files, file.path(DEG_folder, "DEGs.csv"))
+  file.remove(DEG.csv.file.paths)
+}
+
+print("Ended beautifully ... ")
+