args = commandArgs(trailingOnly=T) args = paste(args, collapse = "") args = unlist(strsplit(args, ";")) arguments.list = " seurat.query.addr.arg = args[1] seurat.ref.addr.arg = args[2] set.ident.query.arg = args[3] set.ident.ref.arg = args[4] cell.types.query.arg = args[5] cell.types.ref.arg = args[6] dims.plot.arg = args[7] compute.DEGs.arg = args[8] " DEG.number.limit = 20 expected_arguments = unlist(strsplit(arguments.list, "\n")) expected_arguments = expected_arguments[!(expected_arguments == "")] if(length(args) != length(expected_arguments)){ error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments))) expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n") stop(sprintf('This pipeline requires %s parameters: ')) } eval(parse(text = arguments.list)) for(n in 1:length(expected_arguments)){ argument = expected_arguments[n] argument = gsub(pattern=" ", replacement="", x=argument) argument.name = unlist(strsplit(argument, "="))[1] variable.name = gsub(pattern=".arg", replacement="", argument.name) argument.content = eval(parse(text = argument.name)) eval(parse(text = argument.content)) if (!exists(variable.name)){ stop(sprintf("Argument %s not passed. Stopping ... ", variable.name)) } } # create required folders for output and work material output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd())) label1 = gsub(pattern="\\.RDS", replacement="", x=seurat.query.addr) label2 = gsub(pattern="\\.RDS", replacement="", x=seurat.ref.addr) output_folder = paste(output_folder, label1, label2, sep = "_") c.time = Sys.time() c.time = gsub(pattern=" BST", replacement="", x=c.time) c.time = gsub(pattern=":", replacement="", x=c.time) c.time = gsub(pattern=" ", replacement="", x=c.time) c.time = gsub(pattern="-", replacement="", x=c.time) c.time = substr(x=c.time, start=3, stop=nchar(c.time)) output_folder = paste(output_folder, c.time, sep = "_") output_folder = file.path("../../output", output_folder) dir.create(output_folder) output_folder_material = file.path(output_folder, "material") dir.create(output_folder_material) seurat.query.addr = file.path("../../data", seurat.query.addr) seurat.ref.addr = file.path("../../data", seurat.ref.addr) source("../../tools/bunddle_utils.R") gene_mean_expression = function(seurat.obj){ no.genes = nrow(seurat.obj@data) start_index = 1 while (start_index < no.genes){ end_index = start_index + 999 end_index = min(end_index, no.genes) expression.data_ = data.matrix(seurat.obj@data[start_index:end_index, ]) expression.data_ = t(expression.data_) expression.data_ = as.data.frame(expression.data_) expression.data_ = cbind(data.frame(CellLabels = as.vector(seurat.obj@ident)), expression.data_) expression.data_ = aggregate(expression.data_[2:dim(expression.data_)[2]], list(expression.data_$CellLabels), mean) expression.data_ = cbind(data.frame(CellType = expression.data_$Group.1), expression.data_[, 2:dim(expression.data_)[2]]) rownames(expression.data_) = expression.data_$CellType expression.data_ = expression.data_[, 2:ncol(expression.data_)] print(start_index) if (start_index == 1){ expression.data = expression.data_ }else{ expression.data = cbind(expression.data, expression.data_) } start_index = start_index + 1000 } expression.data } make_heatmap = function(table.data, measure.name, low.col = "darkslateblue", mid.col = "floralwhite", high.col = "goldenrod1"){ expression.melt = reshape::melt(data=as.matrix(table.data)) colnames(expression.melt) = c("CellTypesQuery", "CellTypesReference", measure.name) expression.melt$CellTypesQuery = factor(as.vector(expression.melt$CellTypesQuery), levels=rev(query.labels)) expression.melt$CellTypesReference = factor(as.vector(expression.melt$CellTypesReference), levels = ref.labels) heatmap.plot = ggplot(expression.melt, aes(CellTypesReference, CellTypesQuery, fill = expression.melt[, 3])) + geom_tile(color = "black") + scale_fill_gradientn(colours = c(low.col, mid.col, high.col), values = c(0, .5, 1.0)) + theme(axis.title.x=element_blank(), axis.title.y=element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) + labs(fill=measure.name) heatmap.plot } plot_DEGs = function(query.pop){ print("running plot_degs function") vln.grobs = c() first_time = T for (i in seq_along(ref.labels)){ ref.pop = ref.labels[i] # compute DEGs for query vs refs if (query.pop != ref.pop){ print("printing query pop") print(query.pop) print("printing ref.pop") print(ref.pop) #DEGs = FindMarkers(object=seurat.obj, ident.1=query.pop, ident.2=ref.pop, genes.use=rownames(seurat.obj@raw.data), only.pos=F, max.cells.per.ident = 500) DEGs = FindMarkers(object=seurat.obj, ident.1=query.pop, ident.2=ref.pop, features=rownames(seurat.obj@raw.data), only.pos=F, max.cells.per.ident = 500) print("printing DEGs that are found straight after findmarkers is run") print(DEGs) DEGs$Gene = rownames(DEGs) DEGs$Reference_Population = gsub(pattern="R::", replacement="", x=ref.pop) DEGs$Query_Population = gsub(pattern="Q::", replacement="", x=query.pop) if (first_time){ DEG.matrix = DEGs first_time = F }else{ DEG.matrix = rbind(DEG.matrix, DEGs) } print("printing DEGs after if loop to add cols") print(DEGs) pos.markers = DEGs %>% top_n(deg.span, avg_logFC) pos.markers = as.vector(pos.markers$Gene) neg.markers = DEGs %>% top_n(-deg.span, avg_logFC) neg.markers = rev(as.vector(neg.markers$Gene)) markers = c(pos.markers, neg.markers) # make marker data - used downstream for plotting markers.data.query = t(as.matrix(seurat.obj@data[markers, as.vector(seurat.obj@ident) == query.pop])) markers.data.query = reshape::melt(markers.data.query) colnames(markers.data.query) = c("Population", "Gene", "Expression") markers.data.query$Population = "Query_Population" markers.data.ref = t(as.matrix(seurat.obj@data[markers, as.vector(seurat.obj@ident) == ref.pop])) markers.data.ref = reshape::melt(markers.data.ref) colnames(markers.data.ref) = c("Population", "Gene", "Expression") markers.data.ref$Population = "Reference_Population" markers.data = as.data.frame(rbind(markers.data.query, markers.data.ref)) markers.data$Gene = factor(as.vector(markers.data$Gene), levels = markers) # make violin plot object vln.plot = ggplot(markers.data, aes(x = Population, y = Expression, col = Population, fill = Population)) vln.plot = vln.plot + geom_jitter(size = .3) vln.plot = vln.plot + facet_wrap(~Gene, nrow = 1) vln.plot = vln.plot + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), legend.position="none") vln.plot = vln.plot + theme(axis.title.y = element_text(angle = 90, hjust = 1)) + ylab(ref.pop) unique_vln_plot = paste("VlnPlot", i, sep = "_") eval(parse(text = sprintf('%s = vln.plot', unique_vln_plot))) vln.grobs = c(vln.grobs, unique_vln_plot) } } vln.grobs = paste(vln.grobs, collapse = ", ") eval(parse(text = sprintf('vln.grobs = list(%s)', vln.grobs))) DEG.fname = gsub(pattern="/", replacement='_', x=query.pop, fixed = T) DEG.fname = gsub(pattern="Q::", replacement="", x=DEG.fname) DEG.fname = paste(DEG.fname, ".png", sep = "") DEG.fname = file.path(deg.folder, DEG.fname) print(DEG.fname) png(DEG.fname, width = plot.W, height = plot.H) grid.arrange(grobs = vln.grobs, ncol = 1) dev.off() DEG.fname = gsub(pattern="/", replacement='_', x=query.pop, fixed = T) DEG.fname = gsub(pattern="Q::", replacement="", x=DEG.fname) DEG.fname = paste(DEG.fname, ".csv", sep = "") DEG.fname = file.path(deg.folder, DEG.fname) print(DEG.fname) write.csv(DEG.matrix, DEG.fname) } DownSample = function(seurat.obj, limit){ pop.types = as.vector(unique(seurat.obj@ident)) to.keep = c() for(k in seq_along(pop.types)){ pop.type = pop.types[k] pop.names = names(seurat.obj@ident)[as.vector(seurat.obj@ident) == pop.type] if (length(pop.names) > limit){ pop.names = sample(x=pop.names, size=limit, replace=FALSE) } to.keep = c(to.keep, pop.names) } SubsetData(object=seurat.obj, cells.use=to.keep) } library(Seurat) library(RColorBrewer) library(plyr) library(dplyr) library(gridExtra) library(BiocParallel) ####################################################################################################### # load data print("loading query data set ... ") seurat.query.obj = readRDS(seurat.query.addr) seurat.query.obj = SetAllIdent(object=seurat.query.obj, id=set.ident.query) if (ncol(seurat.query.obj@data) > 100000){ print("Query data too big. Downsampling query data ...") seurat.query.obj = DownSample(seurat.query.obj, limit = 2000) } print("Query data loaded.") if(seurat.ref.addr != seurat.query.addr){ print("loading reference data set ... ") seurat.ref.obj = readRDS(seurat.ref.addr) seurat.ref.obj = SetAllIdent(object=seurat.ref.obj, id=set.ident.ref) if(ncol(seurat.ref.obj@data) > 100000){ print("Reference data too big. Downsampling reference data ...") seurat.ref.obj = DownSample(seurat.ref.obj, limit = 2000) } print("Reference data loaded.") }else{ print("Query and reference data sets are the same. Copying the query to make the reference data sets ...") if (set.ident.query != set.ident.ref){ stop("The reference and query data sets are the same but the set identities are different. Terminating script ...") } seurat.ref.obj = seurat.query.obj seurat.obj = seurat.query.obj } # load lists of cell types if (is.na(cell.types.query)){ cell.types.query = sort(as.vector(unique(seurat.query.obj@ident))) }else{ # open file connection cell.types.query.file = file.path("../../resources", cell.types.query) cell.types.query.file = file(cell.types.query.file, "r") cell.types.query = readLines(cell.types.query.file) close(cell.types.query.file) } if(is.na(cell.types.ref)){ cell.types.ref = sort(as.vector(unique(seurat.ref.obj@ident))) }else{ # open file connection cell.types.ref.file = file.path("../../resources/", cell.types.ref) cell.types.ref.file = file(cell.types.ref.file, "r") cell.types.ref = readLines(cell.types.ref.file) close(cell.types.ref.file) } # subset seurat objects if (!all(as.vector(unique(seurat.query.obj@ident)) %in% cell.types.query)){ print("Subsetting the query data ...") print("as.vector(unique(seurat.query.obj@ident)") print(as.vector(unique(seurat.query.obj@ident))) print("cell types query") print(cell.types.query) seurat.query.obj = SubsetData(object=seurat.query.obj, ident.use=cell.types.query, do.clean=T, subset.raw=T) } if(!all(as.vector(unique(seurat.ref.obj@ident)) %in% cell.types.ref)){ print("Subsetting the reference data ...") seurat.ref.obj = SubsetData(object=seurat.ref.obj, ident.use=cell.types.ref, do.clean=T, subset.raw=T) } #remove any cell type represented by to few cells to allow a statistical test of DEGs if (compute.DEGs){ cell.types.query.fail = names(table(seurat.query.obj@ident)[table(seurat.query.obj@ident) < DEG.number.limit]) cell.types.ref.fail = names(table(seurat.ref.obj@ident)[table(seurat.ref.obj@ident) < DEG.number.limit]) if (length(cell.types.query.fail) > 0){ print(sprintf("The query data has cell population represented by too few cells to allow DEG statistical test. These are: '%s';", paste(cell.types.query.fail, collapse = ","))) print("Removing these cell types from query data ...") cell.types.query = cell.types.query[!(cell.types.query %in% cell.types.query.fail)] seurat.query.obj = SubsetData(object=seurat.query.obj, ident.use=cell.types.query, do.clean=T, subset.raw=T) } if (length(cell.types.ref.fail) > 0){ print(sprintf("The reference data has cell population represented by too few cells to allow DEG statistical test. These are:' %s'; And will be removed.", paste(cell.types.ref.fail, collapse = ","))) print("Removing these cell types from reference data ...") cell.types.ref = cell.types.ref[!(cell.types.ref %in% cell.types.ref.fail)] seurat.ref.obj = SubsetData(object=seurat.ref.obj, ident.use=cell.types.ref, do.clean=T, subset.raw=T) } } # merge the seurat objects if(seurat.ref.addr == seurat.query.addr){ seurat.obj = SubsetData(object=seurat.obj, ident.use=unique(c(cell.types.query, cell.types.ref)), do.clean=T, subset.raw=T) }else{ seurat.query.obj@meta.data$comparison_labels = eval(parse(text = sprintf("seurat.query.obj@meta.data$%s", set.ident.query))) seurat.query.obj@meta.data$comparison_labels = paste("Q::", as.vector(seurat.query.obj@meta.data$comparison_labels), sep = "") seurat.ref.obj@meta.data$comparison_labels = eval(parse(text = sprintf("seurat.ref.obj@meta.data$%s", set.ident.ref))) seurat.ref.obj@meta.data$comparison_labels = paste("R::", as.vector(seurat.ref.obj@meta.data$comparison_labels), sep = "") seurat.obj = MergeSeurat(object1=seurat.ref.obj, object2=seurat.query.obj, min.cells=0, min.genes=0, do.normalize=T, add.cell.id1="R", add.cell.id2="Q") seurat.obj = SetAllIdent(object=seurat.obj, id="comparison_labels") } # compute gene x cell type mean gene expression matrix for the query and reference data sets query.expression.data = gene_mean_expression(seurat.query.obj) ref.expression.data = gene_mean_expression(seurat.ref.obj) query.expression.data = query.expression.data[cell.types.query, ] ref.expression.data = ref.expression.data[cell.types.ref, ] # process and merge the expression matrices if(seurat.ref.addr == seurat.query.addr){ query.labels = cell.types.query ref.labels = cell.types.ref }else{ query.labels = paste("Q::", rownames(query.expression.data), sep = "") ref.labels = paste("R::", rownames(ref.expression.data), sep = "") } rownames(query.expression.data) = query.labels rownames(ref.expression.data) = ref.labels # must compute variable genes in both seurat objects, otherwise the might be no shared variable genes print("Computing variable genes ... ") seurat.query.obj = NormalizeData(object = seurat.query.obj, normalization.method = "LogNormalize", scale.factor = 10000) print("Computing variable genes ... ") seurat.query.obj = FindVariableGenes(object = seurat.query.obj, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = .0125, x.high.cutoff = 3, y.cutoff = .625) seurat.ref.obj = NormalizeData(object = seurat.ref.obj, normalization.method = "LogNormalize", scale.factor = 10000) print("Computing variable genes ... ") seurat.ref.obj = FindVariableGenes(object = seurat.ref.obj, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = .0125, x.high.cutoff = 3, y.cutoff = .625) print("Finished computing the variable genes.") shared.genes = c(seurat.query.obj@var.genes, seurat.ref.obj@var.genes) shared.genes = intersect(shared.genes, colnames(query.expression.data)) shared.genes = intersect(shared.genes, colnames(ref.expression.data)) query.expression.data = query.expression.data[, shared.genes] ref.expression.data = ref.expression.data[, shared.genes] expression.data = rbind(query.expression.data, ref.expression.data) # compute correlation distances cor.dist = cor(t(expression.data), method="spearman") cor.dist = cor.dist[query.labels, ref.labels] p.values = c() for(i in seq_along(1:nrow(expression.data))){ for(j in seq_along(1:nrow(expression.data))){ p.values = c(p.values, cor.test(x = as.numeric(expression.data[i, ]), y = as.numeric(expression.data[j, ]), method="spearman")$p.value) } } p.values = matrix(p.values, nrow=nrow(expression.data)) rownames(p.values) = rownames(expression.data) colnames(p.values) = rownames(expression.data) p.values = p.values[query.labels, ref.labels] write.csv(p.values, file.path(output_folder, "correlation_p_values.csv")) # make correlation plot cor.plot = make_heatmap(table.data=cor.dist, measure.name="Correlation") cor.plot.fname = file.path(output_folder, "correlation_matrix.pdf") pdf(cor.plot.fname, width=dims.plot[1], height = dims.plot[2]) print(cor.plot) dev.off() ####### compute AGA scores ###### print("Computing AGA scores") print("... but first need to under-sample cell populations that are more than 500 in numbers in order to make a smaller seurat object.") seurat.obj = DownSample(seurat.obj, limit = 2000) print("Done sampling, back to AGA score computation ...") AGA_folder = file.path(output_folder, "AGA_folder") dir.create(AGA_folder) # write cell labels to disk write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F) # save raw data to disk raw_data = seurat.obj@raw.data raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)] writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx")) # save gene names gene_names = rownames(raw_data) write.csv(data.frame(Genes = gene_names), file.path(output_folder_material, "genenames.csv")) # save cell names cell_names = colnames(raw_data) write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv")) # write cell labels to disk write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F) command = file.path(tool_addr, "AGA/AGA_from_Seurat.py") command = paste(paste(python.addr, command, sep = " "), output_folder, sep = " ") command = paste(command, "AGA_scores", sep =" ") system(command, wait = T) # read the AGA output connectivities = read.csv(file.path(AGA_folder, "connectivities.csv"), row.names = 1) colnames(connectivities) = rownames(connectivities) connectivities = connectivities[query.labels, ref.labels] print("printing connectivities from line 394") print(connectivities) # make AGA plot AGA.plot = make_heatmap(table.data=connectivities, measure.name="AGA score", "cadetblue1", "#928c91", "brown4") AGA.plot.fname = file.path(output_folder, "AGA_scores_matrix.pdf") pdf(AGA.plot.fname, width = dims.plot[3], height = dims.plot[4]) print(AGA.plot) dev.off() # delete unused folders unlink(output_folder_material, recursive=T, force=T) unlink(AGA_folder, recursive=T, force=T) #unlink(file.path(output_folder, "figures"), recursive=T, force=T) # clear up some memory rm(seurat.query.obj, seurat.ref.obj, query.expression.data, ref.expression.data, expression.data, cor.dist, raw_data) ###### compute DEGs ####### if(compute.DEGs){ print("Computing DEGs ...") deg.span = 50 padW = 2 padH = 2 bitW = .6 bitH = 1.1 plot.W = 120 * (padW + 2 * bitW * deg.span) plot.H = 130 * (padH + bitH * length(ref.labels)) deg.folder = file.path(output_folder, "DEGs") dir.create(deg.folder) print("Plotting DEGs") #bplapply(query.labels, plot_DEGs, BPPARAM=MulticoreParam(3)) print("printing query labels") print(query.labels) for (i in 1:length(query.labels)){ plot_DEGs(query.labels[i]) } file.remove("Rplots.pdf") print("removed rplots") # merge all DEG files DEG_folder = file.path(output_folder, "DEGs") print("made deg folder") DEGs.fnames = list.files(DEG_folder) DEG.csv.file.paths = DEGs.fnames[unlist(lapply(DEGs.fnames, function(fname){substr(x=fname, start=nchar(fname) - 3, stop=nchar(fname)) == '.csv'}))] DEG.csv.file.paths = file.path(DEG_folder, DEG.csv.file.paths) print("made DEG.csv.file.paths") DEG.csv.files = lapply(DEG.csv.file.paths, read.csv, row.names = 1) DEG.csv.files = Reduce(f=rbind, x=DEG.csv.files) print("DEG.csv.files") write.csv(DEG.csv.files, file.path(DEG_folder, "DEGs.csv")) file.remove(DEG.csv.file.paths) } print("Ended beautifully ... ")