fork_scRNAseq_analysis/pipelines/22_cell_comparison/cell_comparison.R

args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))

arguments.list = "
seurat.query.addr.arg = args[1]
seurat.ref.addr.arg   = args[2]
set.ident.query.arg   = args[3]
set.ident.ref.arg     = args[4]
cell.types.query.arg  = args[5]
cell.types.ref.arg    = args[6]
dims.plot.arg         = args[7]
compute.DEGs.arg      = args[8]
"

DEG.number.limit = 20

expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]

if(length(args) != length(expected_arguments)){
  error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
  expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
  stop(sprintf('This pipeline requires %s parameters: '))
}

eval(parse(text = arguments.list))

for(n in 1:length(expected_arguments)){
  argument = expected_arguments[n]
  argument = gsub(pattern=" ", replacement="", x=argument)
  argument.name = unlist(strsplit(argument, "="))[1]
  variable.name = gsub(pattern=".arg", replacement="", argument.name)
  argument.content = eval(parse(text = argument.name))
  eval(parse(text = argument.content))
  if (!exists(variable.name)){
    stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
  }
}

# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
label1 = gsub(pattern="\\.RDS", replacement="", x=seurat.query.addr)
label2 = gsub(pattern="\\.RDS", replacement="", x=seurat.ref.addr)
output_folder = paste(output_folder, label1, label2, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)

output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)

seurat.query.addr = file.path("../../data", seurat.query.addr)
seurat.ref.addr   = file.path("../../data", seurat.ref.addr)

source("../../tools/bunddle_utils.R")

gene_mean_expression = function(seurat.obj){
  no.genes = nrow(seurat.obj@data)
  start_index = 1
  while (start_index < no.genes){
    end_index = start_index + 999
    end_index = min(end_index, no.genes)
    expression.data_ = data.matrix(seurat.obj@data[start_index:end_index, ])
    expression.data_ = t(expression.data_)
    expression.data_ = as.data.frame(expression.data_)
    expression.data_ = cbind(data.frame(CellLabels = as.vector(seurat.obj@ident)), expression.data_)
    expression.data_ = aggregate(expression.data_[2:dim(expression.data_)[2]], list(expression.data_$CellLabels), mean)
    expression.data_ = cbind(data.frame(CellType = expression.data_$Group.1), expression.data_[, 2:dim(expression.data_)[2]])
    rownames(expression.data_) = expression.data_$CellType
    expression.data_ = expression.data_[, 2:ncol(expression.data_)]
    print(start_index)
    if (start_index == 1){
      expression.data = expression.data_
    }else{
      expression.data = cbind(expression.data, expression.data_)
    }
    start_index = start_index + 1000
  }
  expression.data
}

make_heatmap = function(table.data, measure.name, low.col = "darkslateblue", mid.col = "floralwhite", high.col = "goldenrod1"){
  expression.melt = reshape::melt(data=as.matrix(table.data))
  colnames(expression.melt) = c("CellTypesQuery", "CellTypesReference", measure.name)
  expression.melt$CellTypesQuery     = factor(as.vector(expression.melt$CellTypesQuery),     levels=rev(query.labels))
  expression.melt$CellTypesReference = factor(as.vector(expression.melt$CellTypesReference), levels = ref.labels)
  heatmap.plot = ggplot(expression.melt, aes(CellTypesReference, CellTypesQuery, fill = expression.melt[, 3])) +
    geom_tile(color = "black") +
    scale_fill_gradientn(colours = c(low.col, mid.col, high.col), values = c(0, .5, 1.0)) +
    theme(axis.title.x=element_blank(), axis.title.y=element_blank(),
                                      axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(fill=measure.name)
  heatmap.plot
}

plot_DEGs = function(query.pop){
  print("running plot_degs function")
  vln.grobs = c()
  first_time = T
  for (i in seq_along(ref.labels)){
    ref.pop = ref.labels[i]
    # compute DEGs for query vs refs
    if (query.pop != ref.pop){
      print("printing query pop")
      print(query.pop)
      print("printing ref.pop")
      print(ref.pop)
      #DEGs = FindMarkers(object=seurat.obj, ident.1=query.pop, ident.2=ref.pop, genes.use=rownames(seurat.obj@raw.data), only.pos=F, max.cells.per.ident = 500)
      DEGs = FindMarkers(object=seurat.obj, ident.1=query.pop, ident.2=ref.pop, features=rownames(seurat.obj@raw.data), only.pos=F, max.cells.per.ident = 500)
      print("printing DEGs that are found straight after findmarkers is run")
      print(DEGs)
      DEGs$Gene = rownames(DEGs)
      DEGs$Reference_Population = gsub(pattern="R::", replacement="", x=ref.pop)
      DEGs$Query_Population     = gsub(pattern="Q::", replacement="", x=query.pop)
      if (first_time){
        DEG.matrix = DEGs
        first_time = F
      }else{
        DEG.matrix = rbind(DEG.matrix, DEGs)
      }

      print("printing DEGs after if loop to add cols")
      print(DEGs)
      pos.markers = DEGs %>% top_n(deg.span, avg_logFC)
      pos.markers = as.vector(pos.markers$Gene)
      neg.markers = DEGs %>% top_n(-deg.span, avg_logFC)
      neg.markers = rev(as.vector(neg.markers$Gene))
      markers     = c(pos.markers, neg.markers)

      # make marker data - used downstream for plotting
      markers.data.query = t(as.matrix(seurat.obj@data[markers, as.vector(seurat.obj@ident) == query.pop]))
      markers.data.query = reshape::melt(markers.data.query)
      colnames(markers.data.query) = c("Population", "Gene", "Expression")
      markers.data.query$Population = "Query_Population"

      markers.data.ref   = t(as.matrix(seurat.obj@data[markers, as.vector(seurat.obj@ident) == ref.pop]))
      markers.data.ref = reshape::melt(markers.data.ref)
      colnames(markers.data.ref) = c("Population", "Gene", "Expression")
      markers.data.ref$Population = "Reference_Population"

      markers.data = as.data.frame(rbind(markers.data.query, markers.data.ref))
      markers.data$Gene = factor(as.vector(markers.data$Gene), levels = markers)

      # make violin plot object
      vln.plot = ggplot(markers.data, aes(x = Population, y = Expression, col = Population, fill = Population))
      vln.plot = vln.plot + geom_jitter(size = .3)
      vln.plot = vln.plot + facet_wrap(~Gene, nrow = 1)
      vln.plot = vln.plot + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), legend.position="none")
      vln.plot = vln.plot + theme(axis.title.y = element_text(angle = 90, hjust = 1)) + ylab(ref.pop)

      unique_vln_plot = paste("VlnPlot", i, sep = "_")
      eval(parse(text = sprintf('%s = vln.plot', unique_vln_plot)))
      vln.grobs = c(vln.grobs, unique_vln_plot)
    }
  }
  vln.grobs = paste(vln.grobs, collapse = ", ")
  eval(parse(text = sprintf('vln.grobs = list(%s)', vln.grobs)))

  DEG.fname = gsub(pattern="/", replacement='_', x=query.pop, fixed = T)
  DEG.fname = gsub(pattern="Q::", replacement="", x=DEG.fname)
  DEG.fname = paste(DEG.fname, ".png", sep = "")
  DEG.fname = file.path(deg.folder, DEG.fname)

  print(DEG.fname)

  png(DEG.fname, width = plot.W, height = plot.H)
  grid.arrange(grobs = vln.grobs, ncol = 1)
  dev.off()

  DEG.fname = gsub(pattern="/", replacement='_', x=query.pop, fixed = T)
  DEG.fname = gsub(pattern="Q::", replacement="", x=DEG.fname)
  DEG.fname = paste(DEG.fname, ".csv", sep = "")
  DEG.fname = file.path(deg.folder, DEG.fname)

  print(DEG.fname)

  write.csv(DEG.matrix, DEG.fname)
}

DownSample = function(seurat.obj, limit){
  pop.types = as.vector(unique(seurat.obj@ident))
  to.keep = c()
  for(k in seq_along(pop.types)){
    pop.type = pop.types[k]
    pop.names = names(seurat.obj@ident)[as.vector(seurat.obj@ident) == pop.type]
    if (length(pop.names) > limit){
      pop.names = sample(x=pop.names, size=limit, replace=FALSE)
    }
    to.keep = c(to.keep, pop.names)
  }
  SubsetData(object=seurat.obj, cells.use=to.keep)
}

library(Seurat)
library(RColorBrewer)
library(plyr)
library(dplyr)
library(gridExtra)
library(BiocParallel)

#######################################################################################################

# load data
print("loading query data set ... ")
seurat.query.obj = readRDS(seurat.query.addr)
seurat.query.obj = SetAllIdent(object=seurat.query.obj, id=set.ident.query)
if (ncol(seurat.query.obj@data) > 100000){
  print("Query data too big. Downsampling query data ...")
  seurat.query.obj = DownSample(seurat.query.obj, limit = 2000)
}
print("Query data loaded.")

if(seurat.ref.addr != seurat.query.addr){
  print("loading reference data set ... ")
  seurat.ref.obj = readRDS(seurat.ref.addr)
  seurat.ref.obj   = SetAllIdent(object=seurat.ref.obj,   id=set.ident.ref)
  if(ncol(seurat.ref.obj@data) > 100000){
    print("Reference data too big. Downsampling reference data ...")
    seurat.ref.obj = DownSample(seurat.ref.obj, limit = 2000)
  }
  print("Reference data loaded.")
}else{
  print("Query and reference data sets are the same. Copying the query to make the reference data sets ...")
  if (set.ident.query != set.ident.ref){
    stop("The reference and query data sets are the same but the set identities are different. Terminating script ...")
  }
  seurat.ref.obj = seurat.query.obj
  seurat.obj     = seurat.query.obj
}

# load lists of cell types
if (is.na(cell.types.query)){
  cell.types.query = sort(as.vector(unique(seurat.query.obj@ident)))
}else{
  # open file connection
  cell.types.query.file = file.path("../../resources", cell.types.query)
  cell.types.query.file = file(cell.types.query.file, "r")
  cell.types.query = readLines(cell.types.query.file)
  close(cell.types.query.file)
}

if(is.na(cell.types.ref)){
  cell.types.ref = sort(as.vector(unique(seurat.ref.obj@ident)))
}else{
  # open file connection
  cell.types.ref.file = file.path("../../resources/", cell.types.ref)
  cell.types.ref.file = file(cell.types.ref.file, "r")
  cell.types.ref = readLines(cell.types.ref.file)
  close(cell.types.ref.file)
}

# subset seurat objects
if (!all(as.vector(unique(seurat.query.obj@ident)) %in% cell.types.query)){
  print("Subsetting the query data ...")
  print("as.vector(unique(seurat.query.obj@ident)")
  print(as.vector(unique(seurat.query.obj@ident)))
  print("cell types query")
  print(cell.types.query)
  seurat.query.obj = SubsetData(object=seurat.query.obj, ident.use=cell.types.query, do.clean=T, subset.raw=T)
}
if(!all(as.vector(unique(seurat.ref.obj@ident)) %in% cell.types.ref)){
  print("Subsetting the reference data ...")
  seurat.ref.obj   = SubsetData(object=seurat.ref.obj,   ident.use=cell.types.ref,   do.clean=T, subset.raw=T)
}

#remove any cell type represented by to few cells to allow a statistical test of DEGs
if (compute.DEGs){

cell.types.query.fail = names(table(seurat.query.obj@ident)[table(seurat.query.obj@ident) < DEG.number.limit])
cell.types.ref.fail   = names(table(seurat.ref.obj@ident)[table(seurat.ref.obj@ident) < DEG.number.limit])

if (length(cell.types.query.fail) > 0){
  print(sprintf("The query data has cell population represented by too few cells to allow DEG statistical test. These are: '%s';", paste(cell.types.query.fail, collapse = ",")))
  print("Removing these cell types from query data ...")
  cell.types.query = cell.types.query[!(cell.types.query %in% cell.types.query.fail)]
  seurat.query.obj = SubsetData(object=seurat.query.obj, ident.use=cell.types.query, do.clean=T, subset.raw=T)
}

if (length(cell.types.ref.fail) > 0){
  print(sprintf("The reference data has cell population represented by too few cells to allow DEG statistical test. These are:' %s'; And will be removed.", paste(cell.types.ref.fail, collapse = ",")))
  print("Removing these cell types from reference data ...")
  cell.types.ref = cell.types.ref[!(cell.types.ref %in% cell.types.ref.fail)]
  seurat.ref.obj   = SubsetData(object=seurat.ref.obj,   ident.use=cell.types.ref,   do.clean=T, subset.raw=T)
}
}

# merge the seurat objects
if(seurat.ref.addr == seurat.query.addr){
  seurat.obj = SubsetData(object=seurat.obj, ident.use=unique(c(cell.types.query, cell.types.ref)), do.clean=T, subset.raw=T)
}else{
  seurat.query.obj@meta.data$comparison_labels = eval(parse(text = sprintf("seurat.query.obj@meta.data$%s", set.ident.query)))
  seurat.query.obj@meta.data$comparison_labels = paste("Q::", as.vector(seurat.query.obj@meta.data$comparison_labels), sep = "")
  seurat.ref.obj@meta.data$comparison_labels   = eval(parse(text = sprintf("seurat.ref.obj@meta.data$%s", set.ident.ref)))
  seurat.ref.obj@meta.data$comparison_labels   = paste("R::", as.vector(seurat.ref.obj@meta.data$comparison_labels), sep = "")

  seurat.obj = MergeSeurat(object1=seurat.ref.obj, object2=seurat.query.obj, min.cells=0, min.genes=0, do.normalize=T, add.cell.id1="R", add.cell.id2="Q")
  seurat.obj = SetAllIdent(object=seurat.obj, id="comparison_labels")
}

# compute gene x cell type mean gene expression matrix for the query and reference data sets
query.expression.data = gene_mean_expression(seurat.query.obj)
ref.expression.data   = gene_mean_expression(seurat.ref.obj)

query.expression.data = query.expression.data[cell.types.query, ]
ref.expression.data   = ref.expression.data[cell.types.ref,     ]

# process and merge the expression matrices
if(seurat.ref.addr == seurat.query.addr){
  query.labels = cell.types.query
  ref.labels   = cell.types.ref
}else{
  query.labels = paste("Q::", rownames(query.expression.data), sep = "")
  ref.labels   = paste("R::", rownames(ref.expression.data),   sep = "")
}

rownames(query.expression.data) = query.labels
rownames(ref.expression.data)   = ref.labels

# must compute variable genes in both seurat objects, otherwise the might be no shared variable genes
print("Computing variable genes ... ")
seurat.query.obj = NormalizeData(object = seurat.query.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Computing variable genes ... ")
seurat.query.obj = FindVariableGenes(object = seurat.query.obj, mean.function = ExpMean,
                                     dispersion.function = LogVMR, x.low.cutoff = .0125,
                                     x.high.cutoff = 3, y.cutoff = .625)
seurat.ref.obj = NormalizeData(object = seurat.ref.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Computing variable genes ... ")
seurat.ref.obj = FindVariableGenes(object = seurat.ref.obj, mean.function = ExpMean,
                                   dispersion.function = LogVMR, x.low.cutoff = .0125,
                                   x.high.cutoff = 3, y.cutoff = .625)
print("Finished computing the variable genes.")

shared.genes = c(seurat.query.obj@var.genes, seurat.ref.obj@var.genes)
shared.genes = intersect(shared.genes, colnames(query.expression.data))
shared.genes = intersect(shared.genes, colnames(ref.expression.data))

query.expression.data = query.expression.data[, shared.genes]
ref.expression.data   = ref.expression.data[, shared.genes]

expression.data = rbind(query.expression.data, ref.expression.data)

# compute correlation distances
cor.dist = cor(t(expression.data), method="spearman")
cor.dist = cor.dist[query.labels, ref.labels]

p.values = c()
for(i in seq_along(1:nrow(expression.data))){
  for(j in seq_along(1:nrow(expression.data))){
    p.values = c(p.values, cor.test(x = as.numeric(expression.data[i, ]), y = as.numeric(expression.data[j, ]), method="spearman")$p.value)
  }
}
p.values = matrix(p.values, nrow=nrow(expression.data))
rownames(p.values) = rownames(expression.data)
colnames(p.values) = rownames(expression.data)
p.values = p.values[query.labels, ref.labels]
write.csv(p.values, file.path(output_folder, "correlation_p_values.csv"))

# make correlation plot
cor.plot = make_heatmap(table.data=cor.dist, measure.name="Correlation")
cor.plot.fname = file.path(output_folder, "correlation_matrix.pdf")
pdf(cor.plot.fname, width=dims.plot[1], height = dims.plot[2])
print(cor.plot)
dev.off()

####### compute AGA scores ######
print("Computing AGA scores")
print("... but first need to under-sample cell populations that are more than 500 in numbers in order to make a smaller seurat object.")
seurat.obj = DownSample(seurat.obj, limit = 2000)
print("Done sampling, back to AGA score computation ...")

AGA_folder = file.path(output_folder, "AGA_folder")
dir.create(AGA_folder)
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)

# save raw data to disk
raw_data = seurat.obj@raw.data
raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)]

writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
# save gene names
gene_names = rownames(raw_data)
write.csv(data.frame(Genes = gene_names), file.path(output_folder_material, "genenames.csv"))
# save cell names
cell_names = colnames(raw_data)
write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv"))
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)

command = file.path(tool_addr, "AGA/AGA_from_Seurat.py")
command = paste(paste(python.addr, command, sep = " "), output_folder, sep = " ")
command = paste(command, "AGA_scores", sep =" ")
system(command, wait = T)

# read the AGA output
connectivities = read.csv(file.path(AGA_folder, "connectivities.csv"), row.names = 1)
colnames(connectivities) = rownames(connectivities)
connectivities = connectivities[query.labels, ref.labels]
print("printing connectivities from line 394")
print(connectivities)

# make AGA plot
AGA.plot = make_heatmap(table.data=connectivities, measure.name="AGA score", "cadetblue1", "#928c91", "brown4")
AGA.plot.fname = file.path(output_folder, "AGA_scores_matrix.pdf")
pdf(AGA.plot.fname, width = dims.plot[3], height = dims.plot[4])
print(AGA.plot)
dev.off()

# delete unused folders
unlink(output_folder_material,              recursive=T, force=T)
unlink(AGA_folder,                          recursive=T, force=T)
#unlink(file.path(output_folder, "figures"), recursive=T, force=T)

# clear up some memory
rm(seurat.query.obj, seurat.ref.obj, query.expression.data, ref.expression.data, expression.data, cor.dist, raw_data)

###### compute DEGs #######
if(compute.DEGs){
  print("Computing DEGs ...")
  deg.span = 50
  padW = 2
  padH = 2
  bitW = .6
  bitH = 1.1
  plot.W = 120 * (padW + 2 * bitW * deg.span)
  plot.H = 130 * (padH + bitH * length(ref.labels))
  deg.folder = file.path(output_folder, "DEGs")
  dir.create(deg.folder)

  print("Plotting DEGs")
  #bplapply(query.labels, plot_DEGs, BPPARAM=MulticoreParam(3))
  print("printing query labels")
  print(query.labels)
  for (i in 1:length(query.labels)){
    plot_DEGs(query.labels[i])
  }
  file.remove("Rplots.pdf")

  print("removed rplots")

  # merge all DEG files
  DEG_folder    = file.path(output_folder, "DEGs")
  print("made deg folder")
  DEGs.fnames   = list.files(DEG_folder)
  DEG.csv.file.paths = DEGs.fnames[unlist(lapply(DEGs.fnames, function(fname){substr(x=fname, start=nchar(fname) - 3, stop=nchar(fname)) == '.csv'}))]
  DEG.csv.file.paths = file.path(DEG_folder, DEG.csv.file.paths)
  print("made DEG.csv.file.paths")
  DEG.csv.files = lapply(DEG.csv.file.paths, read.csv, row.names = 1)
  DEG.csv.files = Reduce(f=rbind, x=DEG.csv.files)
  print("DEG.csv.files")
  write.csv(DEG.csv.files, file.path(DEG_folder, "DEGs.csv"))
  file.remove(DEG.csv.file.paths)
}

print("Ended beautifully ... ")