fork_scRNAseq_analysis/pipelines/13_pseudotime/pseudotime.R
2019-07-08 12:22:01 +01:00

399 lines
17 KiB
R
Executable file

args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
args = gsub(pattern = '@@', replacement = ' ', x = args)
arguments.list = "
seurat.addr.arg = args[1]
set.ident.arg = args[2]
cell.types.arg = args[3]
root_cell_type.arg = args[4]
var.genes.arg = args[5]
type.to.colours.arg = args[6]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: ', length(expected_arguments)))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
#argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
variable.name = gsub(pattern=" ", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(ggplot2)
library(RColorBrewer)
library(plyr)
library(monocle)
library(dplyr)
library(reshape2)
#######################################################################################################
###########
print("printing cell.types")
print(cell.types)
print("printing root_cell_type")
print(root_cell_type)
ma = function(arr, kernel = 50){
res = arr
n = 2 * kernel
for(i in 1:length(arr)){
start_index = max(1, i - kernel)
stop_index = min(length(arr), i + kernel)
res[i] = mean(arr[start_index:stop_index])
}
res
}
adaptive.moving_average = function(arr, kernel = 10, minim_kernel = 10, range.factor = 5){
res = arr
n = 2 * kernel
for(i in 1:length(arr)){
start_index = max(1, i - kernel)
stop_index = min(length(arr), i + kernel)
local_sd = sd(arr[start_index:stop_index])
local_kernel = minim_kernel + round(range.factor / (local_sd + .1))
start_index = max(1, i - local_kernel)
stop_index = min(length(arr), i + local_kernel)
res[i] = mean(arr[start_index:stop_index])
}
res
}
###########
#######################################################################################################
print("Loading data ...")
seurat.obj = readRDS(seurat.addr)
seurat.obj = SetAllIdent(object=seurat.obj, id=set.ident)
print("Data loaded.")
print("Subseting data on singlets and required cell populations")
if(cell.types == "all"){
cell.types = as.vector(unique(seurat.obj@ident))
}
print(table(seurat.obj@ident))
print("Subseting data ...")
to.keep = names(seurat.obj@ident)[as.vector(seurat.obj@ident) %in% cell.types]
seurat.obj = SubsetData(object=seurat.obj, cells.use=to.keep)
seurat.obj@ident = factor(seurat.obj@ident, levels = cell.types)
print(table(seurat.obj@ident))
print("Writing data to disk ...")
# save raw data to disk
raw_data = seurat.obj@raw.data
raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
# decomment the next lines if there is a list of genes that you need to exclude
to_exclude = readRDS('fca_cellcycle_genes.RDS')
genes_to_keep = rownames(raw_data)
genes_to_keep = genes_to_keep[!(genes_to_keep %in% to_exclude)]
raw_data = raw_data[genes_to_keep, colnames(seurat.obj@data)]
writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
# save gene names
gene_names = rownames(raw_data)
write.csv(data.frame(Genes = gene_names), file.path(output_folder_material, "genenames.csv"))
# save cell names
cell_names = colnames(raw_data)
write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv"))
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
print("Computing pseudotime using pdt.scanpy.py...")
# compute pseudotime in python scanpy
command = sprintf("%s pdt_scanpy.py %s %s", python.addr, root_cell_type, output_folder)
system(command, wait=T)
print("finished running .py")
# get cell labels and colours
if (!is.na(type.to.colours)){
type.to.colours = file.path("../../resources", type.to.colours)
type.to.colour = read.csv(type.to.colours)
print("printing type.to.colour after it is loaded in")
print(type.to.colour)
print("printing seurat obj idents which the typetocol arg will be compared against in next lines")
print(as.vector(unique(seurat.obj@ident)))
filter.key = type.to.colour$CellTypes %in% as.vector(unique(seurat.obj@ident))
cell.labels = as.vector(type.to.colour$CellTypes[filter.key])
cell.colours = as.vector(type.to.colour$Colours[filter.key])
}else{
cell.labels = sort(as.vector(unique(seurat.obj@ident)))
cell.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(cell.labels)))
}
print("printing cell.labels")
print(cell.labels)
print("printing cell.colours")
print(cell.colours)
# load pseudotime
print('reading pseudotime values')
pseudotime = read.csv(file.path(output_folder_material, "pseudotime.csv"), row.names = 1, header = F)
print("Are the cells in the same order in both pseudotime and seurat object? ")
print(all(rownames(pseudotime) == names(seurat.obj@ident)))
pseudotime$CellTypes = seurat.obj@ident
colnames(pseudotime) = c("Pseudotime", "CellType")
pseudotime$Color = mapvalues(x=pseudotime$CellType, from=cell.labels, to=cell.colours)
pseudotime$Color = factor(as.vector(pseudotime$Color), levels = cell.colours)
pseudotime$CellType = factor(as.vector(pseudotime$CellType), levels = cell.labels)
colnames(pseudotime) = c("Pseudotime", "Cell Type", "Color")
# making sure that there are no inf values in pdt column
#pseudotime["Pseudotime"][pseudotime["Pseudotime"] == "Inf"] <- 1
plot.density = ggplot(data = pseudotime, aes(x = Pseudotime, color = `Cell Type`, fill = `Cell Type`)) + geom_density(alpha = .7)
plot.density = plot.density + scale_x_continuous(position = "top", limits = c(.0, 1.0), expand = c(0.0, .0))
plot.density = plot.density + scale_color_manual(values = cell.colours)
plot.density = plot.density + scale_fill_manual(values = cell.colours)
plot.density = plot.density + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.title.x = element_text(size = 25),
legend.position = c(0, 1),
legend.justification = c(0, 1))
print("printing cell.colours which is used for scale_colour_manual for plot.density plot")
print(cell.colours)
# compute diff genes
print("Computing var genes by cell type...")
cds = newCellDataSet(cellData = as.matrix(raw_data), phenoData=NULL, featureData=NULL, expressionFamily = negbinomial.size())
print("printing cds made using newCellDataSet function")
print(cds)
pData(cds)$Cluster = as.vector(seurat.obj@ident)
print("printing cds after adding cluster to pdata")
print(cds)
print("running estimatesizefactors for cds")
cds = estimateSizeFactors(cds)
pData(cds)$Pseudotime = pseudotime$Pseudotime
if (is.na(var.genes)){
var.genes.total = c()
print('Computing variable genes ... ')
for (j in 1:length(cell.labels)){
print(sprintf("Choice %s out of %s ... ", as.character(j), as.character(length(cell.labels))))
choices = pseudotime$`Cell Type` == cell.labels[j]
var.genes = differentialGeneTest(cds[, choices], fullModelFormulaStr = "~sm.ns(Pseudotime)")
var.genes = cbind(var.genes, data.frame(gene_id = rownames(var.genes)))
var.genes.ch = var.genes %>% arrange(qval)
var.genes.ch = as.vector(var.genes.ch$gene_id[1:100])
var.genes.total = union(var.genes.total, var.genes.ch)
}
print("Computing var genes globally...")
var.genes = differentialGeneTest(cds, fullModelFormulaStr = "~sm.ns(Pseudotime)")
var.genes = cbind(var.genes, data.frame(gene_id = rownames(var.genes)))
var.genes.ch = var.genes %>% arrange(qval)
var.genes.ch = as.vector(var.genes.ch$gene_id[1:100])
var.genes.total = union(var.genes.total, var.genes.ch)
MT_genes = var.genes.total[grep("^MT-", x=var.genes.total, ignore.case=T)]
var.genes.total = setdiff(var.genes.total, MT_genes)
}else{
var.genes.file = file.path('../../resources', var.genes)
var.genes.file = file(var.genes.file)
var.genes.total = readLines(var.genes.file)
var.genes.total = as.vector(unique(var.genes.total))
var.genes.total = var.genes.total[var.genes.total != '']
close(var.genes.file)
}
# saving the genes to disk
print("Heavy computing finished. Next saving to output...")
print("calculating var_gene_expression")
# cluster genes based on their min-max normalized values
var_gene_expression = as.matrix(seurat.obj@data[var.genes.total, order(pseudotime$Pseudotime)])
var_gene_expression = t(apply(var_gene_expression, 1, adaptive.moving_average, kernel = 15, minim_kernel = 1, range.factor=15))
# min-max normalization
var_gene_min = apply(var_gene_expression, 1, min)
var_gene_expression = var_gene_expression - var_gene_min
var_gene_genes_max = apply(var_gene_expression, 1, max)
var_gene_expression = var_gene_expression / var_gene_genes_max
print("clustering genes by level of expression")
# actual clustering of genes
d_matrix = as.dist(1.0 - cor(t(as.matrix(var_gene_expression)), method="spearman"))
genes_clust = hclust(d=d_matrix, method="ward.D2")
genes.in.order = var.genes.total[genes_clust$order]
# plot min-max normalized expression
###################################################################################################
raw_data_genes = as.matrix(seurat.obj@data[rev(genes.in.order), order(pseudotime$Pseudotime)])
raw_data_genes = t(apply(raw_data_genes, 1, adaptive.moving_average, kernel = 15, minim_kernel = 1, range.factor=15))
# min-max normalization
raw_data_genes_min = apply(raw_data_genes, 1, min)
raw_data_genes = raw_data_genes - raw_data_genes_min
raw_data_genes_max = apply(raw_data_genes, 1, max)
raw_data_genes = raw_data_genes / raw_data_genes_max
print("group genes by pdt")
# group by pdt
pdt = range(pseudotime$Pseudotime)
pdt = seq(pdt[1], pdt[2], length.out=100)
pdt_data = c()
for (k in 1:nrow(raw_data_genes)){
for(j in 1:length(pdt)){
local_pdt = pdt[j]
pdt_index = abs(pseudotime$Pseudotime[order(pseudotime$Pseudotime)] - local_pdt)
pdt_index = which(pdt_index == min(pdt_index))
pdt_data = c(pdt_data, raw_data_genes[k, pdt_index])
}
}
pdt_data = matrix(data=pdt_data, nrow=nrow(raw_data_genes), byrow=T)
print("printing nrow pdt_data")
nrow(pdt_data)
print("printing ncol pdt_data")
ncol(pdt_data)
rownames(pdt_data) = rownames(raw_data_genes)
colnames(pdt_data) = paste("PDT", 1:100, sep = "")
#colnames(pdt_data) = paste("PDT", 1:ncol(pdt_data), sep = "")
# smooth a bit the pdt_data matrx
pdt_data = t(apply(pdt_data, 1, ma, kernel = 7))
pdt_data = pdt_data - apply(pdt_data, 1, min)
pdt_data = pdt_data / apply(pdt_data, 1, max)
beautiful_result_norm = reshape2::melt(data=pdt_data)
colnames(beautiful_result_norm) = c("GeneNames", "Pseudotime", "ExpressionValue")
print("preparing to plot genes by expression level")
plot.genes = ggplot(data = beautiful_result_norm, aes(x = Pseudotime, y = GeneNames))
plot.genes = plot.genes + geom_tile(aes(fill = ExpressionValue), width=1.001, height=1.001)
plot.genes = plot.genes + scale_fill_gradient2(low = "deepskyblue", high = "firebrick3", mid = "darkolivegreen3", midpoint = 0.5, name = "Minmax normalized gene expression")
plot.genes = plot.genes + theme(legend.position = "bottom", legend.text = element_text(size = 25, angle = 90),
legend.title = element_text(size = 25),
legend.key.width = unit(2, "cm"),
axis.text.x = element_blank(), axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 0), axis.text.y = element_text(size = 8))
pdf(file.path(output_folder, "expression_vs_norm_expression.pdf"), width = 13, height = 35)
plot_grid(plot.density, plot.genes, nrow = 2, align = "v", rel_heights = c(1/9, 8/9))
dev.off()
print("plotted expression_vs_norm_expression.pdf in output folder")
# plot non-normalized expression
###################################################################################################
raw_data_genes = as.matrix(seurat.obj@data[rev(genes.in.order), order(pseudotime$Pseudotime)])
print("made raw_data_genes matrix")
raw_data_genes = t(apply(raw_data_genes, 1, adaptive.moving_average, kernel = 15, minim_kernel = 1, range.factor=15))
print("raw_data_genes matrix has applied apply function and t")
# group by pdt
pdt = range(pseudotime$Pseudotime)
pdt = seq(pdt[1], pdt[2], length.out=100)
pdt_data = c()
for (k in 1:nrow(raw_data_genes)){
for(j in 1:length(pdt)){
local_pdt = pdt[j]
pdt_index = abs(pseudotime$Pseudotime[order(pseudotime$Pseudotime)] - local_pdt)
pdt_index = which(pdt_index == min(pdt_index))
pdt_data = c(pdt_data, raw_data_genes[k, pdt_index])
}
}
pdt_data = matrix(data=pdt_data, nrow=nrow(raw_data_genes), byrow=T)
rownames(pdt_data) = rownames(raw_data_genes)
#colnames(pdt_data) = paste("PDT", 1:ncol(pdt_data), sep = "")
colnames(pdt_data) = paste("PDT", 1:100, sep = "")
# smooth a bit the pdt_data matrx
pdt_data = t(apply(pdt_data, 1, ma, kernel = 7))
beautiful_result_nonnorm = reshape2::melt(data=pdt_data)
colnames(beautiful_result_nonnorm) = c("GeneNames", "Pseudotime", "ExpressionValue")
plot.genes = ggplot(data = beautiful_result_nonnorm, aes(x = Pseudotime, y = GeneNames))
plot.genes = plot.genes + geom_tile(aes(fill = ExpressionValue), width=1.001, height=1.001)
plot.genes = plot.genes + scale_fill_gradient2(low = "deepskyblue", high = "firebrick3", mid = "darkolivegreen3", midpoint = mean(range(pdt_data)), name = "Gene expression")
plot.genes = plot.genes + theme(legend.position = "bottom", legend.text = element_text(size = 25, angle = 90),
legend.title = element_text(size = 25),
legend.key.width = unit(2, "cm"),
axis.text.x = element_blank(), axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 0), axis.text.y = element_text(size = 8))
pdf(file.path(output_folder, "expression_vs_nonnorm_expression.pdf"), width = 13, height = 35)
plot_grid(plot.density, plot.genes, nrow = 2, align = "v", rel_heights = c(1/9, 8/9))
dev.off()
print("plotted genes by expression_vs_nonnorm_expression.pdf")
# save diffusion map coordinates and expression data for found genes
by.pdt.order = order(pseudotime$Pseudotime)
dm.df = read.csv(file.path(output_folder_material, "dm.csv"), row.names = 1, header = F)
dm.df = as.data.frame(dm.df[, 1:3])
dm.df$Labels = factor(seurat.obj@ident, levels = cell.labels)
dm.df$Colours = mapvalues(x = dm.df$Labels, from = cell.labels, to = cell.colours)
dm.df = dm.df[by.pdt.order, ]
colnames(dm.df) = c("DM1", "DM2", "DM3", "Labels", "Colours")
print("writing pdt_and_expression.csv")
expression_data_and_pdt = as.data.frame(t(as.matrix(seurat.obj@data[rev(genes.in.order), by.pdt.order])))
pdt.data = data.frame(Pseudotime = pseudotime[by.pdt.order, c(1)])
pdt.data = cbind(dm.df, pdt.data, expression_data_and_pdt)
pdt.data.fp = file.path(output_folder, "pdt_and_expression.csv")
write.csv(pdt.data, pdt.data.fp, row.names = F)
# make interactive diffusion map
command = sprintf("%s html_3D_viewer_and_plotter.py %s %s", python.addr, file.path(output_folder, "Interactive_Pseudotime.html"), pdt.data.fp)
system(command, wait = T)
# save the plotting material, just in case
plot.data.objects = list(pseudotime = pseudotime, beautiful_result_norm = beautiful_result_norm, beautiful_result_nonnorm = beautiful_result_nonnorm)
saveRDS(plot.data.objects, file.path(output_folder, "ploting_material.RDS"))
unlink(output_folder_material, recursive=T, force=T)
print("Ended beautifully ... ")