mirror of
https://github.com/haniffalab/scRNA-seq_analysis.git
synced 2024-10-23 08:29:24 -07:00
354 lines
17 KiB
R
Executable file
354 lines
17 KiB
R
Executable file
tool_addr = "../../tools"
|
|
python.addr = "python3.6"
|
|
|
|
# a function to compute force-directed graph; return coordinates as a data frame
|
|
# arguments:
|
|
# pca.df: pca data as a data frame
|
|
# snn : a nearest-neighbor graph as a sparse data matrix
|
|
runFDG = function(pca.df, snn, iterations = 600, tool_addr, python.addr){
|
|
current.wd = getwd()
|
|
setwd(file.path(tool_addr, "force_abstract_graph_2D"))
|
|
# generate unique name for pca data file
|
|
pca.data.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
pca.data.fname = paste(pca.data.fname, ".csv", sep = "")
|
|
# generate unique name for snn file
|
|
snn.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
snn.fname = paste(snn.fname, ".smm", sep = "")
|
|
# generate unique name for fdg coordinates
|
|
fdg.coordinates.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
fdg.coordinates.fname = paste(fdg.coordinates.fname, ".csv", sep = "")
|
|
write.csv(pca.df, pca.data.fname)
|
|
writeMM(obj=snn, file=snn.fname)
|
|
command = gsub(pattern="ITER", replacement=as.character(iterations), paste(python.addr, "./make_fdg.py ITER", sep = " "))
|
|
command = paste(command, paste(c(pca.data.fname, snn.fname, fdg.coordinates.fname), collapse = " "), sep = " ")
|
|
system(command, wait = T)
|
|
fdg_coordinates = read.csv(fdg.coordinates.fname, header = FALSE)
|
|
colnames(fdg_coordinates) = c("X", "Y")
|
|
rownames(fdg_coordinates) = rownames(pca.df)
|
|
file.remove(c(pca.data.fname, snn.fname, fdg.coordinates.fname))
|
|
setwd(current.wd)
|
|
return(fdg_coordinates)
|
|
}
|
|
|
|
# a function to perform UMAP on a seurat object using PCA coordinates
|
|
RunUMAP = function(pca.df, tool_addr, python.addr){
|
|
current.wd = getwd()
|
|
setwd(file.path(tool_addr, "umap"))
|
|
print("writting pca data to disk...")
|
|
# generate unique name for pca data file
|
|
pca.data.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
pca.data.fname = paste(pca.data.fname, ".csv", sep = "")
|
|
# generate unique name for umap coordinates data file
|
|
umap.coordinates.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
umap.coordinates.fname = paste(umap.coordinates.fname, ".csv", sep = "")
|
|
write.csv(pca.df, pca.data.fname)
|
|
print("perfoming UMAP")
|
|
command = paste(python.addr, "umap_compute.py", sep = " ")
|
|
command = paste(command, pca.data.fname, sep = ' ')
|
|
command = paste(command, umap.coordinates.fname, ' ')
|
|
system(command, wait = T)
|
|
print("Reading results...")
|
|
umap.coordinates = read.csv(umap.coordinates.fname, stringsAsFactors = F)
|
|
file.remove(c(pca.data.fname, umap.coordinates.fname))
|
|
setwd(current.wd)
|
|
umap.coordinates = umap.coordinates[, c("UMAPx", "UMAPy")]
|
|
return(umap.coordinates)
|
|
}
|
|
|
|
# a function that load a SVM models and make predictions like cell types or doublet.singlets
|
|
Apply_Classifier_On_Seurat_Object = function(seurat.obj, classifier.fname, tool_addr, python.addr){
|
|
current.wd = getwd()
|
|
setwd(file.path(tool_addr, 'predict_by_classifier'))
|
|
predictor.addr = file.path("../../resources", classifier.fname, sep = "")
|
|
print(predictor.addr)
|
|
if(!dir.exists(predictor.addr)){
|
|
print("classifier does not exists")
|
|
available.classifiers = list.dirs("../../resources", full.names=F)
|
|
available.classifiers = available.classifiers[grepl("classifier_", available.classifiers)]
|
|
print(paste("Available doublets identifiers: ", paste(available.classifiers, collapse = ", "), sep = ""))
|
|
setwd(current.wd)
|
|
return(NULL)
|
|
}
|
|
tryCatch({
|
|
OK = FALSE
|
|
print("reading feature genes ...")
|
|
feature.genes = readRDS(file.path(predictor.addr, "feature_genes.RDS"))
|
|
features.present = feature.genes[feature.genes %in% rownames(seurat.obj@data)]
|
|
features.not.present = feature.genes[!(feature.genes %in% rownames(seurat.obj@data))]
|
|
expr.data = as.data.frame(t(as.matrix(seurat.obj@data[features.present, ])))
|
|
if (length(features.not.present) > 0){
|
|
zeros.data = as.data.frame(t(as.matrix(seurat.obj@data[1:length(features.not.present),])))
|
|
zeros.data[] = 0
|
|
colnames(zeros.data) = features.not.present
|
|
expr.data=cbind(expr.data, zeros.data)
|
|
expr.data = expr.data[, feature.genes]
|
|
}
|
|
print("Writting data to disk ... ")
|
|
data.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
data.fname = paste(data.fname, ".csv", sep = '')
|
|
write.csv(expr.data, data.fname)
|
|
|
|
print(sprintf("Dims are: %s", dim(expr.data)))
|
|
print("Copying pickle file ... ")
|
|
model.fname = paste(sample(LETTERS, 20, T), collapse = '')
|
|
model.fname = paste(model.fname, '.pickle', sep = "")
|
|
file.copy(from=file.path(predictor.addr, "model.pickle"), to=model.fname)
|
|
|
|
pca.fname = paste(sample(LETTERS, 20, T), collapse = '')
|
|
pca.fname = paste(pca.fname, '.pickle', sep = "")
|
|
file.copy(from=file.path(predictor.addr, "pca.pickle"), to=pca.fname)
|
|
|
|
print("Running classifier in Python ... ")
|
|
predictions.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
|
|
predictions.fname = paste(predictions.fname, ".csv", sep = "")
|
|
command = paste(python.addr, "predict.py", sep = " ")
|
|
command = paste(command, paste(c(model.fname, data.fname, predictions.fname, pca.fname), collapse = " "), sep = " ")
|
|
system(command, wait = T)
|
|
predictions = read.csv(predictions.fname)
|
|
OK = TRUE
|
|
},
|
|
warning = function(warning_conditions){print("")},
|
|
error = function(error_condition){
|
|
print("Errors occured. Cleaning up and then returning NULL ...")
|
|
file.remove(c(model.fname, data.fname, predictions.fname, pca.fname))
|
|
setwd(current.wd)
|
|
})
|
|
if(OK){
|
|
if(length(features.not.present) > 0){
|
|
print('Everything went well except the fact that some of the features genes were not present in the data. These are:')
|
|
print(paste(features.not.present, collapse = ", "))
|
|
}else{
|
|
print("Everything went smooth. Cleaning up and returning the predictions ... ")
|
|
}
|
|
file.remove(c(model.fname, data.fname, predictions.fname, pca.fname))
|
|
setwd(current.wd)
|
|
return(predictions$X0)
|
|
}
|
|
return(NULL)
|
|
}
|
|
|
|
# a function that takes 3D coordinates (e.g. from diffusion map) and generates an
|
|
# interactive html page
|
|
make_3D_interactive_page = function(data_frame_3D, tool_addr, python.addr, save.to){
|
|
data.frame.fname = paste(sample(LETTERS, 20, T), collapse = '')
|
|
data.frame.fname = paste(data.frame.fname, ".csv", sep = "")
|
|
save.to = file.path(getwd(), save.to)
|
|
command = gsub(pattern="save.to", replacement=save.to, x=paste(python.addr, './html_WebGL_3D_viewer.py save.to', sep = " "))
|
|
command = paste(command, data.frame.fname, sep = " ")
|
|
current.wd = getwd()
|
|
setwd(file.path(tool_addr, "interactive_3D_viewer"))
|
|
write.csv(data_frame_3D, data.frame.fname, row.names = F)
|
|
system(command, wait=T)
|
|
file.remove(data.frame.fname)
|
|
setwd(current.wd)
|
|
}
|
|
|
|
# a function that takes 2D coordinates and generates an interactive html page
|
|
make_2D_interactive_page = function(data_frame_2D, tool_addr, python.addr, save.to="./"){
|
|
data.frame.fname = paste(sample(LETTERS, 20, T), collapse = '')
|
|
data.frame.fname = paste(data.frame.fname, ".csv", sep = "")
|
|
save.to = file.path(getwd(), save.to)
|
|
command = gsub(pattern="save.to", replacement=save.to, x=paste(python.addr, './html_WebGL_2D_viewer.py save.to', sep = " "))
|
|
command = paste(command, data.frame.fname, sep = " ")
|
|
current.wd = getwd()
|
|
setwd(file.path(tool_addr, "interactive_2D_viewer"))
|
|
write.csv(data_frame_2D, data.frame.fname, row.names = F)
|
|
system(command, wait=T)
|
|
file.remove(data.frame.fname)
|
|
setwd(current.wd)
|
|
}
|
|
|
|
# a function that create interactic html pages to explore gene expression in data parsed by different categories
|
|
create_gene_expression_viewer_apps = function(seurat.obj, dim.type = 'umap', save.to, tool_addr, python.addr, categories.colours=NA){
|
|
categories = c("cell.labels", "fetal.ids", "sort.ids", "lanes", "stages", "gender", "doublets")
|
|
if(is.na(categories.colours)){
|
|
categories.colours = rep(NA, length(categories))
|
|
}
|
|
categories.data = as.data.frame(seurat.obj@meta.data[names(seurat.obj@ident), categories])
|
|
for(j in 1:length(categories)){
|
|
category = categories[j]
|
|
category.colour.scheme = categories.colours[j]
|
|
if (!is.na(category.colour.scheme)){
|
|
category.colour.scheme = read.csv(category.colour.scheme)
|
|
category.colour.scheme = mapvalues(x=categories.data[, category], from=as.vector(unique(category.colour.scheme$CellTypes)), to=as.vector(unique(category.colour.scheme$Colours)))
|
|
}else{
|
|
category.colour.scheme = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(as.vector(unique(categories.data[, category])))))
|
|
category.colour.scheme = mapvalues(x=categories.data[, category], from=as.vector(unique(categories.data[, category])), to=category.colour.scheme)
|
|
}
|
|
category = paste(category, "colours", sep = "_")
|
|
categories.data[, category] = category.colour.scheme
|
|
}
|
|
eval(parse(text = sprintf("dim.data = seurat.obj@dr$%s@cell.embeddings[names(seurat.obj@ident), 1:2]", dim.type)))
|
|
n.categories = length(categories)
|
|
|
|
genes.by.file = round( 20 * 26149846 / ncol(seurat.obj@data))
|
|
approx.no.of.files = round(nrow(seurat.obj@data) / genes.by.file)
|
|
gene.names = sort(rownames(seurat.obj@data))
|
|
gene.names = sort(seurat.obj@var.genes)
|
|
gene.splits = split(gene.names, sort(1:length(gene.names) %% approx.no.of.files))
|
|
curdir = getwd()
|
|
unlink(x=save.to, recursive=T)
|
|
dir.create(save.to)
|
|
setwd(file.path(tool_addr, 'gene_expression_viewer_apps'))
|
|
folder_name = paste(sample(LETTERS, 20, T), collapse = "")
|
|
dir.create(folder_name)
|
|
for (l in 1:length(gene.splits)){
|
|
gene.split = unlist(gene.splits[[l]])
|
|
expression.data = as.data.frame(as.matrix(t(seurat.obj@data[gene.split, names(seurat.obj@ident)])))
|
|
expression.data = cbind(dim.data, categories.data, expression.data)
|
|
first_gene = gene.split[1]
|
|
last_gene = gene.split[length(gene.split)]
|
|
print(sprintf("Creating app for: %s", paste(c(first_gene, "to", last_gene), collapse = "_")))
|
|
expression.data.fname = paste(c(first_gene, "to", last_gene), collapse = "_")
|
|
expression.data.fname = paste(expression.data.fname, ".csv", sep = "")
|
|
expression.data.fname = file.path(folder_name, expression.data.fname)
|
|
save_to = paste(c(first_gene, "to", last_gene), collapse = "_")
|
|
save_to = paste(save_to, ".html", sep = "")
|
|
save_to = file.path(file.path(curdir, save.to), save_to)
|
|
command = sprintf("%s gene_expression_viewer_apps.py %s %s %s", python.addr, save_to, expression.data.fname, n.categories)
|
|
write.csv(expression.data, expression.data.fname, row.names = F)
|
|
system(command, wait = T)
|
|
}
|
|
unlink(x=folder_name, recursive=T, force=T)
|
|
setwd(curdir)
|
|
}
|
|
|
|
# a plotting function for indexed legend
|
|
plot.indexed.legend = function(label.vector, color.vector, ncols = 2, left.limit = 3.4, symbol.size = 8, text.size = 10, padH = 1, padV = 1, padRight = 0){
|
|
if (length(label.vector) != length(color.vector)){
|
|
stop("number of labels is different from number colors\nAdvice: learn to count!")
|
|
}
|
|
if (length(ncol) > length(label.vector)){
|
|
stop("You cannot have more columns than labels\nSolution: Learn to count")
|
|
}
|
|
indices.vector = 1:length(label.vector)
|
|
label.no = length(label.vector)
|
|
nrows = ceiling(label.no / ncols)
|
|
legend.frame = data.frame(X = rep(0, label.no), Y = rep(0, label.no), CS = color.vector, Txt = label.vector)
|
|
legend.frame$X = rep(1:ncols, each=nrows)[1:nrow(legend.frame)]
|
|
legend.frame$Y = rep(nrows:1, times = ncols)[1:nrow(legend.frame)]
|
|
Xrange = range(legend.frame$X)
|
|
Yrange = range(legend.frame$Y)
|
|
plot.obj = ggplot(data = legend.frame, aes(x = X, y = Y))
|
|
plot.obj = plot.obj + geom_point(size = symbol.size, colour = color.vector)
|
|
plot.obj = plot.obj + scale_x_continuous(limits = c(Xrange[1] - padRight, Xrange[2] + padH))
|
|
plot.obj = plot.obj + scale_y_continuous(limits = c(Yrange[1] - padV, Yrange[2] + padV))
|
|
plot.obj = plot.obj + theme_void()
|
|
|
|
plot.obj = plot.obj + annotate("text", x=legend.frame$X, y = legend.frame$Y, label = indices.vector, size = text.size)
|
|
plot.obj = plot.obj + annotate("text", x=legend.frame$X+.1, y = legend.frame$Y, label=legend.frame$Txt, hjust = 0, size = text.size)
|
|
return(plot.obj)
|
|
}
|
|
|
|
# plotting function for dimensionaly-reduced data to label population by a round indexed label
|
|
dr.plot = function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2, use.cols = NULL, use.labels = NULL, limits = NULL, annotate.plot = T, index.map = NA){
|
|
if(!is.na(overlay.data)){
|
|
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2, overlay.data=factor(df$overlay.data))
|
|
}
|
|
else{
|
|
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
|
|
}
|
|
if(is.null(use.labels)){
|
|
p.labels = sort(unique(as.vector(point.labels)))
|
|
}
|
|
else{
|
|
p.labels = use.labels
|
|
}
|
|
df.dr$Cell.Labels = factor(df.dr$Cell.Labels, levels=p.labels)
|
|
p.labels.medians = aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
|
|
df.dr$Cell.Labels = mapvalues(x = df.dr$Cell.Labels, from = p.labels, to = paste(1:length(p.labels), p.labels, sep = " "))
|
|
if(is.null(use.cols)){
|
|
set.seed(random_state)
|
|
plt.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
|
|
}else{
|
|
plt.colours = use.cols
|
|
}
|
|
if(is.na(index.map)){
|
|
index.map = 1:length(p.labels)
|
|
}
|
|
if(!is.na(overlay.data)){
|
|
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels, shape = factor(overlay.data, levels=overlay.data.ordered)))
|
|
print("levels=overlay.data.ordered")
|
|
print(overlay.data.ordered)
|
|
}
|
|
else{
|
|
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
|
|
}
|
|
# this line should give different sizes for different values in overlay.data metadata column
|
|
if(!is.na(overlay.data)){
|
|
plot.obj = plot.obj + geom_point(size=pt.size)
|
|
plot.obj = plot.obj + geom_point(data = subset(df.dr, overlay.data == overlay.data.ordered[2]))
|
|
print("overlay.data.ordered[2]")
|
|
print(overlay.data.ordered[2])
|
|
}
|
|
else{
|
|
plot.obj = plot.obj + geom_point(size=pt.size)
|
|
}
|
|
plot.obj = plot.obj + scale_color_manual(values=plt.colours)
|
|
if(annotate.plot){
|
|
if(!is.na(overlay.data)){
|
|
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21, shape=factor(df.dr$overlay.data, levels=overlay.data.ordered))
|
|
}
|
|
else{
|
|
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21)
|
|
}
|
|
plot.obj = plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = index.map, size = txt.lb.size)
|
|
|
|
}
|
|
if (no.legend){
|
|
plot.obj = plot.obj + theme(legend.position="none")
|
|
}else{
|
|
plot.obj = plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
|
|
}
|
|
plot.obj = plot.obj + xlab(dr1.name) + ylab(dr2.name)
|
|
if(!is.null(limits)){
|
|
X0 = limits[1]; X1 = limits[2]; Y0 = limits[3]; Y1 = limits[4];
|
|
plot.obj = plot.obj + scale_x_continuous(limits = c(X0, X1))
|
|
plot.obj = plot.obj + scale_y_continuous(limits = c(Y0, Y1))
|
|
}
|
|
return(plot.obj)
|
|
}
|
|
|
|
|
|
# plotting function for dimensionaly-reduced data to label population by a round indexed label
|
|
dr.plot.numerical = function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2, use.cols = NULL, use.labels = NULL, limits = NULL, annotate.plot = T, index.map = NA){
|
|
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
|
|
if(is.null(use.labels)){
|
|
p.labels = sort(unique(as.vector(point.labels)))
|
|
}
|
|
else{
|
|
p.labels = use.labels
|
|
}
|
|
df.dr$Cell.Labels = factor(df.dr$Cell.Labels, levels=p.labels)
|
|
p.labels.medians = aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
|
|
#df.dr$Cell.Labels = mapvalues(x = df.dr$Cell.Labels, from = p.labels, to = paste(1:length(p.labels), p.labels, sep = " "))
|
|
if(is.null(use.cols)){
|
|
set.seed(random_state)
|
|
plt.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
|
|
}else{
|
|
plt.colours = use.cols
|
|
}
|
|
if(is.na(index.map)){
|
|
#index.map = 1:length(p.labels)
|
|
index.map = p.labels
|
|
}
|
|
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
|
|
plot.obj = plot.obj + geom_point(size = pt.size)
|
|
plot.obj = plot.obj + scale_color_manual(values=plt.colours)
|
|
if(annotate.plot){
|
|
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21)
|
|
plot.obj = plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = index.map, size = txt.lb.size)
|
|
}
|
|
if (no.legend){
|
|
plot.obj = plot.obj + theme(legend.position="none")
|
|
}else{
|
|
plot.obj = plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
|
|
}
|
|
plot.obj = plot.obj + xlab(dr1.name) + ylab(dr2.name)
|
|
if(!is.null(limits)){
|
|
X0 = limits[1]; X1 = limits[2]; Y0 = limits[3]; Y1 = limits[4];
|
|
plot.obj = plot.obj + scale_x_continuous(limits = c(X0, X1))
|
|
plot.obj = plot.obj + scale_y_continuous(limits = c(Y0, Y1))
|
|
}
|
|
return(plot.obj)
|
|
}
|