scRNA-seq_analysis

This commit is contained in:
veghp 2019-07-08 12:22:01 +01:00
commit 82cc2d191e
188 changed files with 146184 additions and 0 deletions

354
tools/bunddle_utils.R Executable file
View file

@ -0,0 +1,354 @@
tool_addr = "../../tools"
python.addr = "python3.6"
# a function to compute force-directed graph; return coordinates as a data frame
# arguments:
# pca.df: pca data as a data frame
# snn : a nearest-neighbor graph as a sparse data matrix
runFDG = function(pca.df, snn, iterations = 600, tool_addr, python.addr){
current.wd = getwd()
setwd(file.path(tool_addr, "force_abstract_graph_2D"))
# generate unique name for pca data file
pca.data.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
pca.data.fname = paste(pca.data.fname, ".csv", sep = "")
# generate unique name for snn file
snn.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
snn.fname = paste(snn.fname, ".smm", sep = "")
# generate unique name for fdg coordinates
fdg.coordinates.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
fdg.coordinates.fname = paste(fdg.coordinates.fname, ".csv", sep = "")
write.csv(pca.df, pca.data.fname)
writeMM(obj=snn, file=snn.fname)
command = gsub(pattern="ITER", replacement=as.character(iterations), paste(python.addr, "./make_fdg.py ITER", sep = " "))
command = paste(command, paste(c(pca.data.fname, snn.fname, fdg.coordinates.fname), collapse = " "), sep = " ")
system(command, wait = T)
fdg_coordinates = read.csv(fdg.coordinates.fname, header = FALSE)
colnames(fdg_coordinates) = c("X", "Y")
rownames(fdg_coordinates) = rownames(pca.df)
file.remove(c(pca.data.fname, snn.fname, fdg.coordinates.fname))
setwd(current.wd)
return(fdg_coordinates)
}
# a function to perform UMAP on a seurat object using PCA coordinates
RunUMAP = function(pca.df, tool_addr, python.addr){
current.wd = getwd()
setwd(file.path(tool_addr, "umap"))
print("writting pca data to disk...")
# generate unique name for pca data file
pca.data.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
pca.data.fname = paste(pca.data.fname, ".csv", sep = "")
# generate unique name for umap coordinates data file
umap.coordinates.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
umap.coordinates.fname = paste(umap.coordinates.fname, ".csv", sep = "")
write.csv(pca.df, pca.data.fname)
print("perfoming UMAP")
command = paste(python.addr, "umap_compute.py", sep = " ")
command = paste(command, pca.data.fname, sep = ' ')
command = paste(command, umap.coordinates.fname, ' ')
system(command, wait = T)
print("Reading results...")
umap.coordinates = read.csv(umap.coordinates.fname, stringsAsFactors = F)
file.remove(c(pca.data.fname, umap.coordinates.fname))
setwd(current.wd)
umap.coordinates = umap.coordinates[, c("UMAPx", "UMAPy")]
return(umap.coordinates)
}
# a function that load a SVM models and make predictions like cell types or doublet.singlets
Apply_Classifier_On_Seurat_Object = function(seurat.obj, classifier.fname, tool_addr, python.addr){
current.wd = getwd()
setwd(file.path(tool_addr, 'predict_by_classifier'))
predictor.addr = file.path("../../resources", classifier.fname, sep = "")
print(predictor.addr)
if(!dir.exists(predictor.addr)){
print("classifier does not exists")
available.classifiers = list.dirs("../../resources", full.names=F)
available.classifiers = available.classifiers[grepl("classifier_", available.classifiers)]
print(paste("Available doublets identifiers: ", paste(available.classifiers, collapse = ", "), sep = ""))
setwd(current.wd)
return(NULL)
}
tryCatch({
OK = FALSE
print("reading feature genes ...")
feature.genes = readRDS(file.path(predictor.addr, "feature_genes.RDS"))
features.present = feature.genes[feature.genes %in% rownames(seurat.obj@data)]
features.not.present = feature.genes[!(feature.genes %in% rownames(seurat.obj@data))]
expr.data = as.data.frame(t(as.matrix(seurat.obj@data[features.present, ])))
if (length(features.not.present) > 0){
zeros.data = as.data.frame(t(as.matrix(seurat.obj@data[1:length(features.not.present),])))
zeros.data[] = 0
colnames(zeros.data) = features.not.present
expr.data=cbind(expr.data, zeros.data)
expr.data = expr.data[, feature.genes]
}
print("Writting data to disk ... ")
data.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
data.fname = paste(data.fname, ".csv", sep = '')
write.csv(expr.data, data.fname)
print(sprintf("Dims are: %s", dim(expr.data)))
print("Copying pickle file ... ")
model.fname = paste(sample(LETTERS, 20, T), collapse = '')
model.fname = paste(model.fname, '.pickle', sep = "")
file.copy(from=file.path(predictor.addr, "model.pickle"), to=model.fname)
pca.fname = paste(sample(LETTERS, 20, T), collapse = '')
pca.fname = paste(pca.fname, '.pickle', sep = "")
file.copy(from=file.path(predictor.addr, "pca.pickle"), to=pca.fname)
print("Running classifier in Python ... ")
predictions.fname = paste(sample(LETTERS, 20, TRUE), collapse = "")
predictions.fname = paste(predictions.fname, ".csv", sep = "")
command = paste(python.addr, "predict.py", sep = " ")
command = paste(command, paste(c(model.fname, data.fname, predictions.fname, pca.fname), collapse = " "), sep = " ")
system(command, wait = T)
predictions = read.csv(predictions.fname)
OK = TRUE
},
warning = function(warning_conditions){print("")},
error = function(error_condition){
print("Errors occured. Cleaning up and then returning NULL ...")
file.remove(c(model.fname, data.fname, predictions.fname, pca.fname))
setwd(current.wd)
})
if(OK){
if(length(features.not.present) > 0){
print('Everything went well except the fact that some of the features genes were not present in the data. These are:')
print(paste(features.not.present, collapse = ", "))
}else{
print("Everything went smooth. Cleaning up and returning the predictions ... ")
}
file.remove(c(model.fname, data.fname, predictions.fname, pca.fname))
setwd(current.wd)
return(predictions$X0)
}
return(NULL)
}
# a function that takes 3D coordinates (e.g. from diffusion map) and generates an
# interactive html page
make_3D_interactive_page = function(data_frame_3D, tool_addr, python.addr, save.to){
data.frame.fname = paste(sample(LETTERS, 20, T), collapse = '')
data.frame.fname = paste(data.frame.fname, ".csv", sep = "")
save.to = file.path(getwd(), save.to)
command = gsub(pattern="save.to", replacement=save.to, x=paste(python.addr, './html_WebGL_3D_viewer.py save.to', sep = " "))
command = paste(command, data.frame.fname, sep = " ")
current.wd = getwd()
setwd(file.path(tool_addr, "interactive_3D_viewer"))
write.csv(data_frame_3D, data.frame.fname, row.names = F)
system(command, wait=T)
file.remove(data.frame.fname)
setwd(current.wd)
}
# a function that takes 2D coordinates and generates an interactive html page
make_2D_interactive_page = function(data_frame_2D, tool_addr, python.addr, save.to="./"){
data.frame.fname = paste(sample(LETTERS, 20, T), collapse = '')
data.frame.fname = paste(data.frame.fname, ".csv", sep = "")
save.to = file.path(getwd(), save.to)
command = gsub(pattern="save.to", replacement=save.to, x=paste(python.addr, './html_WebGL_2D_viewer.py save.to', sep = " "))
command = paste(command, data.frame.fname, sep = " ")
current.wd = getwd()
setwd(file.path(tool_addr, "interactive_2D_viewer"))
write.csv(data_frame_2D, data.frame.fname, row.names = F)
system(command, wait=T)
file.remove(data.frame.fname)
setwd(current.wd)
}
# a function that create interactic html pages to explore gene expression in data parsed by different categories
create_gene_expression_viewer_apps = function(seurat.obj, dim.type = 'umap', save.to, tool_addr, python.addr, categories.colours=NA){
categories = c("cell.labels", "fetal.ids", "sort.ids", "lanes", "stages", "gender", "doublets")
if(is.na(categories.colours)){
categories.colours = rep(NA, length(categories))
}
categories.data = as.data.frame(seurat.obj@meta.data[names(seurat.obj@ident), categories])
for(j in 1:length(categories)){
category = categories[j]
category.colour.scheme = categories.colours[j]
if (!is.na(category.colour.scheme)){
category.colour.scheme = read.csv(category.colour.scheme)
category.colour.scheme = mapvalues(x=categories.data[, category], from=as.vector(unique(category.colour.scheme$CellTypes)), to=as.vector(unique(category.colour.scheme$Colours)))
}else{
category.colour.scheme = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(as.vector(unique(categories.data[, category])))))
category.colour.scheme = mapvalues(x=categories.data[, category], from=as.vector(unique(categories.data[, category])), to=category.colour.scheme)
}
category = paste(category, "colours", sep = "_")
categories.data[, category] = category.colour.scheme
}
eval(parse(text = sprintf("dim.data = seurat.obj@dr$%s@cell.embeddings[names(seurat.obj@ident), 1:2]", dim.type)))
n.categories = length(categories)
genes.by.file = round( 20 * 26149846 / ncol(seurat.obj@data))
approx.no.of.files = round(nrow(seurat.obj@data) / genes.by.file)
gene.names = sort(rownames(seurat.obj@data))
gene.names = sort(seurat.obj@var.genes)
gene.splits = split(gene.names, sort(1:length(gene.names) %% approx.no.of.files))
curdir = getwd()
unlink(x=save.to, recursive=T)
dir.create(save.to)
setwd(file.path(tool_addr, 'gene_expression_viewer_apps'))
folder_name = paste(sample(LETTERS, 20, T), collapse = "")
dir.create(folder_name)
for (l in 1:length(gene.splits)){
gene.split = unlist(gene.splits[[l]])
expression.data = as.data.frame(as.matrix(t(seurat.obj@data[gene.split, names(seurat.obj@ident)])))
expression.data = cbind(dim.data, categories.data, expression.data)
first_gene = gene.split[1]
last_gene = gene.split[length(gene.split)]
print(sprintf("Creating app for: %s", paste(c(first_gene, "to", last_gene), collapse = "_")))
expression.data.fname = paste(c(first_gene, "to", last_gene), collapse = "_")
expression.data.fname = paste(expression.data.fname, ".csv", sep = "")
expression.data.fname = file.path(folder_name, expression.data.fname)
save_to = paste(c(first_gene, "to", last_gene), collapse = "_")
save_to = paste(save_to, ".html", sep = "")
save_to = file.path(file.path(curdir, save.to), save_to)
command = sprintf("%s gene_expression_viewer_apps.py %s %s %s", python.addr, save_to, expression.data.fname, n.categories)
write.csv(expression.data, expression.data.fname, row.names = F)
system(command, wait = T)
}
unlink(x=folder_name, recursive=T, force=T)
setwd(curdir)
}
# a plotting function for indexed legend
plot.indexed.legend = function(label.vector, color.vector, ncols = 2, left.limit = 3.4, symbol.size = 8, text.size = 10, padH = 1, padV = 1, padRight = 0){
if (length(label.vector) != length(color.vector)){
stop("number of labels is different from number colors\nAdvice: learn to count!")
}
if (length(ncol) > length(label.vector)){
stop("You cannot have more columns than labels\nSolution: Learn to count")
}
indices.vector = 1:length(label.vector)
label.no = length(label.vector)
nrows = ceiling(label.no / ncols)
legend.frame = data.frame(X = rep(0, label.no), Y = rep(0, label.no), CS = color.vector, Txt = label.vector)
legend.frame$X = rep(1:ncols, each=nrows)[1:nrow(legend.frame)]
legend.frame$Y = rep(nrows:1, times = ncols)[1:nrow(legend.frame)]
Xrange = range(legend.frame$X)
Yrange = range(legend.frame$Y)
plot.obj = ggplot(data = legend.frame, aes(x = X, y = Y))
plot.obj = plot.obj + geom_point(size = symbol.size, colour = color.vector)
plot.obj = plot.obj + scale_x_continuous(limits = c(Xrange[1] - padRight, Xrange[2] + padH))
plot.obj = plot.obj + scale_y_continuous(limits = c(Yrange[1] - padV, Yrange[2] + padV))
plot.obj = plot.obj + theme_void()
plot.obj = plot.obj + annotate("text", x=legend.frame$X, y = legend.frame$Y, label = indices.vector, size = text.size)
plot.obj = plot.obj + annotate("text", x=legend.frame$X+.1, y = legend.frame$Y, label=legend.frame$Txt, hjust = 0, size = text.size)
return(plot.obj)
}
# plotting function for dimensionaly-reduced data to label population by a round indexed label
dr.plot = function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2, use.cols = NULL, use.labels = NULL, limits = NULL, annotate.plot = T, index.map = NA){
if(!is.na(overlay.data)){
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2, overlay.data=factor(df$overlay.data))
}
else{
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
}
if(is.null(use.labels)){
p.labels = sort(unique(as.vector(point.labels)))
}
else{
p.labels = use.labels
}
df.dr$Cell.Labels = factor(df.dr$Cell.Labels, levels=p.labels)
p.labels.medians = aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
df.dr$Cell.Labels = mapvalues(x = df.dr$Cell.Labels, from = p.labels, to = paste(1:length(p.labels), p.labels, sep = " "))
if(is.null(use.cols)){
set.seed(random_state)
plt.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
}else{
plt.colours = use.cols
}
if(is.na(index.map)){
index.map = 1:length(p.labels)
}
if(!is.na(overlay.data)){
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels, shape = factor(overlay.data, levels=overlay.data.ordered)))
print("levels=overlay.data.ordered")
print(overlay.data.ordered)
}
else{
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
}
# this line should give different sizes for different values in overlay.data metadata column
if(!is.na(overlay.data)){
plot.obj = plot.obj + geom_point(size=pt.size)
plot.obj = plot.obj + geom_point(data = subset(df.dr, overlay.data == overlay.data.ordered[2]))
print("overlay.data.ordered[2]")
print(overlay.data.ordered[2])
}
else{
plot.obj = plot.obj + geom_point(size=pt.size)
}
plot.obj = plot.obj + scale_color_manual(values=plt.colours)
if(annotate.plot){
if(!is.na(overlay.data)){
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21, shape=factor(df.dr$overlay.data, levels=overlay.data.ordered))
}
else{
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21)
}
plot.obj = plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = index.map, size = txt.lb.size)
}
if (no.legend){
plot.obj = plot.obj + theme(legend.position="none")
}else{
plot.obj = plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
}
plot.obj = plot.obj + xlab(dr1.name) + ylab(dr2.name)
if(!is.null(limits)){
X0 = limits[1]; X1 = limits[2]; Y0 = limits[3]; Y1 = limits[4];
plot.obj = plot.obj + scale_x_continuous(limits = c(X0, X1))
plot.obj = plot.obj + scale_y_continuous(limits = c(Y0, Y1))
}
return(plot.obj)
}
# plotting function for dimensionaly-reduced data to label population by a round indexed label
dr.plot.numerical = function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2, use.cols = NULL, use.labels = NULL, limits = NULL, annotate.plot = T, index.map = NA){
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
if(is.null(use.labels)){
p.labels = sort(unique(as.vector(point.labels)))
}
else{
p.labels = use.labels
}
df.dr$Cell.Labels = factor(df.dr$Cell.Labels, levels=p.labels)
p.labels.medians = aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
#df.dr$Cell.Labels = mapvalues(x = df.dr$Cell.Labels, from = p.labels, to = paste(1:length(p.labels), p.labels, sep = " "))
if(is.null(use.cols)){
set.seed(random_state)
plt.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
}else{
plt.colours = use.cols
}
if(is.na(index.map)){
#index.map = 1:length(p.labels)
index.map = p.labels
}
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
plot.obj = plot.obj + geom_point(size = pt.size)
plot.obj = plot.obj + scale_color_manual(values=plt.colours)
if(annotate.plot){
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21)
plot.obj = plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = index.map, size = txt.lb.size)
}
if (no.legend){
plot.obj = plot.obj + theme(legend.position="none")
}else{
plot.obj = plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
}
plot.obj = plot.obj + xlab(dr1.name) + ylab(dr2.name)
if(!is.null(limits)){
X0 = limits[1]; X1 = limits[2]; Y0 = limits[3]; Y1 = limits[4];
plot.obj = plot.obj + scale_x_continuous(limits = c(X0, X1))
plot.obj = plot.obj + scale_y_continuous(limits = c(Y0, Y1))
}
return(plot.obj)
}