scRNA-seq_analysis

This commit is contained in:
veghp 2019-07-08 12:22:01 +01:00
commit 82cc2d191e
188 changed files with 146184 additions and 0 deletions

View file

@ -0,0 +1,69 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(RColorBrewer)
library(dplyr)
library(plyr)
#######################################################################################################
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
unlink(output_folder_material, recursive=T, force=T)
print("Ended beautifully ... ")

View file

@ -0,0 +1,285 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
if(length(args) != 8){
stop('This pipeline requires 8 parameters: organ, ProjectName, save.at (name of RDS file where processed data are saved), sequencing.types (normal, 5GEX or VDJ), annotate.cells (boolean), identify.doublets (boolean), cell.type.SVM (folder name where cell type svm classifier for given organ is stored), doublet.svm (folder name where singlet/doublet svm classifier for given organ is stored);')
}
arguments.list = "
organ.arg = args[1]
ProjectName.arg = args[2]
save.at.arg = args[3]
sequencing.types.arg = args[4]
annotate.cells.arg = args[5]
identify.doublets.arg = args[6]
cell.type.SVM.arg = args[7]
doublet.svm.arg = args[8]
"
eval(parse(text = arguments.list))
arguments.list = unlist(strsplit(arguments.list, "\n"))
arguments.list = arguments.list[!(arguments.list == "")]
for(n in 1:length(arguments.list)){
argument = arguments.list[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, save.at, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
source("../../tools/bunddle_utils.R")
sequencing.types = unlist(strsplit(sequencing.types, "-"))
save.at = file.path("../../data", save.at)
key.fname = "../../resources/key.csv"
sample.key.fname = "../../resources/sample_key.csv"
library(Seurat)
library(plyr)
library(dplyr)
# function to load sequentiall 10x data from many folders passed as a vector
# return a seurat object created from merging all the 10X data in the folders
# does not apply any filtering
load_10x_data_from_folders = function(folders, inside = "filtered/GRCh38/", key, sample.key){
# load data from first folder. if only one folder then return
sample.col.label = folders[1]
nfolders = length(folders)
prelabel.a = key$Prelabel[key$SUPPLIER.SAMPLE.NAME == sample.col.label]
folder = key$SANGER.SAMPLE.ID[key$SUPPLIER.SAMPLE.NAME == sample.col.label]
folder = file.path(folder, inside)
print(sprintf("Loading sample %s from %s", prelabel.a, folder))
seurat.obj.a = tryCatch({
Read10X(folder)
}, error = function(e){
Read10X(file.path(unlist(strsplit(folder, '/'))[1], unlist(strsplit(folder, '/'))[2]))
})
colnames(seurat.obj.a) = paste(prelabel.a, colnames(seurat.obj.a), sep = "_")
seurat.obj = CreateSeuratObject(raw.data = seurat.obj.a, min.cells = 0, min.genes = 0, project = "")
# if there is only one folder to read data from than add the prelabel to the cell names and return the objec
if (nfolders == 1){
return(seurat.obj)
}
# if more the 1 folder load next folder(s)
for (i in 2:nfolders){
sample.col.label = folders[i]
print(sample.col.label)
prelabel.b = key$Prelabel[key$SUPPLIER.SAMPLE.NAME == sample.col.label]
folder = key$SANGER.SAMPLE.ID[key$SUPPLIER.SAMPLE.NAME == sample.col.label]
folder = file.path(folder, inside)
print(sprintf("Loading sample %s from %s", prelabel.b, folder))
seurat.obj.b = tryCatch({
Read10X(folder)
}, error = function(e){
Read10X(file.path(unlist(strsplit(folder, '/'))[1], unlist(strsplit(folder, '/'))[2]))
})
seurat.obj = AddSamples(object=seurat.obj, new.data=seurat.obj.b, project=ProjectName, min.cells=0, min.genes=0, do.normalize=F, do.scale=F, do.center=F, add.cell.id=prelabel.b)
print(seurat.obj)
}
# eliminate the multiple underscore heading from the cell names introduces by sequential merging
cell.names = strsplit(colnames(seurat.obj@raw.data), "_")
cell.names = lapply(cell.names, function(x)x[x != ""])
cell.names = unlist(lapply(cell.names, FUN=function(parts){return(paste(parts, collapse="_"))}))
colnames(seurat.obj@data) = cell.names
colnames(seurat.obj@raw.data) = cell.names
names(seurat.obj@ident) = cell.names
return(seurat.obj)
}
# function to add meta data to a seurat object based on parsing cell names
# currently it adds: fetal ids, sort ids, tissue, lane, stage and sample type
add.meta.data = function(seurat.obj, sample.key, key){
cell.names = strsplit(colnames(seurat.obj@data), "_")
fetal.ids = as.factor(unlist(lapply(cell.names, "[", 1)))
tissue = as.factor(unlist(lapply(cell.names, "[", 2)))
sort.ids = as.factor(unlist(lapply(cell.names, "[", 3)))
lanes = as.factor(unlist(lapply(cell.names, "[", 4)))
key.key = which(sample.key$Sample %in% levels(fetal.ids))
# map stages
stages = plyr::mapvalues(x=fetal.ids, from=sample.key$Sample[key.key], to = sample.key$Stage[key.key])
# map sample type
sample.type = plyr::mapvalues(x=fetal.ids, from=sample.key$Sample[key.key], to = sample.key$Type[key.key])
# map fetal ids
fetal.ids = plyr::mapvalues(x=fetal.ids, from=sample.key$Sample[key.key], to = sample.key$Name[key.key])
# create gender vector
gender = strsplit(as.vector(fetal.ids), "_")
gender = as.factor(unlist(lapply(gender, "[",2)))
# create the AnnatomicalPart vector
unique.lanes = as.vector(unique(lanes))
unique.key = key[key$SANGER.SAMPLE.ID %in% unique.lanes, ]
AnnatomicalPart = plyr::mapvalues(x=lanes, from=unique.key$SANGER.SAMPLE.ID, to=unique.key$AnnatomicalPart)
# add the meta data
seurat.obj@meta.data$fetal.ids = fetal.ids
seurat.obj@meta.data$sort.ids = sort.ids
seurat.obj@meta.data$tissue = tissue
seurat.obj@meta.data$lanes = lanes
seurat.obj@meta.data$stages = stages
seurat.obj@meta.data$sample.type = sample.type
seurat.obj@meta.data$gender = gender
seurat.obj@meta.data$AnnatomicalPart = AnnatomicalPart
return(seurat.obj)
}
# function to perform filtering on a seurat object
# this ensures all the datasets in a project are filtered with same criteria
filter.seurat = function(seurat.obj, min.cells = 3, min.genes = 200, project.name = "", mito.genes.treshold = .2){
# apply filtering based on min.genes and min.cells
print("Filtering on cell and gene numbers ... ")
seurat.obj = CreateSeuratObject(raw.data = seurat.obj@raw.data, min.cells = min.cells,
min.genes = min.genes, project = "")
seurat.obj.meta.data = seurat.obj@meta.data
saveRDS(seurat.obj.meta.data, file.path(output_folder, 'meta_data_mingenes.RDS'))
# calculate percentage of mitocondrial genes
mito.genes = grep(pattern = "^MT-", x = rownames(x = seurat.obj@data), value = TRUE)
percent.mito = Matrix::colSums(seurat.obj@raw.data[mito.genes, ])/Matrix::colSums(seurat.obj@raw.data)
seurat.obj = AddMetaData(object = seurat.obj, metadata = percent.mito, col.name = "percent.mito")
# filter on mitocondrial genes > mito.genes.treshold
print("Filtering on mitochondrial genes")
seurat.obj = FilterCells(object = seurat.obj, subset.names = c("percent.mito"), low.thresholds = c(-Inf),
high.thresholds = c(mito.genes.treshold))
seurat.obj.meta.data = seurat.obj@meta.data
saveRDS(seurat.obj.meta.data, file.path(output_folder, 'meta_data_mitogenes.RDS'))
return(seurat.obj)
}
# load the key
# then do View(key) to look for the datasets you required
# write the names of interest from key$V2 into data.folders
# you can now access the folder names that need to be uploaded for creating a seurat object with required data
#key = read.csv(file = key.fname, stringsAsFactors = FALSE, header=T)
key = read.csv(file = key.fname, stringsAsFactors = FALSE, header=T, sep="\t")
key$Fetus = unlist(regmatches(key$SUPPLIER.SAMPLE.NAME, gregexpr(pattern="F[0-9]{2}", text=key$SUPPLIER.SAMPLE.NAME)))
key = key[key$Organ != "other", ]
# load sample key
sample.key = read.csv(sample.key.fname, stringsAsFactors = F, sep = "\t")
# check all sample names in the key are also in the sample key:
print(paste("All sample names in the key are also in the sample_key: ", all(key$Fetus %in% sample.key$Sample), sep = ""))
# create prelabel column in key data frame
# the prelabel is attached to the cell names before each barcode
prelabel = paste(key$Fetus, key$Organ, sep = "_")
# the first 20 supplier labels have 4 fields so the gate field is at position 4
gate = strsplit(key$SUPPLIER.SAMPLE.NAME, split="_")
gate = as.vector(unlist(lapply(gate, "[", 3)))
gate = plyr::mapvalues(x = gate, from = c("CD45P", "CD45N", "TOT"),
to = c("CD45+", "CD45-", "Total"))
prelabel = paste(prelabel, gate, sep = "_")
prelabel = paste(prelabel, key$SANGER.SAMPLE.ID, sep = "_")
key$Prelabel = prelabel
# next the data can be parsed by tissue
##########################################################################################
##########################################################################################
##########################################################################################
data.folders = key$SUPPLIER.SAMPLE.NAME[(key$Organ == organ & key$Sequencing %in% sequencing.types ) & key$Passed]
key = key[(key$Organ == organ & key$Sequencing %in% sequencing.types ) & key$Passed, ]
print("Next is the key: ")
print(key)
# load the data
cur.dir = getwd()
setwd("../../data/sc_count_matrices/")
seurat.obj = load_10x_data_from_folders(folders=data.folders, key = key, sample.key = sample.key)
setwd(cur.dir)
# parse meta data from cell names
seurat.obj = add.meta.data(seurat.obj, sample.key = sample.key, key = key)
print("Number of cells by lanes and gates before filtering:")
print(table(seurat.obj@meta.data$fetal.ids, seurat.obj@meta.data$sort.ids))
print('Cells by samples and gates before filtering:')
print(table(seurat.obj@meta.data$fetal.ids, seurat.obj@meta.data$sort.ids))
# apply filtering
seurat.obj = filter.seurat(seurat.obj=seurat.obj, project.name="")
# parse meta data from cell names because meta.data is lost during filtering
seurat.obj = add.meta.data(seurat.obj, sample.key = sample.key, key=key)
print('Cells by samples and gates after filtering:')
print(table(seurat.obj@meta.data$fetal.ids, seurat.obj@meta.data$sort.ids))
print('Cells by lanes and gates after filtering:')
print(table(seurat.obj@meta.data$lanes, seurat.obj@meta.data$sort.ids))
# normaliza data
print("Normalizing data ...")
seurat.obj = NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Computing variable genes ...")
# find variable genes
seurat.obj = FindVariableGenes(object = seurat.obj, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = .0125,
x.high.cutoff = 3, y.cutoff = .625)
# calculate percentage of variable genes
print(paste("Percentage of variable genes:", round(100 * length(seurat.obj@var.genes) / dim(seurat.obj@data)[1], digits = 2), sep = " "))
# scale data in variable genes, otherwise pca is not possible
print("Scaling data ...")
seurat.obj = ScaleData(object=seurat.obj)
# run PCA
print("Performing PCA ...")
seurat.obj = RunPCA(object = seurat.obj, pc.genes = seurat.obj@var.genes, do.print = FALSE, pcs.print = 1:20, genes.print = 10)
# run TSNE
print("Performing TSNE")
seurat.obj = RunTSNE(object=seurat.obj, dims.use=1:20, seed.use=42, do.fast=TRUE)
# run umap
print("running UMAP")
umap.coordinates = RunUMAP(pca.df=seurat.obj@dr$pca@cell.embeddings, tool_addr=tool_addr, python.addr=python.addr)
rownames(umap.coordinates) = names(seurat.obj@ident)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="cell.embeddings", new.data=as.matrix(umap.coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="key", new.data="umap")
# run force-directed graph
print("Running force directed graph")
seurat.obj = BuildSNN(object=seurat.obj, reduction.type="pca", dims.use=1:20, plot.SNN=F)
fdg_coordinates = runFDG(pca.df=seurat.obj@dr$pca@cell.embeddings, snn=seurat.obj@snn, iterations=600, tool_addr=tool_addr, python.addr=python.addr)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot="cell.embeddings", new.data=as.matrix(fdg_coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot = "key", new.data = "fdg")
if(annotate.cells){
print("Annotating cells ... ")
seurat.obj@meta.data$cell.labels = Apply_Classifier_On_Seurat_Object(seurat.obj=seurat.obj, classifier.fname=cell.type.SVM, tool_addr=tool_addr, python.addr=python.addr)
}
if (identify.doublets){
print("identifying doublets")
seurat.obj@meta.data$doublets = Apply_Classifier_On_Seurat_Object(seurat.obj=seurat.obj, classifier.fname=doublet.svm, tool_addr=tool_addr, python.addr=python.addr)
}
print("saving data")
saveRDS(seurat.obj, save.at)
if (identify.doublets){
print("Doublets and singlets: ")
print(table(seurat.obj@meta.data$fetal.ids, seurat.obj@meta.data$doublets))
}
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N seurat_from_count_tables
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=300G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript seurat_from_count_tables.R $1
echo "End on `date`"

View file

@ -0,0 +1,83 @@
from pptx import Presentation
from pptx.util import Inches, Pt
from os import listdir
from os.path import isfile
import pandas as pd
if isfile("./graphs/previous_clusters.csv"):
previousAnnotation = True
previousAnnData = pd.read_csv("./graphs/previous_clusters.csv", index_col = 0 )
else:
previousAnnotation = False
# count the number of clusters by counting the number of images begin with "clusters"
clusters = len([tsne_img for tsne_img in listdir("./graphs") if tsne_img[0:7] == "cluster"])
# initiate presentation
prs = Presentation()
prs = Presentation()
black_slide_layout = prs.slide_layouts[6]
# add tsne and umap plots by cluster on first slide
img_path = "./graphs/dr.png"
slide = prs.slides.add_slide(black_slide_layout)
left = Inches(.1)
top = Inches(.5)
height = Inches(5.7)
pic = slide.shapes.add_picture(img_path, left, top, height=height)
# add tsne and umap plots by sample on second slide
img_path = "./graphs/dr_sample.png"
slide = prs.slides.add_slide(black_slide_layout)
left = Inches(.1)
top = Inches(.5)
height = Inches(5.7)
pic = slide.shapes.add_picture(img_path, left, top, height=height)
# for each cluter counted import dr plot and tally plots and insert them on a slide
for cluster in range(clusters):
# insert dr plot
img_path = "./graphs/cluster_dr_{cluster}.png".format(cluster = cluster)
slide = prs.slides.add_slide(black_slide_layout)
left = Inches(7)
top = Inches(.7)
height = Inches(6.7)
pic = slide.shapes.add_picture(img_path, left, top, height=height)
# insert tally plot
img_path = "./graphs/tally_{cluster}.png".format(cluster = cluster)
left = Inches(2.2)
top = Inches(0)
height = Inches(.77)
pic = slide.shapes.add_picture(img_path, left, top, height=height)
# insert the text
left = top = Inches(0)
width = Inches(6)
height = Inches(3)
txtBox = slide.shapes.add_textbox(left, top, width, height)
tf = txtBox.text_frame
tf.clear()
p = tf.paragraphs[0]
p.text = "Cluster {cluster}: ...".format(cluster = int(cluster))
p.font.bold = True
p.font.size = Pt(24)
p = tf.add_paragraph()
p.text = "Defining markers:"
p.font.size = Pt(12)
p = tf.add_paragraph()
p.text = "..."; p.level = 1
p.font.size = Pt(10)
p = tf.add_paragraph(); p.text = "Indentity: ..."; p.font.bold = True;
p.font.size = Pt(12)
p = tf.add_paragraph(); p.text = "Justification: ..."; p.font.bold = True;
p.font.size = Pt(12)
if previousAnnotation:
left = Inches(4); top = Inches(1); width = Inches(2); height = Inches(3);
txtBox = slide.shapes.add_textbox(left, top, width, height)
tf = txtBox.text_frame; tf.clear(); p = tf.paragraphs[0];
p.text = "\n".join(previousAnnData.loc[cluster].values[0].split("; "))
p.font.size = Pt(12)
prs.save('annotation_template.pptx')

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,106 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 7 11:42:51 2018
@author: doru
"""
import sys
args = sys.argv
save_to = args[1]
expression_data_fname = args[2]
no_of_categories = int(args[3])
import pandas as pd
import numpy as np
data = pd.read_csv(expression_data_fname, index_col = None)
# convert Colours to r, g, b values, then to floats < 1.0
def hexdec_to_1floats(hexdec):
return np.array([int(hexdec[1:][i:(i+2)], 16) for i in (0, 2, 4)]) / 255.0
gene_names = [gene_name for gene_name in data.columns[(2 + 2 * no_of_categories):]]
raw_expression = data.values[:, (2 + 2 * no_of_categories):]
gene_options = []
gene_expression_colour_coded = []
max_expression = raw_expression.max(axis = 1)
raw_expression / max_expression.reshape(max_expression.shape[0], 1)
max_expression_string = []
for index, gene_name in enumerate(gene_names):
gene_expression = raw_expression[:, index]
gene_expression = [str(value)[:min(4, len(str(value)))] for value in gene_expression]
gene_expression = ",".join(gene_expression)
gene_expression_colour_coded.append("gene_expression['{gn}'] = [{ge}]".format(gn = gene_name, ge = gene_expression))
gene_options.append("<option value='{gn}'>{gn}</option>".format(gn = gene_name))
max_expression_string.append("max_expression['{gene}'] = {val}".format(gene = gene_name, val = max_expression[index]))
gene_options = "".join(gene_options)
gene_expression_colour_coded = ";".join(gene_expression_colour_coded)
max_expression_string = ";".join(max_expression_string)
# make coordinates data string
coordinates = data.values[:, 0:2].astype('float32')
# next few steps are compressing the data into a stadard cube centered at (0,0,0) and L = 200
Xrange = np.percentile(coordinates[:, 0], q = [1, 98]) * 1.2
Yrange = np.percentile(coordinates[:, 1], q = [1, 98]) * 1.2
center = np.array((np.mean(Xrange), np.mean(Yrange)))
coordinates = coordinates - np.tile(center, (coordinates.shape[0], 1))
ratio = max(np.abs(np.percentile(coordinates[:, 0], q = [1, 98]) * 1.2))
ratio = max(ratio, max(np.abs(np.percentile(coordinates[:, 1], q = [1, 98]) * 1.2)))
ratio = 1.0 / ratio
coordinates = coordinates * ratio
coordinates = ",".join([str(value)[:min(6, len(str(value)))] for value in coordinates.ravel()])
categories = [str(value).replace(".", " ") for value in data.columns[2:(2 + no_of_categories)]]
categories_options = ["<option value='{cat}'>{cat}</option>".format(cat=cat) for cat in categories]
categories_options = "".join(categories_options)
categories_colours = []
categories_indices = []
for cat_index in range(no_of_categories):
category_name = data.columns[2 + cat_index]
category_name = category_name.replace(".", " ")
category_colours = [hexdec_to_1floats(colour) for colour in data.values[:, 2 + cat_index + no_of_categories]]
category_colours = [",".join([str(value)[:min(4, len(str(value)))] for value in colour]) for colour in category_colours]
category_colours = ",".join(category_colours)
categories_colours.append("categories_colours['{cn}'] = [{cc}]".format(cn = category_name, cc = category_colours))
types = [value for value in np.unique(data.values[:, 2 + cat_index])]
cat_indices = []
categories_indices.append("categories_indices['{cn}'] = []".format(cn = category_name))
for t_name in types:
indices = data.values[:, 2 + cat_index] == t_name
indices = np.where(indices)[0]
indices = ",".join([str(value) for value in indices])
cat_indices.append("categories_indices['{cn}']['{tn}'] = [{ind}]".format(cn = category_name, tn = t_name, ind = indices))
cat_indices = "\n".join(cat_indices)
categories_indices.append(cat_indices)
categories_indices = "\n".join(categories_indices)
categories_colours = "\n".join(categories_colours)
gene_families_file = open("./gene_families.txt", "r")
gene_families = gene_families_file.read()
gene_families_file.close()
geneFams = [fam.split("=")[0] for fam in gene_families.split("\n") if fam != ""]
geneFams = [fam.split("\'")[1] for fam in geneFams]
geneFams = ["<option value='{cat}'>{cat}</option>".format(cat=cat) for cat in geneFams]
geneFams = "".join(geneFams)
f = open('template.html', "r")
template_str = f.read()
f.close()
template_str = template_str.replace('gene_options_here', gene_options)
template_str = template_str.replace('gene_expression_colour_coded', gene_expression_colour_coded)
template_str = template_str.replace('coordinates_data_here', coordinates)
template_str = template_str.replace('category_options_here', categories_options)
template_str = template_str.replace('categories_colours_data_here', categories_colours)
template_str = template_str.replace('categories_indices_data_here', categories_indices)
template_str = template_str.replace('gene_families_options_here', gene_families)
template_str = template_str.replace('feature_family_option_here', geneFams)
template_str = template_str.replace('max_expression_here', max_expression_string)
with open(save_to, 'w') as result:
result.write(template_str)

View file

@ -0,0 +1,339 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
clustering.res.arg = args[2]
DE.downsample.arg = args[3]
sample.arg = args[4]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(plyr)
library(dplyr)
library(reshape2)
library(RColorBrewer)
library(gridExtra)
library(grid)
library(BiocParallel)
dr.plot.indexed.clusters <- function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2){
df.dr <- data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
p.labels <- unique(as.vector(point.labels))
p.labels <- as.character(sort(as.numeric(p.labels)))
p.labels.medians <- aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
set.seed(random_state)
plt.colours <- sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
index.map <- p.labels
plot.obj <- ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
plot.obj <- plot.obj + geom_point(size = pt.size)
plot.obj <- plot.obj + scale_color_manual(values=plt.colours)
plot.obj <- plot.obj + stat_density2d(geom="density2d", aes(x=DR1, y=DR2,alpha=5/10), size=.2, contour=TRUE,bins=7,h=1.5)
plot.obj <- plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21)
plot.obj <- plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = as.vector(p.labels.medians$Group.1), size = txt.lb.size)
if (no.legend){
plot.obj <- plot.obj + theme(legend.position="none")
}else{
plot.obj <- plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
}
plot.obj <- plot.obj + xlab(dr1.name) + ylab(dr2.name)
return(plot.obj)
}
dr.plot <- function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2, use.cols = FALSE, index.map = c()){
df.dr <- data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
p.labels <- sort(unique(as.vector(point.labels)))
df.dr$Cell.Labels <- factor(df.dr$Cell.Labels, levels=p.labels)
p.labels.medians <- aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
df.dr$Cell.Labels <- mapvalues(x = df.dr$Cell.Labels, from = p.labels, to = paste(1:length(p.labels), p.labels, sep = " "))
if(is.logical(use.cols)){
set.seed(random_state)
plt.colours <- sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
index.map <- 1:length(p.labels)
}else{
plt.colours <- use.cols
}
plot.obj <- ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
plot.obj <- plot.obj + geom_point(size = pt.size)
plot.obj <- plot.obj + scale_color_manual(values=plt.colours)
#plot.obj <- plot.obj + stat_density2d(geom="density2d", aes(x=DR1, y=DR2,alpha=5/10), size=.2, contour=TRUE,bins=7,h=1.5)
plot.obj <- plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = "gray", alpha = .5, pch = 21)
plot.obj <- plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = index.map, size = txt.lb.size)
if (no.legend){
plot.obj <- plot.obj + theme(legend.position="none")
}else{
plot.obj <- plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
}
plot.obj <- plot.obj + xlab(dr1.name) + ylab(dr2.name)
return(plot.obj)
}
dr.plot.group <- function(point.labels, dr1, dr2, dr1.name, dr2.name, group.name, pt.size = .4){
df.dr <- data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
p.labels <- sort(unique(as.vector(point.labels)))
df.dr$Cell.Labels <- factor(df.dr$Cell.Labels, levels=p.labels)
group.index <- which(p.labels == group.name)
plt.colours <- rep("#bae1ff", length(p.labels))
plt.colours[group.index] <- "#0D7D75"
plot.obj <- ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
plot.obj <- plot.obj + geom_point(size = pt.size)
plot.obj <- plot.obj + scale_color_manual(values=plt.colours)
plot.obj <- plot.obj + theme(legend.position="none")
plot.obj <- plot.obj + xlab(dr1.name) + ylab(dr2.name) + ggtitle(group.name)
return(plot.obj)
}
tabulate.seurat.by.cluster <- function(seurat.obj, slot1, slot2, save.at, width, height, saveas.pdf = F){
"used to build tables that show contingency distribution of cells by 2 different labeling criteria"
"these are slot1 and slot2 which should be in the meta.data slot of the seurat object"
for (i in 1:length(levels(seurat.obj@ident))){
cluster = levels(seurat.obj@ident)[i]
cells.cluster <- colnames(seurat.obj@data)[seurat.obj@ident == cluster]
cells.indices <- match(cells.cluster, colnames(seurat.obj@data))
base.com <- paste(substitute(seurat.obj), "meta.data", sep = "@")
com1 <- paste(base.com, slot1, sep = "$")
com1 <- sprintf("%s[cells.indices]", com1)
com2 <- paste(base.com, slot2, sep = "$")
com2 <- sprintf("%s[cells.indices]", com2)
command <- sprintf("tally <- table(%s, %s)", com1, com2)
eval(parse(text = command))
command <- sprintf("tally.rez <- cbind(tally, `Total by %s` = rowSums(tally))", slot1)
eval(parse(text = command))
command <- sprintf("tally.rez <- rbind(tally.rez, `Total by %s` = c(colSums(tally), length(cells.cluster)))", slot2)
eval(parse(text = command))
print(tally.rez)
if (saveas.pdf){
filename <- paste(paste("tally_", cluster, sep = ""), ".pdf", sep = "")
filename <- file.path(save.at, filename)
pdf(filename, width = width, height = height)
grid.table(tally.rez)
dev.off()
}else{
filename <- paste(paste("tally_", cluster, sep = ""), ".png", sep = "")
filename <- file.path(save.at, filename)
png(filename, width = width, height = height)
grid.table(tally.rez)
dev.off()
}
}
}
FindMarker.wrapper <- function(markers.for){
if (DE.downsample){
markers = FindMarkers(seurat.obj_d, ident.1=markers.for, only.pos = F, min.pct=0.25, genes.use=rownames(seurat.obj_d@data),
thresh.use = 0.25, test.use = "wilcox", random.seed = 42, print.bar=T, do.print=T)
}else{
markers = FindMarkers(seurat.obj, ident.1=markers.for, only.pos = F, min.pct=0.25, genes.use=rownames(seurat.obj@data),
thresh.use = 0.25, test.use = "wilcox", random.seed = 42, print.bar=T, do.print=T)
}
markers$cluster = markers.for
markers$gene = rownames(markers)
markers
}
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
# process the data (normalize - scale - variable genes - pca - tsne)
print("Normalizing data ... ")
seurat.obj <- NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Computing variable genes ... ")
# find all clusters
print("Clustering data ... ")
seurat.obj <- FindClusters(object = seurat.obj, reduction.type = "pca",
dims.use = 1:20, resolution = clustering.res, save.SNN = T, algorithm=1)
print(paste("Number of clusters: ", print(length(levels(seurat.obj@ident))), sep = ""))
seurat.obj@meta.data$LouvainClustering = as.vector(seurat.obj@ident)
print(table(seurat.obj@meta.data$LouvainClustering))
print("Saving Seurat object")
saveRDS(seurat.obj, seurat.addr)
print('Seurat object saved')
# writing marker genes to disk
if (DE.downsample){
cluster.ids <-unique(as.vector(seurat.obj@ident))
cells.to.keep <- c()
for (k in 1:length(cluster.ids)){
cluster.id <- cluster.ids[k]
cell.ids <- names(seurat.obj@ident)[seurat.obj@ident == cluster.id]
cell.ids <- which(names(seurat.obj@ident) %in% cell.ids )
cells.to.keep <- c(sample(x=cell.ids, size=min(300, length(cell.ids)), replace=F), cells.to.keep)
}
seurat.obj_d <- SubsetData(object=seurat.obj, cells.use=names(seurat.obj@ident)[cells.to.keep])
seurat.obj_d <- NormalizeData(object = seurat.obj_d, normalization.method = "LogNormalize", scale.factor = 10000)
print("Calculating marker genes: finished subseting, currently actually calculating the markers ... ")
Markers <- bplapply(sort(as.vector(unique(seurat.obj_d@meta.data$LouvainClustering))), FindMarker.wrapper,BPPARAM=MulticoreParam(5))
}else{
print("Calculating marker genes: finished subseting, currently actually calculating the markers ... ")
Markers <- bplapply(sort(as.vector(unique(seurat.obj@meta.data$LouvainClustering))), FindMarker.wrapper,BPPARAM=MulticoreParam(5))
}
marker.genes = Reduce(f=rbind, x=Markers)
print("Saving marker genes ... ")
write.csv(marker.genes, file.path(output_folder, "all_markers.csv"))
print('Creating and saving to disk annotation marker genes')
gene_db = read.csv('./gene_info.csv')
rownames(gene_db) = as.vector(gene_db$gene.symbol)
marker.genes.top = marker.genes %>% group_by(cluster) %>% top_n(50, avg_logFC)
gene_to_pop = read.csv("./gene_to_pop.tsv", sep = '\t', header = F)
colnames(gene_to_pop) = c('Gene', 'Population')
marker.genes.unique = unique(as.vector(marker.genes.top$gene))
gene_info = gene_db[marker.genes.unique, ]
# get gene name
gene.name = mapvalues(x=as.vector(marker.genes.top$gene), from=as.vector(gene_info$gene.symbol),
to=as.vector(gene_info$gene.name))
marker.genes.top = cbind(as.data.frame(marker.genes.top), data.frame(GeneName = gene.name))
# get also present in
also_present_in = function(gene.sym){
part = as.vector(marker.genes.top[as.vector(marker.genes.top$gene) == gene.sym, ]$cluster)
if (length(part) > 1){
return(paste(part, collapse = ', '))
}else{
return('')
}
}
present_in = unlist(lapply(as.list(marker.genes.unique), also_present_in))
present_in = mapvalues(x=as.vector(marker.genes.top$gene), from=as.vector(marker.genes.unique), to=as.vector(present_in))
marker.genes.top = cbind(marker.genes.top, data.frame(AlsoPresentInClusters = present_in))
# get cell type flag
cell_type_flag = c()
for(i in 1:dim(marker.genes.top)[1]){
gene.sym = as.vector(marker.genes.top$gene)[i]
pops = as.vector(gene_to_pop$Population)[as.vector(gene_to_pop$Gene) == gene.sym]
if(length(pops) == 0){
pops = ''
}
cell_type_flag = c(cell_type_flag, pops)
}
marker.genes.top = cbind(marker.genes.top, data.frame(CellTypeFlag = cell_type_flag))
# get gene summary
gene.summary = mapvalues(x=as.vector(marker.genes.top$gene), from=as.vector(gene_info$gene.symbol),
to=as.vector(gene_info$gene.summary))
marker.genes.top = cbind(as.data.frame(marker.genes.top), data.frame(Summary = gene.summary))
# get reactom
reactome.pathway = mapvalues(x=as.vector(marker.genes.top$gene), from=as.vector(gene_info$gene.symbol),
to=as.vector(gene_info$reactome.pathway))
marker.genes.top = cbind(as.data.frame(marker.genes.top), data.frame(Reactom = reactome.pathway))
# get gene family
gene.family = mapvalues(x=as.vector(marker.genes.top$gene), from=as.vector(gene_info$gene.symbol),
to=as.vector(gene_info$gene.family))
marker.genes.top = cbind(as.data.frame(marker.genes.top), data.frame(GeneFamily = gene.family))
write.csv(marker.genes.top, file.path(output_folder, "annotation_markers.csv"))
update.template <- data.frame(Cluster = sort(as.vector(unique(seurat.obj@ident))), Identity = rep("None", length(unique(seurat.obj@ident))))
write.csv(update.template, file.path(output_folder, "update_template.csv"), row.names = F)
print("compiling the template")
df <- data.frame(CellNames = names(seurat.obj@ident),
ClusterIndex = as.vector(seurat.obj@ident),
tSNEx = seurat.obj@dr$tsne@cell.embeddings[, 1],
tSNEy = seurat.obj@dr$tsne@cell.embeddings[, 2],
UMAPx = seurat.obj@dr$umap@cell.embeddings[, 1],
UMAPy = seurat.obj@dr$umap@cell.embeddings[, 2],
Sample = seurat.obj@meta.data[, sample])
# transfer the compile.py file to the output
file.copy(from='compile_template.py', to=file.path(output_folder, 'compile_template.py'))
CURDIR = getwd()
setwd(output_folder)
dir.create("graphs")
# make tsne and umap plots by clusters
plot.tsne <- dr.plot.indexed.clusters(point.labels=df$ClusterIndex, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE-x", dr2.name="tSNE-y", no.legend = T, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2)
plot.umap <- dr.plot.indexed.clusters(point.labels=df$ClusterIndex, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP-x", dr2.name="UMAP-y", no.legend = T, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2)
png("./graphs/dr.png", width = 1200, height = 700)
plot_grid(plot.tsne, plot.umap)
dev.off()
# make tsne and umap plots by sample
plot.tsne <- dr.plot(point.labels=df$Sample, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE-x", dr2.name="tSNE-y", no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = 1)
plot.umap <- dr.plot(point.labels=df$Sample, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP-x", dr2.name="UMAP-y", no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = 1)
png("./graphs/dr_sample.png", width = 1200, height = 700)
plot_grid(plot.tsne, plot.umap)
dev.off()
# create cell tally plots, tsne plots and umap plots
no.clusters <- length(levels(seurat.obj@ident))
for (i in 1:no.clusters){
cluster.name <- levels(seurat.obj@ident)[i]
plot.tsne <- dr.plot.group(point.labels=df$ClusterIndex, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE1", dr2.name="tSNE2", group.name=cluster.name, pt.size = .4)
plot.umap <- dr.plot.group(point.labels=df$ClusterIndex, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP1", dr2.name="UMAP2", group.name=cluster.name, pt.size = .4)
graph.addr <- paste(paste("cluster_dr_", cluster.name, sep = ""), ".png", sep = "")
graph.addr <- file.path("graphs", graph.addr)
png(graph.addr, width = 400, height = 900)
print(plot_grid(plot.tsne, plot.umap, nrow = 2))
dev.off()
print(cluster.name)
}
# plot cell numbers by sample and gate for all the clusters ("sort.ids", "fetal.ids")
tabulate.seurat.by.cluster(seurat.obj, "tissue", "tissue", save.at="./graphs", width=1110, height=110, saveas.pdf = F)
# compile template annotation powerpoint
system(paste(python.addr, "compile_template.py", sep = " "), wait = T)
setwd(CURDIR)
print('Finished.')

View file

@ -0,0 +1,17 @@
#!/bin/bash
#$ -cwd
#$ -N make_annotation_template
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=200G
#$ -pe smp 6
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript make_annotation_template.R $1
echo "End on `date`"

View file

@ -0,0 +1,577 @@
<!doctype html>
<html lang='en'>
<head>
<meta charset='utf-8'>
<title>3D viewer</title>
<meta name='description' content='The HTML5 Herald'>
<meta name='author' content='Dorin-Mirel Popescu'>
</head>
<body>
<table>
<tr>
<td align='left'>
<form>
<fieldset>
<legend><b>Visualisation options</b></legend>
<label for = 'particleSizeBar'>Particle size: </label>
<input type='range' name = 'particleSizeBar' min = 1 max = 14 step=0.1 oninput='setParticleSize(value)' value = 2 /><br />
<label for = 'alphaInput'>Transparency: </label>
<input type='range' name = 'alphaInput' min = 0 max = 1000 oninput='setAlpha(value)' value = 1000 /><br />
<label for = 'canvasSizeInput'>Canvas size: </label>
<input type='range' name = 'canvasSizeInput' min = 200 max = 2000 oninput='setCanvasSize(value)' value = 500 /><br />
<label for = 'bgInput'>Dark background: </label>
<input type='radio' name = 'bgInput' oninput='setBackground(value)' value = 'dark' />
<label for = 'bgInput'>White background: </label>
<input type='radio' name = 'bgInput' oninput='setBackground(value)' value = 'white' checked />
<br />
</fieldset>
</form>
</td>
<td style='vertical-align: top' rowspan='2'>
<form>
<fieldset>
<legend><b>Colour by:</b></legend>
<table>
<tr>
<td>
Choose gene family:
</td>
<td>
<label for='familyGeneSelector'><select name='familyGeneSelector' id='familyGeneSelector' onchange='selectFeatureFamily()'>feature_family_option_here</select></label>
</td>
</tr>
<tr>
<td>
<label for='colourType'><input type='radio' name='colourType' onchange='setColourBy(value)' value='gene_expression' />Gene expression: </label>
</td>
<td>
<label for='geneSelector'><select name='geneSelector' id='geneSelector' onchange='selectFeature()'>gene_options_here</select></label>
</td>
</tr>
<tr>
<td colspan = '2' align='center'>
<canvas id='canvasColorScale' width = 200 height=40></canvas>
</td>
</tr>
<tr>
<td>
<label for='colourType'><input type='radio' name='colourType' checked onchange='setColourBy(value)' value='category' />Category:</label>
</td>
<td>
<label for='categorySelector'><select name='categorySelector' id='categorySelector' onchange = 'setCategory()'>category_options_here</select></label>
</td>
</tr>
</table>
</fieldset>
</form>
<br />
<div>
<fieldset>
<legend><b>Cell types:</b></legend>
<label for='toggleRadio'><input type='checkbox' name = 'toggleRadio' id='toggleRadio' onchange='toggleAllTypes()' checked />Show all:</label>
<form id = 'typesControlPanel'>
</form>
</fieldset>
</div>
</td>
</tr>
<tr>
<td style='vertical-align: text-top' >
<canvas id='canvas' width=600 height=600></canvas>
</td>
</tr>
</table>
<script id='vertex-shader' type='x-shader/x-fragment'>
attribute vec4 a_Position;
attribute vec3 a_Color;
uniform float u_basePointSize;
uniform float u_Alpha;
uniform int u_PaintFeatureScale;
varying vec4 v_Color;
void main() {
gl_Position = a_Position;
gl_PointSize = u_basePointSize;
if (u_PaintFeatureScale == 0){
v_Color = vec4(a_Color, u_Alpha);
}
else{
float r = 0.0;
float g = 0.0;
float b = 0.0;
r = max(0.0, 2.0 * a_Color.r - 1.0);
b = max(0.0, 2.0 * (1.0 - a_Color.r) - 1.0);
g = 1.0 - 2.0 * abs(a_Color.r - 0.5);
v_Color = vec4(r, g, b, u_Alpha);
}
}
</script>
<script id ='fragment-shader' type='x-shader/x-fragment'>
precision mediump float;
varying vec4 v_Color;
void main() {
float r = 0.0;
vec2 cxy = 2.0 * gl_PointCoord - 1.0;
r = dot(cxy, cxy);
if (r > 1.0){
discard;
}
gl_FragColor = v_Color;
}
</script>
<script type = 'text/javascript'>
var Matrix4 = function(opt_src) {
var i, s, d;
if (opt_src && typeof opt_src === 'object' && opt_src.hasOwnProperty('elements')) {
s = opt_src.elements;
d = new Float32Array(16);
for (i = 0; i < 16; ++i) {
d[i] = s[i];
}
this.elements = d;
} else {
this.elements = new Float32Array([1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1]);
}
};
Matrix4.prototype.setTranslate = function(x, y, z) {
var e = this.elements;
e[0] = 1; e[4] = 0; e[8] = 0; e[12] = x;
e[1] = 0; e[5] = 1; e[9] = 0; e[13] = y;
e[2] = 0; e[6] = 0; e[10] = 1; e[14] = z;
e[3] = 0; e[7] = 0; e[11] = 0; e[15] = 1;
return this;
};
Matrix4.prototype.setLookAt = function(eyeX, eyeY, eyeZ, centerX, centerY, centerZ, upX, upY, upZ) {
var e, fx, fy, fz, rlf, sx, sy, sz, rls, ux, uy, uz;
fx = centerX - eyeX;
fy = centerY - eyeY;
fz = centerZ - eyeZ;
// Normalize f.
rlf = 1 / Math.sqrt(fx*fx + fy*fy + fz*fz);
fx *= rlf;
fy *= rlf;
fz *= rlf;
// Calculate cross product of f and up.
sx = fy * upZ - fz * upY;
sy = fz * upX - fx * upZ;
sz = fx * upY - fy * upX;
// Normalize s.
rls = 1 / Math.sqrt(sx*sx + sy*sy + sz*sz);
sx *= rls;
sy *= rls;
sz *= rls;
// Calculate cross product of s and f.
ux = sy * fz - sz * fy;
uy = sz * fx - sx * fz;
uz = sx * fy - sy * fx;
// Set to this.
e = this.elements;
e[0] = sx;
e[1] = ux;
e[2] = -fx;
e[3] = 0;
e[4] = sy;
e[5] = uy;
e[6] = -fy;
e[7] = 0;
e[8] = sz;
e[9] = uz;
e[10] = -fz;
e[11] = 0;
e[12] = 0;
e[13] = 0;
e[14] = 0;
e[15] = 1;
// Translate.
return this.translate(-eyeX, -eyeY, -eyeZ);
};
Matrix4.prototype.translate = function(x, y, z) {
var e = this.elements;
e[12] += e[0] * x + e[4] * y + e[8] * z;
e[13] += e[1] * x + e[5] * y + e[9] * z;
e[14] += e[2] * x + e[6] * y + e[10] * z;
e[15] += e[3] * x + e[7] * y + e[11] * z;
return this;
};
Matrix4.prototype.setPerspective = function(fovy, aspect, near, far) {
var e, rd, s, ct;
if (near === far || aspect === 0) {
throw 'null frustum';
}
if (near <= 0) {
throw 'near <= 0';
}
if (far <= 0) {
throw 'far <= 0';
}
fovy = Math.PI * fovy / 180 / 2;
s = Math.sin(fovy);
if (s === 0) {
throw 'null frustum';
}
rd = 1 / (far - near);
ct = Math.cos(fovy) / s;
e = this.elements;
e[0] = ct / aspect;
e[1] = 0;
e[2] = 0;
e[3] = 0;
e[4] = 0;
e[5] = ct;
e[6] = 0;
e[7] = 0;
e[8] = 0;
e[9] = 0;
e[10] = -(far + near) * rd;
e[11] = -1;
e[12] = 0;
e[13] = 0;
e[14] = -2 * near * far * rd;
e[15] = 0;
return this;
};
</script>
<script type='text/javascript'>
function buildCategoryRadioButtons(){
category_type = categorySelector.options[categorySelector.selectedIndex].value;
current_indices = indices_all;
// create radio commands from categories
typesControlPanel.innerHTML = "";
radio_commands_HTML = "";
for(name in categories_indices[category_type]){
f_index = categories_indices[category_type][name][0]
cols = categories_colours[category_type].slice(3 * f_index, 3 * f_index + 3)
col_label = "#";
for(k=0;k<cols.length;k++){col_hex = Math.round(255 * cols[k]).toString(16).padStart(2, '0'); col_label = col_label + col_hex}
radio_command = "<div style='background-color:" + col_label + "'>";
radio_command = radio_command + "<input style='float:left' type='checkbox' id='" + name;
radio_command = radio_command + "' checked onchange='toggleCategoryAction()' /><label style='float:left' for='" + name + "'";
radio_command = radio_command + ">" + name + ": </label><br/></div>"
radio_commands_HTML = radio_commands_HTML + radio_command
}
typesControlPanel.innerHTML = radio_commands_HTML;
}
function toggleCategoryAction(){
updateBuffer()
draw()
}
function setCategory(){
buildCategoryRadioButtons()
updateBuffer()
draw()
}
function setColourBy(value){
colour_by = value;
if (colour_by =='category'){
PaintFeatureScale = 0;
}else{
PaintFeatureScale = 1;
}
gl_context.uniform1i(u_PaintFeatureScale, PaintFeatureScale)
updateBuffer()
draw()
}
function toggleAllTypes(){
controlRadios = typesControlPanel.elements
for(i=0;i<controlRadios.length;i++){
controlRadios[i].checked = toggleRadio.checked
}
updateBuffer()
draw()
}
function selectFeature(){
feature = geneSelector.value
updateBuffer()
draw()
drawScale(max_expression[feature])
console.log('selected features')
}
function draw(){
if(bg_color == "white"){
gl_context.clearColor(1, 1, 1, 1)
}else{
gl_context.clearColor(0, 0, 0, 1)
}
gl_context.clear(gl_context.COLOR_BUFFER_BIT);
gl_context.bufferData(gl_context.ARRAY_BUFFER, buffer_data_array, gl_context.STATIC_DRAW)
gl_context.drawArrays(gl_context.POINTS, 0, n)
}
function updateBuffer(){
var buffer_data = [];
// first update indices to be used - for this read the category control panel radio buttons
controlRadios = typesControlPanel.elements
current_indices = []
for(i=0;i<controlRadios.length;i++){
if(controlRadios[i].checked){
radio_type = controlRadios[i].id
current_indices = current_indices.concat(categories_indices[category_type][radio_type])
}
}
// now just populate the buffer_data
if(colour_by == 'gene_expression'){
current_indices.forEach(function(index, i){
buffer_data.push(coordinates_data[2 * index])
buffer_data.push(coordinates_data[2 * index + 1])
buffer_data.push(gene_expression[feature][index])
buffer_data.push(gene_expression[feature][index])
buffer_data.push(gene_expression[feature][index])
})
}else{
current_indices.forEach(function(index, i){
buffer_data.push(coordinates_data[2 * index])
buffer_data.push(coordinates_data[2 * index + 1])
buffer_data.push(categories_colours[category_type][3 * index])
buffer_data.push(categories_colours[category_type][3 * index + 1])
buffer_data.push(categories_colours[category_type][3 * index + 2])
})
}
buffer_data_array = new Float32Array(buffer_data)
n = buffer_data_array.length / 5
}
function setParticleSize(value){
particleSize = parseInt(value)
gl_context.uniform1f(u_basePointSize, particleSize)
updateBuffer()
draw()
}
function setAlpha(value){
alphaValue = parseInt(value) / 1000
gl_context.uniform1f(u_Alpha, alphaValue)
updateBuffer()
draw()
}
function setCanvasSize(value){
value = parseInt(value)
canvas.width = value
canvas.height = value
gl_context = getContext(canvas)
gl_context = initContext(gl_context)
gl_context.viewport(0, 0, canvas.width, canvas.height)
updateBuffer()
draw()
}
function setBackground(value){
bg_color = value;
draw()
}
function shadersFromScriptElement(gl, ID, type){
shaderScript = document.getElementById(ID)
var str = ''
var k = shaderScript.firstChild;
while(k){
if (k.nodeType == 3){
str += k.textContent;
}
k = k.nextSibling
}
var shader = gl.createShader(type)
gl.shaderSource(shader, str)
gl.compileShader(shader)
return shader
}
function getContext(canvasWidget){
var names = ['webgl', 'experimental-webgl', 'webkit-3d', 'moz-webgl'];
for(var i=0; i<names.length; i++){
try{
var gl = canvasWidget.getContext(names[i])
}catch(e){}
if(gl){i=names.length}
}
var vshader = shadersFromScriptElement(gl, 'vertex-shader', gl.VERTEX_SHADER),
fshader = shadersFromScriptElement(gl, 'fragment-shader', gl.FRAGMENT_SHADER)
program = gl.createProgram();
gl.attachShader(program, vshader)
gl.attachShader(program, fshader)
gl.linkProgram(program)
gl.useProgram(program)
gl.program = program
return gl
}
function initContext(gl){
n = buffer_data_array.length / 5
var vertexColourBuffer = gl.createBuffer()
gl.bindBuffer(gl.ARRAY_BUFFER, vertexColourBuffer)
var FSIZE = buffer_data_array.BYTES_PER_ELEMENT;
var a_Position = gl.getAttribLocation(gl.program, 'a_Position')
gl.vertexAttribPointer(a_Position, 2, gl.FLOAT, false, FSIZE * 5, 0)
gl.enableVertexAttribArray(a_Position)
var a_Color = gl.getAttribLocation(gl.program, 'a_Color')
gl.vertexAttribPointer(a_Color, 3, gl.FLOAT, false, FSIZE * 5, 2 * FSIZE)
gl.enableVertexAttribArray(a_Color)
u_basePointSize = gl.getUniformLocation(gl.program, 'u_basePointSize')
gl.uniform1f(u_basePointSize, particleSize)
u_Alpha = gl.getUniformLocation(gl.program, "u_Alpha")
gl.uniform1f(u_Alpha, alphaValue)
u_PaintFeatureScale = gl.getUniformLocation(gl.program, 'u_PaintFeatureScale')
gl.uniform1i(u_PaintFeatureScale, PaintFeatureScale)
gl.clearColor(1, 1, 1, 1);
if(bg_color == "dark"){
gl.clearColor(0, 0, 0, 1)
}
gl.disable(gl.DEPTH_TEST)
gl.enable(gl.BLEND)
gl.blendFunc(gl.SRC_ALPHA, gl.ONE_MINUS_SRC_ALPHA)
gl.clear(gl.COLOR_BUFFER_BIT);
return gl
}
var categorySelector = document.getElementById('categorySelector'),
geneSelector = document.getElementById('geneSelector'),
typesControlPanel = document.getElementById('typesControlPanel'),
toggleRadio = document.getElementById('toggleRadio'),
familyGeneSelector = document.getElementById("familyGeneSelector")
var canvas = document.getElementById('canvas'),
particleSize = 5,
alphaValue = 1.0,
bg_color = "white",
n = 0,
particleSize = 2,
PaintFeatureScale = 0,
currentMaxExpression = 0;
coordinates_data = [coordinates_data_here]
gene_expression = []; gene_expression_colour_coded;
categories_colours = []
categories_colours_data_here
categories_indices = []
categories_indices_data_here
var gene_families = []
gene_families_options_here
var max_expression=[]
max_expression_here
function selectFeatureFamily(value){
var genes = gene_families[familyGeneSelector.value],
gene_options = "";
for(var i=0;i<genes.length;i++){
console.log(i)
gene_options = gene_options + "<option value='" + genes[i] + "'>" + genes[i] + "</option>";
}
geneSelector.innerHTML = gene_options
selectFeature()
}
// initialize flags
// when toggling between gene expression and category, do not slice data i.e. do not recompute index data
// when choosing a category always re-initiate index data
var colour_by = 'category', // the other options is can be 'category'
category_types = [],
category_type = '',
features = [],
feature = '';
// set category
for(name in categories_colours){category_types.push(name)}
category_type = category_types[0]
// set feature
for(name in gene_expression){features.push(name)}
feature = features[0];
// create global data holders
var indices_all = [],
current_indices = [],
current_colours = [],
buffer_data_array = [];
for(j=0;j<categories_colours[category_type].length/3;j++){indices_all.push(j)}
// build the categories buttons for the first time
buildCategoryRadioButtons()
updateBuffer()
// create the renderer
var gl_context = getContext(canvas);
gl_context = initContext(gl_context)
// now draw
draw()
// draw the scale
var canvasColorScale = document.getElementById('canvasColorScale'),
canvas_ctx = canvasColorScale.getContext('2d'),
scale_gradient = canvas_ctx.createLinearGradient(0, 0, 200, 0);
function drawScale(maxVal){
canvas_ctx.fillStyle = 'white'
canvas_ctx.fillRect(0, 0, canvasColorScale.width, canvasColorScale.height)
canvas_ctx.fillStyle = scale_gradient;
canvas_ctx.fillRect(0, 20, canvasColorScale.width, canvasColorScale.height)
canvas_ctx.fillStyle = 'black'
canvas_ctx.fillText('0', 10, 10)
canvas_ctx.fillText(parseInt(10 * maxVal) / 10, 180, 10)
}
scale_gradient.addColorStop(0, 'blue');
scale_gradient.addColorStop(0.5, 'green');
scale_gradient.addColorStop(1, 'red');
selectFeature()
</script>
</body>
</html>

View file

@ -0,0 +1,44 @@
library(Seurat)
# can split RDS by doublets, or any other metadata column in the seurat object
sort.by <- "doublets"
seurat.addrs <- "/home/b8058304/single_cell_data_analysis_bundle/data/test_yolk_sac_all.RDS"
#######################################################################################################
#######################################################################################################
#######################################################################################################
# load the seurat object
print("Loading the data ... ")
seurat.obj <- readRDS(seurat.addrs)
# get categories of sort.by
eval(parse(text=paste("cats <- seurat.obj@meta.data$", sort.by, sep = "")))
cats.unique <- unique(as.vector(cats))
print(cats.unique)
# set the ident slot to the sort.by meta.data
seurat.obj <- SetAllIdent(object=seurat.obj, id=sort.by)
# init list to store seurat objects splitted by cats
store <- c()
for (i in 1:length(cats.unique)){
category <- cats.unique[i]
cat.object <- SubsetData(object=seurat.obj, ident.use=category, subset.raw=T, do.clean = T)
# filter out cells you do not want to keep
cells.to.keep <- names(cat.object@ident)[cat.object@meta.data[sort.by] == category]
cat.object <- SubsetData(object=cat.object, cells.use=cells.to.keep)
print(paste("Doing: ", category, sep = ""))
print("Check point 3")
print(dim(cat.object@data))
print(dim(cat.object@raw.data))
file.name <- gsub(pattern=" ", replacement="_", x=category)
file.name <- paste(file.name, ".RDS", sep = "")
saveRDS(cat.object, file.name)
}
print("Ended beautifully")

View file

@ -0,0 +1,11 @@
#!/bin/bash
#$ -cwd
#$ -N split_seurat_by_category
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=400G
Rscript split_seurat_by_category.R
echo "End on `date`"

View file

@ -0,0 +1,90 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
if(length(args) != 5){
stop('This pipeline requires 5 parameters: seurat.addrs (list of file names)\nappend_tag (boolean)\ntags_to_append (list of tags)\nappend_tags_at (list of meta.data columns where to append the tags)\nsave (file name to save the data at)')
}
arguments.list = "
seurat.addrs.arg = args[1]
append_tag.arg = args[2]
tags_to_append.arg = args[3]
append_tags_at.arg = args[4]
save.at.arg = args[5]
"
eval(parse(text = arguments.list))
arguments.list = unlist(strsplit(arguments.list, "\n"))
arguments.list = arguments.list[!(arguments.list == "")]
for(n in 1:length(arguments.list)){
argument = arguments.list[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
for(n in 1:length(seurat.addrs)){
seurat.addrs[n] = file.path("../../data", seurat.addrs[n])
}
save.at = file.path("../../data", save.at)
if(append_tag){
if ( length(tags_to_append) != length(seurat.addrs)){
stop("Number of tags is different from number of data objects. Stopping ... ")
}
}
library(Seurat)
#######################################################################################################
# init empty list to store the seurat objects
store = list()
# for each .RDS filename read the data and append object to the store lirest
print("loading the datasets")
for (i in 1:length(seurat.addrs)){
sprintf("Loaded %d out of %d", i, length(seurat.addrs))
seurat.obj = readRDS(seurat.addrs[i])
if(append_tag){
own.tag = tags_to_append[i]
for(md.index in 1:length(append_tags_at)){
md.name = append_tags_at[md.index]
code.line = sprintf("seurat.obj@meta.data$%s = paste(seurat.obj@meta.data$%s, '%s', sep = '_')", md.name, md.name, own.tag)
eval(parse(text = code.line))
}
}
store[[i]] = seurat.obj
print(store[[i]])
}
# merge first 2 objects
print("Merging the first 2 datasets")
seurat.obj = MergeSeurat(object1=store[[1]], object2=store[[2]], project="None", min.cells=0, min.genes=0)
# add the rest of the seurat objects
if(length(store) > 2){
for (j in 3:length(store)){
sprintf("Adding dataset %d", j)
seurat.obj = MergeSeurat(object1=seurat.obj, object2=store[[j]], project="None", min.cells=0, min.genes=0)
}
}
rm(store)
print("normalizing data ... ")
seurat.obj = NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Computing variable genes ... ")
seurat.obj = FindVariableGenes(object = seurat.obj, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = .0125,
x.high.cutoff = 3, y.cutoff = .625)
print("Scaling data ...")
seurat.obj = ScaleData(object=seurat.obj)
print("Saving data ... ")
saveRDS(seurat.obj, save.at)
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N merge_seurat_objects
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=200G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript merge_seurat_objects.R $1
echo "End on `date`"

View file

@ -0,0 +1,86 @@
# import libraries
library(Seurat)
seurat.obj.addr = "../../data/test_yolk_sac_subset.RDS"
save.at = "../../data/test_yolk_sac_subset_cluster_1.RDS"
process = T
add.dr = T
filter.args = list("cell.labels" = c("HSC/MPP" , "Neutrophil-myeloid progenitor", "Monocyte-DC precursor", "DC2"), "gender" = c("male"))
#######################################################################################################
#######################################################################################################
#######################################################################################################
# load the seurat object
print("Loading the data ... ")
seurat.obj = readRDS(seurat.obj.addr)
print("Data loaded")
source("../../tools/bunddle_utils.R")
cells.to.keep = rep(T, length(seurat.obj@ident))
for(k in 1:length(filter.args)){
cat = filter.args[k]
conditions = as.vector(unlist(cat))
cat = as.vector(names(cat))
satisfy = seurat.obj@meta.data[, cat] %in% conditions
cells.to.keep = cells.to.keep & satisfy
}
if(!any(cells.to.keep)){
print("No cells have been selected. Relax the conditions.")
}else{
seurat.obj = SubsetData(object=seurat.obj, cells.use=names(seurat.obj@ident)[cells.to.keep], subset.raw=T, do.clean=T)
# add processing
if (process){
# normaliza data
print("Normalizing data ...")
seurat.obj = NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Computing variable genes ...")
# find variable genes
seurat.obj = FindVariableGenes(object = seurat.obj, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = .0125,
x.high.cutoff = 3, y.cutoff = .625)
# calculate percentage of variable genes
print(paste("Percentage of variable genes:", round(100 * length(seurat.obj@var.genes) / dim(seurat.obj@data)[1], digits = 2), sep = " "))
# scale data in variable genes, otherwise pca is not possible
print("Scaling data ...")
seurat.obj = ScaleData(object=seurat.obj)
# run PCA
print("Performing PCA ...")
seurat.obj = RunPCA(object = seurat.obj, pc.genes = seurat.obj@var.genes, do.print = TRUE, pcs.print = 1:20, genes.print = 10)
}
if(add.dr){
# run TSNE
print("Performing TSNE")
seurat.obj = RunTSNE(object=seurat.obj, dims.use=1:20, seed.use=42, do.fast=TRUE)
# run umap
print("running UMAP")
umap.coordinates = RunUMAP(pca.df=seurat.obj@dr$pca@cell.embeddings, tool_addr=tool_addr, python.addr=python.addr)
rownames(umap.coordinates) = names(seurat.obj@ident)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="cell.embeddings", new.data=as.matrix(umap.coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="key", new.data="umap")
# run force-directed graph
print("Running force directed graph")
seurat.obj = BuildSNN(object=seurat.obj, reduction.type="pca", dims.use=1:20, plot.SNN=F, force.recalc=TRUE, prune.SNN=.1)
fdg_coordinates = runFDG(pca.df=seurat.obj@dr$pca@cell.embeddings, snn=seurat.obj@snn, iterations=2000, tool_addr=tool_addr, python.addr=python.addr)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot="cell.embeddings", new.data=as.matrix(fdg_coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot = "key", new.data = "fdg")
}
# save seurat object
print(sprintf("saving data at: %s", save.at))
saveRDS(seurat.obj, save.at)
}
file.remove("Rplots.pdf")
print("Ended beautifully ... ")

View file

@ -0,0 +1,11 @@
#!/bin/bash
#$ -cwd
#$ -N subset_seurat
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=300G
Rscript subset_seurat.R
echo "End on `date`"

130
pipelines/06_add_dr/add_dr.R Executable file
View file

@ -0,0 +1,130 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
do.normalize.arg = args[2]
add.PCA.arg = args[3]
add.TSNE.arg = args[4]
add.UMAP.arg = args[5]
add.FDG.arg = args[6]
save.dr.arg = args[7]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library("sva")
library(plyr)
library(dplyr)
library(reshape)
#######################################################################################################
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
if(do.normalize){
print("Normalizing data ... ")
seurat.obj = NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
seurat.obj = FindVariableGenes(object = seurat.obj, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = .0125,
x.high.cutoff = 3, y.cutoff = .625)
print("Scaling data ... ")
seurat.obj = ScaleData(object=seurat.obj)
}
if(add.PCA){
print("Performing PCA")
seurat.obj = RunPCA(object = seurat.obj, pc.genes = seurat.obj@var.genes, do.print = FALSE)
}
if (add.TSNE){
print("Performing tSNE")
seurat.obj = RunTSNE(object=seurat.obj, dims.use=1:20, seed.use=42, do.fast=TRUE)
}
if (add.UMAP){
# run umap
print("running UMAP")
umap.coordinates = RunUMAP(pca.df=seurat.obj@dr$pca@cell.embeddings, tool_addr=tool_addr, python.addr=python.addr)
rownames(umap.coordinates) = names(seurat.obj@ident)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="cell.embeddings", new.data=as.matrix(umap.coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="key", new.data="umap")
}
if (add.FDG){
# run force-directed graph
print("Running force directed graph")
seurat.obj = BuildSNN(object=seurat.obj, reduction.type="pca", dims.use=1:20, plot.SNN=F, force.recalc=T)
fdg_coordinates = runFDG(pca.df=seurat.obj@dr$pca@cell.embeddings, snn=seurat.obj@snn, iterations=2000, tool_addr=tool_addr, python.addr=python.addr)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot="cell.embeddings", new.data=as.matrix(fdg_coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot = "key", new.data = "fdg")
}
print("Saving Seurat object")
saveRDS(seurat.obj, seurat.addr)
if(save.dr){
CellNames = as.vector(names(seurat.obj@ident))
tSNEdata = seurat.obj@dr$tsne@cell.embeddings
UMAPdata = seurat.obj@dr$umap@cell.embeddings
FDGdata = seurat.obj@dr$fdg@cell.embeddings
PCAdata = seurat.obj@dr$pca@cell.embeddings
colnames(tSNEdata) = c("tSNEx", "tSNEy")
colnames(UMAPdata) = c("UMAPx", "UMAPy")
colnames(FDGdata) = c("FDGx", "FDGy")
dr_md_df = data.frame(CellNames = CellNames)
dr_md_df = cbind(dr_md_df, tSNEdata, UMAPdata, FDGdata, PCAdata, seurat.obj@meta.data)
save.to = file.path(output_folder, "dr_and_metadata.csv")
write.csv(dr_md_df, save.to)
}else{
unlink(output_folder, recursive=T, force=T)
}
file.remove("Rplots.pdf")
print("Ended beautifully ... ")

16
pipelines/06_add_dr/add_dr.sh Executable file
View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N add_dr
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=300G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript add_dr.R $1
echo "End on `date`"

View file

@ -0,0 +1,159 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
do.normalize.arg = args[2]
add.PCA.arg = args[3]
add.TSNE.arg = args[4]
add.UMAP.arg = args[5]
add.FDG.arg = args[6]
save.dr.arg = args[7]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library("sva")
library(plyr)
library(dplyr)
library(reshape)
#######################################################################################################
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
if(do.normalize){
print("Normalizing data ... ")
seurat.obj = NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
print("Applying COMBAT ...")
expression.data = as.matrix(seurat.obj@data[seurat.obj@var.genes, ])
pheno.data = data.frame(sample = names(seurat.obj@ident),
batch = seurat.obj@meta.data$fetal.ids,
stages = seurat.obj@meta.data$stages)
batch = as.numeric(pheno.data$batch)
pheno.data$batch = batch
mod = model.matrix(~as.factor(stages), data=pheno.data)
colnames(expression.data) = gsub(pattern="CD45[+]", replacement="CD45Pos", x=colnames(expression.data))
colnames(expression.data) = gsub(pattern="CD45[-]", replacement="CD45Neg", x=colnames(expression.data))
write.csv(expression.data, file.path(output_folder, "data.csv"), row.names = T)
batch = data.frame(Batch = batch)
rownames(batch) = colnames(expression.data)
write.csv(batch, file.path(output_folder, "batch.csv"))
command = sprintf("%s combat.py %s", python.addr, output_folder)
system(command, wait = T)
rm(mod, expression.data)
combat_data = read.csv(file.path(output_folder, "combat.csv"), sep = ",", row.names = 1)
print("COMBAT data loaded.")
colnames(combat_data) = gsub(pattern="CD45Pos", replacement="CD45+", x=colnames(combat_data))
colnames(combat_data) = gsub(pattern="CD45Neg", replacement="CD45-", x = colnames(combat_data))
combat_data = as(as.matrix(combat_data), "dgCMatrix")
genes.not = rownames(seurat.obj@data)[!(rownames(seurat.obj@data) %in% rownames(combat_data))]
all.expression = seurat.obj@data[genes.not, ]
all.expression = rbind(all.expression, combat_data)
all.expression = all.expression[rownames(seurat.obj@data), ]
seurat.obj@data = combat_data
print("Scaling data ... ")
seurat.obj = ScaleData(object=seurat.obj)
}
if(add.PCA){
print("Performing PCA")
seurat.obj = RunPCA(object = seurat.obj, pc.genes = seurat.obj@var.genes, do.print = FALSE)
}
if (add.TSNE){
print("Performing tSNE")
seurat.obj = RunTSNE(object=seurat.obj, dims.use=1:20, seed.use=42, do.fast=TRUE)
}
if (add.UMAP){
# run umap
print("running UMAP")
umap.coordinates = RunUMAP(pca.df=seurat.obj@dr$pca@cell.embeddings, tool_addr=tool_addr, python.addr=python.addr)
rownames(umap.coordinates) = names(seurat.obj@ident)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="cell.embeddings", new.data=as.matrix(umap.coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="umap", slot="key", new.data="umap")
}
if (add.FDG){
# run force-directed graph
print("Running force directed graph")
seurat.obj = BuildSNN(object=seurat.obj, reduction.type="pca", dims.use=1:20, plot.SNN=F, force.recalc=T)
fdg_coordinates = runFDG(pca.df=seurat.obj@dr$pca@cell.embeddings, snn=seurat.obj@snn, iterations=2000, tool_addr=tool_addr, python.addr=python.addr)
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot="cell.embeddings", new.data=as.matrix(fdg_coordinates))
seurat.obj = SetDimReduction(object=seurat.obj, reduction.type="fdg", slot = "key", new.data = "fdg")
}
print("Saving Seurat object")
saveRDS(seurat.obj, seurat.addr)
if(save.dr){
CellNames = as.vector(names(seurat.obj@ident))
tSNEdata = seurat.obj@dr$tsne@cell.embeddings
UMAPdata = seurat.obj@dr$umap@cell.embeddings
FDGdata = seurat.obj@dr$fdg@cell.embeddings
PCAdata = seurat.obj@dr$pca@cell.embeddings
colnames(tSNEdata) = c("tSNEx", "tSNEy")
colnames(UMAPdata) = c("UMAPx", "UMAPy")
colnames(FDGdata) = c("FDGx", "FDGy")
dr_md_df = data.frame(CellNames = CellNames)
dr_md_df = cbind(dr_md_df, tSNEdata, UMAPdata, FDGdata, PCAdata, seurat.obj@meta.data)
save.to = file.path(output_folder, "dr_and_metadata.csv")
write.csv(dr_md_df, save.to)
}else{
unlink(output_folder, recursive=T, force=T)
}
file.remove("Rplots.pdf")
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N add_dr
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=300G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript add_dr_COMBAT.R $1
echo "End on `date`"

208
pipelines/06_add_dr/combat.py Executable file
View file

@ -0,0 +1,208 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 1 18:37:16 2019
@author: doru
"""
import pandas as pd
import patsy
import sys
import numpy.linalg as la
import numpy as np
def adjust_nums(numerical_covariates, drop_idxs):
# if we dropped some values, have to adjust those with a larger index.
if numerical_covariates is None: return drop_idxs
return [nc - sum(nc < di for di in drop_idxs) for nc in numerical_covariates]
def design_mat(mod, numerical_covariates, batch_levels):
# require levels to make sure they are in the same order as we use in the
# rest of the script.
design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels),
mod, return_type="dataframe")
mod = mod.drop(["batch"], axis=1)
numerical_covariates = list(numerical_covariates)
sys.stderr.write("found %i batches\n" % design.shape[1])
other_cols = [c for i, c in enumerate(mod.columns)
if not i in numerical_covariates]
factor_matrix = mod[other_cols]
design = pd.concat((design, factor_matrix), axis=1)
if numerical_covariates is not None:
sys.stderr.write("found %i numerical covariates...\n"
% len(numerical_covariates))
for i, nC in enumerate(numerical_covariates):
cname = mod.columns[nC]
sys.stderr.write("\t{0}\n".format(cname))
design[cname] = mod[mod.columns[nC]]
sys.stderr.write("found %i categorical variables:" % len(other_cols))
sys.stderr.write("\t" + ", ".join(other_cols) + '\n')
return design
def combat(data, batch, model=None, numerical_covariates=None):
"""Correct for batch effects in a dataset
Parameters
----------
data : pandas.DataFrame
A (n_features, n_samples) dataframe of the expression or methylation
data to batch correct
batch : pandas.Series
A column corresponding to the batches in the data, with index same as
the columns that appear in ``data``
model : patsy.design_info.DesignMatrix, optional
A model matrix describing metadata on the samples which could be
causing batch effects. If not provided, then will attempt to coarsely
correct just from the information provided in ``batch``
numerical_covariates : list-like
List of covariates in the model which are numerical, rather than
categorical
Returns
-------
corrected : pandas.DataFrame
A (n_features, n_samples) dataframe of the batch-corrected data
"""
if isinstance(numerical_covariates, str):
numerical_covariates = [numerical_covariates]
if numerical_covariates is None:
numerical_covariates = []
if model is not None and isinstance(model, pd.DataFrame):
model["batch"] = list(batch)
else:
model = pd.DataFrame({'batch': batch})
batch_items = model.groupby("batch").groups.items()
batch_levels = [k for k, v in batch_items]
batch_info = [v for k, v in batch_items]
n_batch = len(batch_info)
n_batches = np.array([len(v) for v in batch_info])
n_array = float(sum(n_batches))
# drop intercept
drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True]
drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
model = model[[c for c in model.columns if not c in drop_cols]]
numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
for c in numerical_covariates if not c in drop_cols]
design = design_mat(model, numerical_covariates, batch_levels)
sys.stderr.write("Standardizing Data across genes.\n")
B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T)
grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:])
var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((int(n_array), 1)) / int(n_array))
stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array))))
tmp = np.array(design.copy())
tmp[:,:n_batch] = 0
stand_mean += np.dot(tmp, B_hat).T
s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array)))))
sys.stderr.write("Fitting L/S model and finding priors\n")
batch_design = design[design.columns[:n_batch]]
gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T)
delta_hat = []
for i, batch_idxs in enumerate(batch_info):
#batches = [list(model.columns).index(b) for b in batches]
delta_hat.append(s_data[batch_idxs].var(axis=1))
gamma_bar = gamma_hat.mean(axis=1)
t2 = gamma_hat.var(axis=1)
a_prior = list(map(aprior, delta_hat))
b_prior = list(map(bprior, delta_hat))
sys.stderr.write("Finding parametric adjustments\n")
gamma_star, delta_star = [], []
for i, batch_idxs in enumerate(batch_info):
#print '18 20 22 28 29 31 32 33 35 40 46'
#print batch_info[batch_id]
temp = it_sol(s_data[batch_idxs], gamma_hat[i],
delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i])
gamma_star.append(temp[0])
delta_star.append(temp[1])
sys.stdout.write("Adjusting data\n")
bayesdata = s_data
gamma_star = np.array(gamma_star)
delta_star = np.array(delta_star)
for j, batch_idxs in enumerate(batch_info):
dsq = np.sqrt(delta_star[j,:])
dsq = dsq.reshape((len(dsq), 1))
denom = np.dot(dsq, np.ones((1, n_batches[j])))
numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T)
bayesdata[batch_idxs] = numer / denom
vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1))
bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean
return bayesdata
def it_sol(sdat, g_hat, d_hat, g_bar, t2, a, b, conv=0.0001):
n = (1 - np.isnan(sdat)).sum(axis=1)
g_old = g_hat.copy()
d_old = d_hat.copy()
change = 1
count = 0
while change > conv:
#print g_hat.shape, g_bar.shape, t2.shape
g_new = postmean(g_hat, g_bar, n, d_old, t2)
sum2 = ((sdat - np.dot(g_new.values.reshape((g_new.shape[0], 1)), np.ones((1, sdat.shape[1])))) ** 2).sum(axis=1)
d_new = postvar(sum2, n, a, b)
change = max((abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max())
g_old = g_new #.copy()
d_old = d_new #.copy()
count = count + 1
adjust = (g_new, d_new)
return adjust
def aprior(gamma_hat):
m = gamma_hat.mean()
s2 = gamma_hat.var()
return (2 * s2 +m**2) / s2
def bprior(gamma_hat):
m = gamma_hat.mean()
s2 = gamma_hat.var()
return (m*s2+m**3)/s2
def postmean(g_hat, g_bar, n, d_star, t2):
return (t2*n*g_hat+d_star * g_bar) / (t2*n+d_star)
def postvar(sum2, n, a, b):
return (0.5 * sum2 + b) / (n / 2.0 + a - 1.0)
# get arguments
output_folder = "COMBAT"
output_folder = sys.argv[1]
from os.path import join
data_file = join(output_folder, "data.csv")
batch_file = join(output_folder, "batch.csv")
data = pd.read_csv(data_file, index_col=0)
batch = pd.read_csv(batch_file, index_col = 0)
combat_data = combat(data, batch['Batch'])
combat_data.to_csv(join(output_folder, "combat.csv"), sep=",")

View file

@ -0,0 +1,42 @@
# seurat object address
seurat.addrs <- "../../data/dummydata.RDS"
save.to <- "../../resources/dummydata_markergenes.csv"
DE.downsample <- F # flag to indicate to downsample by cluster before computing DE genes
category <- "Annotation_5" # categories to calculate DE genes for
library(Seurat)
library(dplyr)
# load the seurat object
print("Loading seurat object ...")
seurat.obj <- readRDS(seurat.addrs)
seurat.obj <- SetAllIdent(object=seurat.obj, id=category)
# writing marker genes to disk
if (DE.downsample){
cluster.ids <-unique(as.vector(seurat.obj@ident))
cells.to.keep <- c()
for (k in 1:length(cluster.ids)){
cluster.id <- cluster.ids[k]
cell.ids <- names(seurat.obj@ident)[seurat.obj@ident == cluster.id]
cell.ids <- which(names(seurat.obj@ident) %in% cell.ids )
cells.to.keep <- c(sample(x=cell.ids, size=min(200, length(cell.ids)), replace=F), cells.to.keep)
}
seurat.obj_d <- SubsetData(object=seurat.obj, cells.use=names(seurat.obj@ident)[cells.to.keep])
seurat.obj_d <- NormalizeData(object = seurat.obj_d, normalization.method = "LogNormalize", scale.factor = 10000)
print("Calculating marker genes: finished subseting, currently actually calculating the markers ... ")
marker.genes <- FindAllMarkers(object = seurat.obj_d, only.pos = F, min.pct = 0.25, thresh.use = 0.25,
genes.use = rownames(seurat.obj@data), test.use = "wilcox",
random.seed = 42, print.bar=T, do.print=T, max.cells.per.ident = 200)
}else{
print("Calculating marker genes ... ")
marker.genes <- FindAllMarkers(object = seurat.obj, only.pos = F, min.pct = 0.25, thresh.use = 0.25,
genes.use = rownames(seurat.obj@data), test.use = "wilcox",
random.seed = 42, print.bar=T, do.print=T, max.cells.per.ident = 200)
}
print("Saving marker genes ... ")
print(save.to)
write.csv(marker.genes, save.to)
print("Finished!")

View file

@ -0,0 +1,11 @@
#!/bin/bash
#$ -cwd
#$ -N compute_DEG
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=200G
Rscript compute_DEG.R
echo "End on `date`"

View file

@ -0,0 +1,115 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
if(length(args) != 7){
stop('This pipeline requires 7 parameters: seurat.addr, set.ident (from meta.data on which to set identity of cells,
type.to.colours (file holding the type to colour key), cell.labels (list of categories or all), plot.width,
plot.height, features.file (name of file that holds the list of genes, should be placed in the resource folder)')
}
arguments.list = "
seurat.addr.arg = args[1]
set.ident.arg = args[2]
type.to.colours.arg = args[3]
cell.labels.arg = args[4]
plot.width.arg = args[5]
plot.height.arg = args[6]
features.file.arg = args[7]
"
eval(parse(text = arguments.list))
arguments.list = unlist(strsplit(arguments.list, "\n"))
arguments.list = arguments.list[!(arguments.list == "")]
for(n in 1:length(arguments.list)){
argument = arguments.list[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = paste("08_violin_plots", seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
seurat.addr = file.path("../../data", seurat.addr)
features.file = file.path("../../resources", features.file)
features.file = file(features.file, "r")
features = readLines(features.file)
close(features.file)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(plyr)
library(dplyr)
library(reshape)
library(ggplot2)
library(RColorBrewer)
#######################################################################################################
print("Loading data ...")
seurat.obj <- readRDS(seurat.addr)
print("Data loaded.")
seurat.obj <- SetAllIdent(object=seurat.obj, id=set.ident)
print("Data loaded.")
print("Features present in data:")
print(table(features %in% rownames(seurat.obj@data)))
features = intersect(features, rownames(seurat.obj@data))
if(cell.labels == "all"){
cell.labels=as.vector(unique(seurat.obj@ident))
}else{
cell.labels = file.path("../../resources", cell.labels)
cell.labels.file = file(cell.labels, "r")
cell.labels = readLines(cell.labels.file)
close(cell.labels.file)
}
if (!is.na(type.to.colours)){
type.to.colours = file.path("../../resources", type.to.colours)
type.to.colour <- read.csv(type.to.colours)
match.key <- match(cell.labels, type.to.colour$CellTypes)
cell.colours <- as.vector(type.to.colour$Colours[match.key])
}else{
cell.colours <- sample(colorRampPalette(brewer.pal(12, "Paired"))(length(cell.labels)))
}
to.keep = names(seurat.obj@ident)[as.vector(seurat.obj@ident) %in% cell.labels]
seurat.obj = SubsetData(object=seurat.obj, cells.use=to.keep)
plot_violins <- function(seurat.obj, class.order, class.colours, features){
expression.data <- t(as.data.frame(as.matrix(seurat.obj@data[features, ])))
expression.data <- melt(expression.data)
colnames(expression.data) <- c("CellLabels", "Gene", "Expression")
expression.data$CellLabels <- mapvalues(x=expression.data$CellLabels, from=names(seurat.obj@ident), to=as.vector(seurat.obj@ident))
expression.data$CellLabels <- factor(as.vector(expression.data$CellLabels), levels = class.order)
plot.obj <- ggplot(data=expression.data, aes(x=CellLabels, y = Expression, fill = CellLabels))
plot.obj <- plot.obj + geom_violin(scale = 'width')
plot.obj <- plot.obj + facet_wrap(~Gene, ncol=1)
plot.obj <- plot.obj + theme(axis.text.x = element_text(angle = 90), legend.position = "none")
plot.obj <- plot.obj + scale_fill_manual(values=class.colours)
plot.obj
}
f.name = file.path(output_folder, "features.pdf")
pdf(f.name, width = plot.width, height = plot.height)
print(plot_violins(seurat.obj, class.order=cell.labels, class.colours=cell.colours, features))
dev.off()
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N violin_plots
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=200G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript violin_plots.R $1
echo "End on `date`"

View file

@ -0,0 +1,179 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
if(length(args) != 8){
stop('This pipeline requires 8 parameters: seurat.addr\n set.ident \n genes.to.plot (name of file containing genes to plot)\n cell.types (name of file containing cell types to plot or all)\n cluster.genes (boolean)\n diagonalize (boolean indicating to compute the order of the genes in such a way as to make the appearance of a diagonal on the spotplot - will override the cluster.genes boolean)\n plot.dims (4 tuple for plots dimenssion)\n save.gene.order (boolean indicate to save the genes in the order computed by clustering and/or diaganolization - can be NA or a file name)')
}
arguments.list = "
seurat.addr.arg = args[1]
set.ident.arg = args[2]
genes.to.plot.arg = args[3]
cell.types.arg = args[4]
cluster.genes.arg = args[5]
diagonalize.arg = args[6]
plot.dims.arg = args[7]
save.gene.order.arg = args[8]
"
eval(parse(text = arguments.list))
arguments.list = unlist(strsplit(arguments.list, "\n"))
arguments.list = arguments.list[!(arguments.list == "")]
for(n in 1:length(arguments.list)){
argument = arguments.list[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = paste("09_gene_expression_heatmap_and_spotplot", seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
seurat.addr = file.path("../../data", seurat.addr)
genes.to.plot = file.path("../../resources", genes.to.plot)
library(Seurat)
library(dplyr)
library(plyr)
library(reshape)
#######################################################################################################
# load seurat object
print("loading data ...")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
# set identies
seurat.obj = SetAllIdent(object=seurat.obj, id=set.ident)
# load genes
genes.to.plot = file(genes.to.plot, "r")
genes = readLines(genes.to.plot, warn=F)
close(genes.to.plot)
genes = unlist(strsplit(genes, "\n"))
# check that all genes are in the dataset
if(!(all(genes %in% rownames(seurat.obj@data)))){
not.found = genes[!(genes %in% rownames(seurat.obj@data))]
print(sprintf("The following genes were not found in the data: %s", paste(not.found, collapse = ", ")))
genes = genes[genes %in% rownames(seurat.obj@data)]
}
# check for duplicates
if(length(genes) != length(unique(genes))){
duplicates = names(table(genes)[table(genes) > 1])
duplicates = paste(duplicates, collapse = ", ")
print(sprintf("Duplicates found: %s", duplicates))
print("This will not affect the workflow, but be aware the heat map will have fewer genes than expected.")
genes = unique(genes)
}
# rearange expression matrix by the order in cell types
if(cell.types != "all"){
cell.types = file.path("../../resources", cell.types)
cell_types_file = file(cell.types, "r")
cell.types = readLines(cell_types_file, warn = F)
close(cell_types_file)
cell.types = unlist(strsplit(cell.types, ", "))
print(cell.types)
print("All cell types in data set:")
print(table(cell.types %in% as.vector(unique(seurat.obj@ident))))
}else{
cell.types = sort(as.vector(unique(seurat.obj@ident)))
}
# subset expression data matrix
keep.cell.names = names(seurat.obj@ident)[seurat.obj@ident %in% cell.types]
expression.data = data.matrix(seurat.obj@data[genes, keep.cell.names])
# create a data matrix with mean expression of each marker by cell type
expression.data = t(expression.data)
expression.data = as.data.frame(expression.data)
expression.data = cbind(data.frame(CellLabels = as.vector(seurat.obj@ident[keep.cell.names])), expression.data)
expression.data = aggregate(expression.data[2:dim(expression.data)[2]], list(expression.data$CellLabels), mean)
expression.data = cbind(data.frame(CellType = expression.data$Group.1), expression.data[, 2:dim(expression.data)[2]])
rownames(expression.data) = expression.data$CellType
expression.data = expression.data[, 2:ncol(expression.data)]
# cluster the genes and reorder the expression matrix
if (cluster.genes){
expression.distance = dist(x=t(expression.data), method="euclidian")
gene.order = hclust(d=expression.distance, method="ward.D")$order
expression.data = expression.data[, gene.order]
}
if (diagonalize){
computer.vector.weight.center = function(vecn){
indices = 1:length(vecn)
sum(indices * vecn) / sum(vecn)
}
centers = apply(X=expression.data, MARGIN=2, FUN=computer.vector.weight.center)
centers = order(centers)
print(length(centers))
expression.data = expression.data[, centers]
}
# plot the heatmap
expression.melt = reshape::melt(data=as.matrix(expression.data))
colnames(expression.melt) = c("CellTypes", "Genes", "Values")
expression.melt$CellTypes = factor(as.vector(expression.melt$CellTypes), levels = cell.types)
heatmap.plot = ggplot(expression.melt, aes(factor(Genes, levels = colnames(expression.data)), factor(CellTypes, levels = rev(cell.types)))) + geom_tile(aes(fill = Values), color = "black")
heatmap.plot = heatmap.plot + scale_fill_gradient(low = "lightblue", high = "darkred")
heatmap.plot = heatmap.plot + theme(axis.title.x=element_blank(), axis.title.y=element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
heatmap.plot = heatmap.plot + labs(fill='Expression')
heatmap.fname = file.path(output_folder, "./heatmap.pdf")
pdf(heatmap.fname, width = plot.dims[1], height = plot.dims[2])
print(heatmap.plot)
dev.off()
# plot diag matrix as dot plot (spot plot)
expression.data.r = expression.data
expression.data.r = expression.data.r[rev(cell.types), rev(colnames(expression.data.r))]
expression.melt = reshape::melt(data=as.matrix(expression.data.r))
colnames(expression.melt) = c("CellTypes", "Genes", "Values")
expression.melt$X = rep(1:length(unique(expression.melt$Genes)), each=nrow(expression.data.r))
expression.melt$Y = rep(length(unique(expression.melt$CellTypes)):1, times=ncol(expression.data.r))
colnames(expression.melt) = c("CellTypes", "Genes", "Expression", "X", "Y" )
max.expression = floor(max(expression.melt$Expression)) + 1
spot.plot = ggplot(expression.melt, aes(x = Y, y = X)) +
geom_point(aes(size = Expression, color = Expression)) +
scale_color_gradient(limits = c(0, max.expression), breaks = seq(0,max.expression, by = 1), low = "lightsteelblue1", high = "darkred") +
guides(color = guide_legend(), size = guide_legend()) +
scale_size_continuous(limits=c(0, max.expression), breaks=seq(0, max.expression, by=1)) +
scale_y_discrete(name ="", limits=colnames(expression.data.r)) +
scale_x_discrete(name ="", limits=rev(rownames(expression.data.r))) +
theme(axis.title.x=element_blank(), axis.title.y=element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
splotplot.fname = file.path(output_folder, "./spotplot.pdf")
pdf(splotplot.fname, width = plot.dims[3], height = plot.dims[4])
print(spot.plot)
dev.off()
if(!is.na(save.gene.order)){
save.gene.order = file.path("../../resources", save.gene.order)
save.gene.order = file(save.gene.order, "w")
writeLines(colnames(expression.data), save.gene.order)
close(save.gene.order)
}
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N gene_heatmap_and_spotplot
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=200G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript gene_heatmap_and_spotplot.R $1
echo "End on `date`"

View file

@ -0,0 +1,28 @@
HSC/MPP
Pre pro B cell
pro-B cell
pre-B cell
B cell
ILC progenitor
Early lymphoid/T lymphocyte
NK
Neutrophil-myeloid progenitor
Monocyte-DC precursor
pDC precursor
DC1
DC2
Monocyte
Mono-Mac
Mono-NK
Kupffer Cell
VCAM1+ EI macrophage
EI macrophage
MEMP
Mast cell
Megakaryocyte
Early Erythroid
Mid Erythroid
Late Erythroid
Endothelial cell
Fibroblast
Hepatocyte

View file

@ -0,0 +1,21 @@
PTPRC
IL3RA
CD7
EPCAM
FCGR3A
CD4
HLA-DRA
MS4A1
VCAM1
CD38
NCAM1
CLEC9A
CD14
KIT
ESAM
CD3E
CD8A
CD1C
CD34
GYPA
CD79B

View file

@ -0,0 +1,38 @@
HLA-DRA
CD34
SPINK2
JCHAIN
IGLL1
CD79B
TCL1A
IGKC
MS4A1
LTB
PTPRC
CD3E
CD7
IL32
CD8A
NKG7
XCL2
NCAM1
MPO
LYZ
PLAC8
IL3RA
CLEC9A
CD1C
S100A9
CCL4
CD14
FCGR3A
CD4
C1QA
VCAM1
GYPA
SERPINB1
TPSAB1
KIT
PF4
ESAM
UBE2C

View file

@ -0,0 +1,48 @@
HLA-DRA
CD34
SPINK2
JCHAIN
IGLL1
CD79B
TCL1A
IGKC
MS4A1
CD19
LTB
KLRB1
PTPRC
CD3E
CD7
IL32
CD8A
KLRD1
NKG7
XCL2
NCAM1
MPO
LYZ
PLAC8
IL3RA
CLEC9A
CD1C
S100A9
CCL4
CD14
FCGR3A
CD4
C1QA
VCAM1
GYPA
SERPINB1
TPSAB1
KIT
PF4
ITGA2B
UBE2C
GATA1
KLF1
ALAS2
HBA1
ESAM
ECM1
APOA1

View file

@ -0,0 +1 @@
HSC pro B cell early pro B cell pre B cell B cell ILC progenitor NK Progenitor NK Neut-myeloid progenitor Monocyte-DC progenitor pDC progenitor DC1 DC2 Monocyte Mono-Mac Mono-4 like Kupffer Cell VCAM1+ Erythroid Macrophage Erythroid Macrophage MEP Mast cell Megakaryocyte Early Erythroid Mid Erythroid Late Erythroid Endothelial cell Fibroblast Hepatocyte

View file

@ -0,0 +1,84 @@
VCAM1
FCGR3A
CD14
GYPA
CD1C
LYZ
NKG7
CD3D
CTSW
ESAM
CD34
MYC
GATA2
CLEC9A
IL3RA
SPIB
IRF8
TPSAB1
CPA3
PF4
ITGA2B
MKI67
MS4A1
CD79B
EBF1
DNTT
SPINK2
IGLL1
CD7
XCL2
IFNG
RORC
MPO
GATA1
KLF1
APOA1
AHSG
IGKC
IGLC2
IGLC3
HLA-DQB1
HLA-DPB1
HLA-DPA1
HLA-DRA
CNRIP1
DNASE1L3
AHSP
HBM
HBZ
HBA1
HBA2
HBG1
APOA2
ALB
C1QTNF4
IL7R
LTB
CD52
C1QC
C1QA
C1QB
TPSB2
HBD
PPBP
UBE2C
PRSS57
SERPINB1
KLRB1
CCL4
CCL3
HLA-DRB1
S100A9
S100A8
LGALS1
AZU1
PRTN3
GZMA
IL32
JCHAIN
PLAC8
IGHM
TCL1A
VPREB3
HBB

View file

@ -0,0 +1,84 @@
SPINK2
CD34
C1QTNF4
IGLL1
EBF1
DNTT
LTB
CD52
CD79B
VPREB3
IGHM
JCHAIN
IGLC2
TCL1A
SPIB
IGKC
MS4A1
IGLC3
RORC
IL7R
KLRB1
CD3D
IL32
CD7
CTSW
GZMA
XCL2
IFNG
CCL4
CCL3
NKG7
PRSS57
SERPINB1
APOA2
ALB
MPO
AZU1
PRTN3
APOA1
AHSG
HLA-DPB1
HLA-DPA1
HLA-DRA
HLA-DRB1
CD1C
PLAC8
IRF8
IL3RA
DNASE1L3
CLEC9A
HLA-DQB1
LGALS1
LYZ
S100A9
S100A8
C1QC
C1QA
C1QB
HBA1
HBA2
HBG1
HBB
AHSP
HBM
FCGR3A
CD14
VCAM1
GYPA
HBZ
KLF1
MYC
CPA3
TPSAB1
TPSB2
GATA2
CNRIP1
ITGA2B
PPBP
PF4
HBD
ESAM
GATA1
MKI67
UBE2C

View file

@ -0,0 +1 @@
HSC pro B cell early pro B cell pre B cell B cell ILC progenitor NK Progenitor NK NK - proliferating Neut-myeloid progenitor Monocyte-DC progenitor pDC progenitor DC1 DC2 Monocyte Mono-Mac Mono-4 like Kupffer Cell VCAM1+ Erythroid Macrophage Erythroid Macrophage MEP Mast cell Megakaryocyte Megakaryocyte - proliferating

View file

@ -0,0 +1,48 @@
HLA-DRA
CD34
SPINK2
JCHAIN
IGLL1
CD79B
TCL1A
IGKC
MS4A1
CD19
LTB
KLRB1
PTPRC
CD3E
CD7
IL32
CD8A
KLRD1
NKG7
XCL2
NCAM1
MPO
LYZ
PLAC8
IL3RA
CLEC9A
CD1C
S100A9
CCL4
CD14
FCGR3A
CD4
C1QA
VCAM1
GYPA
SERPINB1
TPSAB1
KIT
PF4
ITGA2B
UBE2C
GATA1
KLF1
ALAS2
HBA1
ESAM
ECM1
APOA1

View file

@ -0,0 +1,64 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
dim.type.arg = args[2]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(RColorBrewer)
library(plyr)
library(dplyr)
#######################################################################################################
print('Loading data ...')
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
print('Data loaded, creating the apps ..')
create_gene_expression_viewer_apps(seurat.obj=seurat.obj, dim.type=dim.type, save.to=output_folder, tool_addr=tool_addr, python.addr=python.addr)
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N seurat_to_interactive_gene_expression
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=200G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript seurat_to_interactive_gene_expression.R $1
echo "End on `date`"

64
pipelines/11_plot_dr/dm.py Executable file
View file

@ -0,0 +1,64 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 14 15:01:36 2018
@author: doru
"""
import sys
args = sys.argv
working_folder = args[1]
import matplotlib; matplotlib.use('Agg');
import scanpy.api as sc;
import pandas as pd
sc.settings.verbosity = 3
scObj = sc.read("{CW}/raw_data.mtx".format(CW = working_folder), cache = False).T
# load gene names
scObj.var_names = pd.read_csv("{CW}/genenames.csv".format(CW = working_folder)).iloc[:, 1]
# load cell names
scObj.obs_names = pd.read_csv("{CW}/cellnames.csv".format(CW = working_folder)).iloc[:, 1]
# add cell labels
cell_labels = pd.read_csv("{CW}/cell_labels.csv".format(CW = working_folder), index_col = 0)
scObj.obs["cell_labels"] = cell_labels
# filter out genes present in less than 3 cells
sc.pp.filter_genes(scObj, min_cells=3)
# log-normalize the data
scObj.raw = sc.pp.log1p(scObj, copy=True)
sc.pp.normalize_per_cell(scObj, counts_per_cell_after=1e4)
# variable genes
filter_result = sc.pp.filter_genes_dispersion(
scObj.X, min_mean=0.0125, max_mean=3, min_disp=0.5)
# subset data on variable genes
scObj = scObj[:, filter_result.gene_subset]
# not sure?
sc.pp.log1p(scObj)
# scale the data
sc.pp.scale(scObj, max_value=10)
# run pca
sc.tl.pca(scObj)
# compunte neighborhood graph
sc.pp.neighbors(scObj, n_neighbors = 15, n_pcs = 20, knn = True, random_state = 10, method = "gauss")
# compute diffusion map
sc.tl.diffmap(scObj, n_comps = 20)
# save diffusion map to disk
dm = scObj.obsm["X_diffmap"]
dm = pd.DataFrame(data = dm, index = None, columns = None)
dm.to_csv("{CW}/dm.csv".format(CW = working_folder), columns = None, header = None)

View file

@ -0,0 +1,90 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 22 11:03:12 2018
@author: doru
"""
# argument variables
import sys
output_folder = sys.argv[1]
from os.path import join
# file names
material_folder = join(output_folder, "AGA_folder")
save_to = join(output_folder, 'AGAlinkage_map_{cat}.html'.format(cat = sys.argv[2]))
colors_fname = join(material_folder, 'colours.csv')
connectivities_fname = join(material_folder, 'connectivities.csv')
coordinates_fname = join(material_folder, 'coordinates.csv')
# read data from files in csv formatr
import pandas as pd
connectivities = pd.read_csv(connectivities_fname, index_col = 0, header = 0)
coordinates = pd.read_csv(coordinates_fname, index_col = 0, header = 0)
try:
colors = pd.read_csv(colors_fname, index_col = 0, header = 0)
except FileNotFoundError:
cell_types = connectivities.columns
import random
cell_types = [f for f in connectivities.columns]
colours = []
for cell_type in cell_types:
r = lambda: random.randint(0,255)
col = '#%02X%02X%02X' % (r(),r(),r())
colours.append({'CellTypes': cell_type, 'Colours': col})
colors = pd.DataFrame(colours)
colors = colors.set_index('CellTypes')
scaleScale = 1.4
minX = coordinates.min()[0] * scaleScale
minY = coordinates.min()[1] * scaleScale
maxX = coordinates.max()[0] * scaleScale
maxY = coordinates.max()[1] * scaleScale
# prepare the coordinates and colors data
cell_names = list(coordinates.index)
cell_sizes = coordinates.Size.tolist()
# reorder cell names by population size - so during drawing smaller cell population are not covered by bigger bubbles
cell_names = [cell_name for [cell_size, cell_name] in sorted(zip(cell_sizes, cell_names), reverse = True)]
data_coordinates = []
for cell_name in cell_names:
row_data = coordinates.loc[cell_name]
X, Y, R = row_data.X, row_data.Y, row_data.Size
X = (X - minX) / (maxX - minX);
Y = (Y - minY) / (maxY - minY);
color = colors.loc[cell_name].Colours
indata = 'data_coordinates["{cell_name}"] = [{X}, {Y}, {R}, "{C}"]'.format(cell_name = cell_name,
X = X, Y = Y, R = R, C = color)
data_coordinates.append(indata)
data_coordinates = '\n'.join(data_coordinates)
# prepare edge thickness data
data_edges = []
# rearrange connectivities by order of cell name
for cell_name in cell_names:
indata = connectivities[cell_name][cell_names].tolist()
indata = ','.join([str(i) for i in indata])
indata = 'data_edges["{cell_name}"] = [{indata}]'.format(cell_name = cell_name, indata = indata)
data_edges.append(indata)
data_edges = '\n'.join(data_edges)
# make cell_names array
cell_names = ['"{cell_name}"'.format(cell_name = cell_name) for cell_name in cell_names]
cell_names = ','.join(cell_names)
cell_names = 'cell_names = [{cell_names}]'.format(cell_names = cell_names)
# prepare all the data
data = '\n'.join([data_coordinates, data_edges, cell_names])
template_fobj = open('template_for_AGA_app.html', 'r')
template = template_fobj.read();
template_fobj.close()
# insert data in template
template = template.replace('// insert data here', data)
# save interactive page
with open(save_to, 'w') as save_fobj:
save_fobj.write(template)

330
pipelines/11_plot_dr/plot_dr.R Executable file
View file

@ -0,0 +1,330 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
plot.by.arg = args[2]
type.to.colours.arg = args[3]
runDiffusionMap.arg = args[4]
runAGA.arg = args[5]
overlay.data.arg = args[6]
overlay.data.ordered.arg = args[7]
"
plotW = 8
plotH = 8
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = paste("11_plot_dr", seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(RColorBrewer)
library(plyr)
library(dplyr)
library(destiny)
#######################################################################################################
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
# save raw data to disk
raw_data = seurat.obj@raw.data
raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
# save gene names
gene_names = rownames(raw_data)
write.csv(data.frame(Genes = gene_names), file.path(output_folder_material, "genenames.csv"))
# save cell names
cell_names = colnames(raw_data)
write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv"))
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
# run the diffusion map
if(runDiffusionMap){
print("Writing .mtx file for diffusion map ...")
writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
print("Running the diffusion map ... ")
command = sprintf("%s ./dm.py %s", python.addr, output_folder_material)
system(command, wait = T)
# load dm from disk
dm = read.csv(file.path(output_folder_material, "dm.csv"), row.names = 1, header = F)
#dm = DiffusionMap(seurat.obj@dr$pca@cell.embeddings, k = 100, density_norm=F, n_eigs = 20)
#dm = data.frame(DC1 = dm$DC1, DC2 = dm$DC2, DC2 = dm$DC3)
}
print("Computing FDG limits ...")
fdg.x = seurat.obj@dr$fdg@cell.embeddings[, 1]
fdg.y = seurat.obj@dr$fdg@cell.embeddings[, 2]
fdg.limits = 1.15 * c(quantile(fdg.x, c(.01)), quantile(fdg.x, c(.99)), quantile(fdg.y, c(.01)), quantile(fdg.y, c(.99)))
print("Making the plots ...")
for (index in 1:length(plot.by)){
caty = plot.by[index]
seurat.obj = SetAllIdent(object=seurat.obj, id=caty)
if (!is.na(type.to.colours[index])){
type.to.colour = read.csv(file.path("../../resources", type.to.colours[index]))
filter.key = type.to.colour$CellTypes %in% as.vector(unique(seurat.obj@ident))
cell.labels = as.vector(type.to.colour$CellTypes[filter.key])
cell.colours = as.vector(type.to.colour$Colours[filter.key])
all_celltypes_missing <- levels(factor(seurat.obj@ident))[!levels(factor(seurat.obj@ident)) %in% levels(factor(type.to.colour$CellTypes))]
all_colours_missing <- levels(factor(type.to.colour$CellTypes))[!levels(factor(type.to.colour$CellTypes)) %in% levels(factor(seurat.obj@ident))]
if (length(all_celltypes_missing)>0){
cat(all_celltypes_missing, "have not been found in your type.to.cols .csv \n", sep="\n")
}
if (length(all_colours_missing)>0){
cat(all_colours_missing, "have not been found in your selected metadata column", sep="\n")
}
if (length(all_colours_missing)==0|length(all_colours_missing)==0){
print("All colours and annotations match")
}
}else{
cell.labels = sort(as.vector(unique(seurat.obj@ident)))
cell.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(cell.labels)))
}
caty = gsub(pattern="\\.", replacement="_", caty)
# file paths for annotated graphs
tsne.file.name = file.path(output_folder, paste("tsne", paste(caty, "pdf", sep = "."), sep = "_"))
umap.file.name = file.path(output_folder, paste("umap", paste(caty, "pdf", sep = "."), sep = "_"))
fdg.file.name = file.path(output_folder, paste("fdg", paste(caty, "pdf", sep = "."), sep = "_"))
legend.file.name = file.path(output_folder, paste("legend", paste(caty, "pdf", sep = "."), sep = "_"))
AGA.file.name = file.path(output_folder, paste("AGA", paste(caty, "pdf", sep = "."), sep = "_"))
# file paths for unannotated plots
tsne.file.name_unlabeled = file.path(output_folder, paste("tsne_unlabeled", paste(caty, "pdf", sep = "."), sep = "_"))
umap.file.name_unlabeled = file.path(output_folder, paste("umap_unlabeled", paste(caty, "pdf", sep = "."), sep = "_"))
fdg.file.name_unlabeled = file.path(output_folder, paste("fdg_unlabeled", paste(caty, "pdf", sep = "."), sep = "_"))
# preparing data frame
if(!is.na(overlay.data)){
df = data.frame(CellType = as.vector(seurat.obj@ident),
tSNEx = seurat.obj@dr$tsne@cell.embeddings[, 1],
tSNEy = seurat.obj@dr$tsne@cell.embeddings[, 2],
UMAPx = seurat.obj@dr$umap@cell.embeddings[, 1],
UMAPy = seurat.obj@dr$umap@cell.embeddings[, 2],
FDGx = seurat.obj@dr$fdg@cell.embeddings[, 1],
FDGy = seurat.obj@dr$fdg@cell.embeddings[, 2],
overlay.data = seurat.obj@meta.data[overlay.data])
colnames(df)[8] <- "overlay.data"
}
else
{
df = data.frame(CellType = as.vector(seurat.obj@ident),
tSNEx = seurat.obj@dr$tsne@cell.embeddings[, 1],
tSNEy = seurat.obj@dr$tsne@cell.embeddings[, 2],
UMAPx = seurat.obj@dr$umap@cell.embeddings[, 1],
UMAPy = seurat.obj@dr$umap@cell.embeddings[, 2],
FDGx = seurat.obj@dr$fdg@cell.embeddings[, 1],
FDGy = seurat.obj@dr$fdg@cell.embeddings[, 2])
}
print("printing header of df made at beginning od r script")
print(head(df))
interactive_plot_df = data.frame(X = seurat.obj@dr$tsne@cell.embeddings[, 1],
Y = seurat.obj@dr$tsne@cell.embeddings[, 2])
interactive_plot_df$Labels = factor(seurat.obj@ident, levels = cell.labels)
interactive_plot_df$Colours = mapvalues(x = interactive_plot_df$Labels, from = cell.labels, to = cell.colours)
# make interartive tsne
interactive_tsne_filename = file.path(output_folder, paste(paste("Interactive_tSNE", caty, sep = "_"), "html", sep = "."))
make_2D_interactive_page(data_frame_2D=interactive_plot_df, tool_addr=tool_addr, python.addr=python.addr, save.to=interactive_tsne_filename)
# make interactive UMAP
interactive_plot_df$X = seurat.obj@dr$umap@cell.embeddings[, 1]
interactive_plot_df$Y = seurat.obj@dr$umap@cell.embeddings[, 2]
interactive_umap_filename = file.path(output_folder, paste(paste("Interactive_UMAP", caty, sep = "_"), "html", sep = "."))
make_2D_interactive_page(data_frame_2D=interactive_plot_df, tool_addr=tool_addr, python.addr=python.addr, save.to=interactive_umap_filename)
# make interactive FDG
interactive_plot_df$X = seurat.obj@dr$fdg@cell.embeddings[, 1]
interactive_plot_df$Y = seurat.obj@dr$fdg@cell.embeddings[, 2]
interactive_fdg_filename = file.path(output_folder, paste(paste("Interactive_FDG", caty, sep = "_"), "html", sep = "."))
make_2D_interactive_page(data_frame_2D=interactive_plot_df, tool_addr=tool_addr, python.addr=python.addr, save.to=interactive_fdg_filename)
n.cols = min(2, length(cell.labels))
n.rows = ceiling(length(cell.labels) / n.cols)
# making the plots
print("making the plots")
# annotated plots
print("making annotated plots")
plot.tsne = dr.plot(point.labels=df$CellType, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE-x", dr2.name="tSNE-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels)
plot.umap = dr.plot(point.labels=df$CellType, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP-x", dr2.name="UMAP-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels)
plot.fdg = dr.plot(point.labels=df$CellType, dr1=df$FDGx, dr2=df$FDGy, dr1.name="FDG-x", dr2.name="FDG-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, limits=fdg.limits, use.labels=cell.labels)
plot.legend = plot.indexed.legend(label.vector=cell.labels, color.vector=cell.colours, ncols=n.cols, left.limit=.2, symbol.size=10, text.size=6, padH=.6, padV=.6)
# unannotated plots
print("making un-annotated plots")
plot.tsne_unlabeled = dr.plot(point.labels=df$CellType, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE-x", dr2.name="tSNE-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels, annotate.plot = F)
plot.umap_unlabeled = dr.plot(point.labels=df$CellType, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP-x", dr2.name="UMAP-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels, annotate.plot = F)
plot.fdg_unlabeled = dr.plot(point.labels=df$CellType, dr1=df$FDGx, dr2=df$FDGy, dr1.name="FDG-x", dr2.name="FDG-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, limits=fdg.limits, use.labels=cell.labels, annotate.plot = F, pt.size=1)
# print the annotated plots
print("printing annotated plots")
pdf(tsne.file.name, width = plotW, height = plotH)
print(plot.tsne)
dev.off()
pdf(umap.file.name, width = plotW, height = plotH)
print(plot.umap)
dev.off()
pdf(fdg.file.name, width = plotW, height = plotH)
print(plot.fdg)
dev.off()
pdf(legend.file.name, width = 1.5 + .15 * n.cols * max(unlist(lapply(cell.labels, nchar))), height = .5 + n.rows * .35)
print(plot.legend)
dev.off()
# print the unannotated plots
print("printing un-annotated plots")
pdf(tsne.file.name_unlabeled, width = plotW, height = plotH)
print(plot.tsne_unlabeled)
dev.off()
pdf(umap.file.name_unlabeled, width = plotW, height = plotH)
print(plot.umap_unlabeled)
dev.off()
pdf(fdg.file.name_unlabeled, width = plotW, height = plotH)
print(plot.fdg_unlabeled)
dev.off()
# run diffusion map
if(runDiffusionMap){
df = as.data.frame(dm[, 1:3])
df$Labels = factor(seurat.obj@ident, levels = cell.labels)
df$Colours = mapvalues(x = df$Labels, from = cell.labels, to = cell.colours)
dm.file.name = file.path(output_folder_material, paste(paste("dm_data", caty, sep="_"), "csv", sep = "."))
write.csv(df, dm.file.name, row.names = F)
dm.file.name = file.path(output_folder, paste(paste("DiffusionMap_3D", caty, sep = "_"), "html", sep = "."))
make_3D_interactive_page(data_frame_3D=df, tool_addr=tool_addr, python.addr=python.addr, save.to=dm.file.name)
}
if(runAGA){
if(runDiffusionMap==F){
print("Writing .mtx file for AGA map ...")
writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
}
print("running AGA ...")
AGA_folder = file.path(output_folder, "AGA_folder")
dir.create(AGA_folder)
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
# running AGA
command =file.path(tool_addr, "AGA/AGA_from_Seurat.py")
command = paste(paste(python.addr, command, sep = " "), output_folder, sep = " ")
command = paste(command, caty, sep =" ")
system(command, wait = T)
# read the AGA output
coordinates = read.csv(file.path(AGA_folder, "coordinates.csv"), row.names = 1)
connectivities = read.csv(file.path(AGA_folder, "connectivities.csv"), row.names = 1)
# plot AGA
coordinates = coordinates[cell.labels, ]
coordinates$Colours = cell.colours
label.order = match(cell.labels, rownames(connectivities))
connectivities = connectivities[label.order, label.order]
plot.obj = ggplot(data=coordinates, aes(x = X, y = Y))
plot.obj = plot.obj + theme_void() + theme(legend.position="none")
xi = c(); xf = c(); yi = c(); yf = c(); vs = c();
for(i in 1:dim(connectivities)[1]){
for(j in i:dim(connectivities)[2]){
v = connectivities[i, j]
if(v > 0){
xi = c(xi, coordinates$X[i])
xf = c(xf, coordinates$X[j])
yi = c(yi, coordinates$Y[i])
yf = c(yf, coordinates$Y[j])
vs = c(vs, v)
}
}
}
lineDF = data.frame(Xi = xi, Yi = yi, Xf = xf, Yf = yf, Vs = vs)
plot.obj = plot.obj + geom_segment(data = lineDF, aes(x = Xi, y = Yi, xend = Xf, yend = Yf), color = "black", size = 3 * lineDF$Vs)
plot.obj = plot.obj + geom_point(size = 2 * log(coordinates$Size), color = coordinates$Colours)
plot.obj = plot.obj + annotate("text", x=coordinates$X, y=coordinates$Y, label = 1:dim(coordinates)[1])
pdf(AGA.file.name, width = 10, height = 10)
print(plot.obj)
dev.off()
######## now make the interactive AGA app
#########################################
print("Making the AGA app ... ")
# save colours
colours.df = data.frame(CellTypes = cell.labels, Colours = cell.colours)
write.csv(colours.df, file.path(AGA_folder, "colours.csv"), row.names = F)
# run python to built the AGA app
command = sprintf("%s make_AGA_app.py %s %s", python.addr, output_folder, caty)
system(command, wait = T)
}
}
# cleaning garbage folders
unlink(output_folder_material, recursive=T, force=T)
if (runAGA){
unlink(AGA_folder, recursive=T, force=T)
unlink(file.path(output_folder, 'figures'), recursive=T, force=T)
}
print("Ended beautifully ... ")

16
pipelines/11_plot_dr/plot_dr.sh Executable file
View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N plot_dr
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=200G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript plot_dr.R $1
echo "End on `date`"

View file

@ -0,0 +1,289 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
plot.by.arg = args[2]
type.to.colours.arg = args[3]
runDiffusionMap.arg = args[4]
runAGA.arg = args[5]
"
plotW = 8
plotH = 8
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = paste("11_plot_dr", seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(RColorBrewer)
library(plyr)
library(dplyr)
library(destiny)
#######################################################################################################
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
# save raw data to disk
raw_data = seurat.obj@raw.data
raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
# save gene names
gene_names = rownames(raw_data)
# save cell names
cell_names = colnames(raw_data)
write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv"))
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
# run the diffusion map
if(runDiffusionMap){
print("Writing .mtx file for diffusion map ...")
writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
print("Running the diffusion map ... ")
command = sprintf("%s ./dm.py %s", python.addr, output_folder_material)
system(command, wait = T)
# load dm from disk
dm = read.csv(file.path(output_folder_material, "dm.csv"), row.names = 1, header = F)
#dm = DiffusionMap(seurat.obj@dr$pca@cell.embeddings, k = 100, density_norm=F, n_eigs = 20)
#dm = data.frame(DC1 = dm$DC1, DC2 = dm$DC2, DC2 = dm$DC3)
}
print("Computing FDG limits ...")
fdg.x = seurat.obj@dr$fdg@cell.embeddings[, 1]
fdg.y = seurat.obj@dr$fdg@cell.embeddings[, 2]
fdg.limits = 1.15 * c(quantile(fdg.x, c(.01)), quantile(fdg.x, c(.99)), quantile(fdg.y, c(.01)), quantile(fdg.y, c(.99)))
print("Making the plots ...")
for (index in 1:length(plot.by)){
caty = plot.by[index]
seurat.obj = SetAllIdent(object=seurat.obj, id=caty)
if (!is.na(type.to.colours[index])){
type.to.colour = read.csv(file.path("../../resources", type.to.colours[index]))
filter.key = type.to.colour$CellTypes %in% as.vector(unique(seurat.obj@ident))
cell.labels = as.vector(type.to.colour$CellTypes[filter.key])
cell.colours = as.vector(type.to.colour$Colours[filter.key])
}else{
cell.labels = sort(as.vector(unique(seurat.obj@ident)))
cell.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(cell.labels)))
}
caty = gsub(pattern="\\.", replacement="_", caty)
# file paths for annotated graphs
tsne.file.name = file.path(output_folder, paste("tsne", paste(caty, "pdf", sep = "."), sep = "_"))
umap.file.name = file.path(output_folder, paste("umap", paste(caty, "pdf", sep = "."), sep = "_"))
fdg.file.name = file.path(output_folder, paste("fdg", paste(caty, "pdf", sep = "."), sep = "_"))
legend.file.name = file.path(output_folder, paste("legend", paste(caty, "pdf", sep = "."), sep = "_"))
AGA.file.name = file.path(output_folder, paste("AGA", paste(caty, "pdf", sep = "."), sep = "_"))
# file paths for unannotated plots
tsne.file.name_unlabeled = file.path(output_folder, paste("tsne_unlabeled", paste(caty, "pdf", sep = "."), sep = "_"))
umap.file.name_unlabeled = file.path(output_folder, paste("umap_unlabeled", paste(caty, "pdf", sep = "."), sep = "_"))
fdg.file.name_unlabeled = file.path(output_folder, paste("fdg_unlabeled", paste(caty, "pdf", sep = "."), sep = "_"))
# preparing data frame
df = data.frame(CellType = as.vector(seurat.obj@ident),
tSNEx = seurat.obj@dr$tsne@cell.embeddings[, 1],
tSNEy = seurat.obj@dr$tsne@cell.embeddings[, 2],
UMAPx = seurat.obj@dr$umap@cell.embeddings[, 1],
UMAPy = seurat.obj@dr$umap@cell.embeddings[, 2],
FDGx = seurat.obj@dr$fdg@cell.embeddings[, 1],
FDGy = seurat.obj@dr$fdg@cell.embeddings[, 2])
interactive_plot_df = data.frame(X = seurat.obj@dr$tsne@cell.embeddings[, 1],
Y = seurat.obj@dr$tsne@cell.embeddings[, 2])
interactive_plot_df$Labels = factor(seurat.obj@ident, levels = cell.labels)
interactive_plot_df$Colours = mapvalues(x = interactive_plot_df$Labels, from = cell.labels, to = cell.colours)
# make interartive tsne
interactive_tsne_filename = file.path(output_folder, paste(paste("Interactive_tSNE", caty, sep = "_"), "html", sep = "."))
make_2D_interactive_page(data_frame_2D=interactive_plot_df, tool_addr=tool_addr, python.addr=python.addr, save.to=interactive_tsne_filename)
# make interactive UMAP
interactive_plot_df$X = seurat.obj@dr$umap@cell.embeddings[, 1]
interactive_plot_df$Y = seurat.obj@dr$umap@cell.embeddings[, 2]
interactive_umap_filename = file.path(output_folder, paste(paste("Interactive_UMAP", caty, sep = "_"), "html", sep = "."))
make_2D_interactive_page(data_frame_2D=interactive_plot_df, tool_addr=tool_addr, python.addr=python.addr, save.to=interactive_umap_filename)
# make interactive FDG
interactive_plot_df$X = seurat.obj@dr$fdg@cell.embeddings[, 1]
interactive_plot_df$Y = seurat.obj@dr$fdg@cell.embeddings[, 2]
interactive_fdg_filename = file.path(output_folder, paste(paste("Interactive_FDG", caty, sep = "_"), "html", sep = "."))
make_2D_interactive_page(data_frame_2D=interactive_plot_df, tool_addr=tool_addr, python.addr=python.addr, save.to=interactive_fdg_filename)
n.cols = min(2, length(cell.labels))
n.rows = ceiling(length(cell.labels) / n.cols)
# making the plots
print("making the plots")
# annotated plots
plot.tsne = dr.plot.numerical(point.labels=df$CellType, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE-x", dr2.name="tSNE-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels)
plot.umap = dr.plot.numerical(point.labels=df$CellType, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP-x", dr2.name="UMAP-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels)
plot.fdg = dr.plot.numerical(point.labels=df$CellType, dr1=df$FDGx, dr2=df$FDGy, dr1.name="FDG-x", dr2.name="FDG-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, limits=fdg.limits, use.labels=cell.labels)
#plot.legend = plot.indexed.legend(label.vector=cell.labels, color.vector=cell.colours, ncols=n.cols, left.limit=.2, symbol.size=10, text.size=6, padH=.6, padV=.6)
# unannotated plots
plot.tsne_unlabeled = dr.plot.numerical(point.labels=df$CellType, dr1=df$tSNEx, dr2=df$tSNEy, dr1.name="tSNE-x", dr2.name="tSNE-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels, annotate.plot = F)
plot.umap_unlabeled = dr.plot.numerical(point.labels=df$CellType, dr1=df$UMAPx, dr2=df$UMAPy, dr1.name="UMAP-x", dr2.name="UMAP-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, use.labels=cell.labels, annotate.plot = F)
plot.fdg_unlabeled = dr.plot.numerical(point.labels=df$CellType, dr1=df$FDGx, dr2=df$FDGy, dr1.name="FDG-x", dr2.name="FDG-y", no.legend=T, plt.lb.sz=7, txt.lb.size=4, use.cols=cell.colours, limits=fdg.limits, use.labels=cell.labels, annotate.plot = F)
# print the annotated plots
pdf(tsne.file.name, width = plotW, height = plotH)
print(plot.tsne)
dev.off()
pdf(umap.file.name, width = plotW, height = plotH)
print(plot.umap)
dev.off()
pdf(fdg.file.name, width = plotW, height = plotH)
print(plot.fdg)
dev.off()
#pdf(legend.file.name, width = 1.5 + .15 * n.cols * max(unlist(lapply(cell.labels, nchar))), height = .5 + n.rows * .35)
#print(plot.legend)
#dev.off()
# print the unannotated plots
pdf(tsne.file.name_unlabeled, width = plotW, height = plotH)
print(plot.tsne_unlabeled)
dev.off()
pdf(umap.file.name_unlabeled, width = plotW, height = plotH)
print(plot.umap_unlabeled)
dev.off()
pdf(fdg.file.name_unlabeled, width = plotW, height = plotH)
print(plot.fdg_unlabeled)
dev.off()
# run diffusion map
if(runDiffusionMap){
df = as.data.frame(dm[, 1:3])
df$Labels = factor(seurat.obj@ident, levels = cell.labels)
df$Colours = mapvalues(x = df$Labels, from = cell.labels, to = cell.colours)
dm.file.name = file.path(output_folder_material, paste(paste("dm_data", caty, sep="_"), "csv", sep = "."))
write.csv(df, dm.file.name, row.names = F)
dm.file.name = file.path(output_folder, paste(paste("DiffusionMap_3D", caty, sep = "_"), "html", sep = "."))
make_3D_interactive_page(data_frame_3D=df, tool_addr=tool_addr, python.addr=python.addr, save.to=dm.file.name)
}
if(runAGA){
if(runDiffusionMap==F){
print("Writing .mtx file for AGA map ...")
writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
}
print("running AGA ...")
AGA_folder = file.path(output_folder, "AGA_folder")
dir.create(AGA_folder)
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
# running AGA
command =file.path(tool_addr, "AGA/AGA_from_Seurat.py")
command = paste(paste(python.addr, command, sep = " "), output_folder, sep = " ")
command = paste(command, caty, sep =" ")
system(command, wait = T)
# read the AGA output
coordinates = read.csv(file.path(AGA_folder, "coordinates.csv"), row.names = 1)
connectivities = read.csv(file.path(AGA_folder, "connectivities.csv"), row.names = 1)
# plot AGA
coordinates = coordinates[cell.labels, ]
coordinates$Colours = cell.colours
label.order = match(cell.labels, rownames(connectivities))
connectivities = connectivities[label.order, label.order]
plot.obj = ggplot(data=coordinates, aes(x = X, y = Y))
plot.obj = plot.obj + theme_void() + theme(legend.position="none")
xi = c(); xf = c(); yi = c(); yf = c(); vs = c();
for(i in 1:dim(connectivities)[1]){
for(j in i:dim(connectivities)[2]){
v = connectivities[i, j]
if(v > 0){
xi = c(xi, coordinates$X[i])
xf = c(xf, coordinates$X[j])
yi = c(yi, coordinates$Y[i])
yf = c(yf, coordinates$Y[j])
vs = c(vs, v)
}
}
}
lineDF = data.frame(Xi = xi, Yi = yi, Xf = xf, Yf = yf, Vs = vs)
plot.obj = plot.obj + geom_segment(data = lineDF, aes(x = Xi, y = Yi, xend = Xf, yend = Yf), color = "black", size = 3 * lineDF$Vs)
plot.obj = plot.obj + geom_point(size = 2 * log(coordinates$Size), color = coordinates$Colours)
plot.obj = plot.obj + annotate("text", x=coordinates$X, y=coordinates$Y, label = 1:dim(coordinates)[1])
pdf(AGA.file.name, width = 10, height = 10)
print(plot.obj)
dev.off()
######## now make the interactive AGA app
#########################################
print("Making the AGA app ... ")
# save colours
colours.df = data.frame(CellTypes = cell.labels, Colours = cell.colours)
write.csv(colours.df, file.path(AGA_folder, "colours.csv"), row.names = F)
# run python to built the AGA app
command = sprintf("%s make_AGA_app.py %s %s", python.addr, output_folder, caty)
system(command, wait = T)
}
}
# cleaning garbage folders
unlink(output_folder_material, recursive=T, force=T)
if (runAGA){
unlink(AGA_folder, recursive=T, force=T)
unlink(file.path(output_folder, 'figures'), recursive=T, force=T)
}
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N plot_dr
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=200G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript plot_dr_numerical.R $1
echo "End on `date`"

View file

@ -0,0 +1,215 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Interactive linkage plot</title>
<meta name="description" content="An interactive plot for the linkage map">
<meta name="author" content="Dorin-Mirel Popescu">
</head>
<body>
<ul>
<li>Bubble size reflects population size; Edge thickness reflects connectivity scores;</li>
<li>Use the sliders to set plotting parameters;</li>
<li>Click the canvas area to select a cell population and reposition it by dragging;</li>
<li>Plot can be saved by right click on canvas area and choose 'Save as'; For higher image resolution increase canvas area, font size and scales before saving;</li>
</ul>
<table>
<tr>
<td>Canvas width</td><td>Canvas height</td><td>Size scale</td><td>Edge scale</td><td>Edge threshold</td><td>Font size</td>
<tr>
<td><input type = 'range' min = '100' max = '3000' value = '500' onchange = 'setWidth(this.value)' /></td>
<td><input type = 'range' min = '100' max = '3000' value = '500' onchange = 'setHeight(this.value)' /></td>
<td><input type = 'range' min = '0' max = '300' value = '10' onchange = 'setSizeScale(this.value)' /></td>
<td><input type = 'range' min = '.1' max = '15' step = '.1' value = '5' onchange = 'setEdgeScale(this.value)' /></td>
<td><input type = 'range' min = '0' max = '1' step = '.001' value = '0' onchange = 'setEdgeThreshold(this.value)'/></td>
<td><input type = 'range' min = '5' max = '80' value = '10' step = '1' onchange = 'setFontSize(this.value)' /></td>
</tr>
</tr>
</table>
<canvas id = 'canvas' width = '500' height = '500'></canvas>
<script type = 'text/javascript'>
// global parameters
var canvas = document.getElementById('canvas'),
canvasW = 500,
canvasH = 500,
sizeScale = .1,
edgeScale = 5,
edgeT = 0,
fontSize = 10,
context = canvas.getContext('2d'),
mouseX = 0,
mouseY = 0,
currentX = 0,
currentY = 0,
selectedX = 0,
selectedY = 0,
selectedPopulation = null;
// data placeholders
var data_coordinates = [], // for each cell name include x coordinate, y coordinate, and radius values
data_edges = [], // for each cell name include and array of edge values
data_composition = []; // for each cell name include 8 numbers (first 4 for male gender, last 4 for female gender)
// insert data here
// function to set the width of canvas. called from slider
function setWidth(value){
canvasW = parseFloat(value)
canvas.width = canvasW
context = canvas.getContext('2d')
draw()
}
// function to set height of canvas. called from slider
function setHeight(value){
canvasH = parseFloat(value)
canvas.height = canvasH
context = canvas.getContext('2d')
draw()
}
// function to set bubble size scale. called from slider
function setSizeScale(value){
sizeScale = parseFloat(value) / 100
draw()
}
// function to set edge scale. called from slider
function setEdgeScale(value){
edgeScale = parseFloat(value)
draw()
}
// function to set edge theshold. Any edge smaller than this threshold will not be drawn. called from slider
function setEdgeThreshold(value){
edgeT = parseFloat(value)
draw()
}
// function to set font size of cell name labels in the plot. called from slider
function setFontSize(value){
fontSize = parseInt(value)
draw()
}
// function to draw the canvas
function draw(){
// clear canvas by drawing a rectangle
context.fillStyle = '#efefef'
context.fillRect(0, 0, canvas.width, canvas.height)
// loop through all the cell name and draw their coresponding bubble reflect population size and write the label above the bubble
for (key in data_coordinates){
// get bubble parameters
var bubble_data = data_coordinates[key],
bubbleX = canvasW * bubble_data[0],
bubbleY = canvasH * (1 - bubble_data[1]),
bubbleA = sizeScale * bubble_data[2],
bubbleR = Math.sqrt(bubbleA),
bubbleC = bubble_data[3];
// draw edges
var edges = data_edges[key]
context.strokeStyle = '#888888'
edges.forEach(function(edgeVal, i){
if (edgeVal > edgeT){
var connectingCellName = cell_names[i],
connectingBubble = data_coordinates[connectingCellName],
endX = canvasW * connectingBubble[0],
endY = canvasH * (1 - connectingBubble[1])
edgeVal *= edgeScale
context.lineWidth = edgeVal;
context.beginPath()
context.moveTo(bubbleX, bubbleY)
context.lineTo(endX, endY)
context.stroke()
}
})
}
// loop through all values in connectivities and draw corresponding edges if great the edge threshold
for (key in data_coordinates){
// get bubble parameters
var bubble_data = data_coordinates[key],
bubbleX = canvasW * bubble_data[0],
bubbleY = canvasH * (1 - bubble_data[1]),
bubbleA = sizeScale * bubble_data[2],
bubbleR = Math.sqrt(bubbleA),
bubbleC = bubble_data[3];
// draw bubble
context.fillStyle = bubbleC
context.beginPath()
context.arc(bubbleX, bubbleY, bubbleR, 0, 2 * Math.PI, false)
context.fill()
// write cell name
context.fillStyle = 'black';
context.font = parseInt(fontSize) + 'px arial'
context.textAlign = 'center'
context.textBaseline = 'Alphabetical'
context.fillText(key, bubbleX, bubbleY - bubbleR - 2)
}
}
// function that takes an event as input and return x, y values of mouse cursor
function getEventCoordinates(event){
var canvasRect = canvas.getBoundingClientRect(),
X = event.clientX - canvasRect.x,
Y = event.clientY - canvasRect.y;
return [X, Y]
}
// function that stops dragging of selected cell name
function stopDraging(event){
// first draw the data point at dropping location
dragDataPoint(event)
// remove dragDataPoint from canvas event listeners
canvas.removeEventListener('mousemove', dragDataPoint)
// remove stopDraging from canvas event listeners
canvas.removeEventListener('mouseup', stopDraging)
}
// function that drags a selected bubble to follow the movement of the cursor
function dragDataPoint(event){
var XY = getEventCoordinates(event)
currentX = XY[0];
currentY = XY[1];
var dx = (mouseX - currentX) / canvasW,
dy = (mouseY - currentY) / canvasH;
// reset coordinates of selected data point
data_coordinates[selectedPopulation][0] = selectedX - dx;
data_coordinates[selectedPopulation][1] = selectedY + dy;
// then draw
draw()
}
// draw the canvas and add the event listeners only when the entire document is loaded
window.onload = function(){
draw()
canvas.addEventListener('mousedown', function(event){
var XY = getEventCoordinates(event),
hit = false;
mouseX = XY[0];
mouseY = XY[1];
// loop through all the data poins and check for hit
for (key in data_coordinates){
var bubble_data = data_coordinates[key],
bubbleX = canvasW * bubble_data[0],
bubbleY = canvasH * (1 - bubble_data[1]),
bubbleA = sizeScale * bubble_data[2],
bubbleR = Math.sqrt(bubbleA),
dx = mouseX - bubbleX,
dy = mouseY - bubbleY,
distance = Math.sqrt(Math.pow(dx, 2) + Math.pow(dy, 2))
if (distance < bubbleR){
hit = true;
selectedPopulation = key;
selectedX = data_coordinates[selectedPopulation][0]
selectedY = data_coordinates[selectedPopulation][1]
}
}
if (hit){
canvas.addEventListener('mousemove', dragDataPoint)
canvas.addEventListener('mouseup', stopDraging)
}else{selectedPopulation = null}
})
}
</script>
</body>
</html>

View file

@ -0,0 +1,17 @@
precision recall f1-score support
A 0.82 0.43 0.56 21
B 0.78 0.88 0.82 16
C 0.48 1.00 0.65 10
D 1.00 1.00 1.00 12
E 0.53 0.67 0.59 15
F 0.93 1.00 0.96 13
G 0.83 0.91 0.87 11
H 0.60 0.90 0.72 10
I 1.00 1.00 1.00 19
J 0.94 0.94 0.94 16
K 1.00 0.86 0.92 14
L 1.00 0.89 0.94 19
M 0.56 0.26 0.36 19
avg / total 0.82 0.79 0.79 195

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,226 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
feature_genes.arg = args[2]
cell.types.arg = args[3]
save.to.dir.arg = args[4]
ident.set.arg = args[5]
type.to.colours.arg = args[6]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
library(Seurat)
library(plyr)
library(dplyr)
library(reshape)
library(ggplot2)
library(RColorBrewer)
source("../../tools/bunddle_utils.R")
seurat.addr = file.path("../../data", seurat.addr)
cell.types = file.path('../../resources', cell.types)
feature_genes = file.path('../../resources', feature_genes)
type.to.colours = file.path("../../resources", type.to.colours)
cell.types.file = file(cell.types, "r")
cell.types = readLines(cell.types.file)
close(cell.types.file)
feature_genes.file = file(feature_genes, "r")
feature_genes = readLines(feature_genes.file)
close(feature_genes.file)
################################################################################################################
# a plotting function for indexed legend
plot.indexed.legend = function(label.vector, color.vector, ncols = 2, left.limit = 3.4, symbol.size = 8, text.size = 10, padH = 1, padV = 1, padRight = 0){
if (length(label.vector) != length(color.vector)){
stop("number of labels is different from number colors\nAdvice: learn to count!")
}
if (length(ncol) > length(label.vector)){
stop("You cannot have more columns than labels\nSolution: Learn to count")
}
indices.vector = 1:length(label.vector)
label.no = length(label.vector)
nrows = ceiling(label.no / ncols)
legend.frame = data.frame(X = rep(0, label.no), Y = rep(0, label.no), CS = color.vector, Txt = label.vector)
legend.frame$X = rep(1:ncols, each=nrows)[1:nrow(legend.frame)]
legend.frame$Y = rep(nrows:1, times = ncols)[1:nrow(legend.frame)]
Xrange = range(legend.frame$X)
Yrange = range(legend.frame$Y)
plot.obj = ggplot(data = legend.frame, aes(x = X, y = Y))
plot.obj = plot.obj + geom_point(size = symbol.size, colour = color.vector)
plot.obj = plot.obj + scale_x_continuous(limits = c(Xrange[1] - padRight, Xrange[2] + padH))
plot.obj = plot.obj + scale_y_continuous(limits = c(Yrange[1] - padV, Yrange[2] + padV))
plot.obj = plot.obj + theme_void()
plot.obj = plot.obj + annotate("text", x=legend.frame$X, y = legend.frame$Y, label = indices.vector, size = text.size)
plot.obj = plot.obj + annotate("text", x=legend.frame$X+.1, y = legend.frame$Y, label=legend.frame$Txt, hjust = 0, size = text.size)
return(plot.obj)
}
# plotting function for dimensionaly-reduced data to label population by a round indexed label
dr.plot = function(point.labels, dr1, dr2, dr1.name, dr2.name, no.legend = F, plt.lb.sz = 5, txt.lb.size = 3, pt.size = .2, random_state = 2, use.cols = NULL, use.labels = NULL, limits = NULL, annotate.plot = T){
df.dr = data.frame("Cell Labels" = point.labels, DR1 = dr1, DR2 = dr2)
if(is.null(use.labels)){
p.labels = sort(unique(as.vector(point.labels)))
}
else{
p.labels = use.labels
}
df.dr$Cell.Labels = factor(df.dr$Cell.Labels, levels=p.labels)
p.labels.medians = aggregate(df.dr[, 2:3], list(df.dr$Cell.Labels), median)
df.dr$Cell.Labels = mapvalues(x = df.dr$Cell.Labels, from = p.labels, to = paste(1:length(p.labels), p.labels, sep = " "))
if(is.null(use.cols)){
set.seed(random_state)
plt.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(p.labels)))
}else{
plt.colours = use.cols
}
index.map = 1:length(p.labels)
plot.obj = ggplot(data = df.dr, aes(x = DR1, y = DR2, color = Cell.Labels))
plot.obj = plot.obj + geom_point(size = pt.size)
plot.obj = plot.obj + scale_color_manual(values=plt.colours)
if(annotate.plot){
plot.obj = plot.obj + geom_point(data=p.labels.medians,aes(x = DR1, y = DR2), colour = "gray", size = plt.lb.sz, fill = plt.colours, alpha = .5, pch = 21)
plot.obj = plot.obj + annotate("text", x=p.labels.medians$DR1, y = p.labels.medians$DR2, label = index.map, size = txt.lb.size)
}
if (no.legend){
plot.obj = plot.obj + theme(legend.position="none")
}else{
plot.obj = plot.obj + guides(color = guide_legend(override.aes = list(size=5)))
}
plot.obj = plot.obj + xlab(dr1.name) + ylab(dr2.name)
if(!is.null(limits)){
X0 = limits[1]; X1 = limits[2]; Y0 = limits[3]; Y1 = limits[4];
plot.obj = plot.obj + scale_x_continuous(limits = c(X0, X1))
plot.obj = plot.obj + scale_y_continuous(limits = c(Y0, Y1))
}
return(plot.obj)
}
################################################################################################################
# load seurat object
print("loading data ...")
seurat.obj = readRDS(seurat.addr)
print("Loaded data")
# set the clustering identity
seurat.obj = SetAllIdent(object=seurat.obj, id = ident.set)
# subset data on cell types
seurat.obj = SubsetData(object=seurat.obj, ident.use=cell.types)
# select on singlets
seurat.obj = SetAllIdent(object=seurat.obj, id = "doublets")
seurat.obj = SubsetData(object=seurat.obj, ident.use=c("Singlet"))
seurat.obj = SetAllIdent(object=seurat.obj, id = ident.set)
# normaliza data
print("Normalizing data ...")
seurat.obj = NormalizeData(object = seurat.obj, normalization.method = "LogNormalize", scale.factor = 10000)
# check that all genes are in the dataset
print("check that genes are in the dataset")
if(!(all(feature_genes %in% rownames(seurat.obj@data)))){
not.found = feature_genes[!(feature_genes %in% rownames(seurat.obj@data))]
print(not.found)
}
# check for duplicates
print("check for duplicates")
if(length(feature_genes) != length(unique(feature_genes))){
duplicates = names(table(feature_genes)[table(feature_genes) > 1])
duplicates = paste(duplicates, collapse = ", ")
print(sprintf("Duplicates found: %s", duplicates))
print("This will not affect the workflow, but be aware the heat map will have a smaller genes than expected.")
feature_genes = unique(feature_genes)
}
# create folder for saving the results
print("creating folders")
dir.create(save.to.dir)
# create folder to save working material
material_folder = file.path(save.to.dir, "material")
unlink(material_folder, recursive=T, force=T)
dir.create(material_folder)
# subsetting seurat object so that we do not get a 'problem too large' error
seurat.obj = SetAllIdent(seurat.obj, id="cell.labels")
seurat.obj = SubsetData(seurat.obj, max.cells.per.ident = 1000)
# write the cluster labels to disk
if (!is.na(type.to.colours)){
type.to.colour = read.csv(type.to.colours)
filter.key = type.to.colour$CellTypes %in% as.vector(unique(seurat.obj@ident))
cell.labels = as.vector(type.to.colour$CellTypes[filter.key])
cell.colours = as.vector(type.to.colour$Colours[filter.key])
}else{
cell.labels = sort(as.vector(unique(seurat.obj@ident)))
cell.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(cell.labels)))
}
labels = data.frame(Labels = as.vector(seurat.obj@ident))
labels$Colours = mapvalues(x=labels$Labels, from=cell.labels, to=cell.colours)
write.csv(labels, file.path(material_folder, "labels.csv"), row.names = F)
# write the feature data to disk
print("writing data.csv")
matrix <- as.matrix(seurat.obj@data)
feature_matrix <- subset(matrix, rownames(matrix) %in% feature_genes)
x.data <- as.data.frame(t(matrix))
#x.data = as.data.frame(t(as.matrix(seurat.obj@data[feature_genes, ])))
write.csv(x.data, file.path(material_folder, "data.csv"), row.names = T)
# run the random forest classifier and get the confusion matrix
print("running random forest classifier")
command = paste(python.addr, sprintf("random_forest_classifier.py %s", save.to.dir), sep = " ")
system(command, wait = T)
print("plot the confusion matrix")
cnf_matrix = read.csv(file.path(material_folder, "confusion_matrix.csv"), check.names = F)
cnf_matrix = cnf_matrix[, -c(1)]
confusion = expand.grid(Actual = colnames(cnf_matrix), Predicted = colnames(cnf_matrix))
cnf_matrix_colSums = colSums(cnf_matrix)
cnf_matrix_colSums[cnf_matrix_colSums == 0] = 1.0
cnf_matrix_colSums_matrix = matrix(ncol = length(cnf_matrix_colSums), nrow = length(cnf_matrix_colSums))
cnf_matrix_colSums_matrix[] = cnf_matrix_colSums
cnf_matrix = cnf_matrix / t(cnf_matrix_colSums_matrix)
confusion$Frequency = rapply(cnf_matrix, c)
confusion$Actual = factor(as.vector(confusion$Actual), levels = cell.labels)
confusion$Predicted = factor(as.vector(confusion$Predicted), levels = rev(cell.labels))
confusion.plot = ggplot(data = confusion, aes(x = Actual, y = Predicted)) + geom_tile(aes(fill = Frequency)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_gradient(low = "lightblue", high = "darkred")
pdf(file.path(save.to.dir, "confusion_matrix.pdf"), width = 14, height = 14)
print(confusion.plot)
dev.off()
print("Ended beautifully.")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N gene_discriminatory_power_analysis
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=400G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript gene_discriminatory_power_analysis.R $1
echo "End on `date`"

View file

@ -0,0 +1,28 @@
Kupffer Cell
Mono-Mac
Early Erythroid
pro B cell
Erythroid Macrophage
VCAM1+ Erythroid Macrophage
B cell
pre B cell
Hepatocyte
DC2
NK Progenitor
HSC
Monocyte
Megakaryocyte
Mono-4 like
NK
Endothelial cell
Neut-myeloid progenitor
Monocyte-DC progenitor
DC1
pDC progenitor
Mast cell
MEP
Mid Erythroid
pro B cell early
ILC progenitor
Fibroblast
Late Erythroid

View file

@ -0,0 +1,28 @@
HSC/MPP
Pre pro B cell
pro-B cell
pre-B cell
B cell
ILC progenitor
Early lymphoid/T lymphocyte
NK
Neutrophil-myeloid progenitor
Monocyte-DC precursor
pDC precursor
DC1
DC2
Monocyte
Mono-Mac
Mono-NK
Kupffer Cell
VCAM1+ EI macrophage
EI macrophage
MEMP
Mast cell
Megakaryocyte
Early Erythroid
Mid Erythroid
Late Erythroid
Endothelial cell
Fibroblast
Hepatocyte

View file

@ -0,0 +1 @@
PTPRC IL3RA CD7 FCGR3A GYPA CD4 HLA-DRA MS4A1 VCAM1 CLEC9A NCAM1 CD14 KIT CD34 ESAM CD8A CD1C

View file

@ -0,0 +1,150 @@
VCAM1
CD14
FCGR3A
HMOX1
TIMD4
FOLR2
LGMN
SLC40A1
CETP
MARCO
CD68
FABP3
LIPA
C1QC
C1QB
C1QA
CFP
CD1C
RNASE6
CLEC7A
CTSH
CSTA
MS4A7
GYPA
CLEC10A
MNDA
LYZ
FCN1
NKG7
GZMA
KLRB1
GZMK
STK17A
IFITM1
KLRC1
PRF1
CD3E
CD3D
CTSW
ALOX5AP
IFNG
CST7
GZMM
CD247
SH2D2A
HLA-DRB5
HLA-DPA1
HLA-DPB1
HLA-DRA
HLA-DRB1
HLA-DMA
HLA-DQB1
HLA-DQA1
HLA-DMB
HLA-DQA2
TNFSF14
ESAM
CD34
FGF23
FCN3
CRHBP
DNASE1L3
ACP5
RAMP2
ANGPTL4
CAV1
PRCP
TM4SF1
ECM1
KDR
PRSS57
GATA2
CYTL1
CLEC11A
CNRIP1
MYC
BATF3
CLEC9A
CCDC50
IRF7
IL3RA
SCT
MZB1
SPIB
IRF8
GPR183
IGLL1
TPSAB1
CPA3
PF4
ITGA2B
CMTM5
TIMP3
NRGN
GP9
PPBP
MKI67
MS4A1
CD37
CD52
TCL1A
CD79B
FCRLA
LTB
SP140
VPREB3
BLNK
RAG2
C1QTNF4
VPREB1
EBF1
SPINK2
CD79A
DNTT
RAG1
CD7
CLIC3
XCL2
RORC
MPO
CTNNBL1
GATA1
KLF1
FAM178B
KCNH2
REXO2
FABP1
APOA2
ALB
APOA1
SERPINA1
AHSG
RBP1
AURKB
CENPF
PTTG1
TK1
C1QBP
HBD
HBA2
HBA1
HBG1
HBG2
HBB
HBM
HBE1
HBZ
HLA-B
CSF1R
CEBPE

View file

@ -0,0 +1,48 @@
HLA-DRA
CD34
SPINK2
JCHAIN
IGLL1
CD79B
TCL1A
IGKC
MS4A1
CD19
LTB
KLRB1
PTPRC
CD3E
CD7
IL32
CD8A
KLRD1
NKG7
XCL2
NCAM1
MPO
LYZ
PLAC8
IL3RA
CLEC9A
CD1C
S100A9
CCL4
CD14
FCGR3A
CD4
C1QA
VCAM1
GYPA
SERPINB1
TPSAB1
KIT
PF4
ITGA2B
UBE2C
GATA1
KLF1
ALAS2
HBA1
ESAM
ECM1
APOA1

View file

@ -0,0 +1,68 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 10 11:17:19 2018
@author: doru
"""
import sys
from os.path import join
arguments = sys.argv
working_folder = arguments[1]
material_folder = join(working_folder, "material")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
from numpy.core.umath_tests import inner1d
from sklearn.model_selection import GridSearchCV
class randomGuessing:
def __init__(self, Y_Labels):
tally = np.unique(Y_Labels, return_counts = True)
self.labels = list(tally[0])
frequency = tally[1]
frequency = frequency / frequency.sum()
self.frequency = frequency
def predict(self, X):
return np.array(np.random.choice(self.labels, X.shape[0], p = self.frequency))
print("Loading data ...")
X = pd.read_csv(join(material_folder, "data.csv"), sep = ",", index_col = 0).values
labels_and_colours = pd.read_csv(join(material_folder, "labels.csv")).values
Y = labels_and_colours[:, 0].reshape(-1, 1).ravel()
Colours = labels_and_colours[:, 1].reshape(-1, 1).ravel()
# separate labels and colours
print("Splitting into training and test sets...")
(X_train, X_test, y_train, y_test) = train_test_split(X, Y, test_size = .3, random_state = 32)
randomForestClassifier = RandomForestClassifier(n_estimators = 500, criterion = "gini", min_samples_split = 5, bootstrap = True, class_weight="balanced")
param_grid = dict(n_estimators=[100, 200, 500],class_weight=["balanced",None],min_samples_split=[2,5,10,15],max_features=["sqrt","log2"],max_depth=[5,10,15,None])
model = GridSearchCV(randomForestClassifier, param_grid=param_grid, cv=3,scoring="f1_weighted")
model.fit(X_train, y_train)
print(model.get_params())
pred = model.predict(X_test)
cls_report = classification_report(y_test, pred, target_names = model.classes_)
with open(join(working_folder, "classification_report.txt"), "w") as cl_f:
cl_f.write(cls_report)
print("Saving confusion matrix to disk ...")
cnf_matrix = confusion_matrix(y_test, pred)
df = pd.DataFrame(cnf_matrix)
df.columns = model.classes_
df.to_csv(join(material_folder, "confusion_matrix.csv"))
print("Finishing random_forest_classifier.py run")

Binary file not shown.

View file

@ -0,0 +1,62 @@
# Prepare a smaller pseudotime heatmap, using the following genes:
selected.gene.list <- scan("selected.genes.std.txt", what = character(), sep = "\n", blank.lines.skip = T, comment.char = "#") # or character vector c("")
path <- "." # path to 'ploting.material.RDS' [sic]
library("ggplot2")
###############################################################################
plottingmat <- readRDS(file.path(path, "ploting_material.RDS"))
# str(plottingmat)
# str(plottingmat$beautiful_result_norm)
# View(plottingmat$beautiful_result_norm)
subsetplotmat <- plottingmat$beautiful_result_norm[plottingmat$beautiful_result_norm$GeneNames %in% selected.gene.list, ]
subsetplotmat$GeneNames <- droplevels(subsetplotmat$GeneNames)
subsetplotmat$GeneNames <- factor(subsetplotmat$GeneNames, levels = rev(selected.gene.list)) # Orders the heatmap
# The following section is adapted from: https://github.com/haniffalab/Single-cell-RNAseq-data-analysis-bundle/blob/master/pipelines/13_pseudotime/pseudotime.R#L270 commit b86d20dc87d35820daac178a93e46badf99216ab
plot.genes <- ggplot(data = subsetplotmat, aes(x = Pseudotime, y = GeneNames))
plot.genes <- plot.genes + geom_tile(aes(fill = ExpressionValue),
width=1.001, height=1.001)
plot.genes <- plot.genes + scale_fill_gradient2(low = "deepskyblue",
high = "firebrick3",
mid = "darkolivegreen3",
midpoint = 0.5,
name = "Minmax normalized gene expression")
plot.genes <- plot.genes + theme(legend.position = "bottom",
legend.text = element_text(size = 25, angle = 90),
legend.title = element_text(size = 25),
legend.key.width = unit(2, "cm"),
axis.text.x = element_blank(), axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 0), axis.text.y = element_text(size = 8))
plot.genes
height = 6; width = 3
pdf("dpt_heatmap.pdf", height = height, width = width)
plot.genes
dev.off()
svg("dpt_heatmap.svg", height = height, width = width)
plot.genes
dev.off()
postscript("dpt_heatmap.ps", height = height, width = width)
plot.genes
dev.off()
png("dpt_heatmap.png", height = 600, width = 300)
plot.genes
dev.off()
###############################################################################
# Alternative formats for density plots
pdt_exp <- read.csv(file.path(path, "pdt_and_expression.csv"))
#~ str(pdt_exp)
# Standard:
ggplot(data = pdt_exp, aes(x = Pseudotime, color = Labels, fill = Labels)) + geom_density(alpha = .7) # alpha for transparency
# Stacked:
ggplot(data = pdt_exp, aes(x = Pseudotime, color = Labels, fill = Labels)) + geom_density(position = "stack")
# Relative:
ggplot(data = pdt_exp, aes(x = Pseudotime, color = Labels, fill = Labels)) + geom_density(adjust = 1.5, position = "fill")
# Histogram:
ggplot(data = pdt_exp, aes(x = Pseudotime, color = Labels, fill = Labels)) + geom_histogram(binwidth = 0.01)

View file

@ -0,0 +1,819 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 23 glorious 2018
@author: Dorin-Mirel Popescu
"""
import sys
args = sys.argv
save_to = args[1]
load_from = args[2]
template = """
<!doctype html>
<html lang='en'>
<head>
<meta charset='utf-8'>
<title>3D viewer</title>
<meta name='description' content='The HTML5 Herald'>
<meta name='author' content='Dorin-Mirel Popescu'>
</head>
<body>
<table>
<tr>
<td align='left'>
<form>
<fieldset>
<legend><b>Visualisation options</b></legend>
<label for = 'particleSizeBar'>Particle size: </label>
<input type='range' name = 'particleSizeBar' min = 10 max = 300 onchange='setParticleSize(value)' value = 150 /><br />
<label for = 'alphaInput'>Transparency: </label>
<input type='range' name = 'alphaInput' min = 0 max = 1000 onchange='setAlpha(value)' value = 1000 /><br />
<label for = 'canvasSizeInput'>Canvas size: </label>
<input type='range' name = 'canvasSizeInput' min = 200 max = 2000 onchange='setCanvasSize(value)' value = 500 /><br />
<label for = "zoom">Zoom: </label>
<input type='range' name = 'zoom' min = 100 max = 1000 onchange='setZoom(value)' value = 400 /><br />
<label for = 'bgInput'>Dark background: </label>
<input type='radio' name = 'bgInput' onchange='setBackground(value)' value = 'dark' />
<label for = 'bgInput'>White background: </label>
<input type='radio' name = 'bgInput' onchange='setBackground(value)' value = 'white' checked />
<br />
<label for='sliderX'>Slide X: </label>
<input type='range' name='sliderX' min='-100' max='100' onchange='slideOnX(value)' value='0' />
<label for='sliderY'>Slide Y: </label>
<input type='range' name='sliderY' min='-100' max='100' onchange='slideOnY(value)' value='0' />
<br />
</fieldset>
</form>
</td>
<td style='vertical-align: top' rowspan='2'>
<form>
<fieldset>
<legend><b>Colour by:</b></legend>
<label for='colourType'><input type='radio' name=colourType onchange='setColourByType(value)' value='celltype' checked />Cell type</label><br />
<label for='colourType'><input type='radio' name=colourType onchange='setColourByType(value)' value='pseudotime' />Pseudotime</label><br />
<label for='colourType'><input type='radio' name=colourType onchange='setColourByType(value)' value='gene' />Gene</label>
</fieldset>
</form>
<br/>
<form>
<fieldset>
<legend><b>Gene expression options</b></legend>
<label for='geneselector'>Chose gene by ID: </label>
<select id='geneselector' onchange='colourByType()'>
gene_options_here
</select>
<br/>
Gene expression as:<br/>
<label><input type='radio' name='expressionType' value='nsnn' onchange='setExpressionType(value)' checked />Non-smooth non-norm</label><br/>
<label><input type='radio' name='expressionType' value='snn' onchange='setExpressionType(value)' />Smoothed non-norm</label><br/>
<label><input type='radio' name='expressionType' value='sn' onchange='setExpressionType(value)' />Smoothed minmax norm</label><br/>
</fieldset>
</form>
<br />
<div>
<fieldset>
<legend><b>Cell types:</b></legend>
<label for='toggleRadio'><input type='checkbox' name = 'toggleRadio' id='toggleRadio' onchange='toggleShowTypes()' checked />Show all:</label>
<form id = 'ControlPanel'>
radiocommands
</form>
</fieldset>
</div>
</td>
</tr>
<tr>
<td style='vertical-align: text-top' >
<canvas id='canvas' width=600 height=600></canvas>
</td>
</tr>
</table>
<script id='vertex-shader' type='x-shader/x-fragment'>
attribute vec4 a_Position;
attribute vec3 a_Color;
uniform mat4 u_ModelMatrix;
uniform mat4 u_ViewMatrix;
uniform mat4 u_ProjMatrix;
uniform float u_basePointSize;
uniform float u_Alpha;
varying vec4 v_Color;
void main() {
vec4 cubePos = u_ProjMatrix * u_ModelMatrix * u_ViewMatrix * a_Position;
float currentWidth = 0.0;
currentWidth = 3.0 + (u_basePointSize - 3.0) * (1.0 - cubePos.z / cubePos.w) / 2.0;
gl_Position = cubePos;
gl_PointSize = currentWidth;
v_Color = vec4(a_Color, u_Alpha);
}
</script>
<script id ='fragment-shader' type='x-shader/x-fragment'>
precision mediump float;
varying vec4 v_Color;
void main() {
float r = 0.0;
vec2 cxy = 2.0 * gl_PointCoord - 1.0;
r = dot(cxy, cxy);
if (r > 1.0){
discard;
}
vec2 D = vec2(0.0, 0.0), centers = vec2(.65, .35);
float light = 0.0;
light = length(centers - gl_PointCoord);
light = .1 + .9 * (pow(50.0, -light));
gl_FragColor = v_Color * light + (1.0 - light) * vec4(0.0, 0.0, 0.0, 1.0);
}
</script>
<script type = 'text/javascript'>
var Matrix4 = function(opt_src) {
var i, s, d;
if (opt_src && typeof opt_src === 'object' && opt_src.hasOwnProperty('elements')) {
s = opt_src.elements;
d = new Float32Array(16);
for (i = 0; i < 16; ++i) {
d[i] = s[i];
}
this.elements = d;
} else {
this.elements = new Float32Array([1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1]);
}
};
Matrix4.prototype.setTranslate = function(x, y, z) {
var e = this.elements;
e[0] = 1; e[4] = 0; e[8] = 0; e[12] = x;
e[1] = 0; e[5] = 1; e[9] = 0; e[13] = y;
e[2] = 0; e[6] = 0; e[10] = 1; e[14] = z;
e[3] = 0; e[7] = 0; e[11] = 0; e[15] = 1;
return this;
};
Matrix4.prototype.setLookAt = function(eyeX, eyeY, eyeZ, centerX, centerY, centerZ, upX, upY, upZ) {
var e, fx, fy, fz, rlf, sx, sy, sz, rls, ux, uy, uz;
fx = centerX - eyeX;
fy = centerY - eyeY;
fz = centerZ - eyeZ;
// Normalize f.
rlf = 1 / Math.sqrt(fx*fx + fy*fy + fz*fz);
fx *= rlf;
fy *= rlf;
fz *= rlf;
// Calculate cross product of f and up.
sx = fy * upZ - fz * upY;
sy = fz * upX - fx * upZ;
sz = fx * upY - fy * upX;
// Normalize s.
rls = 1 / Math.sqrt(sx*sx + sy*sy + sz*sz);
sx *= rls;
sy *= rls;
sz *= rls;
// Calculate cross product of s and f.
ux = sy * fz - sz * fy;
uy = sz * fx - sx * fz;
uz = sx * fy - sy * fx;
// Set to this.
e = this.elements;
e[0] = sx;
e[1] = ux;
e[2] = -fx;
e[3] = 0;
e[4] = sy;
e[5] = uy;
e[6] = -fy;
e[7] = 0;
e[8] = sz;
e[9] = uz;
e[10] = -fz;
e[11] = 0;
e[12] = 0;
e[13] = 0;
e[14] = 0;
e[15] = 1;
// Translate.
return this.translate(-eyeX, -eyeY, -eyeZ);
};
Matrix4.prototype.translate = function(x, y, z) {
var e = this.elements;
e[12] += e[0] * x + e[4] * y + e[8] * z;
e[13] += e[1] * x + e[5] * y + e[9] * z;
e[14] += e[2] * x + e[6] * y + e[10] * z;
e[15] += e[3] * x + e[7] * y + e[11] * z;
return this;
};
Matrix4.prototype.setPerspective = function(fovy, aspect, near, far) {
var e, rd, s, ct;
if (near === far || aspect === 0) {
throw 'null frustum';
}
if (near <= 0) {
throw 'near <= 0';
}
if (far <= 0) {
throw 'far <= 0';
}
fovy = Math.PI * fovy / 180 / 2;
s = Math.sin(fovy);
if (s === 0) {
throw 'null frustum';
}
rd = 1 / (far - near);
ct = Math.cos(fovy) / s;
e = this.elements;
e[0] = ct / aspect;
e[1] = 0;
e[2] = 0;
e[3] = 0;
e[4] = 0;
e[5] = ct;
e[6] = 0;
e[7] = 0;
e[8] = 0;
e[9] = 0;
e[10] = -(far + near) * rd;
e[11] = -1;
e[12] = 0;
e[13] = 0;
e[14] = -2 * near * far * rd;
e[15] = 0;
return this;
};
</script>
<script type='text/javascript'>
function slideOnX(value){
Xshift = parseInt(value);
modelMatrix.setTranslate(Xshift, Yshift, 0);
gl_context.uniformMatrix4fv(u_ModelMatrix, false, modelMatrix.elements);
gl_context.clear(gl_context.COLOR_BUFFER_BIT)
gl_context.drawArrays(gl_context.POINTS, 0, n)
}
function slideOnY(value){
Yshift = parseInt(value)
modelMatrix.setTranslate(Xshift, Yshift, 0);
gl_context.uniformMatrix4fv(u_ModelMatrix, false, modelMatrix.elements);
gl_context.clear(gl_context.COLOR_BUFFER_BIT)
gl_context.drawArrays(gl_context.POINTS, 0, n)
}
function setColourByType(value){
colourKey = value;
colourByType()
}
function colourByType(){
if(colourKey == 'celltype'){
colourByCellType()
}else if(colourKey == 'pseudotime'){
colourByPseudotime()
}else{
colourByGene()
}
}
function colourByCellType(){
loadBuffer(selectData(), data_buffer)
drawBuffers()
}
function colourByPseudotime(){
current_pseudotime_buffer = new Float32Array(data_buffer.length)
current_pseudotime_buffer.set(data_buffer)
points_n = data_buffer.length / 6
for (i=0;i<points_n;i++){
current_pseudotime_buffer[6 * i + 3] = pseudotime_buffer[3*i]
current_pseudotime_buffer[6 * i + 4] = pseudotime_buffer[3*i + 1]
current_pseudotime_buffer[6 * i + 5] = pseudotime_buffer[3*i + 2]
}
loadBuffer(selectData(), current_pseudotime_buffer)
drawBuffers()
}
function setExpressionType(value){
expressionType = value
colourByType()
}
function colourByGene(){
current_gene = geneselector.value;
if(expressionType == 'nsnn'){
// check if colours have been already computed for this gene
if (gene_raw_colours[current_gene].length == 0){
gene_raw_colours[current_gene] = valuesToColours(gene_raw_expression[current_gene], 0, maxRawExpression)
}
var gene_colors = gene_raw_colours[current_gene]
}else if(expressionType == 'snn'){
if(gene_smooth_colours[current_gene].length == 0){
var vector = adaptiveMovingAverage(gene_raw_expression[current_gene])
gene_smooth_colours[current_gene] = valuesToColours(vector, 0, 6)
}
var gene_colors = gene_smooth_colours[current_gene]
}else{
if(gene_diff_colours[current_gene].length == 0){
var vector = adaptiveMovingAverage(gene_raw_expression[current_gene])
vector = minMaxNormalization(vector)
gene_diff_colours[current_gene] = valuesToColours(vector, 0, 1)
}
var gene_colors = gene_diff_colours[current_gene]
}
genecolor_buffer = new Float32Array(data_buffer.length)
genecolor_buffer.set(data_buffer)
points_n = data_buffer.length / 6
for (i=0;i<points_n;i++){
genecolor_buffer[6 * i + 3] = gene_colors[3*i]
genecolor_buffer[6 * i + 4] = gene_colors[3*i + 1]
genecolor_buffer[6 * i + 5] = gene_colors[3*i + 2]
}
loadBuffer(selectData(), genecolor_buffer)
drawBuffers()
}
function valuesToColours(vector, minimum, maximum){
colours = []
range = maximum - minimum;
middle = (maximum + minimum) / 2.0;
vector.forEach(function(val, i){
r = Math.max(0, 2 * (val - minimum) / range - 1)
b = Math.max(0, 2 * (maximum - val) / range - 1)
g = 1.0 - 2 * Math.abs(val - middle) / range
colours = colours.concat([r, g, b])
})
colours = new Float32Array(colours);
return colours;
}
function adaptiveMovingAverage(vector){
var colours = [],
kernel = 10,
minim_kernel = 10,
range_factor = 5,
window = 2 * kernel;
for(i=0;i<vector.length;i++){
var start_index = Math.max(1, i - kernel),
stop_index = Math.min(vector.length, i + kernel),
local_sd = vector.slice(start_index, stop_index);
local_mean = local_sd.reduce(function(sum, val){return sum + val}, 0) / local_sd.length;
sqDiffs = local_sd.map(function(value){var diff = value - local_mean; return diff*diff});
local_sd = Math.sqrt(sqDiffs.reduce(function(sum, val){return sum + val}, 0))
local_kernel = minim_kernel + Math.round(range_factor / (local_sd + .1))
start_index = Math.max(1, i - local_kernel)
stop_index = Math.min(vector.length, i + local_kernel)
local_v = vector.slice(start_index, stop_index);
colours.push(local_v.reduce(function(sum, val){return sum + val}, 0) / local_v.length)
}
return colours
}
function minMaxNormalization(vector){
var minim = vector.reduce(function(a, b){return(Math.min(a, b))})
vector = vector.map(function(value){return value - minim})
var maxim = vector.reduce(function(a, b){return(Math.max(a, b))});
vector = vector.map(function(value){return value / maxim})
return vector
}
function selectData(){
controlPanel = document.getElementById('ControlPanel')
controlRadios = controlPanel.elements
values = []
for(i=0;i<controlRadios.length;i++){
if(controlRadios[i].checked){
values = values.concat(index_table[controlRadios[i].id])
}
}
new_indices = []
for (i=0;i<values.length;i++){
v = values[i]
new_indices.push(6*v)
new_indices.push(6*v+1)
new_indices.push(6*v+2)
new_indices.push(6*v+3)
new_indices.push(6*v+4)
new_indices.push(6*v+5)
}
return new_indices
}
function loadBuffer(new_indices, data_buffer_from){
current_data_buffer = []
new_indices.forEach(function(val, i){current_data_buffer.push(data_buffer_from[val])})
current_data_buffer = new Float32Array(current_data_buffer)
gl_context.bufferData(gl_context.ARRAY_BUFFER, current_data_buffer, gl_context.STATIC_DRAW); // load data to buffer
n = current_data_buffer.length / 6
}
function drawBuffers(){
gl_context.clear(gl_context.COLOR_BUFFER_BIT)
gl_context.drawArrays(gl_context.POINTS, 0, n)
}
function toggleShowTypes(){
toggleRadio = document.getElementById('toggleRadio')
controlPanel = document.getElementById('ControlPanel')
controlRadios = controlPanel.elements
for(i=0;i<controlRadios.length;i++){
controlRadios[i].checked = toggleRadio.checked
}
colourByType()
}
function setParticleSize(value){
particleSize = parseInt(value)
gl_context.uniform1f(u_basePointSize, particleSize)
colourByType()
}
function setAlpha(value){
alphaValue = parseInt(value) / 1000
gl_context.uniform1f(u_Alpha, alphaValue)
colourByType()
}
function setCanvasSize(value){
value = parseInt(value)
canvas.width = value
canvas.height = value
gl_context = getContext(canvas)
gl_context = initContext(gl_context)
gl_context.viewport(0, 0, canvas.width, canvas.height)
if(bg_color == "white"){
gl_context.clearColor(1, 1, 1, 1)
}else{
gl_context.clearColor(0, 0, 0, 1)
}
gl_context.clear(gl_context.COLOR_BUFFER_BIT)
gl_context.drawArrays(gl_context.POINTS, 0, n)
}
function setZoom(value){
eyeVN = parseInt(value)
farField = eyeVN + 100;
rotateData(0, 0)
}
function setBackground(value){
if(value == "dark"){
gl_context.clearColor(0, 0, 0, 1)
bg_color = "dark"
}else{
gl_context.clearColor(1, 1, 1, 1)
bg_color = "white"
}
gl_context.clear(gl_context.COLOR_BUFFER_BIT)
gl_context.drawArrays(gl_context.POINTS, 0, n)
}
function shadersFromScriptElement(gl, ID, type){
shaderScript = document.getElementById(ID)
var str = ''
var k = shaderScript.firstChild;
while(k){
if (k.nodeType == 3){
str += k.textContent;
}
k = k.nextSibling
}
var shader = gl.createShader(type)
gl.shaderSource(shader, str)
gl.compileShader(shader)
return shader
}
function getContext(canvasWidget){
var names = ['webgl', 'experimental-webgl', 'webkit-3d', 'moz-webgl'];
for(var i=0; i<names.length; i++){
try{
var gl = canvasWidget.getContext(names[i])
}catch(e){}
if(gl){i=names.length}
}
var vshader = shadersFromScriptElement(gl, 'vertex-shader', gl.VERTEX_SHADER),
fshader = shadersFromScriptElement(gl, 'fragment-shader', gl.FRAGMENT_SHADER)
program = gl.createProgram();
gl.attachShader(program, vshader)
gl.attachShader(program, fshader)
gl.linkProgram(program)
gl.useProgram(program)
gl.program = program
return gl
}
function initContext(gl){
n = current_data_buffer.length / 6
var vertexColourBuffer = gl.createBuffer()
gl.bindBuffer(gl.ARRAY_BUFFER, vertexColourBuffer)
gl.bufferData(gl.ARRAY_BUFFER, current_data_buffer, gl.STATIC_DRAW)
var FSIZE = data_buffer.BYTES_PER_ELEMENT;
var a_Position = gl.getAttribLocation(gl.program, 'a_Position')
gl.vertexAttribPointer(a_Position, 3, gl.FLOAT, false, FSIZE * 6, 0)
gl.enableVertexAttribArray(a_Position)
var a_Color = gl.getAttribLocation(gl.program, 'a_Color')
gl.vertexAttribPointer(a_Color, 3, gl.FLOAT, false, FSIZE * 6, 3 * FSIZE)
gl.enableVertexAttribArray(a_Color)
u_basePointSize = gl.getUniformLocation(gl.program, 'u_basePointSize')
gl.uniform1f(u_basePointSize, particleSize)
u_Alpha = gl.getUniformLocation(gl.program, "u_Alpha")
gl.uniform1f(u_Alpha, alphaValue)
u_ModelMatrix = gl.getUniformLocation(gl.program, 'u_ModelMatrix');
u_ViewMatrix = gl.getUniformLocation(gl.program, 'u_ViewMatrix');
u_ProjMatrix = gl.getUniformLocation(gl.program, 'u_ProjMatrix');
modelMatrix = new Matrix4(); // The model matrix
viewMatrix = new Matrix4(); // The view matrix
projMatrix = new Matrix4(); // The projection matrix
modelMatrix.setTranslate(0, 0, 0); //
viewMatrix.setLookAt(eyeX, eyeY, eyeZ, 0, 0, 0, upX, upY, upZ); // eyeX, eyeY, eyeZ, camX, camY, camZ, upX, upY, upY
projMatrix.setPerspective(30, canvas.width/canvas.height, 100, farField); // fov, ratio, near, far
// Pass the model, view, and projection matrix to the uniform variable respectively
gl.uniformMatrix4fv(u_ModelMatrix, false, modelMatrix.elements);
gl.uniformMatrix4fv(u_ViewMatrix, false, viewMatrix.elements);
gl.uniformMatrix4fv(u_ProjMatrix, false, projMatrix.elements);
gl.clearColor(1, 1, 1, 1); // add ternary conditional
gl.enable(gl.DEPTH_TEST)
gl.enable(gl.BLEND)
gl.blendFunc(gl.SRC_ALPHA, gl.ONE_MINUS_SRC_ALPHA)
//gl.blendFunc(gl.ONE, gl.ONE_MINUS_SRC_ALPHA)
gl.clear(gl.COLOR_BUFFER_BIT);
return gl
}
var canvas = document.getElementById('canvas'),
particleSize = 150,
alphaValue = 1.0,
bg_color = "white",
eyeX = 0.0,
eyeY = 0.0,
eyeZ = 400.0,
upX = 0.0,
upY = 1.0,
upZ = 0.0,
eyeVN = 400.0,
farField = 500.0,
previousX = null,
previousY = null,
currentX = null,
currentY = null,
Xshift = 0,
Yshift = 0,
colourKey = 'celltype',
expressionType = 'nsnn',
geneselector = document.getElementById('geneselector');
data_buffer = new Float32Array([
datahere
])
pseudotime_buffer = new Float32Array([
pseudotime_here
])
pseudotime_buffer = valuesToColours(pseudotime_buffer, 0.0, 1.0)
gene_raw_expression = []
gene_raw_expression_write_here
gene_raw_colours = []
gene_raw_colours_here
gene_smooth_colours = []
gene_smooth_colours_here
gene_diff_colours = []
gene_diff_colours_here
current_gene_here
var maxRawExpression = maxRawExpression_here
index_table = []
indiceshere
current_data_buffer = data_buffer
gl_context = getContext(canvas)
gl_context = initContext(gl_context)
gl_context.drawArrays(gl_context.POINTS, 0, n)
function negCrossProduct(vecA, vecB){
crossproduct = [ - vecA[1] * vecB[2] + vecA[2] * vecB[1],
- vecA[2] * vecB[0] + vecA[0] * vecB[2],
- vecA[0] * vecB[1] + vecA[1] * vecB[0]
]
return(crossproduct)
}
function vectNorm(vector){
return(Math.sqrt((vector[0] * vector[0]) + (vector[1] * vector[1]) + (vector[2] * vector[2])))
}
function rotateData(hAngle, vAngle){
// change vector for very small angles is approximately the cross product of the eye vector and up vector
change = negCrossProduct([eyeX, eyeY, eyeZ], [upX, upY, upZ])
// normalize the change vector
normChange = vectNorm(change)
// scale the change vector by the horizontal angle
change = [hAngle * change[0]/normChange, hAngle * change[1]/normChange, hAngle * change[2]/normChange]
// update the eye vector by adding the change vector
eyeX = eyeX - change[0]
eyeY = eyeY - change[1]
eyeZ = eyeZ - change[2]
// renormalize the eye vector, other it will increase with each change (due to approx error)
normEye = vectNorm([eyeX, eyeY, eyeZ])
eyeX = eyeVN * eyeX / normEye
eyeY = eyeVN * eyeY / normEye
eyeZ = eyeVN * eyeZ / normEye
// get the (eye, up) plane normal
planeInvNormal = negCrossProduct([eyeX, eyeY, eyeZ], [upX, upY, upZ])
// in the case of vertical angle, the up vector is already the change vector
normChange = vectNorm([upX, upY, upZ])
change = [vAngle * upX / normChange, vAngle * upY / normChange, vAngle * upZ / normChange]
// update the eye vector by adding the change vector
eyeX = eyeX + change[0]
eyeY = eyeY + change[1]
eyeZ = eyeZ + change[2]
// renormalize the eye vector, other it will increase with each change (due to approx error)
normEye = Math.sqrt((eyeX * eyeX)+(eyeY * eyeY)+(eyeZ * eyeZ))
eyeX = eyeVN * eyeX / normEye
eyeY = eyeVN * eyeY / normEye
eyeZ = eyeVN * eyeZ / normEye
// but the up vector needs changing as well
newUp = negCrossProduct([eyeX, eyeY, eyeZ], planeInvNormal)
newUpNormal = vectNorm(newUp)
upX = -newUp[0] / newUpNormal
upY = -newUp[1] / newUpNormal
upZ = -newUp[2] / newUpNormal
gl_context.clear(gl_context.COLOR_BUFFER_BIT);
viewMatrix.setLookAt(eyeX, eyeY, eyeZ, 0, 0, 0, upX, upY, upZ);
projMatrix.setPerspective(30, canvas.width/canvas.height, 100, farField);
gl_context.uniformMatrix4fv(u_ViewMatrix, false, viewMatrix.elements);
gl_context.uniformMatrix4fv(u_ProjMatrix, false, projMatrix.elements);
gl_context.drawArrays(gl_context.POINTS, 0, n);
}
function startRotating(ev){
previousX = ev.clientX
previousY = ev.clientY
canvas.addEventListener('mousemove', rotateEvent)
canvas.addEventListener('mouseup', stopRotation)
canvas.addEventListener('mouseout', stopRotation)
}
function stopRotation(ev){
canvas.removeEventListener('mousemove', rotateEvent)
canvas.removeEventListener('mouseup', stopRotation)
canvas.removeEventListener('mouseout', stopRotation)
}
function rotateEvent(ev){
currentX = ev.clientX
currentY = ev.clientY
var dX = currentX - previousX,
dY = currentY - previousY;
rotateData(2.0 * dX, 2.0 * dY)
previousX = currentX;
previousY = currentY;
}
canvas.addEventListener('mousedown', startRotating)
</script>
</body>
</html>
"""
import pandas as pd
import numpy as np
data = pd.read_csv(load_from, index_col = None)
# convert Colours to r, g, b values, then to floats < 1.0
def hexdec_to_1floats(hexdec):
return np.array([int(hexdec[1:][i:(i+2)], 16) for i in (0, 2, 4)]) / 255.0
# map Labels to colours
labels = sorted(list(data.Labels.unique()))
index_table = []
radio_commands = []
for index, label in enumerate(labels):
indices = data.Labels == label
indices = indices.values
indices = np.where(indices)
indices = ','.join([str(i) for i in indices[0]])
indices = "[{indices}]".format(indices = indices)
index_table.append("index_table['{label}'] = {indices}".format(label = label, indices = indices))
colour = data.Colours[data.Labels == label].values[0]
radio_command = "<div style='background-color:{colour}'><input style='float:left' type='checkbox' id='{label}' checked onchange='colourByType()' /><label style='float:left' for='{label}'>{label}: </label><br /></div>".format(colour = colour, label = label)
radio_commands.append(radio_command)
index_table = ';\n '.join(index_table)
radio_commands = '\n '.join(radio_commands)
# make data string
coordinates = data.values[:, 0:3].astype('float32')
# next few steps are compressing the data into a stadard cube centered at (0,0,0) and L = 200
Xrange = np.percentile(coordinates[:, 0], q = [1, 99]) * 1.2
Yrange = np.percentile(coordinates[:, 1], q = [1, 99]) * 1.2
Zrange = np.percentile(coordinates[:, 2], q = [1, 99]) * 1.2
center = np.tile(np.array([np.mean(Xrange), np.mean(Yrange), np.mean(Zrange)]),
(coordinates.shape[0], 1))
coordinates = coordinates - center
Xrange = Xrange[1] - Xrange[0]
Yrange = Yrange[1] - Yrange[0]
Zrange = Zrange[1] - Zrange[0]
maxRange = max((Xrange, Yrange, Zrange))
ratio = 180.0 / maxRange
coordinates = coordinates * ratio
# next few steps the buffer data is created as string
colours = data.values[:, 4]
buffer_data = []
for index in range(coordinates.shape[0]):
coordinate = [str(i) for i in coordinates[index, :]]
colour = [str(i) for i in hexdec_to_1floats(colours[index]).astype('float32')]
vertex_data = coordinate + colour
buffer_data.append(",".join(vertex_data))
buffer_data = ",".join(buffer_data)
pseudotime = data.values[:, 5]
pseudotime_buffer = []
for index in range(pseudotime.shape[0]):
pseudotime_buffer.append(str(pseudotime[index, ]))
pseudotime_buffer = ",".join(pseudotime_buffer)
raw_expression = data.values[:, 6:]
gene_names = data.columns[6:]
gene_raw_expression = []
gene_raw_colours = []
gene_smooth_colours = []
gene_diff_colours = []
gene_options = []
for index in range(gene_names.shape[0]):
gene_name = gene_names[index]
gene_expression = ",".join([str(val) for val in raw_expression[:, index]])
gene_raw_expression.append("gene_raw_expression['{gn}']=[{ge}]".format(gn = gene_name, ge = gene_expression))
gene_raw_colours.append("gene_raw_colours['{gn}'] = []".format(gn = gene_name))
gene_smooth_colours.append("gene_smooth_colours['{gn}'] = []".format(gn = gene_name))
gene_diff_colours.append("gene_diff_colours['{gn}'] = []".format(gn = gene_name))
gene_options.append("<option value='{gn}'>{gn}</option>".format(gn = gene_name));
gene_raw_expression = ";\n".join(gene_raw_expression)
gene_raw_colours = ";\n".join(gene_raw_colours)
gene_smooth_colours = ";\n".join(gene_smooth_colours)
gene_diff_colours = ";\n".join(gene_diff_colours)
gene_options = "".join(gene_options)
maxRawExpression = raw_expression.max()
template_str = template.replace('datahere', buffer_data)
template_str = template_str.replace('indiceshere', index_table)
template_str = template_str.replace('radiocommands', radio_commands)
template_str = template_str.replace('pseudotime_here', pseudotime_buffer)
template_str = template_str.replace('gene_raw_expression_write_here', gene_raw_expression)
template_str = template_str.replace('maxRawExpression_here', str(maxRawExpression))
template_str = template_str.replace('gene_raw_colours_here', gene_raw_colours)
template_str = template_str.replace('gene_smooth_colours_here', gene_smooth_colours)
template_str = template_str.replace('gene_diff_colours_here', gene_diff_colours)
template_str = template_str.replace('gene_options_here', gene_options)
template_str = template_str.replace('current_gene_here', "var current_gene = '{gn}'".format(gn = str(gene_names[0])))
with open(save_to, 'w') as result:
result.write(template_str)

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, DC2, DC1
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5b

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, DC1
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5b_1

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, DC2
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5b_2

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, DC2, DC1, Monocyte
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5b_3

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, MEP, Early Erythroid, Mid Erythroid, Late Erythroid
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig3d_e

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, MEP, Mast cell
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig3d_m

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, Monocyte, Mono-Mac, Kupffer Cell
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5c

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, Monocyte
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5c_1

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, Neut-myeloid progenitor, Monocyte-DC progenitor, Mono-Mac, Kupffer Cell
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig5c_2

View file

@ -0,0 +1,6 @@
HSC_LI
HSC_TH
NK Progenitor_LI
T_DN_TH
T_DP_TH
T_mature_TH

View file

@ -0,0 +1,5 @@
../../seurat_data/liver_all.RDS
HSC, MEP, Megakaryocyte
HSC
../../constant_inputs/liver_cell_type_colours.csv
Fig3d_t

View file

@ -0,0 +1,6 @@
HSC
pro B cell early
pro B cell
pre B cell
B cell

View file

@ -0,0 +1,90 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 14 15:01:36 2018
@author: doru
"""
print("starting .py script")
import sys
args = sys.argv
root_cell_type = args[1]
CWD = args[2]
print("printing args")
print(args)
args
# use the args below if you have a root cell type containing spaces and @@'s
#root_cell_type = args[1] + " " + args[2]
#CWD = args[3]
import matplotlib; matplotlib.use('Agg');
import scanpy.api as sc;
import pandas as pd
import numpy as np
print("printing root_cell_type")
print(root_cell_type)
print("printing CWD")
print(CWD)
sc.settings.verbosity = 3
scObj = sc.read("{CWD}/material/raw_data.mtx".format(CWD=CWD), cache = False).T
# load gene names
scObj.var_names = pd.read_csv("{CWD}/material/genenames.csv".format(CWD=CWD)).iloc[:, 1]
# load cell names
scObj.obs_names = pd.read_csv("{CWD}/material/cellnames.csv".format(CWD=CWD)).iloc[:, 1]
# add cell labels
cell_labels = pd.read_csv("{CWD}/material/cell_labels.csv".format(CWD=CWD), index_col = 0)
scObj.obs["cell_labels"] = cell_labels
# filter out genes present in less than 3 cells
sc.pp.filter_genes(scObj, min_cells=3)
# log-normalize the data
scObj.raw = sc.pp.log1p(scObj, copy=True)
sc.pp.normalize_per_cell(scObj, counts_per_cell_after=1e4)
# variable genes
filter_result = sc.pp.filter_genes_dispersion(
scObj.X, min_mean=0.0125, max_mean=3, min_disp=0.5)
# subset data on variable genes
scObj = scObj[:, filter_result.gene_subset]
# not sure?
sc.pp.log1p(scObj)
# scale the data
sc.pp.scale(scObj, max_value=10)
# run pca
sc.tl.pca(scObj)
# compunte neighborhood graph
sc.pp.neighbors(scObj, n_neighbors = 15, n_pcs = 20, knn = True, random_state = 10, method = "gauss")
# compute diffusion map
sc.tl.diffmap(scObj, n_comps = 20)
# set root
scObj.uns['iroot'] = np.flatnonzero(scObj.obs['cell_labels'] == root_cell_type)[0]
# compute dpt
print("computing sc.tl.dpt")
sc.tl.dpt(scObj, n_dcs = 20)
# pdt is at scObj.obs["dpt_pseudotime"]
print("displaying pdt table stored in scObj")
print(scObj.obs["dpt_pseudotime"])
pdt = scObj.obs["dpt_pseudotime"].to_csv("{CWD}/material/pseudotime.csv".format(CWD=CWD))
# save the pseudotime
dm = scObj.obsm["X_diffmap"]
dm = pd.DataFrame(data = dm, index = None, columns = None)
dm.to_csv("{CWD}/material/dm.csv".format(CWD=CWD), columns = None, header = None)

View file

@ -0,0 +1,399 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
args = gsub(pattern = '@@', replacement = ' ', x = args)
arguments.list = "
seurat.addr.arg = args[1]
set.ident.arg = args[2]
cell.types.arg = args[3]
root_cell_type.arg = args[4]
var.genes.arg = args[5]
type.to.colours.arg = args[6]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: ', length(expected_arguments)))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
#argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
variable.name = gsub(pattern=" ", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
library(Seurat)
library(ggplot2)
library(RColorBrewer)
library(plyr)
library(monocle)
library(dplyr)
library(reshape2)
#######################################################################################################
###########
print("printing cell.types")
print(cell.types)
print("printing root_cell_type")
print(root_cell_type)
ma = function(arr, kernel = 50){
res = arr
n = 2 * kernel
for(i in 1:length(arr)){
start_index = max(1, i - kernel)
stop_index = min(length(arr), i + kernel)
res[i] = mean(arr[start_index:stop_index])
}
res
}
adaptive.moving_average = function(arr, kernel = 10, minim_kernel = 10, range.factor = 5){
res = arr
n = 2 * kernel
for(i in 1:length(arr)){
start_index = max(1, i - kernel)
stop_index = min(length(arr), i + kernel)
local_sd = sd(arr[start_index:stop_index])
local_kernel = minim_kernel + round(range.factor / (local_sd + .1))
start_index = max(1, i - local_kernel)
stop_index = min(length(arr), i + local_kernel)
res[i] = mean(arr[start_index:stop_index])
}
res
}
###########
#######################################################################################################
print("Loading data ...")
seurat.obj = readRDS(seurat.addr)
seurat.obj = SetAllIdent(object=seurat.obj, id=set.ident)
print("Data loaded.")
print("Subseting data on singlets and required cell populations")
if(cell.types == "all"){
cell.types = as.vector(unique(seurat.obj@ident))
}
print(table(seurat.obj@ident))
print("Subseting data ...")
to.keep = names(seurat.obj@ident)[as.vector(seurat.obj@ident) %in% cell.types]
seurat.obj = SubsetData(object=seurat.obj, cells.use=to.keep)
seurat.obj@ident = factor(seurat.obj@ident, levels = cell.types)
print(table(seurat.obj@ident))
print("Writing data to disk ...")
# save raw data to disk
raw_data = seurat.obj@raw.data
raw_data = raw_data[rownames(seurat.obj@data), colnames(seurat.obj@data)]
# decomment the next lines if there is a list of genes that you need to exclude
to_exclude = readRDS('fca_cellcycle_genes.RDS')
genes_to_keep = rownames(raw_data)
genes_to_keep = genes_to_keep[!(genes_to_keep %in% to_exclude)]
raw_data = raw_data[genes_to_keep, colnames(seurat.obj@data)]
writeMM(raw_data, file.path(output_folder_material, "raw_data.mtx"))
# save gene names
gene_names = rownames(raw_data)
write.csv(data.frame(Genes = gene_names), file.path(output_folder_material, "genenames.csv"))
# save cell names
cell_names = colnames(raw_data)
write.csv(data.frame(Cells = cell_names), file.path(output_folder_material, "cellnames.csv"))
# write cell labels to disk
write.csv(data.frame(Cells = names(seurat.obj@ident), Labels = seurat.obj@ident), file.path(output_folder_material, "cell_labels.csv"), row.names = F)
print("Computing pseudotime using pdt.scanpy.py...")
# compute pseudotime in python scanpy
command = sprintf("%s pdt_scanpy.py %s %s", python.addr, root_cell_type, output_folder)
system(command, wait=T)
print("finished running .py")
# get cell labels and colours
if (!is.na(type.to.colours)){
type.to.colours = file.path("../../resources", type.to.colours)
type.to.colour = read.csv(type.to.colours)
print("printing type.to.colour after it is loaded in")
print(type.to.colour)
print("printing seurat obj idents which the typetocol arg will be compared against in next lines")
print(as.vector(unique(seurat.obj@ident)))
filter.key = type.to.colour$CellTypes %in% as.vector(unique(seurat.obj@ident))
cell.labels = as.vector(type.to.colour$CellTypes[filter.key])
cell.colours = as.vector(type.to.colour$Colours[filter.key])
}else{
cell.labels = sort(as.vector(unique(seurat.obj@ident)))
cell.colours = sample(colorRampPalette(brewer.pal(12, "Paired"))(length(cell.labels)))
}
print("printing cell.labels")
print(cell.labels)
print("printing cell.colours")
print(cell.colours)
# load pseudotime
print('reading pseudotime values')
pseudotime = read.csv(file.path(output_folder_material, "pseudotime.csv"), row.names = 1, header = F)
print("Are the cells in the same order in both pseudotime and seurat object? ")
print(all(rownames(pseudotime) == names(seurat.obj@ident)))
pseudotime$CellTypes = seurat.obj@ident
colnames(pseudotime) = c("Pseudotime", "CellType")
pseudotime$Color = mapvalues(x=pseudotime$CellType, from=cell.labels, to=cell.colours)
pseudotime$Color = factor(as.vector(pseudotime$Color), levels = cell.colours)
pseudotime$CellType = factor(as.vector(pseudotime$CellType), levels = cell.labels)
colnames(pseudotime) = c("Pseudotime", "Cell Type", "Color")
# making sure that there are no inf values in pdt column
#pseudotime["Pseudotime"][pseudotime["Pseudotime"] == "Inf"] <- 1
plot.density = ggplot(data = pseudotime, aes(x = Pseudotime, color = `Cell Type`, fill = `Cell Type`)) + geom_density(alpha = .7)
plot.density = plot.density + scale_x_continuous(position = "top", limits = c(.0, 1.0), expand = c(0.0, .0))
plot.density = plot.density + scale_color_manual(values = cell.colours)
plot.density = plot.density + scale_fill_manual(values = cell.colours)
plot.density = plot.density + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.title.x = element_text(size = 25),
legend.position = c(0, 1),
legend.justification = c(0, 1))
print("printing cell.colours which is used for scale_colour_manual for plot.density plot")
print(cell.colours)
# compute diff genes
print("Computing var genes by cell type...")
cds = newCellDataSet(cellData = as.matrix(raw_data), phenoData=NULL, featureData=NULL, expressionFamily = negbinomial.size())
print("printing cds made using newCellDataSet function")
print(cds)
pData(cds)$Cluster = as.vector(seurat.obj@ident)
print("printing cds after adding cluster to pdata")
print(cds)
print("running estimatesizefactors for cds")
cds = estimateSizeFactors(cds)
pData(cds)$Pseudotime = pseudotime$Pseudotime
if (is.na(var.genes)){
var.genes.total = c()
print('Computing variable genes ... ')
for (j in 1:length(cell.labels)){
print(sprintf("Choice %s out of %s ... ", as.character(j), as.character(length(cell.labels))))
choices = pseudotime$`Cell Type` == cell.labels[j]
var.genes = differentialGeneTest(cds[, choices], fullModelFormulaStr = "~sm.ns(Pseudotime)")
var.genes = cbind(var.genes, data.frame(gene_id = rownames(var.genes)))
var.genes.ch = var.genes %>% arrange(qval)
var.genes.ch = as.vector(var.genes.ch$gene_id[1:100])
var.genes.total = union(var.genes.total, var.genes.ch)
}
print("Computing var genes globally...")
var.genes = differentialGeneTest(cds, fullModelFormulaStr = "~sm.ns(Pseudotime)")
var.genes = cbind(var.genes, data.frame(gene_id = rownames(var.genes)))
var.genes.ch = var.genes %>% arrange(qval)
var.genes.ch = as.vector(var.genes.ch$gene_id[1:100])
var.genes.total = union(var.genes.total, var.genes.ch)
MT_genes = var.genes.total[grep("^MT-", x=var.genes.total, ignore.case=T)]
var.genes.total = setdiff(var.genes.total, MT_genes)
}else{
var.genes.file = file.path('../../resources', var.genes)
var.genes.file = file(var.genes.file)
var.genes.total = readLines(var.genes.file)
var.genes.total = as.vector(unique(var.genes.total))
var.genes.total = var.genes.total[var.genes.total != '']
close(var.genes.file)
}
# saving the genes to disk
print("Heavy computing finished. Next saving to output...")
print("calculating var_gene_expression")
# cluster genes based on their min-max normalized values
var_gene_expression = as.matrix(seurat.obj@data[var.genes.total, order(pseudotime$Pseudotime)])
var_gene_expression = t(apply(var_gene_expression, 1, adaptive.moving_average, kernel = 15, minim_kernel = 1, range.factor=15))
# min-max normalization
var_gene_min = apply(var_gene_expression, 1, min)
var_gene_expression = var_gene_expression - var_gene_min
var_gene_genes_max = apply(var_gene_expression, 1, max)
var_gene_expression = var_gene_expression / var_gene_genes_max
print("clustering genes by level of expression")
# actual clustering of genes
d_matrix = as.dist(1.0 - cor(t(as.matrix(var_gene_expression)), method="spearman"))
genes_clust = hclust(d=d_matrix, method="ward.D2")
genes.in.order = var.genes.total[genes_clust$order]
# plot min-max normalized expression
###################################################################################################
raw_data_genes = as.matrix(seurat.obj@data[rev(genes.in.order), order(pseudotime$Pseudotime)])
raw_data_genes = t(apply(raw_data_genes, 1, adaptive.moving_average, kernel = 15, minim_kernel = 1, range.factor=15))
# min-max normalization
raw_data_genes_min = apply(raw_data_genes, 1, min)
raw_data_genes = raw_data_genes - raw_data_genes_min
raw_data_genes_max = apply(raw_data_genes, 1, max)
raw_data_genes = raw_data_genes / raw_data_genes_max
print("group genes by pdt")
# group by pdt
pdt = range(pseudotime$Pseudotime)
pdt = seq(pdt[1], pdt[2], length.out=100)
pdt_data = c()
for (k in 1:nrow(raw_data_genes)){
for(j in 1:length(pdt)){
local_pdt = pdt[j]
pdt_index = abs(pseudotime$Pseudotime[order(pseudotime$Pseudotime)] - local_pdt)
pdt_index = which(pdt_index == min(pdt_index))
pdt_data = c(pdt_data, raw_data_genes[k, pdt_index])
}
}
pdt_data = matrix(data=pdt_data, nrow=nrow(raw_data_genes), byrow=T)
print("printing nrow pdt_data")
nrow(pdt_data)
print("printing ncol pdt_data")
ncol(pdt_data)
rownames(pdt_data) = rownames(raw_data_genes)
colnames(pdt_data) = paste("PDT", 1:100, sep = "")
#colnames(pdt_data) = paste("PDT", 1:ncol(pdt_data), sep = "")
# smooth a bit the pdt_data matrx
pdt_data = t(apply(pdt_data, 1, ma, kernel = 7))
pdt_data = pdt_data - apply(pdt_data, 1, min)
pdt_data = pdt_data / apply(pdt_data, 1, max)
beautiful_result_norm = reshape2::melt(data=pdt_data)
colnames(beautiful_result_norm) = c("GeneNames", "Pseudotime", "ExpressionValue")
print("preparing to plot genes by expression level")
plot.genes = ggplot(data = beautiful_result_norm, aes(x = Pseudotime, y = GeneNames))
plot.genes = plot.genes + geom_tile(aes(fill = ExpressionValue), width=1.001, height=1.001)
plot.genes = plot.genes + scale_fill_gradient2(low = "deepskyblue", high = "firebrick3", mid = "darkolivegreen3", midpoint = 0.5, name = "Minmax normalized gene expression")
plot.genes = plot.genes + theme(legend.position = "bottom", legend.text = element_text(size = 25, angle = 90),
legend.title = element_text(size = 25),
legend.key.width = unit(2, "cm"),
axis.text.x = element_blank(), axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 0), axis.text.y = element_text(size = 8))
pdf(file.path(output_folder, "expression_vs_norm_expression.pdf"), width = 13, height = 35)
plot_grid(plot.density, plot.genes, nrow = 2, align = "v", rel_heights = c(1/9, 8/9))
dev.off()
print("plotted expression_vs_norm_expression.pdf in output folder")
# plot non-normalized expression
###################################################################################################
raw_data_genes = as.matrix(seurat.obj@data[rev(genes.in.order), order(pseudotime$Pseudotime)])
print("made raw_data_genes matrix")
raw_data_genes = t(apply(raw_data_genes, 1, adaptive.moving_average, kernel = 15, minim_kernel = 1, range.factor=15))
print("raw_data_genes matrix has applied apply function and t")
# group by pdt
pdt = range(pseudotime$Pseudotime)
pdt = seq(pdt[1], pdt[2], length.out=100)
pdt_data = c()
for (k in 1:nrow(raw_data_genes)){
for(j in 1:length(pdt)){
local_pdt = pdt[j]
pdt_index = abs(pseudotime$Pseudotime[order(pseudotime$Pseudotime)] - local_pdt)
pdt_index = which(pdt_index == min(pdt_index))
pdt_data = c(pdt_data, raw_data_genes[k, pdt_index])
}
}
pdt_data = matrix(data=pdt_data, nrow=nrow(raw_data_genes), byrow=T)
rownames(pdt_data) = rownames(raw_data_genes)
#colnames(pdt_data) = paste("PDT", 1:ncol(pdt_data), sep = "")
colnames(pdt_data) = paste("PDT", 1:100, sep = "")
# smooth a bit the pdt_data matrx
pdt_data = t(apply(pdt_data, 1, ma, kernel = 7))
beautiful_result_nonnorm = reshape2::melt(data=pdt_data)
colnames(beautiful_result_nonnorm) = c("GeneNames", "Pseudotime", "ExpressionValue")
plot.genes = ggplot(data = beautiful_result_nonnorm, aes(x = Pseudotime, y = GeneNames))
plot.genes = plot.genes + geom_tile(aes(fill = ExpressionValue), width=1.001, height=1.001)
plot.genes = plot.genes + scale_fill_gradient2(low = "deepskyblue", high = "firebrick3", mid = "darkolivegreen3", midpoint = mean(range(pdt_data)), name = "Gene expression")
plot.genes = plot.genes + theme(legend.position = "bottom", legend.text = element_text(size = 25, angle = 90),
legend.title = element_text(size = 25),
legend.key.width = unit(2, "cm"),
axis.text.x = element_blank(), axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 0), axis.text.y = element_text(size = 8))
pdf(file.path(output_folder, "expression_vs_nonnorm_expression.pdf"), width = 13, height = 35)
plot_grid(plot.density, plot.genes, nrow = 2, align = "v", rel_heights = c(1/9, 8/9))
dev.off()
print("plotted genes by expression_vs_nonnorm_expression.pdf")
# save diffusion map coordinates and expression data for found genes
by.pdt.order = order(pseudotime$Pseudotime)
dm.df = read.csv(file.path(output_folder_material, "dm.csv"), row.names = 1, header = F)
dm.df = as.data.frame(dm.df[, 1:3])
dm.df$Labels = factor(seurat.obj@ident, levels = cell.labels)
dm.df$Colours = mapvalues(x = dm.df$Labels, from = cell.labels, to = cell.colours)
dm.df = dm.df[by.pdt.order, ]
colnames(dm.df) = c("DM1", "DM2", "DM3", "Labels", "Colours")
print("writing pdt_and_expression.csv")
expression_data_and_pdt = as.data.frame(t(as.matrix(seurat.obj@data[rev(genes.in.order), by.pdt.order])))
pdt.data = data.frame(Pseudotime = pseudotime[by.pdt.order, c(1)])
pdt.data = cbind(dm.df, pdt.data, expression_data_and_pdt)
pdt.data.fp = file.path(output_folder, "pdt_and_expression.csv")
write.csv(pdt.data, pdt.data.fp, row.names = F)
# make interactive diffusion map
command = sprintf("%s html_3D_viewer_and_plotter.py %s %s", python.addr, file.path(output_folder, "Interactive_Pseudotime.html"), pdt.data.fp)
system(command, wait = T)
# save the plotting material, just in case
plot.data.objects = list(pseudotime = pseudotime, beautiful_result_norm = beautiful_result_norm, beautiful_result_nonnorm = beautiful_result_nonnorm)
saveRDS(plot.data.objects, file.path(output_folder, "ploting_material.RDS"))
unlink(output_folder_material, recursive=T, force=T)
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N pseudotime
#$ -V
#$ -l h_rt=47:59:59
#$ -l h_vmem=400G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript pseudotime.R $1
echo "End on `date`"

View file

@ -0,0 +1,154 @@
## ForceAtlas2 for Python
A port of Gephi's Force Atlas 2 layout algorithm to Python 2 and Python 3 (with a wrapper for NetworkX and igraph). This is the fastest python implementation available with most of the features complete. It also supports Barnes Hut approximation for maximum speedup.
ForceAtlas2 is a very fast layout algorithm for force-directed graphs. It's used to spatialize a **weighted undirected** graph in 2D (Edge weight defines the strength of the connection). The implementation is based on this [paper](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0098679) and the corresponding [gephi-java-code](https://github.com/gephi/gephi/blob/master/modules/LayoutPlugin/src/main/java/org/gephi/layout/plugin/forceAtlas2/ForceAtlas2.java). Its really quick compared to the fruchterman reingold algorithm (spring layout) of networkx and scales well to high number of nodes (>10000).
<p align="center" text-align="center">
<b>Spatialize a random Geometric Graph</b>
</p>
<p align="center">
<img width="460" height="300" src="https://raw.githubusercontent.com/bhargavchippada/forceatlas2/master/examples/geometric_graph.png" alt="Geometric Graph">
</p>
## Installation
Install from pip:
pip install fa2
To build and install run from source:
python setup.py install
**Cython is highly recommended if you are buidling from source as it will speed up by a factor of 10-100x depending on the graph**
### Dependencies
- numpy (adjacency matrix as complete matrix)
- scipy (adjacency matrix as sparse matrix)
- tqdm (progressbar)
- Cython (10-100x speedup)
- networkx (To use the NetworkX wrapper function, you obviously need NetworkX)
- python-igraph (To use the igraph wrapper)
<p align="center" text-align="center">
<b>Spatialize a 2D Grid</b>
</p>
<p align="center">
<img width="460" height="300" src="https://raw.githubusercontent.com/bhargavchippada/forceatlas2/master/examples/grid_graph.png" alt="Grid Graph">
</p>
## Usage
from fa2 import ForceAtlas2
Create a ForceAtlas2 object with the appropriate settings. ForceAtlas2 class contains three important methods:
```python
forceatlas2 (G, pos, iterations)
# G is a graph in 2D numpy ndarray format (or) scipy sparse matrix format. You can set the edge weights (> 0) in the matrix
# pos is a numpy array (Nx2) of initial positions of nodes
# iterations is num of iterations to run the algorithm
# returns a list of (x,y) pairs for each node's final position
```
```python
forceatlas2_networkx_layout(G, pos, iterations)
# G is a networkx graph. Edge weights can be set (if required) in the Networkx graph
# pos is a dictionary, as in networkx
# iterations is num of iterations to run the algorithm
# returns a dictionary of node positions (2D X-Y tuples) indexed by the node name
```
```python
forceatlas2_igraph_layout(G, pos, iterations, weight_attr)
# G is an igraph graph
# pos is a numpy array (Nx2) or list of initial positions of nodes (see that the indexing matches igraph node index)
# iterations is num of iterations to run the algorithm
# weight_attr denotes the weight attribute's name in G.es, None by default
# returns an igraph layout
```
Below is an example usage. You can also see the feature settings of ForceAtlas2 class.
```python
import networkx as nx
from fa2 import ForceAtlas2
import matplotlib.pyplot as plt
G = nx.random_geometric_graph(400, 0.2)
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=True, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=1.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=2.0,
strongGravityMode=False,
gravity=1.0,
# Log
verbose=True)
positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=2000)
nx.draw_networkx_nodes(G, positions, node_size=20, with_labels=False, node_color="blue", alpha=0.4)
nx.draw_networkx_edges(G, positions, edge_color="green", alpha=0.05)
plt.axis('off')
plt.show()
# equivalently
import igraph
G = igraph.Graph.TupleList(G.edges(), directed=False)
layout = forceatlas2.forceatlas2_igraph_layout(G, pos=None, iterations=2000)
igraph.plot(G, layout).show()
```
You can also take a look at forceatlas2.py file for understanding the ForceAtlas2 class and its functions better.
## Features Completed
- **barnesHutOptimize**: Barnes Hut optimization, n<sup>2</sup> complexity to n.ln(n)
- **gravity**: Attracts nodes to the center. Prevents islands from drifting away
- **Dissuade Hubs**: Distributes attraction along outbound edges. Hubs attract less and thus are pushed to the borders
- **scalingRatio**: How much repulsion you want. More makes a more sparse graph
- **strongGravityMode**: A stronger gravity view
- **jitterTolerance**: How much swinging you allow. Above 1 discouraged. Lower gives less speed and more precision
- **verbose**: Shows a progressbar of iterations completed. Also, shows time taken for different force computations
- **edgeWeightInfluence**: How much influence you give to the edges weight. 0 is "no influence" and 1 is "normal"
## Documentation
You will find all the documentation in the source code
## Contributors
Contributions are highly welcome. Please submit your pull requests and become a collaborator.
## Copyright
Copyright (C) 2017 Bhargav Chippada bhargavchippada19@gmail.com.
Licensed under the GNU GPLv3.
The files are heavily based on the java files included in Gephi, git revision 2b9a7c8 and Max Shinn's port to python of the algorithm. Here I include the copyright information from those files:
Copyright 2008-2011 Gephi
Authors : Mathieu Jacomy <mathieu.jacomy@gmail.com>
Website : http://www.gephi.org
Copyright 2011 Gephi Consortium. All rights reserved.
Portions Copyrighted 2011 Gephi Consortium.
The contents of this file are subject to the terms of either the
GNU General Public License Version 3 only ("GPL") or the Common
Development and Distribution License("CDDL") (collectively, the
"License"). You may not use this file except in compliance with
the License.
<https://github.com/mwshinn/forceatlas2-python>
Copyright 2016 Max Shinn <mws41@cam.ac.uk>
Available under the GPLv3
Also, thanks to Eugene Bosiakov <https://github.com/bosiakov/fa2l>

View file

@ -0,0 +1,5 @@
Package downloaded from https://github.com/bhargavchippada/forceatlas2
forceatlas2.py has been modified and it is different from the original script.
The modification allows for returning all FDG coordinates for each iteration. This is needed for the creation of animated force directed graph.
It is the understanding of the person (Dorin-Mirel Popescu) who modified the published package that forceatlas2 is subjected to GPL version 3 terms which allows for modifications of original code and publishing the modified version. The original author of forceatlas2 (Mathieu Jacomy) is acknowledged. Furthermore the modifications within this version do not pertain to the algorithm but only functionalities that allow for keeping all transient states for the purpose of tracking the evolution of force directed graph visualised in a video format.

View file

@ -0,0 +1 @@
from .forceatlas2 import *

View file

@ -0,0 +1,122 @@
# Cython optimizations. Cython allows huge speed boosts by giving
# each variable a type. This file is called a "pxd extension file"
# (see the "Pure Python" section of the Cython manual). In essence,
# it provides types for function definitions and then, if cython is
# available, it uses these types to optimize normal python code. It
# is associated with the fa2util.py file.
#
# IF ANY CHANGES ARE MADE TO fa2util.py, THE CHANGES MUST BE REFLECTED
# HERE!!
#
# Copyright (C) 2017 Bhargav Chippada <bhargavchippada19@gmail.com>
#
# Available under the GPLv3
import cython
# This will substitute for the nLayout object
cdef class Node:
cdef public double mass
cdef public double old_dx, old_dy
cdef public double dx, dy
cdef public double x, y
# This is not in the original java function, but it makes it easier to
# deal with edges.
cdef class Edge:
cdef public int node1, node2
cdef public double weight
# Repulsion function. `n1` and `n2` should be nodes. This will
# adjust the dx and dy values of `n1` (and optionally `n2`). It does
# not return anything.
@cython.locals(xDist = cython.double,
yDist = cython.double,
distance2 = cython.double,
factor = cython.double)
cdef void linRepulsion(Node n1, Node n2, double coefficient=*)
@cython.locals(xDist = cython.double,
yDist = cython.double,
distance2 = cython.double,
factor = cython.double)
cdef void linRepulsion_region(Node n, Region r, double coefficient=*)
@cython.locals(xDist = cython.double,
yDist = cython.double,
distance = cython.double,
factor = cython.double)
cdef void linGravity(Node n, double g)
@cython.locals(xDist = cython.double,
yDist = cython.double,
factor = cython.double)
cdef void strongGravity(Node n, double g, double coefficient=*)
@cython.locals(xDist = cython.double,
yDist = cython.double,
factor = cython.double)
cpdef void linAttraction(Node n1, Node n2, double e, bint distributedAttraction, double coefficient=*)
@cython.locals(i = cython.int,
j = cython.int,
n1 = Node,
n2 = Node)
cpdef void apply_repulsion(list nodes, double coefficient)
@cython.locals(n = Node)
cpdef void apply_gravity(list nodes, double gravity, bint useStrongGravity=*)
@cython.locals(edge = Edge)
cpdef void apply_attraction(list nodes, list edges, bint distributedAttraction, double coefficient, double edgeWeightInfluence)
cdef class Region:
cdef public double mass
cdef public double massCenterX, massCenterY
cdef public double size
cdef public list nodes
cdef public list subregions
@cython.locals(massSumX = cython.double,
massSumY = cython.double,
n = Node,
distance = cython.double)
cdef void updateMassAndGeometry(self)
@cython.locals(n = Node,
leftNodes = list,
rightNodes = list,
topleftNodes = list,
bottomleftNodes = list,
toprightNodes = list,
bottomrightNodes = list,
subregion = Region)
cpdef void buildSubRegions(self)
@cython.locals(distance = cython.double,
subregion = Region)
cdef void applyForce(self, Node n, double theta, double coefficient=*)
@cython.locals(n = Node)
cpdef applyForceOnNodes(self, list nodes, double theta, double coefficient=*)
@cython.locals(totalSwinging = cython.double,
totalEffectiveTraction = cython.double,
n = Node,
swinging = cython.double,
totalSwinging = cython.double,
totalEffectiveTraction = cython.double,
estimatedOptimalJitterTolerance = cython.double,
minJT = cython.double,
maxJT = cython.double,
jt = cython.double,
minSpeedEfficiency = cython.double,
targetSpeed = cython.double,
maxRise = cython.double,
factor = cython.double,
values = dict)
cpdef dict adjustSpeedAndApplyForces(list nodes, double speed, double speedEfficiency, double jitterTolerance)

View file

@ -0,0 +1,326 @@
# This file allows separating the most CPU intensive routines from the
# main code. This allows them to be optimized with Cython. If you
# don't have Cython, this will run normally. However, if you use
# Cython, you'll get speed boosts from 10-100x automatically.
#
# THE ONLY CATCH IS THAT IF YOU MODIFY THIS FILE, YOU MUST ALSO MODIFY
# fa2util.pxd TO REFLECT ANY CHANGES IN FUNCTION DEFINITIONS!
#
# Copyright (C) 2017 Bhargav Chippada <bhargavchippada19@gmail.com>
#
# Available under the GPLv3
from math import sqrt
# This will substitute for the nLayout object
class Node:
def __init__(self):
self.mass = 0.0
self.old_dx = 0.0
self.old_dy = 0.0
self.dx = 0.0
self.dy = 0.0
self.x = 0.0
self.y = 0.0
# This is not in the original java code, but it makes it easier to deal with edges
class Edge:
def __init__(self):
self.node1 = -1
self.node2 = -1
self.weight = 0.0
# Here are some functions from ForceFactory.java
# =============================================
# Repulsion function. `n1` and `n2` should be nodes. This will
# adjust the dx and dy values of `n1` `n2`
def linRepulsion(n1, n2, coefficient=0):
xDist = n1.x - n2.x
yDist = n1.y - n2.y
distance2 = xDist * xDist + yDist * yDist # Distance squared
if distance2 > 0:
factor = coefficient * n1.mass * n2.mass / distance2
n1.dx += xDist * factor
n1.dy += yDist * factor
n2.dx -= xDist * factor
n2.dy -= yDist * factor
# Repulsion function. 'n' is node and 'r' is region
def linRepulsion_region(n, r, coefficient=0):
xDist = n.x - r.massCenterX
yDist = n.y - r.massCenterY
distance2 = xDist * xDist + yDist * yDist
if distance2 > 0:
factor = coefficient * n.mass * r.mass / distance2
n.dx += xDist * factor
n.dy += yDist * factor
# Gravity repulsion function. For some reason, gravity was included
# within the linRepulsion function in the original gephi java code,
# which doesn't make any sense (considering a. gravity is unrelated to
# nodes repelling each other, and b. gravity is actually an
# attraction)
def linGravity(n, g):
xDist = n.x
yDist = n.y
distance = sqrt(xDist * xDist + yDist * yDist)
if distance > 0:
factor = n.mass * g / distance
n.dx -= xDist * factor
n.dy -= yDist * factor
# Strong gravity force function. `n` should be a node, and `g`
# should be a constant by which to apply the force.
def strongGravity(n, g, coefficient=0):
xDist = n.x
yDist = n.y
if xDist != 0 and yDist != 0:
factor = coefficient * n.mass * g
n.dx -= xDist * factor
n.dy -= yDist * factor
# Attraction function. `n1` and `n2` should be nodes. This will
# adjust the dx and dy values of `n1` and `n2`. It does
# not return anything.
def linAttraction(n1, n2, e, distributedAttraction, coefficient=0):
xDist = n1.x - n2.x
yDist = n1.y - n2.y
if not distributedAttraction:
factor = -coefficient * e
else:
factor = -coefficient * e / n1.mass
n1.dx += xDist * factor
n1.dy += yDist * factor
n2.dx -= xDist * factor
n2.dy -= yDist * factor
# The following functions iterate through the nodes or edges and apply
# the forces directly to the node objects. These iterations are here
# instead of the main file because Python is slow with loops.
def apply_repulsion(nodes, coefficient):
i = 0
for n1 in nodes:
j = i
for n2 in nodes:
if j == 0:
break
linRepulsion(n1, n2, coefficient)
j -= 1
i += 1
def apply_gravity(nodes, gravity, useStrongGravity=False):
if not useStrongGravity:
for n in nodes:
linGravity(n, gravity)
else:
for n in nodes:
strongGravity(n, gravity)
def apply_attraction(nodes, edges, distributedAttraction, coefficient, edgeWeightInfluence):
# Optimization, since usually edgeWeightInfluence is 0 or 1, and pow is slow
if edgeWeightInfluence == 0:
for edge in edges:
linAttraction(nodes[edge.node1], nodes[edge.node2], 1, distributedAttraction, coefficient)
elif edgeWeightInfluence == 1:
for edge in edges:
linAttraction(nodes[edge.node1], nodes[edge.node2], edge.weight, distributedAttraction, coefficient)
else:
for edge in edges:
linAttraction(nodes[edge.node1], nodes[edge.node2], pow(edge.weight, edgeWeightInfluence),
distributedAttraction, coefficient)
# For Barnes Hut Optimization
class Region:
def __init__(self, nodes):
self.mass = 0.0
self.massCenterX = 0.0
self.massCenterY = 0.0
self.size = 0.0
self.nodes = nodes
self.subregions = []
self.updateMassAndGeometry()
def updateMassAndGeometry(self):
if len(self.nodes) > 1:
self.mass = 0
massSumX = 0
massSumY = 0
for n in self.nodes:
self.mass += n.mass
massSumX += n.x * n.mass
massSumY += n.y * n.mass
self.massCenterX = massSumX / self.mass
self.massCenterY = massSumY / self.mass
self.size = 0.0
for n in self.nodes:
distance = sqrt((n.x - self.massCenterX) ** 2 + (n.y - self.massCenterY) ** 2)
self.size = max(self.size, 2 * distance)
def buildSubRegions(self):
if len(self.nodes) > 1:
leftNodes = []
rightNodes = []
for n in self.nodes:
if n.x < self.massCenterX:
leftNodes.append(n)
else:
rightNodes.append(n)
topleftNodes = []
bottomleftNodes = []
for n in leftNodes:
if n.y < self.massCenterY:
topleftNodes.append(n)
else:
bottomleftNodes.append(n)
toprightNodes = []
bottomrightNodes = []
for n in rightNodes:
if n.y < self.massCenterY:
toprightNodes.append(n)
else:
bottomrightNodes.append(n)
if len(topleftNodes) > 0:
if len(topleftNodes) < len(self.nodes):
subregion = Region(topleftNodes)
self.subregions.append(subregion)
else:
for n in topleftNodes:
subregion = Region([n])
self.subregions.append(subregion)
if len(bottomleftNodes) > 0:
if len(bottomleftNodes) < len(self.nodes):
subregion = Region(bottomleftNodes)
self.subregions.append(subregion)
else:
for n in bottomleftNodes:
subregion = Region([n])
self.subregions.append(subregion)
if len(toprightNodes) > 0:
if len(toprightNodes) < len(self.nodes):
subregion = Region(toprightNodes)
self.subregions.append(subregion)
else:
for n in toprightNodes:
subregion = Region([n])
self.subregions.append(subregion)
if len(bottomrightNodes) > 0:
if len(bottomrightNodes) < len(self.nodes):
subregion = Region(bottomrightNodes)
self.subregions.append(subregion)
else:
for n in bottomrightNodes:
subregion = Region([n])
self.subregions.append(subregion)
for subregion in self.subregions:
subregion.buildSubRegions()
def applyForce(self, n, theta, coefficient=0):
if len(self.nodes) < 2:
linRepulsion(n, self.nodes[0], coefficient)
else:
distance = sqrt((n.x - self.massCenterX) ** 2 + (n.y - self.massCenterY) ** 2)
if distance * theta > self.size:
linRepulsion_region(n, self, coefficient)
else:
for subregion in self.subregions:
subregion.applyForce(n, theta, coefficient)
def applyForceOnNodes(self, nodes, theta, coefficient=0):
for n in nodes:
self.applyForce(n, theta, coefficient)
# Adjust speed and apply forces step
def adjustSpeedAndApplyForces(nodes, speed, speedEfficiency, jitterTolerance):
# Auto adjust speed.
totalSwinging = 0.0 # How much irregular movement
totalEffectiveTraction = 0.0 # How much useful movement
for n in nodes:
swinging = sqrt((n.old_dx - n.dx) * (n.old_dx - n.dx) + (n.old_dy - n.dy) * (n.old_dy - n.dy))
totalSwinging += n.mass * swinging
totalEffectiveTraction += .5 * n.mass * sqrt(
(n.old_dx + n.dx) * (n.old_dx + n.dx) + (n.old_dy + n.dy) * (n.old_dy + n.dy))
# Optimize jitter tolerance. The 'right' jitter tolerance for
# this network. Bigger networks need more tolerance. Denser
# networks need less tolerance. Totally empiric.
estimatedOptimalJitterTolerance = .05 * sqrt(len(nodes))
minJT = sqrt(estimatedOptimalJitterTolerance)
maxJT = 10
jt = jitterTolerance * max(minJT,
min(maxJT, estimatedOptimalJitterTolerance * totalEffectiveTraction / (
len(nodes) * len(nodes))))
minSpeedEfficiency = 0.05
# Protective against erratic behavior
if totalSwinging / totalEffectiveTraction > 2.0:
if speedEfficiency > minSpeedEfficiency:
speedEfficiency *= .5
jt = max(jt, jitterTolerance)
if totalSwinging == 0:
targetSpeed = float('inf')
else:
targetSpeed = jt * speedEfficiency * totalEffectiveTraction / totalSwinging
if totalSwinging > jt * totalEffectiveTraction:
if speedEfficiency > minSpeedEfficiency:
speedEfficiency *= .7
elif speed < 1000:
speedEfficiency *= 1.3
# But the speed shoudn't rise too much too quickly, since it would
# make the convergence drop dramatically.
maxRise = .5
speed = speed + min(targetSpeed - speed, maxRise * speed)
# Apply forces.
#
# Need to add a case if adjustSizes ("prevent overlap") is
# implemented.
for n in nodes:
swinging = n.mass * sqrt((n.old_dx - n.dx) * (n.old_dx - n.dx) + (n.old_dy - n.dy) * (n.old_dy - n.dy))
factor = speed / (1.0 + sqrt(speed * swinging))
n.x = n.x + (n.dx * factor)
n.y = n.y + (n.dy * factor)
values = {}
values['speed'] = speed
values['speedEfficiency'] = speedEfficiency
return values
try:
import cython
if not cython.compiled:
print("Warning: uncompiled fa2util module. Compile with cython for a 10-100x speed boost.")
except:
print("No cython detected. Install cython and compile the fa2util module for a 10-100x speed boost.")

View file

@ -0,0 +1,250 @@
# This is the fastest python implementation of the ForceAtlas2 plugin from Gephi
# intended to be used with networkx, but is in theory independent of
# it since it only relies on the adjacency matrix. This
# implementation is based directly on the Gephi plugin:
#
# https://github.com/gephi/gephi/blob/master/modules/LayoutPlugin/src/main/java/org/gephi/layout/plugin/forceAtlas2/ForceAtlas2.java
#
# For simplicity and for keeping code in sync with upstream, I have
# reused as many of the variable/function names as possible, even when
# they are in a more java-like style (e.g. camelcase)
#
# I wrote this because I wanted an almost feature complete and fast implementation
# of ForceAtlas2 algorithm in python
#
# NOTES: Currently, this only works for weighted undirected graphs.
#
# Copyright (C) 2017 Bhargav Chippada <bhargavchippada19@gmail.com>
#
# Available under the GPLv3
import random
import time
import numpy as np
import numpy
import scipy
from tqdm import tqdm
from . import fa2util
class Timer:
def __init__(self, name="Timer"):
self.name = name
self.start_time = 0.0
self.total_time = 0.0
def start(self):
self.start_time = time.time()
def stop(self):
self.total_time += (time.time() - self.start_time)
def display(self):
print(self.name, " took ", "%.2f" % self.total_time, " seconds")
class ForceAtlas2:
def __init__(self,
# Behavior alternatives
outboundAttractionDistribution=False, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=1.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=2.0,
strongGravityMode=False,
gravity=1.0,
# Log
verbose=True):
assert linLogMode == adjustSizes == multiThreaded == False, "You selected a feature that has not been implemented yet..."
self.outboundAttractionDistribution = outboundAttractionDistribution
self.linLogMode = linLogMode
self.adjustSizes = adjustSizes
self.edgeWeightInfluence = edgeWeightInfluence
self.jitterTolerance = jitterTolerance
self.barnesHutOptimize = barnesHutOptimize
self.barnesHutTheta = barnesHutTheta
self.scalingRatio = scalingRatio
self.strongGravityMode = strongGravityMode
self.gravity = gravity
self.verbose = verbose
self.dataContainer = []
def init(self,
G, # a graph in 2D numpy ndarray format (or) scipy sparse matrix format
pos=None # Array of initial positions
):
isSparse = False
if isinstance(G, numpy.ndarray):
# Check our assumptions
assert G.shape == (G.shape[0], G.shape[0]), "G is not 2D square"
assert numpy.all(G.T == G), "G is not symmetric. Currently only undirected graphs are supported"
assert isinstance(pos, numpy.ndarray) or (pos is None), "Invalid node positions"
elif scipy.sparse.issparse(G):
# Check our assumptions
assert G.shape == (G.shape[0], G.shape[0]), "G is not 2D square"
assert isinstance(pos, numpy.ndarray) or (pos is None), "Invalid node positions"
G = G.tolil()
isSparse = True
else:
assert False, "G is not numpy ndarray or scipy sparse matrix"
# Put nodes into a data structure we can understand
nodes = []
for i in range(0, G.shape[0]):
n = fa2util.Node()
if isSparse:
n.mass = 1 + len(G.rows[i])
else:
n.mass = 1 + numpy.count_nonzero(G[i])
n.old_dx = 0
n.old_dy = 0
n.dx = 0
n.dy = 0
if pos is None:
n.x = random.random()
n.y = random.random()
else:
n.x = pos[i][0]
n.y = pos[i][1]
nodes.append(n)
# Put edges into a data structure we can understand
edges = []
es = numpy.asarray(G.nonzero()).T
for e in es: # Iterate through edges
if e[1] <= e[0]: continue # Avoid duplicate edges
edge = fa2util.Edge()
edge.node1 = e[0] # The index of the first node in `nodes`
edge.node2 = e[1] # The index of the second node in `nodes`
edge.weight = G[tuple(e)]
edges.append(edge)
return nodes, edges
# Given an adjacency matrix, this function computes the node positions
# according to the ForceAtlas2 layout algorithm. It takes the same
# arguments that one would give to the ForceAtlas2 algorithm in Gephi.
# Not all of them are implemented. See below for a description of
# each parameter and whether or not it has been implemented.
#
# This function will return a list of X-Y coordinate tuples, ordered
# in the same way as the rows/columns in the input matrix.
#
# The only reason you would want to run this directly is if you don't
# use networkx. In this case, you'll likely need to convert the
# output to a more usable format. If you do use networkx, use the
# "forceatlas2_networkx_layout" function below.
#
# Currently, only undirected graphs are supported so the adjacency matrix
# should be symmetric.
def forceatlas2(self,
G, # a graph in 2D numpy ndarray format (or) scipy sparse matrix format
pos=None, # Array of initial positions
iterations=100 # Number of times to iterate the main loop
):
# Initializing, initAlgo()
# ================================================================
# speed and speedEfficiency describe a scaling factor of dx and dy
# before x and y are adjusted. These are modified as the
# algorithm runs to help ensure convergence.
speed = 1.0
speedEfficiency = 1.0
nodes, edges = self.init(G, pos)
outboundAttCompensation = 1.0
if self.outboundAttractionDistribution:
outboundAttCompensation = numpy.mean([n.mass for n in nodes])
# ================================================================
# Main loop, i.e. goAlgo()
# ================================================================
barneshut_timer = Timer(name="BarnesHut Approximation")
repulsion_timer = Timer(name="Repulsion forces")
gravity_timer = Timer(name="Gravitational forces")
attraction_timer = Timer(name="Attraction forces")
applyforces_timer = Timer(name="AdjustSpeedAndApplyForces step")
# Each iteration of this loop represents a call to goAlgo().
niters = range(iterations)
if self.verbose:
niters = tqdm(niters)
for _i in niters:
for n in nodes:
n.old_dx = n.dx
n.old_dy = n.dy
n.dx = 0
n.dy = 0
# Barnes Hut optimization
if self.barnesHutOptimize:
barneshut_timer.start()
rootRegion = fa2util.Region(nodes)
rootRegion.buildSubRegions()
barneshut_timer.stop()
# Charge repulsion forces
repulsion_timer.start()
# parallelization should be implemented here
if self.barnesHutOptimize:
rootRegion.applyForceOnNodes(nodes, self.barnesHutTheta, self.scalingRatio)
else:
fa2util.apply_repulsion(nodes, self.scalingRatio)
repulsion_timer.stop()
# Gravitational forces
gravity_timer.start()
fa2util.apply_gravity(nodes, self.gravity, useStrongGravity=self.strongGravityMode)
gravity_timer.stop()
# If other forms of attraction were implemented they would be selected here.
attraction_timer.start()
fa2util.apply_attraction(nodes, edges, self.outboundAttractionDistribution, outboundAttCompensation,
self.edgeWeightInfluence)
attraction_timer.stop()
# Adjust speeds and apply forces
applyforces_timer.start()
values = fa2util.adjustSpeedAndApplyForces(nodes, speed, speedEfficiency, self.jitterTolerance)
speed = values['speed']
speedEfficiency = values['speedEfficiency']
applyforces_timer.stop()
self.dataContainer.append(np.array([(n.x, n.y) for n in nodes]))
if self.verbose:
if self.barnesHutOptimize:
barneshut_timer.display()
repulsion_timer.display()
gravity_timer.display()
attraction_timer.display()
applyforces_timer.display()
# ================================================================
return [(n.x, n.y) for n in nodes]
# A layout for NetworkX.
#
# This function returns a NetworkX layout, which is really just a
# dictionary of node positions (2D X-Y tuples) indexed by the node name.
def forceatlas2_networkx_layout(self, G, pos=None, iterations=100):
import networkx
assert isinstance(G, networkx.classes.graph.Graph), "Not a networkx graph"
assert isinstance(pos, dict) or (pos is None), "pos must be specified as a dictionary, as in networkx"
M = networkx.to_scipy_sparse_matrix(G, dtype='f', format='lil')
if pos is None:
l = self.forceatlas2(M, pos=None, iterations=iterations)
else:
poslist = numpy.asarray([pos[i] for i in G.nodes()])
l = self.forceatlas2(M, pos=poslist, iterations=iterations)
return dict(zip(G.nodes(), l))

View file

@ -0,0 +1,75 @@
from codecs import open
from os import path
from setuptools import setup
print("Installing fa2 package (fastest forceatlas2 python implementation)\n")
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, 'README.md'), 'r') as f:
long_description = f.read()
print(">>>> Cython is installed?")
try:
from Cython.Distutils import Extension
from Cython.Build import build_ext
USE_CYTHON = True
print('Yes\n')
except ImportError:
from setuptools.extension import Extension
USE_CYTHON = False
print('Cython is not installed; using pre-generated C files if available')
print('Please install Cython first and try again if you face any installation problems\n')
print(">>>> Are pre-generated C files available?")
if USE_CYTHON:
ext_modules = [Extension('fa2.fa2util', ['fa2/fa2util.py', 'fa2/fa2util.pxd'], cython_directives={'language_level' : 3})]
cmdclass = {'build_ext': build_ext}
opts = {"ext_modules": ext_modules, "cmdclass": cmdclass}
elif path.isfile(path.join(here, 'fa2/fa2util.c')):
print("Yes\n")
ext_modules = [Extension('fa2.fa2util', ['fa2/fa2util.c'])]
cmdclass = {}
opts = {"ext_modules": ext_modules, "cmdclass": cmdclass}
else:
print("Pre-generated C files are not available. This library will be slow without Cython optimizations.\n")
opts = {"py_modules": ["fa2.fa2util"]}
# Uncomment the following line if you want to install without optimizations
# opts = {"py_modules": ["fa2.fa2util"]}
print(">>>> Starting to install!\n")
setup(
name='fa2',
version='0.3.5',
description='The fastest ForceAtlas2 algorithm for Python (and NetworkX)',
long_description_content_type='text/markdown',
long_description=long_description,
author='Bhargav Chippada',
author_email='bhargavchippada19@gmail.com',
url='https://github.com/bhargavchippada/forceatlas2',
download_url='https://github.com/bhargavchippada/forceatlas2/archive/v0.3.5.tar.gz',
keywords=['forceatlas2', 'networkx', 'force-directed-graph', 'force-layout', 'graph'],
packages=['fa2'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Mathematics',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 3'
],
install_requires=['numpy', 'scipy', 'tqdm'],
extras_require={
'networkx': ['networkx'],
'igraph': ['python-igraph']
},
include_package_data=True,
**opts
)

View file

@ -0,0 +1,108 @@
from os import mkdir
from os.path import exists
from shutil import rmtree
from fa2 import ForceAtlas2
import pandas as pd
from scipy.io import mmread
import numpy as np
import subprocess
# smaller steps by:
# - decrease barnesHutOptimize
# - decrease gravity
# number of frames
frames = 2000
# load pca, SNN and label colours data
# the first 2 PC form PCA are used as initial conditions
# SNN is used for building the force directed graph
pca_data = pd.read_csv("./input/pca.csv", index_col = 0)
labels_col = pd.read_csv("./input/label_colours.csv", squeeze = True, index_col = 0)
snn = mmread("./input/SNN.smm")
# set initialposition as the first 2 PCs
positions = pca_data.values[:, 0:2]
# initialize force directed graph class instance
forceatlas2 = ForceAtlas2(outboundAttractionDistribution=False, linLogMode=False,
adjustSizes=False, edgeWeightInfluence=1.0,
jitterTolerance=1.0, barnesHutTheta = .8,
barnesHutOptimize=True, multiThreaded=False,
scalingRatio=2.0, strongGravityMode=True, gravity=1, verbose=True)
# run force directed graph; for each iterations generates the coordinates use din each frame
discard = forceatlas2.forceatlas2(G = snn, pos = positions, iterations = frames)
if exists("./input/buffers"):
rmtree("./input/buffers")
if exists("./input/frames"):
rmtree("./input/frames")
mkdir("./input/buffers")
mkdir("./input/frames")
for index in range(len(forceatlas2.dataContainer)):
positions = forceatlas2.dataContainer[index]
fname = "./input/buffers/{index}.csv".format(index = index)
np.savetxt(fname, positions, delimiter = ",")
print("Saving buffer: {index}".format(index = index))
# run R
subprocess.call(["Rscript", "make_plots.R"], shell = True)
# assemble the frames into a video
import cv2
import os
def sortImages(imgPath):
return int(os.path.splitext(imgPath)[0])
# Arguments
dir_path = './input/frames'
ext = "png"
output = "fdg.mp4"
images = []
for f in os.listdir(dir_path):
if f.endswith(ext):
images.append(f)
images = sorted(images, key = sortImages)
legend = cv2.imread("./input/legend.png")
lH, lW, chs = legend.shape
legend = legend[0:(lH-10), 10:lW]
legend = cv2.resize(legend, (0, 0), fx = .8, fy = .8)
lH, lW, chs = legend.shape
# Determine the width and height from the first image
image_path = os.path.join(dir_path, images[0])
frame = cv2.imread(image_path)
cv2.imshow('video',frame)
height, width, channels = frame.shape
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Be sure to use lower case
out = cv2.VideoWriter(output, fourcc, 30.0, (width+792, height))
import numpy as np
for image in images:
image_path = os.path.join(dir_path, image)
frame = cv2.imread(image_path)
frame = cv2.resize(frame, (width, height))
lh1 = width + lW
template = np.zeros((height, lW, 3), dtype = frame.dtype)
frame = np.hstack((frame, template))
frame[0:lH, width:lh1, :] = legend
#cv2.putText(frame, "by Dorin-Mirel Popescu", (width - 400, height - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), thickness = 2)
out.write(frame) # Write out frame to video
print(image)
# Release everything if job is finished
out.release()
cv2.destroyAllWindows()
print("The output video is {}".format(output))

View file

@ -0,0 +1,42 @@
setwd("~/Documents/MyTools/force_abstract_graph_2Danimation/")
buffers.addrs <- list.files("./input/buffers/", full.names=T)
data.colours <- as.vector(read.csv("./input/label_colours.csv")$LabelCols)
################################################################################################################
################################################################################################################
################################################################################################################
library(RColorBrewer)
library(dplyr)
library(plyr)
library(Seurat)
#c.unique <-as.vector( unique(data.colours))
#c.colours <- sample(colorRampPalette(brewer.pal(12, "Paired"))(length(c.unique)))
#data.colours <- factor(plyr::mapvalues(x=data.colours, from=c.unique, to = c.colours), levels = c.colours)
################################################################################################################
################################################################################################################
################################################################################################################
for(k in 1:length(buffers.addrs)){
buffer.addr <- buffers.addrs[k]
print(sprintf("Plotting frame %d", k))
buffer.data <- read.csv(buffer.addr, header = F)
buffer.data <- cbind(buffer.data, data.colours)
colnames(buffer.data) <- c("FDGX", "FDGY", "Colours")
limitX <- quantile(buffer.data$FDGX, c(.01, .99)) + c(-15000, 15000)
limitY <- 1.1 * quantile(buffer.data$FDGY, c(.01, .99)) + c(-15000, 15000)
plot.obj <- ggplot(data=buffer.data, aes(x = FDGX, y = FDGY))
plot.obj <- plot.obj + geom_point(show.legend=F, size = 1.5, color = as.vector(buffer.data$Colours))
plot.obj <- plot.obj + scale_color_manual(values=as.vector(buffer.data$Colours))
plot.obj <- plot.obj + theme(plot.background = element_rect(fill = "black"))
plot.obj <- plot.obj + scale_x_continuous(limits = limitX, expand = c(0, 0))
plot.obj <- plot.obj + scale_y_continuous(limits = limitY, expand = c(0, 0))
plot.obj <- plot.obj + theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())
fname <- file.path("./input/frames", sub(pattern=".csv", replacement=".png", x=basename(buffer.addr)))
png(fname, width = 2000, height = 2000)
print(plot.obj)
dev.off()
}

View file

@ -0,0 +1,71 @@
# import libraries
library(Seurat)
library(plyr)
seurat.obj.addr <- "../../seurat_data/liver_immune.RDS"
# a plotting function for indexed legend; special modifications for current script
plot.indexed.legend <- function(label.vector, color.vector, ncols = 2, left.limit = 3.4, symbol.size = 8, text.size = 10){
if (length(label.vector) != length(color.vector)){
stop("number of labels is different from number colors\nAdvice: learn to count!")
}
if (length(ncol) > length(label.vector)){
stop("You cannot have more columns than labels\nSolution: Learn to count")
}
indices.vector <- 1:length(label.vector)
label.no <- length(label.vector)
nrows <- ceiling(label.no / ncols)
legend.frame <- data.frame(X = rep(0, label.no), Y = rep(0, label.no), CS = color.vector, Txt = label.vector)
for (i in 1:label.no){
col.index <- floor(i / (nrows + 1)) + 1
row.index <- 15 - ((i - 1) %% nrows + 1)
legend.frame[i, 1] <- (col.index - 1) * 2
legend.frame[i, 2] <- row.index
}
plot.obj <- ggplot(data = legend.frame, aes(x = X, y = Y))
plot.obj <- plot.obj + geom_point(size = symbol.size, colour = color.vector)
plot.obj <- plot.obj + scale_x_continuous(limits = c(0, left.limit)) + theme_void()
plot.obj <- plot.obj + annotate("text", x=legend.frame$X+.1, y = legend.frame$Y, label=legend.frame$Txt, hjust = 0, size = text.size, colour = "white")
plot.obj <- plot.obj + theme(panel.background = element_rect(fill='black'))
return(plot.obj)
}
# load the seurat object
print("Loading the data ... ")
seurat.obj <- readRDS(seurat.obj.addr)
cell.type.to.colour <- read.csv("./liver_cell_type_colours.csv")
seurat.obj <- SetAllIdent(object=seurat.obj, id="cell.labels")
################################################
print("saving pca data ...")
pca.data <- seurat.obj@dr$pca@cell.embeddings
write.csv(pca.data, "./input/pca.csv")
################################################
print("Computing and saving KNN graph ...")
seurat.obj <- BuildSNN(object=seurat.obj, reduction.type="pca", dims.use=1:20, plot.SNN=F, force.recalc=T)
writeMM(obj=seurat.obj@snn, file="./input/SNN.smm")
labels <- as.vector(seurat.obj@ident)
labels.unique <- unique(labels)
filter.key <- cell.type.to.colour$CellTypes %in% labels.unique
cell.labels <- cell.type.to.colour$CellTypes[filter.key]
cell.colours <- cell.type.to.colour$Colours[filter.key]
labels.cols <- mapvalues(x=labels, from=as.vector(cell.labels), to=as.vector(cell.colours))
write.csv(data.frame(LabelCols = labels.cols), "./input/label_colours.csv")
png("./input/legend.png", width = 1000, height = 800)
legend.plt <- plot.indexed.legend(label.vector=cell.labels, color.vector=cell.colours, left.limit=3.6, text.size=10, ncols=2, symbol.size = 15)
print(legend.plt)
dev.off()
print("End")

View file

@ -0,0 +1,11 @@
#!/bin/bash
#$ -cwd
#$ -N prepare_input
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=400G
Rscript prepare_input.R
echo "End on `date`"

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,112 @@
# labels have been updated, should remove the part that overwrites cell labels
# must create functions that handle the formation of FDG animation:
# - data writter
# - plotting that takes dimenssion parameters
library(plyr)
library(RColorBrewer)
library(Seurat)
seurat.addr <- "../../data/test_yolk_sac_subset.RDS"
seurat.obj <- readRDS(seurat.addr)
cell.type.to.colour <- read.csv("../../resources/test_yolk_sac_fdg_colour_key.csv")
print("Checking for doublets:")
print(table(seurat.obj@meta.data$doublets))
# a plotting function for indexed legend; special modifications for current script
plot.indexed.legend <- function(label.vector, color.vector, ncols = 2, left.limit = 3.4, symbol.size = 8, text.size = 10){
if (length(label.vector) != length(color.vector)){
stop("number of labels is different from number colors\nAdvice: learn to count!")
}
if (length(ncol) > length(label.vector)){
stop("You cannot have more columns than labels\nSolution: Learn to count")
}
indices.vector <- 1:length(label.vector)
label.no <- length(label.vector)
nrows <- ceiling(label.no / ncols)
legend.frame <- data.frame(X = rep(0, label.no), Y = rep(0, label.no), CS = color.vector, Txt = label.vector)
for (i in 1:label.no){
col.index <- floor(i / (nrows + 1)) + 1
row.index <- 15 - ((i - 1) %% nrows + 1)
legend.frame[i, 1] <- (col.index - 1) * 2
legend.frame[i, 2] <- row.index
}
plot.obj <- ggplot(data = legend.frame, aes(x = X, y = Y))
plot.obj <- plot.obj + geom_point(size = symbol.size, colour = color.vector)
plot.obj <- plot.obj + scale_x_continuous(limits = c(0, left.limit)) + theme_void()
plot.obj <- plot.obj + annotate("text", x=legend.frame$X+.1, y = legend.frame$Y, label=legend.frame$Txt, hjust = 0, size = text.size, colour = "white")
plot.obj <- plot.obj + theme(panel.background = element_rect(fill='black'))
return(plot.obj)
}
# a plotting function for indexed legend
plot.indexed.legend <- function(label.vector, color.vector, ncols = 2, left.limit = 3.4, symbol.size = 8, text.size = 10, padH = 1, padV = 1, padRight = 0){
if (length(label.vector) != length(color.vector)){
stop("number of labels is different from number colors\nAdvice: learn to count!")
}
if (length(ncol) > length(label.vector)){
stop("You cannot have more columns than labels\nSolution: Learn to count")
}
indices.vector <- 1:length(label.vector)
label.no <- length(label.vector)
nrows <- ceiling(label.no / ncols)
legend.frame <- data.frame(X = rep(0, label.no), Y = rep(0, label.no), CS = color.vector, Txt = label.vector)
legend.frame$X <- rep(1:ncols, each=nrows)[1:nrow(legend.frame)]
legend.frame$Y <- rep(nrows:1, times = ncols)[1:nrow(legend.frame)]
Xrange <- range(legend.frame$X)
Yrange <- range(legend.frame$Y)
plot.obj <- ggplot(data = legend.frame, aes(x = X, y = Y))
plot.obj <- plot.obj + geom_point(size = symbol.size, colour = color.vector)
plot.obj <- plot.obj + scale_x_continuous(limits = c(Xrange[1] - padRight, Xrange[2] + padH))
plot.obj <- plot.obj + scale_y_continuous(limits = c(Yrange[1] - padV, Yrange[2] + padV))
plot.obj <- plot.obj + theme_void()
plot.obj <- plot.obj + annotate("text", x=legend.frame$X, y = legend.frame$Y, label = indices.vector, size = text.size)
plot.obj <- plot.obj + annotate("text", x=legend.frame$X+.1, y = legend.frame$Y, label=legend.frame$Txt, hjust = 0, size = text.size, colour = "white")
plot.obj <- plot.obj + theme(panel.background = element_rect(fill='black'))
return(plot.obj)
}
pca.data <- seurat.obj@dr$pca@cell.embeddings
write.csv(pca.data, "./input/pca.csv")
seurat.obj <- BuildSNN(object=seurat.obj, reduction.type="pca", dims.use=1:20, plot.SNN=F,force.recalc=T)
writeMM(obj=seurat.obj@snn, file="./input/SNN.smm")
labels <- as.vector(seurat.obj@meta.data$cell.labels)
labels.unique <- unique(labels)
print("printing cell.type.to.colour")
print(cell.type.to.colour)
print("!is.na(cell.type.to.colour)")
print(!is.na(cell.type.to.colour))
if(!is.na(cell.type.to.colour)){
cell.labels <- as.vector(cell.type.to.colour$CellTypes)
cell.colours <- as.vector(cell.type.to.colour$Colours)
filter.key <- cell.labels %in% labels.unique
cell.labels <- cell.labels[filter.key]
cell.colours <- cell.colours[filter.key]
}else{
cell.labels <- labels.unique
set.seed(100)
cell.colours <- sample(colorRampPalette(brewer.pal(12, "Paired"))(length(labels.unique)))
}
print("printing cell.labels")
print(cell.labels)
print("printing cell.colours")
print(cell.colours)
labels.cols <- mapvalues(x=labels, from=cell.labels, to=cell.colours)
write.csv(data.frame(LabelCols = labels.cols), "./input/label_colours.csv")
png("./input/legend.png", width = 1000, height = 700)
legend.plt <- plot.indexed.legend(label.vector=cell.labels, color.vector=cell.colours, ncols=2, left.limit=0, symbol.size=17, text.size=10, padH=.9, padV=.6)
print(legend.plt)
dev.off()
print("ended beautifully")

View file

@ -0,0 +1,11 @@
#!/bin/bash
#$ -cwd
#$ -N write_data
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=400G
Rscript write_data.R
echo "End on `date`"

View file

@ -0,0 +1,58 @@
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle
import sys
args = sys.argv
material_dir = args[1]
output_dir = args[2]
from os.path import join
print("Loading data ...")
X = pd.read_csv(join(material_dir, 'data.csv'), sep = ",", index_col = 0).values
Y = pd.read_csv(join(material_dir, 'labels.csv')).values[:, 0].reshape(-1, 1).ravel()
from sklearn.decomposition import PCA
pca = PCA(n_components = .8)
X = pca.fit_transform(X)
modelFile = open(join(output_dir, "pca.pickle"), "wb")
print(modelFile)
modelFile.write(pickle.dumps(pca))
modelFile.close()
print("Splitting into training and test sets...")
(X_train, X_test, y_train, y_test) = train_test_split(X, Y, test_size = .3, random_state = 42)
params = {"C":[1e-6, 1e-3, .1, 1, 10, 100, 1000],
"gamma": [1e-6, 1e-3, .1, 1]}
# established as the best paramaters in some other work
params = {"C":[10], "gamma": [1e-3]}
print("Creating the model and fitting the data ...")
model = GridSearchCV(SVC(probability = False, kernel = "rbf"), params, cv=5)
model.fit(X_train, y_train)
print("Testing ...")
pred = model.predict(X_test)
cls_report = classification_report(y_test, pred, target_names = model.classes_)
print(cls_report)
with open(join(output_dir, 'classification_report.txt'), "w") as cl_f:
cl_f.write(cls_report)
print("Saving model and confusion matrix to disk ...")
cnf_matrix = confusion_matrix(y_test, pred)
df = pd.DataFrame(cnf_matrix)
df.columns = model.classes_
df.to_csv(join(output_dir, 'confusion_matrix.csv'))
modelFile = open(join(output_dir, 'model.pickle'), "wb")
modelFile.write(pickle.dumps(model))
modelFile.close()

View file

@ -0,0 +1,107 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
marker.genes.addr = args[2]
save.at = args[3]
classifier = args[4]
"
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires %s parameters: '))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
working_dir = paste(sample(LETTERS, 50, replace=T),collapse = '')
material_dir = file.path(working_dir, 'material')
output_dir = file.path(working_dir, 'output')
dir.create(working_dir)
dir.create(material_dir)
dir.create(output_dir)
save.at = file.path('../../resources', save.at)
seurat.addr = file.path("../../data", seurat.addr)
classifier = paste(classifier, '.py', sep = '.')
source("../../tools/bunddle_utils.R")
library(Seurat)
library(RColorBrewer)
library(plyr)
library(dplyr)
library(ggplot2)
# load data
print("loading data ... ")
seurat.obj = readRDS(seurat.addr)
print("Data loaded.")
# create and save label data frame
print("Create and save label data frame ...")
singlets <- as.vector(seurat.obj@meta.data$doublets) == "Singlet"
labels <- data.frame(Labels = as.vector(seurat.obj@meta.data$cell.labels)[singlets])
write.csv(labels, file.path(material_dir, 'labels.csv'), row.names = F)
# save variable genes in the output folder
print("Choose features genes ...")
marker.genes = file.path('../../resources/marker_genes', marker.genes)
marker.genes <- read.csv(marker.genes)
marker.genes <- marker.genes %>% group_by(cluster) %>% top_n(20, avg_logFC)
classifier.features <- unique(as.vector(marker.genes$gene))
saveRDS(classifier.features, file.path(output_dir, 'feature_genes.RDS'))
# save the normalized data to disk
print("saving training data to disk ...")
cell.names <- names(seurat.obj@ident)[singlets]
x.data <- as.data.frame(t(as.matrix(seurat.obj@data[classifier.features, cell.names])))
write.csv(x.data, file.path(material_dir, 'data.csv'), row.names = T)
print("initiating SVM trainer ... ")
system(sprintf('%s svm.py %s %s', python.addr, material_dir, output_dir), wait = T)
# plot confusion matrix
cnf_matrix = read.csv(file.path(output_dir, 'confusion_matrix.csv'))
cnf_matrix <- cnf_matrix[, -c(1)]
confusion <- expand.grid(Actual = colnames(cnf_matrix), Predicted = colnames(cnf_matrix))
cnf_matrix <- cnf_matrix / colSums(cnf_matrix)
confusion$freq <- rapply(cnf_matrix, c)
pdf(file.path(output_dir, 'confusion_matrix.pdf'), width = 14, height = 14)
ggplot(data = confusion, aes(x = Actual, y = Predicted)) + geom_tile(aes(fill = freq)) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
dev.off()
unlink(save.at, recursive=T, force=T)
dir.create(save.at)
file.rename(from=file.path(output_dir, 'feature_genes.RDS'), to=file.path(save.at, "feature_genes.RDS"))
file.rename(from=file.path(output_dir, 'classification_report.txt'), to=file.path(save.at, "classification_report.txt"))
file.rename(from=file.path(output_dir, 'confusion_matrix.csv'), to=file.path(save.at, "confusion_matrix.csv"))
file.rename(from=file.path(output_dir, 'confusion_matrix.pdf'), to=file.path(save.at, "confusion_matrix.pdf"))
file.rename(from=file.path(output_dir, 'model.pickle'), to=file.path(save.at, "model.pickle"))
file.rename(from=file.path(output_dir, 'pca.pickle'), to=file.path(save.at, "pca.pickle"))
unlink(working_dir, recursive=T, force=T)
print("Ended beautifully ... ")

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N train_classifier
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=100G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript train_classifier.R $1
echo "End on `date`"

View file

@ -0,0 +1,80 @@
args = commandArgs(trailingOnly=T)
args = paste(args, collapse = "")
args = unlist(strsplit(args, ";"))
arguments.list = "
seurat.addr.arg = args[1]
save.at.arg = args[2]
doublet.svm.arg = args[3]
"
{
expected_arguments = unlist(strsplit(arguments.list, "\n"))
expected_arguments = expected_arguments[!(expected_arguments == "")]
if(length(args) != length(expected_arguments)){
error.msg = sprintf('This pipeline requires %s parameters', as.character(length(expected_arguments)))
expected_arguments = paste(unlist(lapply(strsplit(expected_arguments, ".arg"), "[", 1)), collapse = "\n")
stop(sprintf('This pipeline requires these parameters: seurat.addr ; save.at (name of RDS file where processed data are saved), doublet.svm (folder name where singlet/doublet svm classifier for given organ is stored)'))
}
eval(parse(text = arguments.list))
for(n in 1:length(expected_arguments)){
argument = expected_arguments[n]
argument = gsub(pattern=" ", replacement="", x=argument)
argument.name = unlist(strsplit(argument, "="))[1]
variable.name = gsub(pattern=".arg", replacement="", argument.name)
argument.content = eval(parse(text = argument.name))
eval(parse(text = argument.content))
if (!exists(variable.name)){
stop(sprintf("Argument %s not passed. Stopping ... ", variable.name))
}
}
# create required folders for output and work material
output_folder = gsub(pattern="^\\d+_", replacement="", x=basename(getwd()))
output_folder = paste(output_folder, seurat.addr, sep = "_")
c.time = Sys.time()
c.time = gsub(pattern=" BST", replacement="", x=c.time)
c.time = gsub(pattern=":", replacement="", x=c.time)
c.time = gsub(pattern=" ", replacement="", x=c.time)
c.time = gsub(pattern="-", replacement="", x=c.time)
c.time = substr(x=c.time, start=3, stop=nchar(c.time))
output_folder = paste(output_folder, c.time, sep = "_")
output_folder = file.path("../../output", output_folder)
dir.create(output_folder)
output_folder_material = file.path(output_folder, "material")
dir.create(output_folder_material)
seurat.addr = file.path("../../data", seurat.addr)
source("../../tools/bunddle_utils.R")
save.at = file.path("../../data", save.at)
library(Seurat)
library(RColorBrewer)
library(dplyr)
library(plyr)
}
###############################################################################
# Load data
print("Loading data ...")
seurat.obj = readRDS(seurat.addr) # seurat.addr.arg
print("Data loaded.")
print("Identifying doublets")
seurat.obj@meta.data$doublets <- Apply_Classifier_On_Seurat_Object(
seurat.obj = seurat.obj,
classifier.fname = doublet.svm, # doublet.svm.arg
tool_addr = tool_addr,
python.addr = python.addr)
print("Doublets and singlets: ")
print(table(seurat.obj@meta.data$lanes, seurat.obj@meta.data$doublets))
print("Saving data")
saveRDS(seurat.obj, save.at) # save.at.arg
unlink(output_folder_material, recursive = T, force = T)

View file

@ -0,0 +1,16 @@
#!/bin/bash
#$ -cwd
#$ -N apply_doublets_SVM
#$ -V
#$ -l h_rt=23:59:59
#$ -l h_vmem=100G
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
exit 1
fi
Rscript apply_doublet_classifier.R $1
echo "End on `date`"

View file

@ -0,0 +1,54 @@
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pickle
import sys
from os.path import join
args = sys.argv
save_to = args[1]
print("Loading data ...")
X = pd.read_csv(join(save_to, "./data.csv"), sep = ",", index_col = 0).values
y = pd.read_csv(join(save_to, "./labels.csv")).values[:, 0].reshape(-1, 1).ravel()
from sklearn.decomposition import PCA
pca = PCA(n_components = .8)
X = pca.fit_transform(X)
modelFile = open(join(save_to, "pca.pickle"), "wb")
print(modelFile)
modelFile.write(pickle.dumps(pca))
modelFile.close()
print(X.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=19)
params = {"C":[1, 10, 100, 300]}
print("Creating the model and fitting the data ...")
model = GridSearchCV(SVC(probability = False, kernel = "rbf"), params, cv=3)
model.fit(X_train, y_train)
print("Testing ...")
pred = model.predict(X)
cls_report = classification_report(y, pred, target_names = model.classes_)
print(cls_report)
with open(join(save_to, "classification_report.txt"), "w") as cl_f:
cl_f.write(cls_report)
print("Saving model and confusion matrix to disk ...")
cnf_matrix = confusion_matrix(y, pred)
df = pd.DataFrame(cnf_matrix)
df.columns = model.classes_
df.to_csv(join(save_to, "confusion_matrix.csv"))
modelFile = open(join(save_to, "model.pickle"), "wb")
modelFile.write(pickle.dumps(model))
modelFile.close()
print(model.best_params_)

Some files were not shown because too many files have changed in this diff Show more