fork_scRNAseq_analysis/pipelines/13_pseudotime/pdt_scanpy.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 14 15:01:36 2018

@author: doru
"""
print("starting .py script")

import sys
args = sys.argv
root_cell_type = args[1]
CWD = args[2]
print("printing args")
print(args)
args
# use the args below if you have a root cell type containing spaces and @@'s
#root_cell_type = args[1] + " " + args[2]
#CWD = args[3]

import matplotlib; matplotlib.use('Agg');
import scanpy.api as sc;
import pandas as pd
import numpy as np

print("printing root_cell_type")
print(root_cell_type)
print("printing CWD")
print(CWD)

sc.settings.verbosity = 3

scObj = sc.read("{CWD}/material/raw_data.mtx".format(CWD=CWD), cache = False).T

# load gene names
scObj.var_names = pd.read_csv("{CWD}/material/genenames.csv".format(CWD=CWD)).iloc[:, 1]

# load cell names
scObj.obs_names = pd.read_csv("{CWD}/material/cellnames.csv".format(CWD=CWD)).iloc[:, 1]

# add cell labels
cell_labels = pd.read_csv("{CWD}/material/cell_labels.csv".format(CWD=CWD), index_col = 0)

scObj.obs["cell_labels"] = cell_labels

# filter out genes present in less than 3 cells
sc.pp.filter_genes(scObj, min_cells=3)

# log-normalize the data
scObj.raw = sc.pp.log1p(scObj, copy=True)
sc.pp.normalize_per_cell(scObj, counts_per_cell_after=1e4)

# variable genes
filter_result = sc.pp.filter_genes_dispersion(
        scObj.X, min_mean=0.0125, max_mean=3, min_disp=0.5)
# subset data on variable genes
scObj = scObj[:, filter_result.gene_subset]
# not sure?
sc.pp.log1p(scObj)

# scale the data
sc.pp.scale(scObj, max_value=10)

# run pca
sc.tl.pca(scObj)

# compunte neighborhood graph
sc.pp.neighbors(scObj, n_neighbors = 15, n_pcs = 20, knn = True, random_state = 10, method = "gauss")

# compute diffusion map
sc.tl.diffmap(scObj, n_comps = 20)

# set root
scObj.uns['iroot'] = np.flatnonzero(scObj.obs['cell_labels'] == root_cell_type)[0]

# compute dpt
print("computing sc.tl.dpt")
sc.tl.dpt(scObj, n_dcs = 20)

# pdt is at scObj.obs["dpt_pseudotime"]
print("displaying pdt table stored in scObj")
print(scObj.obs["dpt_pseudotime"])
pdt = scObj.obs["dpt_pseudotime"].to_csv("{CWD}/material/pseudotime.csv".format(CWD=CWD))

# save the pseudotime
dm  = scObj.obsm["X_diffmap"]
dm = pd.DataFrame(data = dm, index = None, columns = None)
dm.to_csv("{CWD}/material/dm.csv".format(CWD=CWD), columns = None, header = None)