#setwd()
setwd("~/Dropbox (Gladstone)/Bioinformatics/Training_Workshops/Gladstone-internal/Intermediate_RNA-seq_Fall_2019")

library(magrittr)
library(edgeR)
library(org.Mm.eg.db)
library(ggplot2)
library(tidyverse)
library(vioplot)

phenotype_info_file <- "targets.txt"
raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"

#12 samples and 6 categories
targets <- phenotype_info_file %>%
  read.delim(., stringsAsFactors=FALSE)

#This is equivalent to 
# targets <- read.delim(phenotype_info_file, stringsAsFactors = FALSE)

group <- targets %$%
  paste(CellType, Status, sep = ".") %>%
  factor()

#Length of a gene is the total number of bases in exons and UTRs for that gene.
GenewiseCounts <- raw_counts_file %>%
  read.delim(., row.names="EntrezGeneID")

colnames(GenewiseCounts) %<>% substring(.,1,7)

# colnames(GenewiseCounts)[-1] <- group

#------------------------
#Concept 1: MA plots
#------------------------

two_samples <- GenewiseCounts[, c(2, 3)] %>% #Replicate samples
  add(., 1) %>%
  log2()

plotData <- data.frame(M = two_samples[, 1] - two_samples[, 2],
                       A = (two_samples[, 1] + two_samples[, 2])/2)

ggplot(plotData, aes(x = A, y = M)) +
  geom_point() +
  geom_smooth()

#------------------------
#Create DGElist object and retrieve gene symbols.
#------------------------
y <- DGEList(counts = GenewiseCounts[,-1], 
             group=group,
             genes=GenewiseCounts[,1,drop=FALSE])

y$genes$Symbol <- mapIds(org.Mm.eg.db,
                         keys = rownames(y),
                         keytype="ENTREZID", 
                         column="SYMBOL")

#------------------------
#Independent filtering.
#------------------------

#Filter genes whose symbols are not found.
keep <- y$genes$Symbol %>%
  is.na() %>%
  not()

y <- y[keep, ]


#It is not possible to make reliable inference for genes ...
#... with very low counts.
#Can filter directly by counts but ...
#... better to account for difference in library sizes.

minimum_counts_reqd <- 10
cutoff <- y$samples$lib.size %>% 
  median() %>% 
  divide_by(., 10^6) %>% 
  divide_by(minimum_counts_reqd, .) %>% 
  round(., 1)

keep <- cpm(y) %>%
  is_greater_than(., cutoff) %>%
  rowSums() %>%
  is_weakly_greater_than(., 2)

#Alternatively, use counts to filter.
#Examples:
#keep <- rowSums(y$counts) > 50

y <- y[keep, , keep.lib.sizes=FALSE]

#-------------------------
#Normalizing the counts.
#-------------------------
y <- calcNormFactors(y)

#What's under the hood?

cnts <- y$counts %>% as.matrix()
lib.sizes <- apply(cnts, 2, sum)

cnts_adjst_libsize <- map_dfc(colnames(cnts), 
                              function(x) cnts[, x]/lib.sizes[x]) %>%
  set_colnames(., colnames(cnts))

vioplot(cnts_adjst_libsize)
#Intuitively, we should expect similar adjustments for similar samples.

f <- apply(cnts_adjst_libsize, 2, 
           function(x) quantile(x,p=0.75))

ref <- (f - mean(f)) %>% 
  abs() %>% 
  which.min()

TMM_norm_factors <- 
  map_dfc(colnames(cnts), 
          function(x, logratioTrim=.3, sumTrim=0.05, 
                   doWeighting=TRUE, Acutoff=-1e10) {
            
            #The following steps are excerpted from edgeR's source.
            nO <- lib.sizes[x]
            nR <- lib.sizes[ref]
            
            obs <- cnts[, x] %>% as.numeric()
            ref <- cnts[, ref] %>% as.numeric()
            
            logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size
            absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression
            v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref   # estimated asymptotic variance
            
            #	remove infinite values, cutoff based on A
            fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
            
            logR <- logR[fin]
            absE <- absE[fin]
            v <- v[fin]
            
            if(max(abs(logR)) < 1e-6) return(1)
            
            #	taken from the original mean() function
            n <- length(logR)
            loL <- floor(n * logratioTrim) + 1
            hiL <- n + 1 - loL
            loS <- floor(n * sumTrim) + 1
            hiS <- n + 1 - loS
            
            keep <- (rank(logR)>=loL & rank(logR)<=hiL) & 
              (rank(absE)>=loS & rank(absE)<=hiS)
            
            if(doWeighting)
              f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
            else
              f <- mean(logR[keep], na.rm=TRUE)
            
            #	Results will be missing if the two libraries share no features with positive counts
            #	In this case, return unity
            if(is.na(f)) f <- 0
            2^f
          }) %>%
  data.frame() %>% 
  as.numeric() 

#Rescale norm factors for convenience of interpretation.
rescale <- TMM_norm_factors %>% 
  log() %>%
  mean() %>%
  exp()

TMM_norm_factors %<>% divide_by(., rescale)

#Plot the normalization factors by sample.
ggplot(y$samples %>% 
         cbind(., replicate = factor(1:2)), 
       aes(x = group, y = norm.factors, fill = replicate)) +
  geom_bar(stat= "identity", position = position_dodge())

#"A normalization factor below one indicates that a small number of high count genes 
#...are monopolizing the sequencing, causing the counts for other genes to be lower 
#...than would be usual given the library size." Chen et al., 2016
#Looks like L.lactating samples contain a number of very highly upregulated genes.

#------------------------
#Exploratory visualizations
#------------------------
#MDS plot
pch <- c(0,1,2,15,16,17)
colors <- rep(c("darkgreen", "red", "blue"), 2)
plotMDS(y, col=colors[group], pch=pch[group])
legend("top", legend=levels(group), pch=pch, col=colors, ncol=2, 
       text.width = 0.1)

#PCA plot
cpm <- cpm(y, log = T, prior.count = 0.01)
rv <- apply(cpm,1,var) 

#Select genes with highest variance.
keep <- order(rv, decreasing = TRUE)[1:500]
selected <- cpm[keep, ] %>% t()

#Transpose is needed to ensure that each row is a vector.
pca <- prcomp(selected, scale=T, center = T)

stddev <- pca$sdev
pc1_var <- round(100*stddev[1]^2/sum(stddev^2))
pc2_var <- round(100*stddev[2]^2/sum(stddev^2))
pc3_var <- round(100*stddev[3]^2/sum(stddev^2))
PlotData <- data.frame(cbind(PC1 = pca$x[,1], PC2 = pca$x[,2]))
PlotData <- targets[, c("CellType", "Status")] %>%
  cbind(PlotData, .)

ggplot(PlotData, aes(x=PC1, y=PC2, color=CellType, shape=Status)) + 
  geom_point(size=4.5) +  
  xlab(paste("PC1:", pc1_var, "% variance")) + 
  ylab(paste("PC2:", pc2_var, "% variance"))

#--------------------
#Fitting the model.
#--------------------
design <- model.matrix(~0+group) %>%   
  set_colnames(., levels(group))

y <- estimateDisp(y, design, robust=TRUE)
plotBCV(y)

fit <- glmQLFit(y, design, robust=TRUE) 
head(fit$coefficients)

plotQLDisp(fit) 

#--------------------
#Hypothesis testing
#--------------------

#Example 1:
B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design)
res <- glmQLFTest(fit, contrast=B.LvsP)
topTags(res)
is.de <- decideTestsDGE(res)
summary(is.de)
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
       legend="topright")

#Example 2:
B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design)
res <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5))
topTags(res)
is.de <- decideTestsDGE(res)
summary(is.de)
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
       legend="topright")

#Example 3:
con <- makeContrasts(
  (L.lactating-L.pregnant)-(B.lactating-B.pregnant),
  levels=design)
res <- glmQLFTest(fit, contrast=con)
topTags(res)
is.de <- decideTestsDGE(res)
summary(is.de)
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
       legend="topright")

#Example 4:
con <- makeContrasts(
  L.PvsL = L.pregnant - L.lactating,
  L.VvsL = L.virgin - L.lactating,
  L.VvsP = L.virgin - L.pregnant, levels=design)
res <- glmQLFTest(fit, contrast=con)
topTags(res)
is.de <- decideTestsDGE(res)
summary(is.de)

#----------------------------
#Save results in a table
result <- topTags(res, n = nrow(y$counts)) %>%
  data.frame()
write.table(result, 
            file = "DE_result.txt",
            quote = FALSE,
            sep = "\t",
            row.names = FALSE)