diff --git a/intermediate-r-rna-seq/Analysis.R b/intermediate-r-rna-seq/Analysis.R index 472257c..20806d2 100644 --- a/intermediate-r-rna-seq/Analysis.R +++ b/intermediate-r-rna-seq/Analysis.R @@ -4,9 +4,11 @@ setwd("~/Dropbox (Gladstone)/Bioinformatics/Training_Workshops/Gladstone-interna library(magrittr) library(edgeR) library(org.Mm.eg.db) -library(ggplot2) library(tidyverse) -library(vioplot) + +#---------------------------- +#Load and organize data. +#---------------------------- phenotype_info_file <- "targets.txt" raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz" @@ -15,7 +17,7 @@ raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz" targets <- phenotype_info_file %>% read.delim(., stringsAsFactors=FALSE) -#This is equivalent to +#The above statement is equivalent to # targets <- read.delim(phenotype_info_file, stringsAsFactors = FALSE) group <- targets %$% @@ -28,8 +30,6 @@ GenewiseCounts <- raw_counts_file %>% colnames(GenewiseCounts) %<>% substring(.,1,7) -# colnames(GenewiseCounts)[-1] <- group - #------------------------ #Concept 1: MA plots #------------------------ @@ -81,6 +81,8 @@ cutoff <- y$samples$lib.size %>% divide_by(minimum_counts_reqd, .) %>% round(., 1) +#... or simply set cutoff to 0.5 + keep <- cpm(y) %>% is_greater_than(., cutoff) %>% rowSums() %>% @@ -97,86 +99,15 @@ y <- y[keep, , keep.lib.sizes=FALSE] #------------------------- y <- calcNormFactors(y) -#What's under the hood? +#What has calcNormFactors got under the hood? +#See the slides and TMM_normalization_steps.R -cnts <- y$counts %>% as.matrix() -lib.sizes <- apply(cnts, 2, sum) - -cnts_adjst_libsize <- map_dfc(colnames(cnts), - function(x) cnts[, x]/lib.sizes[x]) %>% - set_colnames(., colnames(cnts)) - -vioplot(cnts_adjst_libsize) #Intuitively, we should expect similar adjustments for similar samples. - -f <- apply(cnts_adjst_libsize, 2, - function(x) quantile(x,p=0.75)) - -ref <- (f - mean(f)) %>% - abs() %>% - which.min() - -TMM_norm_factors <- - map_dfc(colnames(cnts), - function(x, logratioTrim=.3, sumTrim=0.05, - doWeighting=TRUE, Acutoff=-1e10) { - - #The following steps are excerpted from edgeR's source. - nO <- lib.sizes[x] - nR <- lib.sizes[ref] - - obs <- cnts[, x] %>% as.numeric() - ref <- cnts[, ref] %>% as.numeric() - - logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size - absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression - v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance - - # remove infinite values, cutoff based on A - fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff) - - logR <- logR[fin] - absE <- absE[fin] - v <- v[fin] - - if(max(abs(logR)) < 1e-6) return(1) - - # taken from the original mean() function - n <- length(logR) - loL <- floor(n * logratioTrim) + 1 - hiL <- n + 1 - loL - loS <- floor(n * sumTrim) + 1 - hiS <- n + 1 - loS - - keep <- (rank(logR)>=loL & rank(logR)<=hiL) & - (rank(absE)>=loS & rank(absE)<=hiS) - - if(doWeighting) - f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE) - else - f <- mean(logR[keep], na.rm=TRUE) - - # Results will be missing if the two libraries share no features with positive counts - # In this case, return unity - if(is.na(f)) f <- 0 - 2^f - }) %>% - data.frame() %>% - as.numeric() - -#Rescale norm factors for convenience of interpretation. -rescale <- TMM_norm_factors %>% - log() %>% - mean() %>% - exp() - -TMM_norm_factors %<>% divide_by(., rescale) - #Plot the normalization factors by sample. ggplot(y$samples %>% cbind(., replicate = factor(1:2)), aes(x = group, y = norm.factors, fill = replicate)) + - geom_bar(stat= "identity", position = position_dodge()) + geom_col(position = position_dodge()) #"A normalization factor below one indicates that a small number of high count genes #...are monopolizing the sequencing, causing the counts for other genes to be lower @@ -190,11 +121,11 @@ ggplot(y$samples %>% pch <- c(0,1,2,15,16,17) colors <- rep(c("darkgreen", "red", "blue"), 2) plotMDS(y, col=colors[group], pch=pch[group]) -legend("top", legend=levels(group), pch=pch, col=colors, ncol=2, - text.width = 0.1) +legend("top", legend=levels(group) %>% substr(., 1, 3), + pch=pch, col=colors, ncol=2, cex = 0.5) #PCA plot -cpm <- cpm(y, log = T, prior.count = 0.01) +cpm <- cpm(y, log = TRUE, prior.count = 0.01) rv <- apply(cpm,1,var) #Select genes with highest variance. diff --git a/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf b/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf index d17e937..e6353fd 100644 Binary files a/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf and b/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf differ diff --git a/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx b/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx index 98a5461..6317c96 100644 Binary files a/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx and b/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx differ diff --git a/intermediate-r-rna-seq/Intermediate_RNA-seq.zip b/intermediate-r-rna-seq/Intermediate_RNA-seq.zip index d9304d4..6359202 100644 Binary files a/intermediate-r-rna-seq/Intermediate_RNA-seq.zip and b/intermediate-r-rna-seq/Intermediate_RNA-seq.zip differ diff --git a/intermediate-r-rna-seq/TMM_normalization_steps.R b/intermediate-r-rna-seq/TMM_normalization_steps.R new file mode 100644 index 0000000..cc0da59 --- /dev/null +++ b/intermediate-r-rna-seq/TMM_normalization_steps.R @@ -0,0 +1,152 @@ +#The code for the main steps in the following ... +#... are copied from the source code for edgeR::calcNormFactors. + +cnts <- y$counts %>% as.matrix() +lib.sizes <- apply(cnts, 2, sum) + +cnts_adjst_libsize <- map_dfc(colnames(cnts), + function(x) cnts[, x]/lib.sizes[x]) %>% + set_colnames(., colnames(cnts)) + +boxplot(cnts_adjst_libsize) + +f <- apply(cnts_adjst_libsize, 2, + function(x) quantile(x, p=0.75)) + +boxplot(cnts_adjst_libsize, ylim = c(0, 10^(-4))) +abline(h = mean(f), lty = "dotted") + +ref_sample <- (f - mean(f)) %>% + abs() %>% + which.min() + +#Illustrating normalization of one sample. +illustrate = TRUE +if (illustrate) { +x <- colnames(cnts)[12] + +nO <- lib.sizes[x] +nR <- lib.sizes[ref_sample] + +obs <- cnts[, x] %>% as.numeric() +ref <- cnts[, ref_sample] %>% as.numeric() + +#The M values: +logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size + +#The A values: +absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression + +# remove infinite values, cutoff based on A +fin <- is.finite(logR) & is.finite(absE) & (absE > -10^(10)) + +logR <- logR[fin] +absE <- absE[fin] + +print( +ggplot(data.frame(A = absE, + M = logR), + aes(A, M)) + + geom_point() + + geom_smooth() + + coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5)) +) + +logratioTrim=.3 +sumTrim=0.05 + +#Remove the genes with the 5% most extreme A values. +#Remove the genes with the 30% most extreme M values +n <- length(logR) +loL <- floor(n * logratioTrim) + 1 +hiL <- n + 1 - loL +loS <- floor(n * sumTrim) + 1 +hiS <- n + 1 - loS + +keep <- (rank(logR)>=loL & rank(logR)<=hiL) & + (rank(absE)>=loS & rank(absE)<=hiS) + +print( + ggplot(data.frame(A = absE[keep], + M = logR[keep]), + aes(A, M)) + + geom_point() + + coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5)) + + geom_hline(color = "red", linetype = "dotted", + yintercept = mean(logR[keep], na.rm = T)) +) + +#Normalization factor for the current sample wrt to the reference sample +f <- mean(logR[keep], na.rm=TRUE) +print("Normalization factor (log2 scale)") +print(f) + +f <- 2^f +print("Normalization factor (original scale)") +print(f) + +#Compare with the normalization factor calculated by TMM. +print("Normalization factor (from edgeR::calcNormFactors)") +y$samples$norm.factors[12] +} + +TMM_norm_factors <- + map_dfc(colnames(cnts), + function(x, logratioTrim=.3, sumTrim=0.05, + doWeighting=TRUE, Acutoff=-1e10) { + + #The following steps are excerpted from edgeR's source. + nO <- lib.sizes[x] + nR <- lib.sizes[ref_sample] + + obs <- cnts[, x] %>% as.numeric() + ref <- cnts[, ref_sample] %>% as.numeric() + + logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size + absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression + v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance + + # remove infinite values, cutoff based on A + fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff) + + logR <- logR[fin] + absE <- absE[fin] + v <- v[fin] + + if(max(abs(logR)) < 1e-6) return(1) + + # taken from the original mean() function + n <- length(logR) + loL <- floor(n * logratioTrim) + 1 + hiL <- n + 1 - loL + loS <- floor(n * sumTrim) + 1 + hiS <- n + 1 - loS + + keep <- (rank(logR)>=loL & rank(logR)<=hiL) & + (rank(absE)>=loS & rank(absE)<=hiS) + + if(doWeighting) + f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE) + else + f <- mean(logR[keep], na.rm=TRUE) + + # Results will be missing if the two libraries share no features with positive counts + # In this case, return unity + if(is.na(f)) f <- 0 + 2^f + }) %>% + data.frame() %>% + as.numeric() + +#Rescale norm factors for convenience of interpretation. +rescale <- TMM_norm_factors %>% + log() %>% + mean() %>% + exp() + +TMM_norm_factors %<>% divide_by(., rescale) +TMM_norm_factors + +#Compare with the output from calcNormFactors +y$samples$norm.factors + diff --git a/intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx b/intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx new file mode 100644 index 0000000..138f1a0 Binary files /dev/null and b/intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx differ