Updated the scripts and slides

2025-11-30 09:45:43 -08:00 · 2020-07-20 04:48:26 -07:00 · 2020-07-20 04:48:26 -07:00 · bb9cfaf359
commit bb9cfaf359
parent d30f39831c
6 changed files with 165 additions and 82 deletions
--- a/intermediate-r-rna-seq/Analysis.R
+++ b/intermediate-r-rna-seq/Analysis.R
@ -4,9 +4,11 @@ setwd("~/Dropbox (Gladstone)/Bioinformatics/Training_Workshops/Gladstone-interna
 library(magrittr)
 library(edgeR)
 library(org.Mm.eg.db)
-library(ggplot2)
 library(tidyverse)
-library(vioplot)
+
+#----------------------------
+#Load and organize data.
+#----------------------------

 phenotype_info_file <- "targets.txt"
 raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"
@ -15,7 +17,7 @@ raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"
 targets <- phenotype_info_file %>%
  read.delim(., stringsAsFactors=FALSE)

-#This is equivalent to 
+#The above statement is equivalent to 
 # targets <- read.delim(phenotype_info_file, stringsAsFactors = FALSE)

 group <- targets %$%
@ -28,8 +30,6 @@ GenewiseCounts <- raw_counts_file %>%

 colnames(GenewiseCounts) %<>% substring(.,1,7)

-# colnames(GenewiseCounts)[-1] <- group
-
 #------------------------
 #Concept 1: MA plots
 #------------------------
@ -81,6 +81,8 @@ cutoff <- y$samples$lib.size %>%
  divide_by(minimum_counts_reqd, .) %>% 
  round(., 1)

+#... or simply set cutoff to 0.5
+
 keep <- cpm(y) %>%
  is_greater_than(., cutoff) %>%
  rowSums() %>%
@ -97,86 +99,15 @@ y <- y[keep, , keep.lib.sizes=FALSE]
 #-------------------------
 y <- calcNormFactors(y)

-#What's under the hood?
+#What has calcNormFactors got under the hood?
+#See the slides and TMM_normalization_steps.R

-cnts <- y$counts %>% as.matrix()
-lib.sizes <- apply(cnts, 2, sum)
-
-cnts_adjst_libsize <- map_dfc(colnames(cnts), 
-                              function(x) cnts[, x]/lib.sizes[x]) %>%
-  set_colnames(., colnames(cnts))
-
-vioplot(cnts_adjst_libsize)
 #Intuitively, we should expect similar adjustments for similar samples.
-
-f <- apply(cnts_adjst_libsize, 2, 
-           function(x) quantile(x,p=0.75))
-
-ref <- (f - mean(f)) %>% 
-  abs() %>% 
-  which.min()
-
-TMM_norm_factors <- 
-  map_dfc(colnames(cnts), 
-          function(x, logratioTrim=.3, sumTrim=0.05, 
-                   doWeighting=TRUE, Acutoff=-1e10) {
-            
-            #The following steps are excerpted from edgeR's source.
-            nO <- lib.sizes[x]
-            nR <- lib.sizes[ref]
-            
-            obs <- cnts[, x] %>% as.numeric()
-            ref <- cnts[, ref] %>% as.numeric()
-            
-            logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size
-            absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression
-            v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref   # estimated asymptotic variance
-            
-            #	remove infinite values, cutoff based on A
-            fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
-            
-            logR <- logR[fin]
-            absE <- absE[fin]
-            v <- v[fin]
-            
-            if(max(abs(logR)) < 1e-6) return(1)
-            
-            #	taken from the original mean() function
-            n <- length(logR)
-            loL <- floor(n * logratioTrim) + 1
-            hiL <- n + 1 - loL
-            loS <- floor(n * sumTrim) + 1
-            hiS <- n + 1 - loS
-            
-            keep <- (rank(logR)>=loL & rank(logR)<=hiL) & 
-              (rank(absE)>=loS & rank(absE)<=hiS)
-            
-            if(doWeighting)
-              f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
-            else
-              f <- mean(logR[keep], na.rm=TRUE)
-            
-            #	Results will be missing if the two libraries share no features with positive counts
-            #	In this case, return unity
-            if(is.na(f)) f <- 0
-            2^f
-          }) %>%
-  data.frame() %>% 
-  as.numeric() 
-
-#Rescale norm factors for convenience of interpretation.
-rescale <- TMM_norm_factors %>% 
-  log() %>%
-  mean() %>%
-  exp()
-
-TMM_norm_factors %<>% divide_by(., rescale)
-
 #Plot the normalization factors by sample.
 ggplot(y$samples %>% 
         cbind(., replicate = factor(1:2)), 
       aes(x = group, y = norm.factors, fill = replicate)) +
-  geom_bar(stat= "identity", position = position_dodge())
+  geom_col(position = position_dodge())

 #"A normalization factor below one indicates that a small number of high count genes 
 #...are monopolizing the sequencing, causing the counts for other genes to be lower 
@ -190,11 +121,11 @@ ggplot(y$samples %>%
 pch <- c(0,1,2,15,16,17)
 colors <- rep(c("darkgreen", "red", "blue"), 2)
 plotMDS(y, col=colors[group], pch=pch[group])
-legend("top", legend=levels(group), pch=pch, col=colors, ncol=2, 
-       text.width = 0.1)
+legend("top", legend=levels(group) %>% substr(., 1, 3), 
+       pch=pch, col=colors, ncol=2, cex = 0.5)

 #PCA plot
-cpm <- cpm(y, log = T, prior.count = 0.01)
+cpm <- cpm(y, log = TRUE, prior.count = 0.01)
 rv <- apply(cpm,1,var) 

 #Select genes with highest variance.
--- a/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf
+++ b/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf
--- a/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx
+++ b/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx
--- a/intermediate-r-rna-seq/Intermediate_RNA-seq.zip
+++ b/intermediate-r-rna-seq/Intermediate_RNA-seq.zip
--- a/intermediate-r-rna-seq/TMM_normalization_steps.R
+++ b/intermediate-r-rna-seq/TMM_normalization_steps.R
@ -0,0 +1,152 @@
+#The code for the main steps in the following ...
+#... are copied from the source code for edgeR::calcNormFactors.
+
+cnts <- y$counts %>% as.matrix()
+lib.sizes <- apply(cnts, 2, sum)
+
+cnts_adjst_libsize <- map_dfc(colnames(cnts), 
+                              function(x) cnts[, x]/lib.sizes[x]) %>%
+  set_colnames(., colnames(cnts))
+
+boxplot(cnts_adjst_libsize)
+
+f <- apply(cnts_adjst_libsize, 2, 
+           function(x) quantile(x, p=0.75))
+
+boxplot(cnts_adjst_libsize, ylim = c(0, 10^(-4)))
+abline(h = mean(f), lty = "dotted")
+
+ref_sample <- (f - mean(f)) %>% 
+  abs() %>% 
+  which.min()
+
+#Illustrating normalization of one sample.
+illustrate = TRUE
+if (illustrate) {
+x <- colnames(cnts)[12]
+
+nO <- lib.sizes[x]
+nR <- lib.sizes[ref_sample]
+
+obs <- cnts[, x] %>% as.numeric()
+ref <- cnts[, ref_sample] %>% as.numeric()
+
+#The M values:
+logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size
+
+#The A values:
+absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression
+
+#	remove infinite values, cutoff based on A
+fin <- is.finite(logR) & is.finite(absE) & (absE > -10^(10))
+
+logR <- logR[fin]
+absE <- absE[fin]
+
+print(
+ggplot(data.frame(A = absE, 
+                  M = logR), 
+       aes(A, M)) + 
+  geom_point() + 
+  geom_smooth() + 
+  coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5))
+)
+
+logratioTrim=.3
+sumTrim=0.05
+
+#Remove the genes with the 5% most extreme A values.
+#Remove the genes with the 30% most extreme M values
+n <- length(logR)
+loL <- floor(n * logratioTrim) + 1
+hiL <- n + 1 - loL
+loS <- floor(n * sumTrim) + 1
+hiS <- n + 1 - loS
+
+keep <- (rank(logR)>=loL & rank(logR)<=hiL) & 
+  (rank(absE)>=loS & rank(absE)<=hiS)
+
+print(
+  ggplot(data.frame(A = absE[keep], 
+                  M = logR[keep]), 
+       aes(A, M)) + 
+  geom_point() + 
+  coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5)) +
+  geom_hline(color = "red", linetype = "dotted", 
+           yintercept = mean(logR[keep], na.rm = T))
+)
+
+#Normalization factor for the current sample wrt to the reference sample
+f <- mean(logR[keep], na.rm=TRUE) 
+print("Normalization factor (log2 scale)")
+print(f)
+
+f <- 2^f
+print("Normalization factor (original scale)")
+print(f)
+
+#Compare with the normalization factor calculated by TMM.
+print("Normalization factor (from edgeR::calcNormFactors)")
+y$samples$norm.factors[12]
+}
+
+TMM_norm_factors <- 
+  map_dfc(colnames(cnts), 
+          function(x, logratioTrim=.3, sumTrim=0.05, 
+                   doWeighting=TRUE, Acutoff=-1e10) {
+            
+            #The following steps are excerpted from edgeR's source.
+            nO <- lib.sizes[x]
+            nR <- lib.sizes[ref_sample]
+            
+            obs <- cnts[, x] %>% as.numeric()
+            ref <- cnts[, ref_sample] %>% as.numeric()
+            
+            logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size
+            absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression
+            v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref   # estimated asymptotic variance
+            
+            #	remove infinite values, cutoff based on A
+            fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
+            
+            logR <- logR[fin]
+            absE <- absE[fin]
+            v <- v[fin]
+            
+            if(max(abs(logR)) < 1e-6) return(1)
+            
+            #	taken from the original mean() function
+            n <- length(logR)
+            loL <- floor(n * logratioTrim) + 1
+            hiL <- n + 1 - loL
+            loS <- floor(n * sumTrim) + 1
+            hiS <- n + 1 - loS
+            
+            keep <- (rank(logR)>=loL & rank(logR)<=hiL) & 
+              (rank(absE)>=loS & rank(absE)<=hiS)
+            
+            if(doWeighting)
+              f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
+            else
+              f <- mean(logR[keep], na.rm=TRUE)
+            
+            #	Results will be missing if the two libraries share no features with positive counts
+            #	In this case, return unity
+            if(is.na(f)) f <- 0
+            2^f
+          }) %>%
+  data.frame() %>% 
+  as.numeric() 
+
+#Rescale norm factors for convenience of interpretation.
+rescale <- TMM_norm_factors %>% 
+  log() %>%
+  mean() %>%
+  exp()
+
+TMM_norm_factors %<>% divide_by(., rescale)
+TMM_norm_factors
+
+#Compare with the output from calcNormFactors
+y$samples$norm.factors
+
--- a/intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx
+++ b/intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx