mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
Updated the scripts and slides
This commit is contained in:
parent
d30f39831c
commit
bb9cfaf359
6 changed files with 165 additions and 82 deletions
|
|
@ -4,9 +4,11 @@ setwd("~/Dropbox (Gladstone)/Bioinformatics/Training_Workshops/Gladstone-interna
|
|||
library(magrittr)
|
||||
library(edgeR)
|
||||
library(org.Mm.eg.db)
|
||||
library(ggplot2)
|
||||
library(tidyverse)
|
||||
library(vioplot)
|
||||
|
||||
#----------------------------
|
||||
#Load and organize data.
|
||||
#----------------------------
|
||||
|
||||
phenotype_info_file <- "targets.txt"
|
||||
raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"
|
||||
|
|
@ -15,7 +17,7 @@ raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"
|
|||
targets <- phenotype_info_file %>%
|
||||
read.delim(., stringsAsFactors=FALSE)
|
||||
|
||||
#This is equivalent to
|
||||
#The above statement is equivalent to
|
||||
# targets <- read.delim(phenotype_info_file, stringsAsFactors = FALSE)
|
||||
|
||||
group <- targets %$%
|
||||
|
|
@ -28,8 +30,6 @@ GenewiseCounts <- raw_counts_file %>%
|
|||
|
||||
colnames(GenewiseCounts) %<>% substring(.,1,7)
|
||||
|
||||
# colnames(GenewiseCounts)[-1] <- group
|
||||
|
||||
#------------------------
|
||||
#Concept 1: MA plots
|
||||
#------------------------
|
||||
|
|
@ -81,6 +81,8 @@ cutoff <- y$samples$lib.size %>%
|
|||
divide_by(minimum_counts_reqd, .) %>%
|
||||
round(., 1)
|
||||
|
||||
#... or simply set cutoff to 0.5
|
||||
|
||||
keep <- cpm(y) %>%
|
||||
is_greater_than(., cutoff) %>%
|
||||
rowSums() %>%
|
||||
|
|
@ -97,86 +99,15 @@ y <- y[keep, , keep.lib.sizes=FALSE]
|
|||
#-------------------------
|
||||
y <- calcNormFactors(y)
|
||||
|
||||
#What's under the hood?
|
||||
#What has calcNormFactors got under the hood?
|
||||
#See the slides and TMM_normalization_steps.R
|
||||
|
||||
cnts <- y$counts %>% as.matrix()
|
||||
lib.sizes <- apply(cnts, 2, sum)
|
||||
|
||||
cnts_adjst_libsize <- map_dfc(colnames(cnts),
|
||||
function(x) cnts[, x]/lib.sizes[x]) %>%
|
||||
set_colnames(., colnames(cnts))
|
||||
|
||||
vioplot(cnts_adjst_libsize)
|
||||
#Intuitively, we should expect similar adjustments for similar samples.
|
||||
|
||||
f <- apply(cnts_adjst_libsize, 2,
|
||||
function(x) quantile(x,p=0.75))
|
||||
|
||||
ref <- (f - mean(f)) %>%
|
||||
abs() %>%
|
||||
which.min()
|
||||
|
||||
TMM_norm_factors <-
|
||||
map_dfc(colnames(cnts),
|
||||
function(x, logratioTrim=.3, sumTrim=0.05,
|
||||
doWeighting=TRUE, Acutoff=-1e10) {
|
||||
|
||||
#The following steps are excerpted from edgeR's source.
|
||||
nO <- lib.sizes[x]
|
||||
nR <- lib.sizes[ref]
|
||||
|
||||
obs <- cnts[, x] %>% as.numeric()
|
||||
ref <- cnts[, ref] %>% as.numeric()
|
||||
|
||||
logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size
|
||||
absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression
|
||||
v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance
|
||||
|
||||
# remove infinite values, cutoff based on A
|
||||
fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
|
||||
|
||||
logR <- logR[fin]
|
||||
absE <- absE[fin]
|
||||
v <- v[fin]
|
||||
|
||||
if(max(abs(logR)) < 1e-6) return(1)
|
||||
|
||||
# taken from the original mean() function
|
||||
n <- length(logR)
|
||||
loL <- floor(n * logratioTrim) + 1
|
||||
hiL <- n + 1 - loL
|
||||
loS <- floor(n * sumTrim) + 1
|
||||
hiS <- n + 1 - loS
|
||||
|
||||
keep <- (rank(logR)>=loL & rank(logR)<=hiL) &
|
||||
(rank(absE)>=loS & rank(absE)<=hiS)
|
||||
|
||||
if(doWeighting)
|
||||
f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
|
||||
else
|
||||
f <- mean(logR[keep], na.rm=TRUE)
|
||||
|
||||
# Results will be missing if the two libraries share no features with positive counts
|
||||
# In this case, return unity
|
||||
if(is.na(f)) f <- 0
|
||||
2^f
|
||||
}) %>%
|
||||
data.frame() %>%
|
||||
as.numeric()
|
||||
|
||||
#Rescale norm factors for convenience of interpretation.
|
||||
rescale <- TMM_norm_factors %>%
|
||||
log() %>%
|
||||
mean() %>%
|
||||
exp()
|
||||
|
||||
TMM_norm_factors %<>% divide_by(., rescale)
|
||||
|
||||
#Plot the normalization factors by sample.
|
||||
ggplot(y$samples %>%
|
||||
cbind(., replicate = factor(1:2)),
|
||||
aes(x = group, y = norm.factors, fill = replicate)) +
|
||||
geom_bar(stat= "identity", position = position_dodge())
|
||||
geom_col(position = position_dodge())
|
||||
|
||||
#"A normalization factor below one indicates that a small number of high count genes
|
||||
#...are monopolizing the sequencing, causing the counts for other genes to be lower
|
||||
|
|
@ -190,11 +121,11 @@ ggplot(y$samples %>%
|
|||
pch <- c(0,1,2,15,16,17)
|
||||
colors <- rep(c("darkgreen", "red", "blue"), 2)
|
||||
plotMDS(y, col=colors[group], pch=pch[group])
|
||||
legend("top", legend=levels(group), pch=pch, col=colors, ncol=2,
|
||||
text.width = 0.1)
|
||||
legend("top", legend=levels(group) %>% substr(., 1, 3),
|
||||
pch=pch, col=colors, ncol=2, cex = 0.5)
|
||||
|
||||
#PCA plot
|
||||
cpm <- cpm(y, log = T, prior.count = 0.01)
|
||||
cpm <- cpm(y, log = TRUE, prior.count = 0.01)
|
||||
rv <- apply(cpm,1,var)
|
||||
|
||||
#Select genes with highest variance.
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
152
intermediate-r-rna-seq/TMM_normalization_steps.R
Normal file
152
intermediate-r-rna-seq/TMM_normalization_steps.R
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
#The code for the main steps in the following ...
|
||||
#... are copied from the source code for edgeR::calcNormFactors.
|
||||
|
||||
cnts <- y$counts %>% as.matrix()
|
||||
lib.sizes <- apply(cnts, 2, sum)
|
||||
|
||||
cnts_adjst_libsize <- map_dfc(colnames(cnts),
|
||||
function(x) cnts[, x]/lib.sizes[x]) %>%
|
||||
set_colnames(., colnames(cnts))
|
||||
|
||||
boxplot(cnts_adjst_libsize)
|
||||
|
||||
f <- apply(cnts_adjst_libsize, 2,
|
||||
function(x) quantile(x, p=0.75))
|
||||
|
||||
boxplot(cnts_adjst_libsize, ylim = c(0, 10^(-4)))
|
||||
abline(h = mean(f), lty = "dotted")
|
||||
|
||||
ref_sample <- (f - mean(f)) %>%
|
||||
abs() %>%
|
||||
which.min()
|
||||
|
||||
#Illustrating normalization of one sample.
|
||||
illustrate = TRUE
|
||||
if (illustrate) {
|
||||
x <- colnames(cnts)[12]
|
||||
|
||||
nO <- lib.sizes[x]
|
||||
nR <- lib.sizes[ref_sample]
|
||||
|
||||
obs <- cnts[, x] %>% as.numeric()
|
||||
ref <- cnts[, ref_sample] %>% as.numeric()
|
||||
|
||||
#The M values:
|
||||
logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size
|
||||
|
||||
#The A values:
|
||||
absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression
|
||||
|
||||
# remove infinite values, cutoff based on A
|
||||
fin <- is.finite(logR) & is.finite(absE) & (absE > -10^(10))
|
||||
|
||||
logR <- logR[fin]
|
||||
absE <- absE[fin]
|
||||
|
||||
print(
|
||||
ggplot(data.frame(A = absE,
|
||||
M = logR),
|
||||
aes(A, M)) +
|
||||
geom_point() +
|
||||
geom_smooth() +
|
||||
coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5))
|
||||
)
|
||||
|
||||
logratioTrim=.3
|
||||
sumTrim=0.05
|
||||
|
||||
#Remove the genes with the 5% most extreme A values.
|
||||
#Remove the genes with the 30% most extreme M values
|
||||
n <- length(logR)
|
||||
loL <- floor(n * logratioTrim) + 1
|
||||
hiL <- n + 1 - loL
|
||||
loS <- floor(n * sumTrim) + 1
|
||||
hiS <- n + 1 - loS
|
||||
|
||||
keep <- (rank(logR)>=loL & rank(logR)<=hiL) &
|
||||
(rank(absE)>=loS & rank(absE)<=hiS)
|
||||
|
||||
print(
|
||||
ggplot(data.frame(A = absE[keep],
|
||||
M = logR[keep]),
|
||||
aes(A, M)) +
|
||||
geom_point() +
|
||||
coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5)) +
|
||||
geom_hline(color = "red", linetype = "dotted",
|
||||
yintercept = mean(logR[keep], na.rm = T))
|
||||
)
|
||||
|
||||
#Normalization factor for the current sample wrt to the reference sample
|
||||
f <- mean(logR[keep], na.rm=TRUE)
|
||||
print("Normalization factor (log2 scale)")
|
||||
print(f)
|
||||
|
||||
f <- 2^f
|
||||
print("Normalization factor (original scale)")
|
||||
print(f)
|
||||
|
||||
#Compare with the normalization factor calculated by TMM.
|
||||
print("Normalization factor (from edgeR::calcNormFactors)")
|
||||
y$samples$norm.factors[12]
|
||||
}
|
||||
|
||||
TMM_norm_factors <-
|
||||
map_dfc(colnames(cnts),
|
||||
function(x, logratioTrim=.3, sumTrim=0.05,
|
||||
doWeighting=TRUE, Acutoff=-1e10) {
|
||||
|
||||
#The following steps are excerpted from edgeR's source.
|
||||
nO <- lib.sizes[x]
|
||||
nR <- lib.sizes[ref_sample]
|
||||
|
||||
obs <- cnts[, x] %>% as.numeric()
|
||||
ref <- cnts[, ref_sample] %>% as.numeric()
|
||||
|
||||
logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size
|
||||
absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression
|
||||
v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance
|
||||
|
||||
# remove infinite values, cutoff based on A
|
||||
fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
|
||||
|
||||
logR <- logR[fin]
|
||||
absE <- absE[fin]
|
||||
v <- v[fin]
|
||||
|
||||
if(max(abs(logR)) < 1e-6) return(1)
|
||||
|
||||
# taken from the original mean() function
|
||||
n <- length(logR)
|
||||
loL <- floor(n * logratioTrim) + 1
|
||||
hiL <- n + 1 - loL
|
||||
loS <- floor(n * sumTrim) + 1
|
||||
hiS <- n + 1 - loS
|
||||
|
||||
keep <- (rank(logR)>=loL & rank(logR)<=hiL) &
|
||||
(rank(absE)>=loS & rank(absE)<=hiS)
|
||||
|
||||
if(doWeighting)
|
||||
f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
|
||||
else
|
||||
f <- mean(logR[keep], na.rm=TRUE)
|
||||
|
||||
# Results will be missing if the two libraries share no features with positive counts
|
||||
# In this case, return unity
|
||||
if(is.na(f)) f <- 0
|
||||
2^f
|
||||
}) %>%
|
||||
data.frame() %>%
|
||||
as.numeric()
|
||||
|
||||
#Rescale norm factors for convenience of interpretation.
|
||||
rescale <- TMM_norm_factors %>%
|
||||
log() %>%
|
||||
mean() %>%
|
||||
exp()
|
||||
|
||||
TMM_norm_factors %<>% divide_by(., rescale)
|
||||
TMM_norm_factors
|
||||
|
||||
#Compare with the output from calcNormFactors
|
||||
y$samples$norm.factors
|
||||
|
||||
BIN
intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx
Normal file
BIN
intermediate-r-rna-seq/~$Intermediate_RNA-seq.pptx
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue