Gladstone-Bioinformatics-Wo.../intermediate-r-rna-seq/TMM_normalization_steps.R

#The code for the main steps in the following ...
#... are copied from the source code for edgeR::calcNormFactors.

cnts <- y$counts %>% as.matrix()
lib.sizes <- apply(cnts, 2, sum)

cnts_adjst_libsize <- map_dfc(colnames(cnts),
                              function(x) cnts[, x]/lib.sizes[x]) %>%
  set_colnames(., colnames(cnts))

boxplot(cnts_adjst_libsize)

f <- apply(cnts_adjst_libsize, 2,
           function(x) quantile(x, p=0.75))

boxplot(cnts_adjst_libsize, ylim = c(0, 10^(-4)))
abline(h = mean(f), lty = "dotted")

ref_sample <- (f - mean(f)) %>%
  abs() %>%
  which.min()

#Illustrating normalization of one sample.
illustrate = TRUE
if (illustrate) {
x <- colnames(cnts)[12]

nO <- lib.sizes[x]
nR <- lib.sizes[ref_sample]

obs <- cnts[, x] %>% as.numeric()
ref <- cnts[, ref_sample] %>% as.numeric()

#The M values:
logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size

#The A values:
absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression

#	remove infinite values, cutoff based on A
fin <- is.finite(logR) & is.finite(absE) & (absE > -10^(10))

logR <- logR[fin]
absE <- absE[fin]

print(
ggplot(data.frame(A = absE,
                  M = logR),
       aes(A, M)) +
  geom_point() +
  geom_smooth() +
  coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5))
)

logratioTrim=.3
sumTrim=0.05

#Remove the genes with the 5% most extreme A values.
#Remove the genes with the 30% most extreme M values
n <- length(logR)
loL <- floor(n * logratioTrim) + 1
hiL <- n + 1 - loL
loS <- floor(n * sumTrim) + 1
hiS <- n + 1 - loS

keep <- (rank(logR)>=loL & rank(logR)<=hiL) &
  (rank(absE)>=loS & rank(absE)<=hiS)

print(
  ggplot(data.frame(A = absE[keep],
                  M = logR[keep]),
       aes(A, M)) +
  geom_point() +
  coord_cartesian(xlim = c(-25, -5), ylim = c(-10.5, 10.5)) +
  geom_hline(color = "red", linetype = "dotted",
           yintercept = mean(logR[keep], na.rm = T))
)

#Normalization factor for the current sample wrt to the reference sample
f <- mean(logR[keep], na.rm=TRUE)
print("Normalization factor (log2 scale)")
print(f)

f <- 2^f
print("Normalization factor (original scale)")
print(f)

#Compare with the normalization factor calculated by TMM.
print("Normalization factor (from edgeR::calcNormFactors)")
y$samples$norm.factors[12]
}

TMM_norm_factors <-
  map_dfc(colnames(cnts),
          function(x, logratioTrim=.3, sumTrim=0.05,
                   doWeighting=TRUE, Acutoff=-1e10) {

            #The following steps are excerpted from edgeR's source.
            nO <- lib.sizes[x]
            nR <- lib.sizes[ref_sample]

            obs <- cnts[, x] %>% as.numeric()
            ref <- cnts[, ref_sample] %>% as.numeric()

            logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size
            absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression
            v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref   # estimated asymptotic variance

            #	remove infinite values, cutoff based on A
            fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)

            logR <- logR[fin]
            absE <- absE[fin]
            v <- v[fin]

            if(max(abs(logR)) < 1e-6) return(1)

            #	taken from the original mean() function
            n <- length(logR)
            loL <- floor(n * logratioTrim) + 1
            hiL <- n + 1 - loL
            loS <- floor(n * sumTrim) + 1
            hiS <- n + 1 - loS

            keep <- (rank(logR)>=loL & rank(logR)<=hiL) &
              (rank(absE)>=loS & rank(absE)<=hiS)

            if(doWeighting)
              f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
            else
              f <- mean(logR[keep], na.rm=TRUE)

            #	Results will be missing if the two libraries share no features with positive counts
            #	In this case, return unity
            if(is.na(f)) f <- 0
            2^f
          }) %>%
  data.frame() %>%
  as.numeric()

#Rescale norm factors for convenience of interpretation.
rescale <- TMM_norm_factors %>%
  log() %>%
  mean() %>%
  exp()

TMM_norm_factors %<>% divide_by(., rescale)
TMM_norm_factors

#Compare with the output from calcNormFactors
y$samples$norm.factors