Update intermediate-r-rna-seq materials

2025-11-30 09:45:43 -08:00 · 2020-03-25 00:16:22 -07:00 · 2020-03-25 00:16:22 -07:00 · a71e347d5a
commit a71e347d5a
parent 8bcc850ecf
7 changed files with 15874 additions and 0 deletions
--- a/intermediate-r-rna-seq/Analysis.R
+++ b/intermediate-r-rna-seq/Analysis.R
@ -0,0 +1,288 @@
+#setwd()
+setwd("~/Dropbox (Gladstone)/Bioinformatics/Training_Workshops/Gladstone-internal/Intermediate_RNA-seq_Fall_2019")
+
+library(magrittr)
+library(edgeR)
+library(org.Mm.eg.db)
+library(ggplot2)
+library(tidyverse)
+library(vioplot)
+
+phenotype_info_file <- "targets.txt"
+raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"
+
+#12 samples and 6 categories
+targets <- phenotype_info_file %>%
+  read.delim(., stringsAsFactors=FALSE)
+
+#This is equivalent to 
+# targets <- read.delim(phenotype_info_file, stringsAsFactors = FALSE)
+
+group <- targets %$%
+  paste(CellType, Status, sep = ".") %>%
+  factor()
+
+#Length of a gene is the total number of bases in exons and UTRs for that gene.
+GenewiseCounts <- raw_counts_file %>%
+  read.delim(., row.names="EntrezGeneID")
+
+colnames(GenewiseCounts) %<>% substring(.,1,7)
+
+# colnames(GenewiseCounts)[-1] <- group
+
+#------------------------
+#Concept 1: MA plots
+#------------------------
+
+two_samples <- GenewiseCounts[, c(2, 3)] %>% #Replicate samples
+  add(., 1) %>%
+  log2()
+
+plotData <- data.frame(M = two_samples[, 1] - two_samples[, 2],
+                       A = (two_samples[, 1] + two_samples[, 2])/2)
+
+ggplot(plotData, aes(x = A, y = M)) +
+  geom_point() +
+  geom_smooth()
+
+#------------------------
+#Create DGElist object and retrieve gene symbols.
+#------------------------
+y <- DGEList(counts = GenewiseCounts[,-1], 
+             group=group,
+             genes=GenewiseCounts[,1,drop=FALSE])
+
+y$genes$Symbol <- mapIds(org.Mm.eg.db,
+                         keys = rownames(y),
+                         keytype="ENTREZID", 
+                         column="SYMBOL")
+
+#------------------------
+#Independent filtering.
+#------------------------
+
+#Filter genes whose symbols are not found.
+keep <- y$genes$Symbol %>%
+  is.na() %>%
+  not()
+
+y <- y[keep, ]
+
+
+#It is not possible to make reliable inference for genes ...
+#... with very low counts.
+#Can filter directly by counts but ...
+#... better to account for difference in library sizes.
+
+minimum_counts_reqd <- 10
+cutoff <- y$samples$lib.size %>% 
+  median() %>% 
+  divide_by(., 10^6) %>% 
+  divide_by(minimum_counts_reqd, .) %>% 
+  round(., 1)
+
+keep <- cpm(y) %>%
+  is_greater_than(., cutoff) %>%
+  rowSums() %>%
+  is_weakly_greater_than(., 2)
+
+#Alternatively, use counts to filter.
+#Examples:
+#keep <- rowSums(y$counts) > 50
+
+y <- y[keep, , keep.lib.sizes=FALSE]
+
+#-------------------------
+#Normalizing the counts.
+#-------------------------
+y <- calcNormFactors(y)
+
+#What's under the hood?
+
+cnts <- y$counts %>% as.matrix()
+lib.sizes <- apply(cnts, 2, sum)
+
+cnts_adjst_libsize <- map_dfc(colnames(cnts), 
+                              function(x) cnts[, x]/lib.sizes[x]) %>%
+  set_colnames(., colnames(cnts))
+
+vioplot(cnts_adjst_libsize)
+#Intuitively, we should expect similar adjustments for similar samples.
+
+f <- apply(cnts_adjst_libsize, 2, 
+           function(x) quantile(x,p=0.75))
+
+ref <- (f - mean(f)) %>% 
+  abs() %>% 
+  which.min()
+
+TMM_norm_factors <- 
+  map_dfc(colnames(cnts), 
+          function(x, logratioTrim=.3, sumTrim=0.05, 
+                   doWeighting=TRUE, Acutoff=-1e10) {
+            
+            #The following steps are excerpted from edgeR's source.
+            nO <- lib.sizes[x]
+            nR <- lib.sizes[ref]
+            
+            obs <- cnts[, x] %>% as.numeric()
+            ref <- cnts[, ref] %>% as.numeric()
+            
+            logR <- log2(obs/nO) - log2(ref/nR)         # log ratio of expression, accounting for library size
+            absE <- (log2(obs/nO) + log2(ref/nR))/2  # absolute expression
+            v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref   # estimated asymptotic variance
+            
+            #	remove infinite values, cutoff based on A
+            fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
+            
+            logR <- logR[fin]
+            absE <- absE[fin]
+            v <- v[fin]
+            
+            if(max(abs(logR)) < 1e-6) return(1)
+            
+            #	taken from the original mean() function
+            n <- length(logR)
+            loL <- floor(n * logratioTrim) + 1
+            hiL <- n + 1 - loL
+            loS <- floor(n * sumTrim) + 1
+            hiS <- n + 1 - loS
+            
+            keep <- (rank(logR)>=loL & rank(logR)<=hiL) & 
+              (rank(absE)>=loS & rank(absE)<=hiS)
+            
+            if(doWeighting)
+              f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
+            else
+              f <- mean(logR[keep], na.rm=TRUE)
+            
+            #	Results will be missing if the two libraries share no features with positive counts
+            #	In this case, return unity
+            if(is.na(f)) f <- 0
+            2^f
+          }) %>%
+  data.frame() %>% 
+  as.numeric() 
+
+#Rescale norm factors for convenience of interpretation.
+rescale <- TMM_norm_factors %>% 
+  log() %>%
+  mean() %>%
+  exp()
+
+TMM_norm_factors %<>% divide_by(., rescale)
+
+#Plot the normalization factors by sample.
+ggplot(y$samples %>% 
+         cbind(., replicate = factor(1:2)), 
+       aes(x = group, y = norm.factors, fill = replicate)) +
+  geom_bar(stat= "identity", position = position_dodge())
+
+#"A normalization factor below one indicates that a small number of high count genes 
+#...are monopolizing the sequencing, causing the counts for other genes to be lower 
+#...than would be usual given the library size." Chen et al., 2016
+#Looks like L.lactating samples contain a number of very highly upregulated genes.
+
+#------------------------
+#Exploratory visualizations
+#------------------------
+#MDS plot
+pch <- c(0,1,2,15,16,17)
+colors <- rep(c("darkgreen", "red", "blue"), 2)
+plotMDS(y, col=colors[group], pch=pch[group])
+legend("top", legend=levels(group), pch=pch, col=colors, ncol=2, 
+       text.width = 0.1)
+
+#PCA plot
+cpm <- cpm(y, log = T, prior.count = 0.01)
+rv <- apply(cpm,1,var) 
+
+#Select genes with highest variance.
+keep <- order(rv, decreasing = TRUE)[1:500]
+selected <- cpm[keep, ] %>% t()
+
+#Transpose is needed to ensure that each row is a vector.
+pca <- prcomp(selected, scale=T, center = T)
+
+stddev <- pca$sdev
+pc1_var <- round(100*stddev[1]^2/sum(stddev^2))
+pc2_var <- round(100*stddev[2]^2/sum(stddev^2))
+pc3_var <- round(100*stddev[3]^2/sum(stddev^2))
+PlotData <- data.frame(cbind(PC1 = pca$x[,1], PC2 = pca$x[,2]))
+PlotData <- targets[, c("CellType", "Status")] %>%
+  cbind(PlotData, .)
+
+ggplot(PlotData, aes(x=PC1, y=PC2, color=CellType, shape=Status)) + 
+  geom_point(size=4.5) +  
+  xlab(paste("PC1:", pc1_var, "% variance")) + 
+  ylab(paste("PC2:", pc2_var, "% variance"))
+
+#--------------------
+#Fitting the model.
+#--------------------
+design <- model.matrix(~0+group) %>%   
+  set_colnames(., levels(group))
+
+y <- estimateDisp(y, design, robust=TRUE)
+plotBCV(y)
+
+fit <- glmQLFit(y, design, robust=TRUE) 
+head(fit$coefficients)
+
+plotQLDisp(fit) 
+
+#--------------------
+#Hypothesis testing
+#--------------------
+
+#Example 1:
+B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design)
+res <- glmQLFTest(fit, contrast=B.LvsP)
+topTags(res)
+is.de <- decideTestsDGE(res)
+summary(is.de)
+plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
+       legend="topright")
+
+#Example 2:
+B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design)
+res <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5))
+topTags(res)
+is.de <- decideTestsDGE(res)
+summary(is.de)
+plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
+       legend="topright")
+
+#Example 3:
+con <- makeContrasts(
+  (L.lactating-L.pregnant)-(B.lactating-B.pregnant),
+  levels=design)
+res <- glmQLFTest(fit, contrast=con)
+topTags(res)
+is.de <- decideTestsDGE(res)
+summary(is.de)
+plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
+       legend="topright")
+
+#Example 4:
+con <- makeContrasts(
+  L.PvsL = L.pregnant - L.lactating,
+  L.VvsL = L.virgin - L.lactating,
+  L.VvsP = L.virgin - L.pregnant, levels=design)
+res <- glmQLFTest(fit, contrast=con)
+topTags(res)
+is.de <- decideTestsDGE(res)
+summary(is.de)
+
+#----------------------------
+#Save results in a table
+result <- topTags(res, n = nrow(y$counts)) %>%
+  data.frame()
+write.table(result, 
+            file = "DE_result.txt",
+            quote = FALSE,
+            sep = "\t",
+            row.names = FALSE)
+
+
+
--- a/intermediate-r-rna-seq/DE_result.txt
+++ b/intermediate-r-rna-seq/DE_result.txt
--- a/intermediate-r-rna-seq/DE_result.xlsx
+++ b/intermediate-r-rna-seq/DE_result.xlsx
--- a/intermediate-r-rna-seq/GSE60450_Lactation-GenewiseCounts.txt.gz
+++ b/intermediate-r-rna-seq/GSE60450_Lactation-GenewiseCounts.txt.gz
--- a/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf
+++ b/intermediate-r-rna-seq/Intermediate_RNA-seq.pdf
--- a/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx
+++ b/intermediate-r-rna-seq/Intermediate_RNA-seq.pptx
--- a/intermediate-r-rna-seq/targets.txt
+++ b/intermediate-r-rna-seq/targets.txt
@ -0,0 +1,13 @@
+GEO	SRA	CellType	Status
+MCL1.DG	GSM1480297	SRR1552450	B	virgin
+MCL1.DH	GSM1480298	SRR1552451	B	virgin
+MCL1.DI	GSM1480299	SRR1552452	B	pregnant
+MCL1.DJ	GSM1480300	SRR1552453	B	pregnant
+MCL1.DK	GSM1480301	SRR1552454	B	lactating
+MCL1.DL	GSM1480302	SRR1552455	B	lactating
+MCL1.LA	GSM1480291	SRR1552444	L	virgin
+MCL1.LB	GSM1480292	SRR1552445	L	virgin
+MCL1.LC	GSM1480293	SRR1552446	L	pregnant
+MCL1.LD	GSM1480294	SRR1552447	L	pregnant
+MCL1.LE	GSM1480295	SRR1552448	L	lactating
+MCL1.LF	GSM1480296	SRR1552449	L	lactating