mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
Update intermediate-r-rna-seq materials
This commit is contained in:
parent
8bcc850ecf
commit
a71e347d5a
7 changed files with 15874 additions and 0 deletions
288
intermediate-r-rna-seq/Analysis.R
Normal file
288
intermediate-r-rna-seq/Analysis.R
Normal file
|
|
@ -0,0 +1,288 @@
|
|||
#setwd()
|
||||
setwd("~/Dropbox (Gladstone)/Bioinformatics/Training_Workshops/Gladstone-internal/Intermediate_RNA-seq_Fall_2019")
|
||||
|
||||
library(magrittr)
|
||||
library(edgeR)
|
||||
library(org.Mm.eg.db)
|
||||
library(ggplot2)
|
||||
library(tidyverse)
|
||||
library(vioplot)
|
||||
|
||||
phenotype_info_file <- "targets.txt"
|
||||
raw_counts_file <- "GSE60450_Lactation-GenewiseCounts.txt.gz"
|
||||
|
||||
#12 samples and 6 categories
|
||||
targets <- phenotype_info_file %>%
|
||||
read.delim(., stringsAsFactors=FALSE)
|
||||
|
||||
#This is equivalent to
|
||||
# targets <- read.delim(phenotype_info_file, stringsAsFactors = FALSE)
|
||||
|
||||
group <- targets %$%
|
||||
paste(CellType, Status, sep = ".") %>%
|
||||
factor()
|
||||
|
||||
#Length of a gene is the total number of bases in exons and UTRs for that gene.
|
||||
GenewiseCounts <- raw_counts_file %>%
|
||||
read.delim(., row.names="EntrezGeneID")
|
||||
|
||||
colnames(GenewiseCounts) %<>% substring(.,1,7)
|
||||
|
||||
# colnames(GenewiseCounts)[-1] <- group
|
||||
|
||||
#------------------------
|
||||
#Concept 1: MA plots
|
||||
#------------------------
|
||||
|
||||
two_samples <- GenewiseCounts[, c(2, 3)] %>% #Replicate samples
|
||||
add(., 1) %>%
|
||||
log2()
|
||||
|
||||
plotData <- data.frame(M = two_samples[, 1] - two_samples[, 2],
|
||||
A = (two_samples[, 1] + two_samples[, 2])/2)
|
||||
|
||||
ggplot(plotData, aes(x = A, y = M)) +
|
||||
geom_point() +
|
||||
geom_smooth()
|
||||
|
||||
#------------------------
|
||||
#Create DGElist object and retrieve gene symbols.
|
||||
#------------------------
|
||||
y <- DGEList(counts = GenewiseCounts[,-1],
|
||||
group=group,
|
||||
genes=GenewiseCounts[,1,drop=FALSE])
|
||||
|
||||
y$genes$Symbol <- mapIds(org.Mm.eg.db,
|
||||
keys = rownames(y),
|
||||
keytype="ENTREZID",
|
||||
column="SYMBOL")
|
||||
|
||||
#------------------------
|
||||
#Independent filtering.
|
||||
#------------------------
|
||||
|
||||
#Filter genes whose symbols are not found.
|
||||
keep <- y$genes$Symbol %>%
|
||||
is.na() %>%
|
||||
not()
|
||||
|
||||
y <- y[keep, ]
|
||||
|
||||
|
||||
#It is not possible to make reliable inference for genes ...
|
||||
#... with very low counts.
|
||||
#Can filter directly by counts but ...
|
||||
#... better to account for difference in library sizes.
|
||||
|
||||
minimum_counts_reqd <- 10
|
||||
cutoff <- y$samples$lib.size %>%
|
||||
median() %>%
|
||||
divide_by(., 10^6) %>%
|
||||
divide_by(minimum_counts_reqd, .) %>%
|
||||
round(., 1)
|
||||
|
||||
keep <- cpm(y) %>%
|
||||
is_greater_than(., cutoff) %>%
|
||||
rowSums() %>%
|
||||
is_weakly_greater_than(., 2)
|
||||
|
||||
#Alternatively, use counts to filter.
|
||||
#Examples:
|
||||
#keep <- rowSums(y$counts) > 50
|
||||
|
||||
y <- y[keep, , keep.lib.sizes=FALSE]
|
||||
|
||||
#-------------------------
|
||||
#Normalizing the counts.
|
||||
#-------------------------
|
||||
y <- calcNormFactors(y)
|
||||
|
||||
#What's under the hood?
|
||||
|
||||
cnts <- y$counts %>% as.matrix()
|
||||
lib.sizes <- apply(cnts, 2, sum)
|
||||
|
||||
cnts_adjst_libsize <- map_dfc(colnames(cnts),
|
||||
function(x) cnts[, x]/lib.sizes[x]) %>%
|
||||
set_colnames(., colnames(cnts))
|
||||
|
||||
vioplot(cnts_adjst_libsize)
|
||||
#Intuitively, we should expect similar adjustments for similar samples.
|
||||
|
||||
f <- apply(cnts_adjst_libsize, 2,
|
||||
function(x) quantile(x,p=0.75))
|
||||
|
||||
ref <- (f - mean(f)) %>%
|
||||
abs() %>%
|
||||
which.min()
|
||||
|
||||
TMM_norm_factors <-
|
||||
map_dfc(colnames(cnts),
|
||||
function(x, logratioTrim=.3, sumTrim=0.05,
|
||||
doWeighting=TRUE, Acutoff=-1e10) {
|
||||
|
||||
#The following steps are excerpted from edgeR's source.
|
||||
nO <- lib.sizes[x]
|
||||
nR <- lib.sizes[ref]
|
||||
|
||||
obs <- cnts[, x] %>% as.numeric()
|
||||
ref <- cnts[, ref] %>% as.numeric()
|
||||
|
||||
logR <- log2(obs/nO) - log2(ref/nR) # log ratio of expression, accounting for library size
|
||||
absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression
|
||||
v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance
|
||||
|
||||
# remove infinite values, cutoff based on A
|
||||
fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
|
||||
|
||||
logR <- logR[fin]
|
||||
absE <- absE[fin]
|
||||
v <- v[fin]
|
||||
|
||||
if(max(abs(logR)) < 1e-6) return(1)
|
||||
|
||||
# taken from the original mean() function
|
||||
n <- length(logR)
|
||||
loL <- floor(n * logratioTrim) + 1
|
||||
hiL <- n + 1 - loL
|
||||
loS <- floor(n * sumTrim) + 1
|
||||
hiS <- n + 1 - loS
|
||||
|
||||
keep <- (rank(logR)>=loL & rank(logR)<=hiL) &
|
||||
(rank(absE)>=loS & rank(absE)<=hiS)
|
||||
|
||||
if(doWeighting)
|
||||
f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
|
||||
else
|
||||
f <- mean(logR[keep], na.rm=TRUE)
|
||||
|
||||
# Results will be missing if the two libraries share no features with positive counts
|
||||
# In this case, return unity
|
||||
if(is.na(f)) f <- 0
|
||||
2^f
|
||||
}) %>%
|
||||
data.frame() %>%
|
||||
as.numeric()
|
||||
|
||||
#Rescale norm factors for convenience of interpretation.
|
||||
rescale <- TMM_norm_factors %>%
|
||||
log() %>%
|
||||
mean() %>%
|
||||
exp()
|
||||
|
||||
TMM_norm_factors %<>% divide_by(., rescale)
|
||||
|
||||
#Plot the normalization factors by sample.
|
||||
ggplot(y$samples %>%
|
||||
cbind(., replicate = factor(1:2)),
|
||||
aes(x = group, y = norm.factors, fill = replicate)) +
|
||||
geom_bar(stat= "identity", position = position_dodge())
|
||||
|
||||
#"A normalization factor below one indicates that a small number of high count genes
|
||||
#...are monopolizing the sequencing, causing the counts for other genes to be lower
|
||||
#...than would be usual given the library size." Chen et al., 2016
|
||||
#Looks like L.lactating samples contain a number of very highly upregulated genes.
|
||||
|
||||
#------------------------
|
||||
#Exploratory visualizations
|
||||
#------------------------
|
||||
#MDS plot
|
||||
pch <- c(0,1,2,15,16,17)
|
||||
colors <- rep(c("darkgreen", "red", "blue"), 2)
|
||||
plotMDS(y, col=colors[group], pch=pch[group])
|
||||
legend("top", legend=levels(group), pch=pch, col=colors, ncol=2,
|
||||
text.width = 0.1)
|
||||
|
||||
#PCA plot
|
||||
cpm <- cpm(y, log = T, prior.count = 0.01)
|
||||
rv <- apply(cpm,1,var)
|
||||
|
||||
#Select genes with highest variance.
|
||||
keep <- order(rv, decreasing = TRUE)[1:500]
|
||||
selected <- cpm[keep, ] %>% t()
|
||||
|
||||
#Transpose is needed to ensure that each row is a vector.
|
||||
pca <- prcomp(selected, scale=T, center = T)
|
||||
|
||||
stddev <- pca$sdev
|
||||
pc1_var <- round(100*stddev[1]^2/sum(stddev^2))
|
||||
pc2_var <- round(100*stddev[2]^2/sum(stddev^2))
|
||||
pc3_var <- round(100*stddev[3]^2/sum(stddev^2))
|
||||
PlotData <- data.frame(cbind(PC1 = pca$x[,1], PC2 = pca$x[,2]))
|
||||
PlotData <- targets[, c("CellType", "Status")] %>%
|
||||
cbind(PlotData, .)
|
||||
|
||||
ggplot(PlotData, aes(x=PC1, y=PC2, color=CellType, shape=Status)) +
|
||||
geom_point(size=4.5) +
|
||||
xlab(paste("PC1:", pc1_var, "% variance")) +
|
||||
ylab(paste("PC2:", pc2_var, "% variance"))
|
||||
|
||||
#--------------------
|
||||
#Fitting the model.
|
||||
#--------------------
|
||||
design <- model.matrix(~0+group) %>%
|
||||
set_colnames(., levels(group))
|
||||
|
||||
y <- estimateDisp(y, design, robust=TRUE)
|
||||
plotBCV(y)
|
||||
|
||||
fit <- glmQLFit(y, design, robust=TRUE)
|
||||
head(fit$coefficients)
|
||||
|
||||
plotQLDisp(fit)
|
||||
|
||||
#--------------------
|
||||
#Hypothesis testing
|
||||
#--------------------
|
||||
|
||||
#Example 1:
|
||||
B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design)
|
||||
res <- glmQLFTest(fit, contrast=B.LvsP)
|
||||
topTags(res)
|
||||
is.de <- decideTestsDGE(res)
|
||||
summary(is.de)
|
||||
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
|
||||
legend="topright")
|
||||
|
||||
#Example 2:
|
||||
B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design)
|
||||
res <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5))
|
||||
topTags(res)
|
||||
is.de <- decideTestsDGE(res)
|
||||
summary(is.de)
|
||||
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
|
||||
legend="topright")
|
||||
|
||||
#Example 3:
|
||||
con <- makeContrasts(
|
||||
(L.lactating-L.pregnant)-(B.lactating-B.pregnant),
|
||||
levels=design)
|
||||
res <- glmQLFTest(fit, contrast=con)
|
||||
topTags(res)
|
||||
is.de <- decideTestsDGE(res)
|
||||
summary(is.de)
|
||||
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
|
||||
legend="topright")
|
||||
|
||||
#Example 4:
|
||||
con <- makeContrasts(
|
||||
L.PvsL = L.pregnant - L.lactating,
|
||||
L.VvsL = L.virgin - L.lactating,
|
||||
L.VvsP = L.virgin - L.pregnant, levels=design)
|
||||
res <- glmQLFTest(fit, contrast=con)
|
||||
topTags(res)
|
||||
is.de <- decideTestsDGE(res)
|
||||
summary(is.de)
|
||||
|
||||
#----------------------------
|
||||
#Save results in a table
|
||||
result <- topTags(res, n = nrow(y$counts)) %>%
|
||||
data.frame()
|
||||
write.table(result,
|
||||
file = "DE_result.txt",
|
||||
quote = FALSE,
|
||||
sep = "\t",
|
||||
row.names = FALSE)
|
||||
|
||||
|
||||
|
||||
15573
intermediate-r-rna-seq/DE_result.txt
Normal file
15573
intermediate-r-rna-seq/DE_result.txt
Normal file
File diff suppressed because it is too large
Load diff
BIN
intermediate-r-rna-seq/DE_result.xlsx
Normal file
BIN
intermediate-r-rna-seq/DE_result.xlsx
Normal file
Binary file not shown.
BIN
intermediate-r-rna-seq/GSE60450_Lactation-GenewiseCounts.txt.gz
Normal file
BIN
intermediate-r-rna-seq/GSE60450_Lactation-GenewiseCounts.txt.gz
Normal file
Binary file not shown.
BIN
intermediate-r-rna-seq/Intermediate_RNA-seq.pdf
Normal file
BIN
intermediate-r-rna-seq/Intermediate_RNA-seq.pdf
Normal file
Binary file not shown.
BIN
intermediate-r-rna-seq/Intermediate_RNA-seq.pptx
Normal file
BIN
intermediate-r-rna-seq/Intermediate_RNA-seq.pptx
Normal file
Binary file not shown.
13
intermediate-r-rna-seq/targets.txt
Normal file
13
intermediate-r-rna-seq/targets.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
GEO SRA CellType Status
|
||||
MCL1.DG GSM1480297 SRR1552450 B virgin
|
||||
MCL1.DH GSM1480298 SRR1552451 B virgin
|
||||
MCL1.DI GSM1480299 SRR1552452 B pregnant
|
||||
MCL1.DJ GSM1480300 SRR1552453 B pregnant
|
||||
MCL1.DK GSM1480301 SRR1552454 B lactating
|
||||
MCL1.DL GSM1480302 SRR1552455 B lactating
|
||||
MCL1.LA GSM1480291 SRR1552444 L virgin
|
||||
MCL1.LB GSM1480292 SRR1552445 L virgin
|
||||
MCL1.LC GSM1480293 SRR1552446 L pregnant
|
||||
MCL1.LD GSM1480294 SRR1552447 L pregnant
|
||||
MCL1.LE GSM1480295 SRR1552448 L lactating
|
||||
MCL1.LF GSM1480296 SRR1552449 L lactating
|
||||
Loading…
Add table
Add a link
Reference in a new issue