mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
Add files via upload
This commit is contained in:
parent
9236d4d81d
commit
634274721e
6 changed files with 42938 additions and 0 deletions
15573
intro-r-data-analysis/DE_result.txt
Normal file
15573
intro-r-data-analysis/DE_result.txt
Normal file
File diff suppressed because it is too large
Load diff
27180
intro-r-data-analysis/GSE60450_Lactation-GenewiseCounts.txt
Normal file
27180
intro-r-data-analysis/GSE60450_Lactation-GenewiseCounts.txt
Normal file
File diff suppressed because it is too large
Load diff
Binary file not shown.
Binary file not shown.
172
intro-r-data-analysis/explore_counts_matrix.R
Normal file
172
intro-r-data-analysis/explore_counts_matrix.R
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
#Reading data file.
|
||||
dat <- read.table("GSE60450_Lactation-GenewiseCounts.txt")
|
||||
|
||||
#Let us examine how our data looks.
|
||||
View(dat)
|
||||
|
||||
#Seems like all data points are there. Can we improve appearance?
|
||||
#Examine the details of read.table command.
|
||||
?read.table
|
||||
|
||||
#Looks like we can inform read.table about separator type and presence of header.
|
||||
dat <- read.table("GSE60450_Lactation-GenewiseCounts.txt",
|
||||
header= TRUE, sep = "\t")
|
||||
|
||||
#Let us examine how data looks now.
|
||||
View(dat)
|
||||
|
||||
#What are the observations represented in our data?
|
||||
#colnames gives the names of columns of data.
|
||||
colnames(dat)
|
||||
|
||||
#To check the number of rows and columns in table.
|
||||
dim(dat)
|
||||
|
||||
#To check first few rows of table.
|
||||
head(dat)
|
||||
|
||||
#To check last few rows of table.
|
||||
tail(dat)
|
||||
|
||||
#To check basic stats for each column.
|
||||
summary(dat)
|
||||
|
||||
#Let us extract a column of data.
|
||||
#For example, EntrezGeneID.
|
||||
geneIds <- dat$EntrezGeneID
|
||||
|
||||
#Check what kind of variable geneIds is.
|
||||
class(geneIds)
|
||||
|
||||
#geneIds should be string type.
|
||||
dat$EntrezGeneID <- as.character(dat$EntrezGeneID)
|
||||
|
||||
#Check the class of gene ids again
|
||||
class(dat$EntrezGeneID)
|
||||
|
||||
#Information about samples is in another file.
|
||||
phenotype_info <- read.table("targets.csv",
|
||||
header = TRUE,
|
||||
sep = ",")
|
||||
|
||||
#The column named GEO in the table represents sample id on GEO.
|
||||
#Status and CellType are factor levels for statistical analysis.
|
||||
phenotype_info$CellType <- as.factor(phenotype_info$CellType)
|
||||
|
||||
#Check class of CellType. It is a factor variable now.
|
||||
class(phenotype_info$CellType)
|
||||
|
||||
#Currently Status and CellType has repetition of the same values.
|
||||
#Get unique values.
|
||||
celltypes <- unique(phenotype_info$CellType)
|
||||
|
||||
#Perhaps, no point in keeping spcs as factor in the above object.
|
||||
#Convert celltypes to character variable.
|
||||
celltypes <- as.character(celltypes)
|
||||
|
||||
#Checking if a text is present in a character variable?
|
||||
"cardiomyocyte" %in% celltypes
|
||||
|
||||
#Let us say we want subset of data corresponding to B cells.
|
||||
which_B <- phenotype_info$CellType == "B"
|
||||
phenotype_info_B <- phenotype_info[which_B, ]
|
||||
|
||||
#Class of which_B? Logical
|
||||
class(which_B)
|
||||
|
||||
#Alternative way to subset data.
|
||||
phenotype_info_B <- subset(phenotype_info,
|
||||
CellType == "B")
|
||||
|
||||
#Check mean counts for a sample.
|
||||
mean(dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1)
|
||||
|
||||
#Check mean counts for all genes in the B cell samples.
|
||||
clnames_dat <- colnames(dat)[-2:-1]
|
||||
clnames_dat <- strsplit(clnames_dat, split = "_")
|
||||
clnames_dat <- data.frame(clnames_dat)
|
||||
clnames_dat <- t(clnames_dat)
|
||||
rownames(clnames_dat) <- NULL
|
||||
clnames_dat <- clnames_dat[, 1]
|
||||
which_B <- which(clnames_dat %in% phenotype_info_B$X)
|
||||
cnts_B <- dat[, which_B + 2]
|
||||
|
||||
#Estimate median counts for casein protein.
|
||||
median(unlist(dat[dat$EntrezGeneID == "12992", -2:-1]))
|
||||
median(unlist(dat[dat$EntrezGeneID == "12992", which_B + 2]))
|
||||
|
||||
#Estimate standard deviation.
|
||||
sd(unlist(dat[dat$EntrezGeneID == "12992", -2:-1]))
|
||||
sd(unlist(dat[dat$EntrezGeneID == "12992", which_B + 2]))
|
||||
|
||||
#Check histograms of data.
|
||||
hist(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1))
|
||||
hist(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1))
|
||||
|
||||
#These histograms are not easy to compare.
|
||||
#Perhaps, we can fix the axis limits.
|
||||
hist(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
|
||||
xlim = c(0, 20), ylim = c(0, 15000))
|
||||
hist(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
|
||||
xlim = c(0, 20), ylim = c(0, 15000))
|
||||
|
||||
#Let us look at boxplots.
|
||||
boxplot(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
|
||||
ylim = c(0, 25))
|
||||
boxplot(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
|
||||
ylim = c(0, 25))
|
||||
|
||||
#Scatter plots.
|
||||
plot(x= log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
|
||||
y= log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1))
|
||||
#Are higher counts associated with longer genes?
|
||||
#Can we color data points based on length?
|
||||
|
||||
#install.packages("ggplot2")
|
||||
library(ggplot2)
|
||||
qplot(x = log2(1 + MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
|
||||
y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
|
||||
data = dat[1:1000, ], color = log10(Length))
|
||||
|
||||
#Color scale does not give clear insight.
|
||||
#Can we use discrete color scale of points instead?
|
||||
dat_subset <- dat[1:1000, ]
|
||||
dat_subset$genetype <- "smallGene"
|
||||
dat_subset$genetype[dat_subset$Length > median(dat_subset$Length)] <- "longGene"
|
||||
qplot(x = log2(1 + MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
|
||||
y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
|
||||
data = dat_subset, color = genetype)
|
||||
|
||||
#Check the boxplots for long and small genes simultaneously.
|
||||
qplot(x = genetype,
|
||||
y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
|
||||
data = dat_subset, geom = "boxplot")
|
||||
|
||||
#Additional stuff.
|
||||
#Hypothesis test.
|
||||
#Are counts for long genes significantly different from the small genes?
|
||||
dat_small <- subset(dat_subset, genetype == "smallGene")
|
||||
dat_long <- subset(dat_subset, genetype == "longGene")
|
||||
test <- t.test(dat_small$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1,
|
||||
dat_long$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1)
|
||||
test$p.value
|
||||
|
||||
#Conditional statements take a condition and perform steps depending on validitiy of the statement.
|
||||
if (test$p.value < 0.05) {
|
||||
print("Counts depend on gene length.")
|
||||
} else {
|
||||
print("Counts don't depend on gene length.")
|
||||
}
|
||||
|
||||
#Looping for repeating the same task but on different data.
|
||||
for (item in c("smallGene", "longGene")) {
|
||||
y <- subset(dat_subset, genetype == item)
|
||||
smry <- summary(y)
|
||||
print(item)
|
||||
print(smry)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
13
intro-r-data-analysis/targets.csv
Normal file
13
intro-r-data-analysis/targets.csv
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
,GEO,SRA,CellType,Status
|
||||
MCL1.DG,GSM1480297,SRR1552450,B,virgin
|
||||
MCL1.DH,GSM1480298,SRR1552451,B,virgin
|
||||
MCL1.DI,GSM1480299,SRR1552452,B,pregnant
|
||||
MCL1.DJ,GSM1480300,SRR1552453,B,pregnant
|
||||
MCL1.DK,GSM1480301,SRR1552454,B,lactating
|
||||
MCL1.DL,GSM1480302,SRR1552455,B,lactating
|
||||
MCL1.LA,GSM1480291,SRR1552444,L,virgin
|
||||
MCL1.LB,GSM1480292,SRR1552445,L,virgin
|
||||
MCL1.LC,GSM1480293,SRR1552446,L,pregnant
|
||||
MCL1.LD,GSM1480294,SRR1552447,L,pregnant
|
||||
MCL1.LE,GSM1480295,SRR1552448,L,lactating
|
||||
MCL1.LF,GSM1480296,SRR1552449,L,lactating
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue