Gladstone-Bioinformatics-Wo.../intro-r-data-analysis/explore_counts_matrix.R

#Reading data file.
dat <- read.table("GSE60450_Lactation-GenewiseCounts.txt")

#Let us examine how our data looks.
View(dat)

#Seems like all data points are there. Can we improve appearance?
#Examine the details of read.table command.
?read.table

#Looks like we can inform read.table about separator type and presence of header.
dat <- read.table("GSE60450_Lactation-GenewiseCounts.txt",
                  header= TRUE, sep = "\t")

#Let us examine how data looks now.
View(dat)

#What are the observations represented in our data?
#colnames gives the names of columns of data.
colnames(dat)

#To check the number of rows and columns in table.
dim(dat)

#To check first few rows of table.
head(dat)

#To check last few rows of table.
tail(dat)

#To check basic stats for each column.
summary(dat)

#Let us extract a column of data.
#For example, EntrezGeneID.
geneIds <- dat$EntrezGeneID

#Check what kind of variable geneIds is.
class(geneIds)

#geneIds should be string type.
dat$EntrezGeneID <- as.character(dat$EntrezGeneID)

#Check the class of gene ids again
class(dat$EntrezGeneID)

#Information about samples is in another file.
phenotype_info <- read.table("targets.csv",
                             header = TRUE,
                             sep = ",")

#The column named GEO in the table represents sample id on GEO.
#Status and CellType are factor levels for statistical analysis.
phenotype_info$CellType <- as.factor(phenotype_info$CellType)

#Check class of CellType. It is a factor variable now.
class(phenotype_info$CellType)

#Currently Status and CellType has repetition of the same values.
#Get unique values.
celltypes <- unique(phenotype_info$CellType)

#Perhaps, no point in keeping spcs as factor in the above object.
#Convert celltypes to character variable.
celltypes <- as.character(celltypes)

#Checking if a text is present in a character variable?
"cardiomyocyte" %in% celltypes

#Let us say we want subset of data corresponding to B cells.
which_B <- phenotype_info$CellType == "B"
phenotype_info_B <- phenotype_info[which_B, ]

#Class of which_B? Logical
class(which_B)

#Alternative way to subset data.
phenotype_info_B <- subset(phenotype_info,
                           CellType == "B")

#Check mean counts for a sample.
mean(dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1)

#Check mean counts for all genes in the B cell samples.
clnames_dat <- colnames(dat)[-2:-1]
clnames_dat <- strsplit(clnames_dat, split = "_")
clnames_dat <- data.frame(clnames_dat)
clnames_dat <- t(clnames_dat)
rownames(clnames_dat) <- NULL
clnames_dat <- clnames_dat[, 1]
which_B <- which(clnames_dat %in% phenotype_info_B$X)
cnts_B <- dat[, which_B + 2]

#Estimate median counts for casein protein.
median(unlist(dat[dat$EntrezGeneID == "12992", -2:-1]))
median(unlist(dat[dat$EntrezGeneID == "12992", which_B + 2]))

#Estimate standard deviation.
sd(unlist(dat[dat$EntrezGeneID == "12992", -2:-1]))
sd(unlist(dat[dat$EntrezGeneID == "12992", which_B + 2]))

#Check histograms of data.
hist(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1))
hist(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1))

#These histograms are not easy to compare.
#Perhaps, we can fix the axis limits.
hist(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
     xlim = c(0, 20), ylim = c(0, 15000))
hist(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
     xlim = c(0, 20), ylim = c(0, 15000))

#Let us look at boxplots.
boxplot(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
        ylim = c(0, 25))
boxplot(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
        ylim = c(0, 25))

#Scatter plots.
plot(x= log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
     y= log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1))
#Are higher counts associated with longer genes?
#Can  we color data points based on length?

#install.packages("ggplot2")
library(ggplot2)
qplot(x = log2(1 + MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
      y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
      data = dat[1:1000, ], color = log10(Length))

#Color scale does not give clear insight.
#Can we use discrete color scale of points instead?
dat_subset <- dat[1:1000, ]
dat_subset$genetype <- "smallGene"
dat_subset$genetype[dat_subset$Length > median(dat_subset$Length)] <- "longGene"
qplot(x = log2(1 + MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
      y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
      data = dat_subset, color = genetype)

#Check the boxplots for long and small genes simultaneously.
qplot(x = genetype,
      y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
      data = dat_subset, geom = "boxplot")

#Additional stuff.
#Hypothesis test.
#Are counts for long genes significantly different from the small genes?
dat_small <- subset(dat_subset, genetype == "smallGene")
dat_long <- subset(dat_subset, genetype == "longGene")
test <- t.test(dat_small$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1,
               dat_long$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1)
test$p.value

#Conditional statements take a condition and perform steps depending on validitiy of the statement.
if (test$p.value < 0.05) {
  print("Counts depend on gene length.")
} else {
  print("Counts don't depend on gene length.")
}

#Looping for repeating the same task but on different data.
for (item in c("smallGene", "longGene")) {
  y <- subset(dat_subset, genetype == item)
  smry <- summary(y)
  print(item)
  print(smry)
}