mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
138 lines
3.6 KiB
R
138 lines
3.6 KiB
R
#Reading data file.
|
|
dat <- read.table("iris.csv")
|
|
|
|
#Let us examine how our data looks.
|
|
View(dat)
|
|
|
|
#Seems like all data points are there. Can we improve appearance?
|
|
#Examine the details of read.table command.
|
|
?read.table
|
|
|
|
#Looks like we can inform read.table about separator type and presence of header.
|
|
dat <- read.table("iris.csv", header= TRUE, sep = ",")
|
|
|
|
#Let us examine how data looks now.
|
|
View(dat)
|
|
|
|
#What are the observations represented in our data?
|
|
#colnames gives the names of columns of data.
|
|
colnames(dat)
|
|
|
|
#To check the number of rows and columns in table.
|
|
dim(dat)
|
|
|
|
#To check first few rows of table.
|
|
head(dat)
|
|
|
|
#To check last few rows of table.
|
|
tail(dat)
|
|
|
|
#To check basic stats for each column.
|
|
summary(dat)
|
|
|
|
#Let us extract a column of data.
|
|
#For example, sepal length.
|
|
spl_len <- dat$Sepal.Length
|
|
|
|
#Check what kind of variable spl_len is.
|
|
class(spl_len)
|
|
|
|
#Which species of Iris are represented in the data?
|
|
spcs <- dat$Species
|
|
|
|
#Check class of spcs. It is a factor variable.
|
|
class(spcs)
|
|
|
|
#Current value of spcs has repetition of each spcs type.
|
|
#Get unique values.
|
|
spcs <- unique(spcs)
|
|
|
|
#Perhaps, no point in keeping spcs as factor now.
|
|
#Convert spcs to character variable.
|
|
spcs <- as.character(spcs)
|
|
|
|
#Checking if a text is present in a character variable?
|
|
"sapiens" %in% spcs
|
|
|
|
|
|
#Let us say we want subset of data corresponding to Iris setosa.
|
|
which_setosa <- dat$Species == "setosa"
|
|
dat_setosa <- dat[which_setosa, ]
|
|
|
|
#Class of which_setosa? Logical
|
|
class(which_setosa)
|
|
|
|
#Alternative way to subset data.
|
|
dat_setosa <- subset(dat, Species == "setosa")
|
|
|
|
#Check mean Sepal length for all observations.
|
|
mean(dat$Sepal.Length)
|
|
|
|
#Check mean Sepal length for Iris setosa only.
|
|
mean(dat_setosa$Sepal.Length)
|
|
|
|
#Estimate median.
|
|
median(dat$Petal.Width)
|
|
median(dat_setosa$Sepal.Length)
|
|
|
|
#Estimate standard deviation.
|
|
sd(dat$Petal.Length)
|
|
sd(dat_setosa$Sepal.Width)
|
|
|
|
#Check histograms of data.
|
|
hist(dat$Sepal.Length)
|
|
hist(dat_setosa$Sepal.Length)
|
|
|
|
#These histograms are not easy to compare.
|
|
#Perhaps, we can fix the axis limits.
|
|
hist(dat$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
|
|
hist(dat_setosa$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
|
|
|
|
#Let us look at boxplots.
|
|
boxplot(dat$Sepal.Length, ylim = c(3, 9))
|
|
boxplot(dat_setosa$Sepal.Length, ylim = c(3, 9))
|
|
|
|
#Scatter plots.
|
|
plot(x= dat$Sepal.Length, y = dat$Petal.Length)
|
|
#There is a cluster of data points in the lower left corner.
|
|
#Are these data points from one particular species?
|
|
#Can we color data points based on species?
|
|
|
|
#install.packages("ggplot2")
|
|
library(ggplot2)
|
|
qplot(x = Sepal.Length, y = Petal.Length, data = dat, color = Species)
|
|
|
|
#But the journals charge extra for color figures.
|
|
#Can we use shapes to distinguish species?
|
|
qplot(x = Sepal.Length, y = Petal.Length, data = dat, shape = Species)
|
|
|
|
#Check the boxplots for all species simultaneously.
|
|
qplot(x = Species, y = Sepal.Length, data = dat, geom = "boxplot")
|
|
|
|
#Additional stuff.
|
|
#Hypothesis test.
|
|
#Is sepal length for Iris setosa significantly different from the other two species?
|
|
dat_setosa <- subset(dat, Species == "setosa")
|
|
dat_other <- subset(dat, Species != "setosa")
|
|
test <- t.test(dat_setosa$Sepal.Length, dat_other$Sepal.Length)
|
|
test$p.value
|
|
|
|
#Conditional statements take a condition and perform steps depending on validitiy of the statement.
|
|
if (test$p.value < 0.05) {
|
|
print("Iris setosa can be classified based on sepal length.")
|
|
} else {
|
|
print("Iris setosa may not be identified based on sepal length.")
|
|
}
|
|
|
|
#Looping for repeating the same task but on different data.
|
|
for (item in spcs) {
|
|
y <- subset(dat, Species == item)
|
|
smry <- summary(y)
|
|
print(item)
|
|
print(smry)
|
|
}
|
|
|
|
|
|
|
|
|
|
|