mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
Updated slides.
This commit is contained in:
parent
7c0d5a4da9
commit
1feceb9dfb
7 changed files with 217 additions and 0 deletions
34
intro-r-data-analysis/Basic_script.R
Normal file
34
intro-r-data-analysis/Basic_script.R
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#This script perform some basic calculations in R.
|
||||
#To run this script you may select all and hit the Run button on top right of this pane ...
|
||||
#... or go Cmd+A followed by Cmd+Enter on Mac. If you use Windows, ...
|
||||
#... you can also go Ctrl+A followed by Ctrl+Enter.
|
||||
|
||||
#The following command will add 2 to 3 and store the value in variable named 'a'.
|
||||
a <- sum(2,3)
|
||||
|
||||
#The following command will get the product of 2 and 3 and store it in 'b'.
|
||||
b <- prod(2,3)
|
||||
|
||||
#Next, we check if a and b have equal values.
|
||||
a == b
|
||||
|
||||
#Next, we check if a and b are not equal.
|
||||
a != b
|
||||
#Conclusion: Summing numbers is not the same as multiplying them!
|
||||
#Time to write a paper? I think we can go to Nature or Science with this discovery.
|
||||
|
||||
|
||||
#Check if two conditions are simultaneously true.
|
||||
(a == b) & (sqrt(3) == 5)
|
||||
|
||||
#Check if any one of given two conditions are true.
|
||||
#Vertical line is how we say 'or' in R.
|
||||
(a == b) | (sqrt(3) == 5)
|
||||
|
||||
#Variables can also store characters.
|
||||
name <- "Homo sapiens"
|
||||
|
||||
#Is Homo sapiens equal to human being?
|
||||
name == "Human being"
|
||||
|
||||
#Apparently not?
|
||||
50
intro-r-data-analysis/Data_structures_in_R.R
Normal file
50
intro-r-data-analysis/Data_structures_in_R.R
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
#Vectors.
|
||||
#This is a numeric vector.
|
||||
a <- c(3, 5, 2.5, 0, 9)
|
||||
mean(a)
|
||||
|
||||
#Vectors (as well as other data objects) may contain NA.
|
||||
#NA stands for Not Available.
|
||||
a <- c(3, 5, 6.8, NA)
|
||||
mean(a)
|
||||
|
||||
#To ignore NA while taking mean.
|
||||
mean(a, na.rm = TRUE)
|
||||
|
||||
#Vectors are ordered sequences.
|
||||
#This is a character vector.
|
||||
b <- c("Homo sapiens", "Martians", "Blue.Whales", "Homo sapiens")
|
||||
table(b)
|
||||
|
||||
#----------
|
||||
#Data frame
|
||||
#Tabular data possibly with mix of numeric, character, factor or logical type entries.
|
||||
#Example: Iris setosa data that we worked with.
|
||||
df <- data.frame(a = 1:4,
|
||||
name = b)
|
||||
|
||||
#---------
|
||||
#Matrix
|
||||
#Matrices are tabular like data frames but store only one type of entries.
|
||||
df_matrix <- as.matrix(df)
|
||||
|
||||
#May convert between data structures.
|
||||
df_numeric <- as.numeric(df_matrix)
|
||||
df_character <- as.character(df_numeric)
|
||||
|
||||
#----------
|
||||
#List
|
||||
#Lists are flexible data structures.
|
||||
#Lists have named fields which can contain arbitrary data—vectors, other lists, strings, functions, and anything else.
|
||||
#We may wish to keep related data together in one place.
|
||||
#Say you search for a sequence in genome using an R library.
|
||||
#Your ideal output might have mixed entries. For example,
|
||||
#if sequence is found on a chromosome, you want table containing start and end loci on the
|
||||
#chromosome and matched length.
|
||||
#If not found, you want a string that says "Not found."
|
||||
result <- list(chr1 = data.frame(Start = c(5, 100, 200),
|
||||
End = c(70, 150, 230),
|
||||
Length = c(66, 50, 30)
|
||||
),
|
||||
chr2 = "Not found."
|
||||
)
|
||||
BIN
intro-r-data-analysis/Intro to R.pdf
Normal file
BIN
intro-r-data-analysis/Intro to R.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
intro-r-data-analysis/Intro_to_R_materials.zip
Normal file
BIN
intro-r-data-analysis/Intro_to_R_materials.zip
Normal file
Binary file not shown.
133
intro-r-data-analysis/Iris_analysis.R
Normal file
133
intro-r-data-analysis/Iris_analysis.R
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
#Reading data file.
|
||||
dat <- read.table("iris.csv")
|
||||
|
||||
#Let us examine how our data looks.
|
||||
View(dat)
|
||||
|
||||
#Seems like all data points are there. Can we improve appearance?
|
||||
#Examine the details of read.table command.
|
||||
?read.table
|
||||
|
||||
#Looks like we can inform read.table about separator type and presence of header.
|
||||
dat <- read.table("iris.csv", header= TRUE, sep = ",")
|
||||
|
||||
#Let us examine how data looks now.
|
||||
View(dat)
|
||||
|
||||
#What are the observations represented in our data?
|
||||
#colnames gives the names of columns of data.
|
||||
colnames(dat)
|
||||
|
||||
#To check the number of rows and columns in table.
|
||||
dim(dat)
|
||||
|
||||
#To check first few rows of table.
|
||||
head(dat)
|
||||
|
||||
#To check last few rows of table.
|
||||
tail(dat)
|
||||
|
||||
#To check basic stats for each column.
|
||||
summary(dat)
|
||||
|
||||
#Let us extract a column of data.
|
||||
#For example, sepal length.
|
||||
spl_len <- dat$Sepal.Length
|
||||
|
||||
#Check what kind of variable spl_len is.
|
||||
class(spl_len)
|
||||
|
||||
#Which species of Iris are represented in the data?
|
||||
spcs <- dat$Species
|
||||
|
||||
#Check class of spcs. It is a factor variable.
|
||||
class(spcs)
|
||||
|
||||
#Current value of spcs has repetition of each spcs type.
|
||||
#Get unique values.
|
||||
spcs <- unique(spcs)
|
||||
|
||||
#Perhaps, no point in keeping spcs as factor now.
|
||||
#Convert spcs to character variable.
|
||||
spcs <- as.character(spcs)
|
||||
|
||||
#Checking if a text is present in a character variable?
|
||||
"sapiens" %in% spcs
|
||||
|
||||
|
||||
#Let us say we want subset of data corresponding to Iris setosa.
|
||||
which_setosa <- dat$Species == "setosa"
|
||||
dat_setosa <- dat[which_setosa, ]
|
||||
|
||||
#Class of which_setosa? Logical
|
||||
class(which_setosa)
|
||||
|
||||
#Alternative way to subset data.
|
||||
dat_setosa <- subset(dat, Species == "setosa")
|
||||
|
||||
#Check mean Sepal length for all observations.
|
||||
mean(dat$Sepal.Length)
|
||||
|
||||
#Check mean Sepal length for Iris setosa only.
|
||||
mean(dat_setosa$Sepal.Length)
|
||||
|
||||
#Estimate median.
|
||||
median(dat$Petal.Width)
|
||||
median(dat_setosa$Sepal.Length)
|
||||
|
||||
#Estimate standard deviation.
|
||||
sd(dat$Petal.Length)
|
||||
sd(dat_setosa$Sepal.Width)
|
||||
|
||||
#Check histograms of data.
|
||||
hist(dat$Sepal.Length)
|
||||
hist(dat_setosa$Sepal.Length)
|
||||
|
||||
#These histograms are not easy to compare.
|
||||
#Perhaps, we can fix the axis limits.
|
||||
hist(dat$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
|
||||
hist(dat_setosa$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
|
||||
|
||||
#Let us look at boxplots.
|
||||
boxplot(dat$Sepal.Length, ylim = c(3, 9))
|
||||
boxplot(dat_setosa$Sepal.Length, ylim = c(3, 9))
|
||||
|
||||
#Scatter plots.
|
||||
plot(x= dat$Sepal.Length, y = dat$Petal.Length)
|
||||
#There is a cluster of data points in the lower left corner.
|
||||
#Are these data points from one particular species?
|
||||
#Can we color data points based on species?
|
||||
|
||||
#install.packages("ggplot2")
|
||||
library(ggplot2)
|
||||
qplot(x = Sepal.Length, y = Petal.Length, data = dat, color = Species)
|
||||
|
||||
#But the journals charge extra for color figures.
|
||||
#Can we use shapes to distinguish species?
|
||||
qplot(x = Sepal.Length, y = Petal.Length, data = dat, shape = Species)
|
||||
|
||||
#Check the boxplots for all species simultaneously.
|
||||
qplot(x = Species, y = Sepal.Length, data = dat, geom = "boxplot")
|
||||
|
||||
#Additional stuff.
|
||||
#Hypothesis test.
|
||||
#Is sepal length for Iris setosa significantly different from the other two species?
|
||||
dat_setosa <- subset(dat, Species == "setosa")
|
||||
dat_other <- subset(dat, Species != "setosa")
|
||||
test <- t.test(dat_setosa$Sepal.Length, dat_other$Sepal.Length)
|
||||
test$p.value
|
||||
|
||||
#Conditional statements take a condition and perform steps depending on validitiy of the statement.
|
||||
if (test$p.value < 0.05) {
|
||||
print("Iris setosa can be classified based on sepal length.")
|
||||
} else {
|
||||
print("Iris setosa may not be identified based on sepal length.")
|
||||
}
|
||||
|
||||
#Looping for repeating the same task but on different data.
|
||||
for (item in spcs) {
|
||||
y <- subset(dat, Species == item)
|
||||
smry <- summary(y)
|
||||
print(item)
|
||||
print(smry)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue