Updated slides and data.

2025-11-30 09:45:43 -08:00 · 2020-02-28 12:47:28 -08:00 · 2020-02-28 12:47:28 -08:00 · 215aae5400
commit 215aae5400
parent 7c0d5a4da9
7 changed files with 217 additions and 0 deletions
--- a/intro-r-data-analysis/Basic_script.R
+++ b/intro-r-data-analysis/Basic_script.R
@ -0,0 +1,34 @@
+#This script perform some basic calculations in R.
+#To run this script you may select all and hit the Run button on top right of this pane ...
+#... or go Cmd+A followed by Cmd+Enter on Mac. If you use Windows, ...
+#... you can also go Ctrl+A followed by Ctrl+Enter.
+
+#The following command will add 2 to 3 and store the value in variable named 'a'.
+a <- sum(2,3)
+
+#The following command will get the product of 2 and 3 and store it in 'b'.
+b <- prod(2,3)
+
+#Next, we check if a and b have equal values.
+a == b
+
+#Next, we check if a and b are not equal.
+a != b
+#Conclusion: Summing numbers is not the same as multiplying them!
+#Time to write a paper? I think we can go to Nature or Science with this discovery.
+
+
+#Check if two conditions are simultaneously true.
+(a == b) & (sqrt(3) == 5)
+
+#Check if any one of given two conditions are true.
+#Vertical line is how we say 'or' in R.
+(a == b) | (sqrt(3) == 5)
+
+#Variables can also store characters.
+name <- "Homo sapiens"
+
+#Is Homo sapiens equal to human being?
+name == "Human being"
+
+#Apparently not?
--- a/intro-r-data-analysis/Data_structures_in_R.R
+++ b/intro-r-data-analysis/Data_structures_in_R.R
@ -0,0 +1,50 @@
+#Vectors.
+#This is a numeric vector.
+a <- c(3, 5, 2.5, 0, 9)
+mean(a)
+
+#Vectors (as well as other data objects) may contain NA.
+#NA stands for Not Available.
+a <- c(3, 5, 6.8, NA)
+mean(a)
+
+#To ignore NA while taking mean.
+mean(a, na.rm = TRUE)
+
+#Vectors are ordered sequences.
+#This is a character vector.
+b <- c("Homo sapiens", "Martians", "Blue.Whales", "Homo sapiens")
+table(b)
+
+#----------
+#Data frame
+#Tabular data possibly with mix of numeric, character, factor or logical type entries.
+#Example: Iris setosa data that we worked with.
+df <- data.frame(a = 1:4,
+                 name = b)
+
+#---------
+#Matrix
+#Matrices are tabular like data frames but store only one type of entries.
+df_matrix <- as.matrix(df)
+
+#May convert between data structures.
+df_numeric <- as.numeric(df_matrix)
+df_character <- as.character(df_numeric)
+
+#----------
+#List
+#Lists are flexible data structures.
+#Lists have named fields which can contain arbitrary data—vectors, other lists, strings, functions, and anything else.
+#We may wish to keep related data together in one place. 
+#Say you search for a sequence in genome using an R library.
+#Your ideal output might have mixed entries. For example,
+#if sequence is found on a chromosome, you want table containing start and end loci on the 
+#chromosome and matched length.
+#If not found, you want a string that says "Not found."
+result <- list(chr1 = data.frame(Start = c(5, 100, 200),
+                                 End = c(70, 150, 230),
+                                 Length = c(66, 50, 30)
+                                 ),
+               chr2 = "Not found."
+               )
--- a/intro-r-data-analysis/Intro
+++ b/intro-r-data-analysis/Intro
--- a/intro-r-data-analysis/Intro
+++ b/intro-r-data-analysis/Intro
--- a/intro-r-data-analysis/Intro
+++ b/intro-r-data-analysis/Intro
--- a/intro-r-data-analysis/Intro_to_R_materials.zip
+++ b/intro-r-data-analysis/Intro_to_R_materials.zip
--- a/intro-r-data-analysis/Iris_analysis.R
+++ b/intro-r-data-analysis/Iris_analysis.R
@ -0,0 +1,133 @@
+#Reading data file.
+dat <- read.table("iris.csv")
+
+#Let us examine how our data looks.
+View(dat)
+
+#Seems like all data points are there. Can we improve appearance?
+#Examine the details of read.table command.
+?read.table
+
+#Looks like we can inform read.table about separator type and presence of header.
+dat <- read.table("iris.csv", header= TRUE, sep = ",")
+
+#Let us examine how data looks now.
+View(dat)
+
+#What are the observations represented in our data?
+#colnames gives the names of columns of data.
+colnames(dat)
+
+#To check the number of rows and columns in table.
+dim(dat)
+
+#To check first few rows of table.
+head(dat)
+
+#To check last few rows of table.
+tail(dat)
+
+#To check basic stats for each column.
+summary(dat)
+
+#Let us extract a column of data.
+#For example, sepal length.
+spl_len <- dat$Sepal.Length
+
+#Check what kind of variable spl_len is.
+class(spl_len)
+
+#Which species of Iris are represented in the data? 
+spcs <- dat$Species
+
+#Check class of spcs. It is a factor variable.
+class(spcs)
+
+#Current value of spcs has repetition of each spcs type.
+#Get unique values.
+spcs <- unique(spcs)
+
+#Perhaps, no point in keeping spcs as factor now.
+#Convert spcs to character variable.
+spcs <- as.character(spcs)
+
+#Checking if a text is present in a character variable?
+"sapiens" %in% spcs
+
+
+#Let us say we want subset of data corresponding to Iris setosa.
+which_setosa <- dat$Species == "setosa"
+dat_setosa <- dat[which_setosa, ]
+
+#Class of which_setosa? Logical
+class(which_setosa)
+
+#Alternative way to subset data.
+dat_setosa <- subset(dat, Species == "setosa")
+
+#Check mean Sepal length for all observations.
+mean(dat$Sepal.Length)
+
+#Check mean Sepal length for Iris setosa only.
+mean(dat_setosa$Sepal.Length)
+
+#Estimate median.
+median(dat$Petal.Width)
+median(dat_setosa$Sepal.Length)
+
+#Estimate standard deviation.
+sd(dat$Petal.Length)
+sd(dat_setosa$Sepal.Width)
+
+#Check histograms of data.
+hist(dat$Sepal.Length)
+hist(dat_setosa$Sepal.Length)
+
+#These histograms are not easy to compare. 
+#Perhaps, we can fix the axis limits.
+hist(dat$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
+hist(dat_setosa$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
+
+#Let us look at boxplots.
+boxplot(dat$Sepal.Length, ylim = c(3, 9))
+boxplot(dat_setosa$Sepal.Length, ylim = c(3, 9))
+
+#Scatter plots.
+plot(x= dat$Sepal.Length, y = dat$Petal.Length)
+#There is a cluster of data points in the lower left corner. 
+#Are these data points from one particular species?
+#Can  we color data points based on species?
+
+#install.packages("ggplot2")
+library(ggplot2)
+qplot(x = Sepal.Length, y = Petal.Length, data = dat, color = Species)
+
+#But the journals charge extra for color figures.
+#Can we use shapes to distinguish species?
+qplot(x = Sepal.Length, y = Petal.Length, data = dat, shape = Species)
+
+#Check the boxplots for all species simultaneously.
+qplot(x = Species, y = Sepal.Length, data = dat, geom = "boxplot")
+
+#Additional stuff.
+#Hypothesis test.
+#Is sepal length for Iris setosa significantly different from the other two species?
+dat_setosa <- subset(dat, Species == "setosa")
+dat_other <- subset(dat, Species != "setosa")
+test <- t.test(dat_setosa$Sepal.Length, dat_other$Sepal.Length)
+test$p.value
+
+#Conditional statements take a condition and perform steps depending on validitiy of the statement.
+if (test$p.value < 0.05) {
+  print("Iris setosa can be classified based on sepal length.")
+} else {
+  print("Iris setosa may not be identified based on sepal length.")
+}
+
+#Looping for repeating the same task but on different data.
+for (item in spcs) {
+  y <- subset(dat, Species == item)
+  smry <- summary(y)
+  print(item)
+  print(smry)
+}