diff --git a/intro-r-data-analysis/Basic_script.R b/intro-r-data-analysis/Basic_script.R new file mode 100644 index 0000000..51d52e0 --- /dev/null +++ b/intro-r-data-analysis/Basic_script.R @@ -0,0 +1,34 @@ +#This script perform some basic calculations in R. +#To run this script you may select all and hit the Run button on top right of this pane ... +#... or go Cmd+A followed by Cmd+Enter on Mac. If you use Windows, ... +#... you can also go Ctrl+A followed by Ctrl+Enter. + +#The following command will add 2 to 3 and store the value in variable named 'a'. +a <- sum(2,3) + +#The following command will get the product of 2 and 3 and store it in 'b'. +b <- prod(2,3) + +#Next, we check if a and b have equal values. +a == b + +#Next, we check if a and b are not equal. +a != b +#Conclusion: Summing numbers is not the same as multiplying them! +#Time to write a paper? I think we can go to Nature or Science with this discovery. + + +#Check if two conditions are simultaneously true. +(a == b) & (sqrt(3) == 5) + +#Check if any one of given two conditions are true. +#Vertical line is how we say 'or' in R. +(a == b) | (sqrt(3) == 5) + +#Variables can also store characters. +name <- "Homo sapiens" + +#Is Homo sapiens equal to human being? +name == "Human being" + +#Apparently not? diff --git a/intro-r-data-analysis/Data_structures_in_R.R b/intro-r-data-analysis/Data_structures_in_R.R new file mode 100644 index 0000000..d590445 --- /dev/null +++ b/intro-r-data-analysis/Data_structures_in_R.R @@ -0,0 +1,50 @@ +#Vectors. +#This is a numeric vector. +a <- c(3, 5, 2.5, 0, 9) +mean(a) + +#Vectors (as well as other data objects) may contain NA. +#NA stands for Not Available. +a <- c(3, 5, 6.8, NA) +mean(a) + +#To ignore NA while taking mean. +mean(a, na.rm = TRUE) + +#Vectors are ordered sequences. +#This is a character vector. +b <- c("Homo sapiens", "Martians", "Blue.Whales", "Homo sapiens") +table(b) + +#---------- +#Data frame +#Tabular data possibly with mix of numeric, character, factor or logical type entries. +#Example: Iris setosa data that we worked with. +df <- data.frame(a = 1:4, + name = b) + +#--------- +#Matrix +#Matrices are tabular like data frames but store only one type of entries. +df_matrix <- as.matrix(df) + +#May convert between data structures. +df_numeric <- as.numeric(df_matrix) +df_character <- as.character(df_numeric) + +#---------- +#List +#Lists are flexible data structures. +#Lists have named fields which can contain arbitrary data—vectors, other lists, strings, functions, and anything else. +#We may wish to keep related data together in one place. +#Say you search for a sequence in genome using an R library. +#Your ideal output might have mixed entries. For example, +#if sequence is found on a chromosome, you want table containing start and end loci on the +#chromosome and matched length. +#If not found, you want a string that says "Not found." +result <- list(chr1 = data.frame(Start = c(5, 100, 200), + End = c(70, 150, 230), + Length = c(66, 50, 30) + ), + chr2 = "Not found." + ) \ No newline at end of file diff --git a/intro-r-data-analysis/Intro to R.pdf b/intro-r-data-analysis/Intro to R.pdf new file mode 100644 index 0000000..59caf2b Binary files /dev/null and b/intro-r-data-analysis/Intro to R.pdf differ diff --git a/intro-r-data-analysis/Intro to R_2019-09-06.pptx b/intro-r-data-analysis/Intro to R.pptx similarity index 99% rename from intro-r-data-analysis/Intro to R_2019-09-06.pptx rename to intro-r-data-analysis/Intro to R.pptx index d8602a3..0fb2307 100644 Binary files a/intro-r-data-analysis/Intro to R_2019-09-06.pptx and b/intro-r-data-analysis/Intro to R.pptx differ diff --git a/intro-r-data-analysis/Intro to R_2019-09-06.pdf b/intro-r-data-analysis/Intro to R_2019-09-06.pdf deleted file mode 100644 index c712d97..0000000 Binary files a/intro-r-data-analysis/Intro to R_2019-09-06.pdf and /dev/null differ diff --git a/intro-r-data-analysis/Intro_to_R_materials.zip b/intro-r-data-analysis/Intro_to_R_materials.zip new file mode 100644 index 0000000..dbb7a05 Binary files /dev/null and b/intro-r-data-analysis/Intro_to_R_materials.zip differ diff --git a/intro-r-data-analysis/Iris_analysis.R b/intro-r-data-analysis/Iris_analysis.R new file mode 100644 index 0000000..efe8056 --- /dev/null +++ b/intro-r-data-analysis/Iris_analysis.R @@ -0,0 +1,133 @@ +#Reading data file. +dat <- read.table("iris.csv") + +#Let us examine how our data looks. +View(dat) + +#Seems like all data points are there. Can we improve appearance? +#Examine the details of read.table command. +?read.table + +#Looks like we can inform read.table about separator type and presence of header. +dat <- read.table("iris.csv", header= TRUE, sep = ",") + +#Let us examine how data looks now. +View(dat) + +#What are the observations represented in our data? +#colnames gives the names of columns of data. +colnames(dat) + +#To check the number of rows and columns in table. +dim(dat) + +#To check first few rows of table. +head(dat) + +#To check last few rows of table. +tail(dat) + +#To check basic stats for each column. +summary(dat) + +#Let us extract a column of data. +#For example, sepal length. +spl_len <- dat$Sepal.Length + +#Check what kind of variable spl_len is. +class(spl_len) + +#Which species of Iris are represented in the data? +spcs <- dat$Species + +#Check class of spcs. It is a factor variable. +class(spcs) + +#Current value of spcs has repetition of each spcs type. +#Get unique values. +spcs <- unique(spcs) + +#Perhaps, no point in keeping spcs as factor now. +#Convert spcs to character variable. +spcs <- as.character(spcs) + +#Checking if a text is present in a character variable? +"sapiens" %in% spcs + + +#Let us say we want subset of data corresponding to Iris setosa. +which_setosa <- dat$Species == "setosa" +dat_setosa <- dat[which_setosa, ] + +#Class of which_setosa? Logical +class(which_setosa) + +#Alternative way to subset data. +dat_setosa <- subset(dat, Species == "setosa") + +#Check mean Sepal length for all observations. +mean(dat$Sepal.Length) + +#Check mean Sepal length for Iris setosa only. +mean(dat_setosa$Sepal.Length) + +#Estimate median. +median(dat$Petal.Width) +median(dat_setosa$Sepal.Length) + +#Estimate standard deviation. +sd(dat$Petal.Length) +sd(dat_setosa$Sepal.Width) + +#Check histograms of data. +hist(dat$Sepal.Length) +hist(dat_setosa$Sepal.Length) + +#These histograms are not easy to compare. +#Perhaps, we can fix the axis limits. +hist(dat$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30)) +hist(dat_setosa$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30)) + +#Let us look at boxplots. +boxplot(dat$Sepal.Length, ylim = c(3, 9)) +boxplot(dat_setosa$Sepal.Length, ylim = c(3, 9)) + +#Scatter plots. +plot(x= dat$Sepal.Length, y = dat$Petal.Length) +#There is a cluster of data points in the lower left corner. +#Are these data points from one particular species? +#Can we color data points based on species? + +#install.packages("ggplot2") +library(ggplot2) +qplot(x = Sepal.Length, y = Petal.Length, data = dat, color = Species) + +#But the journals charge extra for color figures. +#Can we use shapes to distinguish species? +qplot(x = Sepal.Length, y = Petal.Length, data = dat, shape = Species) + +#Check the boxplots for all species simultaneously. +qplot(x = Species, y = Sepal.Length, data = dat, geom = "boxplot") + +#Additional stuff. +#Hypothesis test. +#Is sepal length for Iris setosa significantly different from the other two species? +dat_setosa <- subset(dat, Species == "setosa") +dat_other <- subset(dat, Species != "setosa") +test <- t.test(dat_setosa$Sepal.Length, dat_other$Sepal.Length) +test$p.value + +#Conditional statements take a condition and perform steps depending on validitiy of the statement. +if (test$p.value < 0.05) { + print("Iris setosa can be classified based on sepal length.") +} else { + print("Iris setosa may not be identified based on sepal length.") +} + +#Looping for repeating the same task but on different data. +for (item in spcs) { + y <- subset(dat, Species == item) + smry <- summary(y) + print(item) + print(smry) +}