From 8d617621dc4a556b453b40265e8a794adf74da11 Mon Sep 17 00:00:00 2001 From: dataMaster-Kris Date: Mon, 1 Mar 2021 03:03:03 -0800 Subject: [PATCH] Delete Iris_analysis.R --- intro-r-data-analysis/Iris_analysis.R | 138 -------------------------- 1 file changed, 138 deletions(-) delete mode 100644 intro-r-data-analysis/Iris_analysis.R diff --git a/intro-r-data-analysis/Iris_analysis.R b/intro-r-data-analysis/Iris_analysis.R deleted file mode 100644 index 5f3436f..0000000 --- a/intro-r-data-analysis/Iris_analysis.R +++ /dev/null @@ -1,138 +0,0 @@ -#Reading data file. -dat <- read.table("iris.csv") - -#Let us examine how our data looks. -View(dat) - -#Seems like all data points are there. Can we improve appearance? -#Examine the details of read.table command. -?read.table - -#Looks like we can inform read.table about separator type and presence of header. -dat <- read.table("iris.csv", header= TRUE, sep = ",") - -#Let us examine how data looks now. -View(dat) - -#What are the observations represented in our data? -#colnames gives the names of columns of data. -colnames(dat) - -#To check the number of rows and columns in table. -dim(dat) - -#To check first few rows of table. -head(dat) - -#To check last few rows of table. -tail(dat) - -#To check basic stats for each column. -summary(dat) - -#Let us extract a column of data. -#For example, sepal length. -spl_len <- dat$Sepal.Length - -#Check what kind of variable spl_len is. -class(spl_len) - -#Which species of Iris are represented in the data? -spcs <- dat$Species - -#Check class of spcs. It is a factor variable. -class(spcs) - -#Current value of spcs has repetition of each spcs type. -#Get unique values. -spcs <- unique(spcs) - -#Perhaps, no point in keeping spcs as factor now. -#Convert spcs to character variable. -spcs <- as.character(spcs) - -#Checking if a text is present in a character variable? -"sapiens" %in% spcs - - -#Let us say we want subset of data corresponding to Iris setosa. -which_setosa <- dat$Species == "setosa" -dat_setosa <- dat[which_setosa, ] - -#Class of which_setosa? Logical -class(which_setosa) - -#Alternative way to subset data. -dat_setosa <- subset(dat, Species == "setosa") - -#Check mean Sepal length for all observations. -mean(dat$Sepal.Length) - -#Check mean Sepal length for Iris setosa only. -mean(dat_setosa$Sepal.Length) - -#Estimate median. -median(dat$Petal.Width) -median(dat_setosa$Sepal.Length) - -#Estimate standard deviation. -sd(dat$Petal.Length) -sd(dat_setosa$Sepal.Width) - -#Check histograms of data. -hist(dat$Sepal.Length) -hist(dat_setosa$Sepal.Length) - -#These histograms are not easy to compare. -#Perhaps, we can fix the axis limits. -hist(dat$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30)) -hist(dat_setosa$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30)) - -#Let us look at boxplots. -boxplot(dat$Sepal.Length, ylim = c(3, 9)) -boxplot(dat_setosa$Sepal.Length, ylim = c(3, 9)) - -#Scatter plots. -plot(x= dat$Sepal.Length, y = dat$Petal.Length) -#There is a cluster of data points in the lower left corner. -#Are these data points from one particular species? -#Can we color data points based on species? - -#install.packages("ggplot2") -library(ggplot2) -qplot(x = Sepal.Length, y = Petal.Length, data = dat, color = Species) - -#But the journals charge extra for color figures. -#Can we use shapes to distinguish species? -qplot(x = Sepal.Length, y = Petal.Length, data = dat, shape = Species) - -#Check the boxplots for all species simultaneously. -qplot(x = Species, y = Sepal.Length, data = dat, geom = "boxplot") - -#Additional stuff. -#Hypothesis test. -#Is sepal length for Iris setosa significantly different from the other two species? -dat_setosa <- subset(dat, Species == "setosa") -dat_other <- subset(dat, Species != "setosa") -test <- t.test(dat_setosa$Sepal.Length, dat_other$Sepal.Length) -test$p.value - -#Conditional statements take a condition and perform steps depending on validitiy of the statement. -if (test$p.value < 0.05) { - print("Iris setosa can be classified based on sepal length.") -} else { - print("Iris setosa may not be identified based on sepal length.") -} - -#Looping for repeating the same task but on different data. -for (item in spcs) { - y <- subset(dat, Species == item) - smry <- summary(y) - print(item) - print(smry) -} - - - - -