Delete Iris_analysis.R

2025-11-30 09:45:43 -08:00 · 2021-03-01 03:03:03 -08:00 · 2021-03-01 03:03:03 -08:00 · 8d617621dc
commit 8d617621dc
parent 9ae455725d
1 changed files with 0 additions and 138 deletions
--- a/intro-r-data-analysis/Iris_analysis.R
+++ b/intro-r-data-analysis/Iris_analysis.R
@ -1,138 +0,0 @@
-#Reading data file.
-dat <- read.table("iris.csv")
-
-#Let us examine how our data looks.
-View(dat)
-
-#Seems like all data points are there. Can we improve appearance?
-#Examine the details of read.table command.
-?read.table
-
-#Looks like we can inform read.table about separator type and presence of header.
-dat <- read.table("iris.csv", header= TRUE, sep = ",")
-
-#Let us examine how data looks now.
-View(dat)
-
-#What are the observations represented in our data?
-#colnames gives the names of columns of data.
-colnames(dat)
-
-#To check the number of rows and columns in table.
-dim(dat)
-
-#To check first few rows of table.
-head(dat)
-
-#To check last few rows of table.
-tail(dat)
-
-#To check basic stats for each column.
-summary(dat)
-
-#Let us extract a column of data.
-#For example, sepal length.
-spl_len <- dat$Sepal.Length
-
-#Check what kind of variable spl_len is.
-class(spl_len)
-
-#Which species of Iris are represented in the data? 
-spcs <- dat$Species
-
-#Check class of spcs. It is a factor variable.
-class(spcs)
-
-#Current value of spcs has repetition of each spcs type.
-#Get unique values.
-spcs <- unique(spcs)
-
-#Perhaps, no point in keeping spcs as factor now.
-#Convert spcs to character variable.
-spcs <- as.character(spcs)
-
-#Checking if a text is present in a character variable?
-"sapiens" %in% spcs
-
-
-#Let us say we want subset of data corresponding to Iris setosa.
-which_setosa <- dat$Species == "setosa"
-dat_setosa <- dat[which_setosa, ]
-
-#Class of which_setosa? Logical
-class(which_setosa)
-
-#Alternative way to subset data.
-dat_setosa <- subset(dat, Species == "setosa")
-
-#Check mean Sepal length for all observations.
-mean(dat$Sepal.Length)
-
-#Check mean Sepal length for Iris setosa only.
-mean(dat_setosa$Sepal.Length)
-
-#Estimate median.
-median(dat$Petal.Width)
-median(dat_setosa$Sepal.Length)
-
-#Estimate standard deviation.
-sd(dat$Petal.Length)
-sd(dat_setosa$Sepal.Width)
-
-#Check histograms of data.
-hist(dat$Sepal.Length)
-hist(dat_setosa$Sepal.Length)
-
-#These histograms are not easy to compare. 
-#Perhaps, we can fix the axis limits.
-hist(dat$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
-hist(dat_setosa$Sepal.Length, xlim = c(4, 8), ylim = c(0, 30))
-
-#Let us look at boxplots.
-boxplot(dat$Sepal.Length, ylim = c(3, 9))
-boxplot(dat_setosa$Sepal.Length, ylim = c(3, 9))
-
-#Scatter plots.
-plot(x= dat$Sepal.Length, y = dat$Petal.Length)
-#There is a cluster of data points in the lower left corner. 
-#Are these data points from one particular species?
-#Can  we color data points based on species?
-
-#install.packages("ggplot2")
-library(ggplot2)
-qplot(x = Sepal.Length, y = Petal.Length, data = dat, color = Species)
-
-#But the journals charge extra for color figures.
-#Can we use shapes to distinguish species?
-qplot(x = Sepal.Length, y = Petal.Length, data = dat, shape = Species)
-
-#Check the boxplots for all species simultaneously.
-qplot(x = Species, y = Sepal.Length, data = dat, geom = "boxplot")
-
-#Additional stuff.
-#Hypothesis test.
-#Is sepal length for Iris setosa significantly different from the other two species?
-dat_setosa <- subset(dat, Species == "setosa")
-dat_other <- subset(dat, Species != "setosa")
-test <- t.test(dat_setosa$Sepal.Length, dat_other$Sepal.Length)
-test$p.value
-
-#Conditional statements take a condition and perform steps depending on validitiy of the statement.
-if (test$p.value < 0.05) {
-  print("Iris setosa can be classified based on sepal length.")
-} else {
-  print("Iris setosa may not be identified based on sepal length.")
-}
-
-#Looping for repeating the same task but on different data.
-for (item in spcs) {
-  y <- subset(dat, Species == item)
-  smry <- summary(y)
-  print(item)
-  print(smry)
-}
-
-
-
-
-