file.edit("/Users/mingyoungshin/Dropbox (Gladstone)/ML_workshop/2023_spring_machine_learning")
setwd("/Users/mingyoungshin/Dropbox (Gladstone)/ML_workshop/2023_spring_machine_learning")
library(class)
library(caret)
set.seed(123)
#load iris data
data(iris)
force(iris)
View(iris)
before_suffling=head(iris)
table(iris$Species)
library(ggplot2)
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width))+
geom_point(aes(color = Species, shape = Species))+
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))
ggplot(iris, aes(x = Petal.Length, y = Petal.Width))+
geom_point(aes(color = Species, shape = Species))+
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))
#shuffle iris data
iris=iris[sample(1:nrow(iris),nrow(iris)),]
head(iris)
before_suffling
#assign training data and test data
nrow(iris)
train_index=1:round( nrow(iris) *0.7)
train_data=iris[train_index,1:4]
nrow(train_data)
train_label=as.character(iris[train_index,5])
test_data=iris[-train_index,1:4]
nrow(test_data)
test_label=as.character(iris[-train_index,5])
#find the best k using cross validation
caret_fit <- train(train_data, train_label, method = "knn", trControl = trainControl(method="cv",number = 10))
library(class)
library(caret)
install.packages("caret")
library(caret)
#find the best k using cross validation
caret_fit <- train(train_data, train_label, method = "knn", trControl = trainControl(method="cv",number = 10))
caret_fit
caret_fit$bestTune
#plot accuracy and k
plot(caret_fit)
#predict labels
prediction<- predict(caret_fit, newdata = test_data)
prediction
#check the prediction result
summary(prediction)
#tabularize prediction vs observed value
tb <- table(prediction,test_label)
tb
#calculate accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tb)
?predict
library(cluster)
library(ggplot2)
#load iris data
data(iris)
head(iris)
#retrieve variables
data=as.data.frame(iris[,1:4])
#choose k
library(factoextra)
install.packages("factoextra")
#choose k
library(factoextra)
fviz.p <-fviz_nbclust(x = data, FUNcluster = kmeans, method = 'wss' )
fviz.p #save ggplot output for methods below
?fviz_nbclust
#train kmeans
fit <- kmeans(data, centers= 3, nstart=50, algorithm ="Lloyd", iter.max=100)
fit$cluster
?kmeans
#visualize clusters
fviz_cluster(fit, data = data, palette = c("#00AFBB","#2E9FDF", "#E7B800", "#FC4E07"), ggtheme = theme_minimal())
library(rpart)
library(partykit)
install.packages("partykit")
library(partykit)
library(rpart.plot)
install.packages("rpart.plot")
library(rpart.plot)
#load titanic data
data(ptitanic)
View(ptitanic)
#shuffle titanic data
titanic=ptitanic[sample(1:nrow(ptitanic),nrow(ptitanic)),]
#split training data and test data
train_index=1:round(nrow(ptitanic)*0.7)
train_data=ptitanic[train_index,]
test_data=ptitanic[-train_index,]
#train a decision tree
survived <- rpart(survived ~ ., data = train_data)
?rpart
#visualize the decision tree
new_tree <- as.party(survived)
plot(new_tree)
#check the splitting rules
rpart.rules(survived, cover = TRUE)
# print Complexity parameter of the current tree
printcp(survived) # result of rpart
table(train_data$survived)
# prune the decision tree based on the best cp
survived_pruned<- prune(survived, cp= survived$cptable[which.min(survived$cptable[,"xerror"]),"CP"])
#visualize the decision tree
new_tree <- as.party(survived_pruned)
plot(new_tree)
#prediction
prediction_pruned <-predict(survived_pruned, test_data, type = 'class')
#tabularize prediction vs observed labels
table_mat_pruned <- table(test_data$survived, prediction_pruned)
table_mat_pruned
#calculate accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(table_mat_pruned)
?rpart
getwd()
library(class)
library(caret)
set.seed(123)
#load iris data
data(iris)
before_suffling=head(iris)
View(iris)
table(iris$Species)
library(ggplot2)
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width))+
geom_point(aes(color = Species, shape = Species))+
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))
ggplot(iris, aes(x = Petal.Length, y = Petal.Width))+
geom_point(aes(color = Species, shape = Species))+
scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))
#shuffle iris data
iris=iris[sample(1:nrow(iris),nrow(iris)),]
head(iris)
before_suffling
train_index=1:round( nrow(iris) *0.7)
train_data=iris[train_index,1:4]
nrow(train_data)
train_label=as.character(iris[train_index,5])
test_data=iris[-train_index,1:4]
nrow(test_data)
test_label=as.character(iris[-train_index,5])
#find the best k using cross validation
caret_fit <- train(train_data, train_label, method = "knn", trControl = trainControl(method="cv",number = 10))
#plot accuracy and k
plot(caret_fit)
caret_fit$bestTune
#predict labels
prediction<- predict(caret_fit, newdata = test_data)
prediction
#check the prediction result
summary(prediction)
#tabularize prediction vs observed value
tb <- table(prediction,test_label)
tb
#calculate accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tb)
library(cluster)
library(ggplot2)
#load iris data
data(iris)
head(iris)
#retrieve variables
data=as.data.frame(iris[,1:4])
#choose k
library(factoextra)
fviz.p <-fviz_nbclust(x = data, FUNcluster = kmeans, method = 'wss' )
fviz.p #save ggplot output for methods below
#train kmeans
fit <- kmeans(data, centers= 3, nstart=50, algorithm ="Lloyd", iter.max=100)
fit$cluster
#visualize clusters
fviz_cluster(fit, data = data, palette = c("#00AFBB","#2E9FDF", "#E7B800", "#FC4E07"), ggtheme = theme_minimal())
library(rpart)
library(partykit)
library(rpart.plot)
#load titanic data
data(ptitanic)
View(ptitanic)
#shuffle titanic data
titanic=ptitanic[sample(1:nrow(ptitanic),nrow(ptitanic)),]
#split training data and test data
train_index=1:round(nrow(ptitanic)*0.7)
train_data=ptitanic[train_index,]
test_data=ptitanic[-train_index,]
#train a decision tree
survived <- rpart(survived ~ ., data = train_data)
#visualize the decision tree
new_tree <- as.party(survived)
plot(new_tree)
#check the splitting rules
rpart.rules(survived, cover = TRUE)
# print Complexity parameter of the current tree
printcp(survived) # result of rpart
# prune the decision tree based on the best cp
survived_pruned<- prune(survived, cp= survived$cptable[which.min(survived$cptable[,"xerror"]),"CP"])
#visualize the decision tree
new_tree <- as.party(survived_pruned)
plot(new_tree)
#prediction
prediction_pruned <-predict(survived_pruned, test_data, type = 'class')
#tabularize prediction vs observed labels
table_mat_pruned <- table(test_data$survived, prediction_pruned)
table_mat_pruned
#calculate accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(table_mat_pruned)
