K-Nearest Neighbors
The code is adopted from Machine Learning with R by Brett Lantz. But, unlike Brett, I did train-test split with rsample.
The data is from https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/wisc_bc_data.csv
wbcd <- read.csv("C:\\Users\\sos523\\Downloads\\wisc_bc_data.csv")
str(wbcd)
wbcd <- wbcd[-1] # remove ids
table(wbcd$diagnosis)
# make response, diagnosis, a factor instead of char
wbcd$diagnosis <- factor(
wbcd$diagnosis,
levels = (c("B", "M")),
labels = c("Benign", "Malignant")
)
str(wbcd$diagnosis)
prop.table(table(wbcd$diagnosis))
summary(wbcd[2:4])
# data are in very difference ranges
min max normalization
normalize <- function(x){
return(
(x - min(x)) / (max(x) - min(x))
)
}
wbcd_n <- wbcd
wbcd_n[2:ncol(wbcd)] <- lapply(wbcd[2:ncol(wbcd)], normalize)
summary(wbcd_n$texture_mean)
#variable is correctly normalized
train-test split
wbcd_n_split <- initial_split(wbcd_n, prop = .8, strata = "diagnosis")
wbcd_n_train <- training(wbcd_n_split)
wbcd_n_test <- testing(wbcd_n_split)
knn() only accept numeric not labels
wbcd_n_train_features <- wbcd_n_train[,2:ncol(wbcd_n_train)]
wbcd_n_train_labels <- wbcd_n_train[,1]
wbcd_n_test_features <- wbcd_n_test[,2:ncol(wbcd_n_test)]
wbcd_n_test_labels <- wbcd_n_test[,1]
# K is the squre root of number of cases in training set
# as knn() is lazy learner, no need to train with training set, then testing with test set
sqrt(454)
k = 21
library(class)
wbcd_test_pred <- knn(train = wbcd_n_train_features,
test = wbcd_n_test_features,
cl = wbcd_n_train_labels,
k = 21)
evaluating the model
library(gmodels)
CrossTable(x = wbcd_n_test_labels,
y = wbcd_test_pred,
chisq = F)