教材就是传说中的机器学习和R语言--中文版,大家可以去图书馆借来看看~~~,例子都是来自书上的
首先介绍一下KNN算法,KNN还好吧,说白了就是一个算距离的公式然后以统计的方式呈现出来,以二维平面为例,平面内已知n个区域,每个区域里面有m(n)个点,现在求一个不在n区域内的点与哪一个区域最近,额,为了“恰当”,考虑较远的点的影响会覆盖较近点的影响和没有意义的重复计算,只取k(k<n)个较近点参与计算,这就是这个方法的原理了,简单粗暴~~问题还有就是在数据很大的时候怎么选取K值,书上说的是取sqrt(n),暂且略。
数据集请搜索wisc_bc_data.csv,github上面有这本书的全部数据集,以后有空的话我考虑一下放在百度云盘上面。
(1)KNN算法诊断乳腺癌:
读取数据集(不转化为因子,因子是什么我还不知道-.-求指教)
> wbcd <- read.csv("wisc_bc_data.csv",stringsAsFactors = FALSE)
去除第一列,剩余的是自变量
> wbcd <- wbcd[-1]
把诊断结果转化成因子,并且添加标签
> wbcd$diagnosis <- factor(wbcd$diagnosis,levels = c("B","M"),labels =c("Benign","Malignant"))
查看概率
>round(prop.table(table(wbcd$diagnosis))*100,digits = 1)
Benign Malignant
62.7 37.3
书中说的min-max标准化,简单易懂,目的就是让各列大大小小不同的数据都变成同一个区间的保持比例的数据
> normalize <- function(x){
+ return((x - min(x))/(max(x)- min(x)))
+ }
lapply函数,对所有的列进行同样的操作,欢迎比较sapply
> wbcd_n <- as.data.frame(lapply(wbcd[2:31],normalize))
训练数据集
> wb_t <- wbcd_n[1:500,]
测试数据集
> wb_e <- wbcd_n[501:569,]
这两个是对应上面两个的,诊断结果,马上要对他们开刀了的
> wb_t_labels <- wbcd[1:500,1]
> wb_e_labels <- wbcd[501:569,1]
安装class包(里面有knn)
这个式子返回的值是预测的wb_e的label,参数大概能看懂吧,不懂就留言吧
wpre <- knn(train = wb_t,test = wb_e,cl = wb_t_labels,k = 21)
使用gmodels包来查看预测详细结果
(2)朴素贝叶斯算法
原理我不说啦,反正我只知道一个大概,概率论这种东西下学期才学,所以我不急~~~有需要的自行XX
发现代码不见了。。。扔个别个的链接http://www.cnblogs.com/tychyg/p/5345221.html
直接上结果
以后记得保存代码。。。Rstudio真可怕,R占内存也很可怕
嗯。。。代码找回来了,Rstudio还不熟。。。
先粘贴上,晚点再来注释了
sms <- read.csv("F:/R语言/R ML/data/sms_spam1.csv",stringsAsFactors = FALSE)
#factor
sms$type <- factor(sms$type)
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(e1071)
library(gmodels)
sms_corpus <- Corpus(VectorSource(sms$text))
corpus_clean <- tm_map(sms_corpus,tolower)
corpus_clean <- tm_map(corpus_clean,removeNumbers)
corpus_clean <- tm_map(corpus_clean,removeWords,stopwords())
corpus_clean <- tm_map(corpus_clean,removePunctuation)
corpus_clean <- tm_map(corpus_clean,stripWhitespace)
corpus_clean <- tm_map(corpus_clean,PlainTextDocument)
sms_tmq <- DocumentTermMatrix(corpus_clean,control = list())
sms_train <- sms[1:4500,]
sms_test <- sms[4501:5571,]
sms_train_d <- sms_tmq[1:4500,]
sms_test_d <- sms_tmq[4501:5571,]
sms_cor_train <- corpus_clean[1:4500]
sms_cor_test <- corpus_clean[4501:5571]
prop.table(table(sms_train$type))
prop.table(table(sms_test$type))
wordcloud(sms_cor_train,min.freq = 40,scale = c(4,.5),colors = "orange",random.order = FALSE)
spam <- subset(sms_train,type == "spam")
ham <- subset(sms_train,type == "ham")
wordcloud(spam$text,min.freq = 40,scale = c(4,.5),colors = "orange",random.order = FALSE)
wordcloud(ham$text,min.freq = 40,scale = c(4,.5),colors = "orange",random.order = FALSE)
sms_dict <- matrix(findFreqTerms(sms_train_d,5))
sms_T <- DocumentTermMatrix(sms_cor_train,list(sms_dict))
sms_Te <- DocumentTermMatrix(sms_cor_test,list(sms_dict))
conver_counts <- function(x){x <-ifelse(x > 0,1,0);x <- factor(x,levels = c(0,1),labels = c("yes","no"))}
sms_T <- apply(sms_T,MARGIN = 2,conver_counts)
sms_Te <- apply(sms_Te,MARGIN = 2,conver_counts)
sms_classfier <- naiveBayes(sms_T,sms_train$type)
sms_prep <- predict(sms_classfier,sms_Te)
CrossTable(sms_prep,sms_test$type,prop.chisq = FALSE,prop.t = FALSE,dnn = c("predicted","actual"))
再加一段,然后复习马克思了。。。。
library(C50)
library(gmodels)
library(RWeka)
credit <- read.csv("F:/R语言/R ML/data/credit.csv")
str(credit)
table(credit$credit_history)
table(credit$amount)
summary(credit$savings_balance)
table(credit$default)
set.seed(2354)
credit_Num <- credit[order(runif(1000)),]
#创建训练数据集和测试数据
credit_train <- credit_Num[1:900,]
credit_test <- credit_Num[901:1000,]
prop.table(table(credit_train$default))
prop.table(table(credit_test$default))
credit_model <- C5.0(credit_train[-17],as.factor(credit_train$default))
credit_model p <- predict(credit_model,credit_test)
CrossTable(credit_test$default,p,prop.chisq = FALSE,prop.c = FALSE,prop.r = FALSE,dnn = c('actual default','predict default'))
credit_boost <- C5.0(credit_train[-17],as.factor(credit_train$default),trials = 10)
pb <- predict(credit_boost,credit_test)
CrossTable(credit_test$default,pb,prop.chisq = FALSE,prop.c = FALSE,prop.r = FALSE,dnn = c('actual default','predict default'))
summary(pb)
summary(credit_boost)
#mashroom test
mushroom <- read.csv("F:/R语言/R ML/data/mushrooms.csv",stringsAsFactors = TRUE)
mushroom$veil_type
mushroom$veil_type <- NULL
mush_pre <- OneR(type~.,mushroom)
summary(mush_pre)
mush_pre
mush_jip <- JRip(type~.,mushroom)
mush_jip summary(mush_jip)
#导入数据
insurance library(psych)
insurance <- read.csv("F:/R语言/R ML/data/insurance.csv",stringsAsFactors = TRUE)
str(insurance)
table(insurance$children)
cor(insurance[c("age","bmi","children","charges")])
pairs.panels(insurance[c("age2","bmi2","children","charges")])
lm_model <- lm(charges~.,data = insurance)
summary(lm_model)
insurance$age2 <- insurance$age ^ 2
insurance$bmi2 <- insurance$bmi ^ 2
lm_model2 <- lm(charges~.,data = insurance)
summary(lm_model2)
insurance$bmi30 <- ifelse(insurance$bmi >= 30,1,0)
ins_model <- lm(charges ~ age + age2 + children + bmi + sex + bmi*smoker + region, data = insurance)
summary(ins_model)
#grape wine
library(rpart.plot)
library(rpart)
library(RWeka)
wine <- read.csv("F:/R语言/R ML/data/whitewines.csv")
str(wine)
hist(wine$quality)
hist(wine$pH)
hist(wine$citric.acid)
hist(wine$fixed.acidity)
wineTest <- wine[4001:4898,]
wineTrain <- wine[1:4000,]
table(wineTrain$quality == 5)
lmt_model <- rpart(quality ~ .,data = wineTrain)
y <- predict(lmt_model,wineTest)
y
lmt_model
pans <- predict(lmt_model,wineTest)
rpart.plot(lmt_model,digits = 3,fallen.leaves = TRUE,type = 3,extra = 101)
summary(pans)
wine_m <- M5P(quality ~., data = wineTrain)