红酒质量预测模型——基于R的应用

2015-01-06 16:47阅读：

http://blog.sina.cn/dpool/blog/u/1931051810

数据来源：网络公开数据，采集自葡萄牙北部的红葡萄酒检测数据，网址：http://archive.ics.uci.edu/ml/datasets/Wine+Quality

结论：我们关注的是测试集的均方误差和相关系数，对于这个数据集，随机森林的均方误差最小，相关系数最大；其次是支持向量机；经典最小二乘回归和神经网络位于中间水平；决策树拟合效果最差。

1、数据导入

redwine<-read.csv(file='winequality-red.csv',header=T)
head(redwine)
## 混合酸挥发性酸柠檬酸残糖氯化物二氧化硫总二氧化硫量密度 PH
## 1 7.4 0.70 0.00 1.9 0.076

11 34 0.9978 3.51
## 2 7.8 0.88 0.00 2.6 0.098 25 67 0.9968 3.20
## 3 7.8 0.76 0.04 2.3 0.092 15 54 0.9970 3.26
## 4 11.2 0.28 0.56 1.9 0.075 17 60 0.9980 3.16
## 5 7.4 0.70 0.00 1.9 0.076 11 34 0.9978 3.51
## 6 7.4 0.66 0.00 1.8 0.075 13 40 0.9978 3.51
## 硫酸盐酒精质量
## 1 0.56 9.4 5
## 2 0.68 9.8 5
## 3 0.65 9.8 5
## 4 0.58 9.8 6
## 5 0.56 9.4 5
## 6 0.56 9.4 5

str(redwine)
## 'data.frame': 1599 obs. of 12 variables:
## $ 混合酸 : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ 挥发性酸 : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ 柠檬酸 : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ 残糖 : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ 氯化物 : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ 二氧化硫 : num 11 25 15 17 11 13 15 15 9 17 ...
## $ 总二氧化硫量: num 34 67 54 60 34 40 59 21 18 102 ...
## $ 密度 : num 0.998 0.997 0.997 0.998 0.998 ...
## $ PH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ 硫酸盐 : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ 酒精 : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ 质量 : int 5 5 5 6 5 5 5 7 7 5 ...

summary(redwine)
## 混合酸挥发性酸柠檬酸残糖
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## 氯化物二氧化硫总二氧化硫量密度
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## PH 硫酸盐酒精质量
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000

require(VIM)
require(mice)

sum(is.na(redwine))
## [1] 0

2、运用线性回归初步建模

require(car)
require(gvlma)

fit.lm<-lm(质量~.,data=redwine)
summary(fit.lm)
##
## Call:
## lm(formula = 质量 ~ ., data = redwine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.68911 -0.36652 -0.04699 0.45202 2.02498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.197e+01 2.119e+01 1.036 0.3002
## 混合酸 2.499e-02 2.595e-02 0.963 0.3357
## 挥发性酸 -1.084e+00 1.211e-01 -8.948 < 2e-16 ***
## 柠檬酸 -1.826e-01 1.472e-01 -1.240 0.2150
## 残糖 1.633e-02 1.500e-02 1.089 0.2765
## 氯化物 -1.874e+00 4.193e-01 -4.470 8.37e-06 ***
## 二氧化硫 4.361e-03 2.171e-03 2.009 0.0447 *
## 总二氧化硫量 -3.265e-03 7.287e-04 -4.480 8.00e-06 ***
## 密度 -1.788e+01 2.163e+01 -0.827 0.4086
## PH -4.137e-01 1.916e-01 -2.159 0.0310 *
## 硫酸盐 9.163e-01 1.143e-01 8.014 2.13e-15 ***
## 酒精 2.762e-01 2.648e-02 10.429 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.648 on 1587 degrees of freedom
## Multiple R-squared: 0.3606, Adjusted R-squared: 0.3561
## F-statistic: 81.35 on 11 and 1587 DF, p-value: < 2.2e-16

gvlma(fit.lm)
## Value p-value Decision
## Global Stat 43.13055 9.722e-09 Assumptions NOT satisfied!
## Skewness 7.54217 6.027e-03 Assumptions NOT satisfied!
## Kurtosis 33.42319 7.413e-09 Assumptions NOT satisfied!
## Link Function 2.09097 1.482e-01 Assumptions acceptable.
## Heteroscedasticity 0.07422 7.853e-01 Assumptions acceptable.

#### 正态性检验
qqPlot(fit.lm,id.method='identify',simulate=T,main='Q-Q Plot')

#### 方差齐性检验
ncvTest(fit.lm)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 22.55524 Df = 1 p = 2.041867e-06
spreadLevelPlot(fit.lm)
##
## Suggested power transformation: -1.335885

#### 多重共线性检验
sqrt(vif(fit.lm))>2 # 混合酸,密度存在共线性
## 混合酸挥发性酸柠檬酸残糖氯化物
## TRUE FALSE FALSE FALSE FALSE
## 二氧化硫总二氧化硫量密度 PH 硫酸盐
## FALSE FALSE TRUE FALSE FALSE
## 酒精
## FALSE

#### 异常点检验
outlierTest(fit.lm)
## rstudent unadjusted p-value Bonferonni p
## 833 -4.194088 2.8912e-05 0.046231
influencePlot(fit.lm,id.method='identify',main='Influence Plot',sub='Circle size is proportional to Cook's distance')

redwine2<-redwine[-c(496,1550,1062,1121,833,639,441,1404,829,391,282,1080,1082,14,87,152,93,724,1300,653,46,1375,691,80,162,1430,1485,354,1277,900,1240,1430,834,1308,1506,259,92,107,755,1320,170,107,1236,1479,460,1203,456,1091,877,470,518,82,1052,292,18,1166,452,69,84,1371),] # 删除主要异常点

3、运用全子集回归筛选变量

require(leaps)
fit.leaps<-regsubsets(质量~.,data=redwine2,nbest=4)
plot(fit.leaps,scale='adjr2') # 挥发性酸,残糖,氯化物,二氧化硫,总二氧化硫量,PH,硫酸盐,酒精

require(dplyr)
redwine3<-select(redwine2,挥发性酸,残糖,氯化物,二氧化硫,总二氧化硫量,PH,硫酸盐,酒精,质量)

4、线性回归二次建模

fit.lm2<-lm(质量~.,data=redwine3)
summary(fit.lm2)
##
## Call:
## lm(formula = 质量 ~ ., data = redwine3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98818 -0.37002 -0.06038 0.42923 1.88715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.931610 0.379523 10.359 < 2e-16 ***
## 挥发性酸 -0.858573 0.096161 -8.929 < 2e-16 ***
## 残糖 0.018038 0.011328 1.592 0.111534
## 氯化物 -1.543442 0.508930 -3.033 0.002464 **
## 二氧化硫 0.005270 0.002015 2.616 0.008987 **
## 总二氧化硫量 -0.003963 0.000673 -5.889 4.77e-09 ***
## PH -0.385400 0.110383 -3.491 0.000494 ***
## 硫酸盐 1.159007 0.115567 10.029 < 2e-16 ***
## 酒精 0.276418 0.016004 17.272 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5879 on 1532 degrees of freedom
## Multiple R-squared: 0.3936, Adjusted R-squared: 0.3904
## F-statistic: 124.3 on 8 and 1532 DF, p-value: < 2.2e-16

gvlma(fit.lm2)
## Value p-value Decision
## Global Stat 11.06578 0.0258346 Assumptions NOT satisfied!
## Skewness 0.03971 0.8420522 Assumptions acceptable.
## Kurtosis 0.01369 0.9068656 Assumptions acceptable.
## Link Function 10.84008 0.0009933 Assumptions NOT satisfied!
## Heteroscedasticity 0.17230 0.6780765 Assumptions acceptable.

redwine4<-select(redwine3,-残糖) # 剔除残糖
fit.lm3<-lm(质量~.,data=redwine4)
summary(fit.lm3)
##
## Call:
## lm(formula = 质量 ~ ., data = redwine4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.99606 -0.36929 -0.05929 0.43749 1.89270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9890498 0.3779935 10.553 < 2e-16 ***
## 挥发性酸 -0.8538857 0.0961640 -8.879 < 2e-16 ***
## 氯化物 -1.4451714 0.5054265 -2.859 0.004303 **
## 二氧化硫 0.0056561 0.0020010 2.827 0.004764 **
## 总二氧化硫量 -0.0039044 0.0006723 -5.808 7.68e-09 ***
## PH -0.4016225 0.1099664 -3.652 0.000269 ***
## 硫酸盐 1.1590215 0.1156251 10.024 < 2e-16 ***
## 酒精 0.2785644 0.0159547 17.460 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5882 on 1533 degrees of freedom
## Multiple R-squared: 0.3926, Adjusted R-squared: 0.3898
## F-statistic: 141.5 on 7 and 1533 DF, p-value: < 2.2e-16

gvlma(fit.lm3)
## Value p-value Decision
## Global Stat 10.484193 0.033015 Assumptions NOT satisfied!
## Skewness 0.148788 0.699696 Assumptions acceptable.
## Kurtosis 0.003346 0.953874 Assumptions acceptable.
## Link Function 10.208782 0.001398 Assumptions NOT satisfied!
## Heteroscedasticity 0.123277 0.725507 Assumptions acceptable.

redwine5<-select(redwine4,-二氧化硫) # 剔除二氧化硫
fit.lm4<-lm(质量~.,data=redwine5)
summary(fit.lm4)
##
## Call:
## lm(formula = 质量 ~ ., data = redwine5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.99787 -0.37365 -0.06407 0.43956 1.87249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8351426 0.3749028 10.230 < 2e-16 ***
## 挥发性酸 -0.8812013 0.0958949 -9.189 < 2e-16 ***
## 氯化物 -1.4454704 0.5065768 -2.853 0.00438 **
## 总二氧化硫量 -0.0025992 0.0004897 -5.307 1.27e-07 ***
## PH -0.3516895 0.1087854 -3.233 0.00125 **
## 硫酸盐 1.1704491 0.1158173 10.106 < 2e-16 ***
## 酒精 0.2809630 0.0159683 17.595 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5895 on 1534 degrees of freedom
## Multiple R-squared: 0.3894, Adjusted R-squared: 0.387
## F-statistic: 163 on 6 and 1534 DF, p-value: < 2.2e-16

gvlma(fit.lm4) # 模型整体通过检验
## Value p-value Decision
## Global Stat 7.125266 0.12941 Assumptions acceptable.
## Skewness 0.069559 0.79198 Assumptions acceptable.
## Kurtosis 0.001136 0.97312 Assumptions acceptable.
## Link Function 6.908247 0.00858 Assumptions NOT satisfied!
## Heteroscedasticity 0.146324 0.70207 Assumptions acceptable.

5、五折交叉验证

NMSE.training<-rep(0,time=5)
NMSE.testing<-NMSE.training
cor.training<-rep(0,time=5)
cor.testing<-cor.training
index.1<-1:length(redwine5[,1])
index.2<-rep(1:5,time=ceiling(length(redwine5[,1])/5))[1:length(redwine5[,1])]
set.seed(111)
index.3<-sample(index.2,size=length(redwine5[,1]))
for(i in 1:5){
t<-index.1[index.3==i]
training<-redwine5[-t,] # 组建训练集
testing<-redwine5[t,] # 组建测试集
fit.lm5<-lm(质量~.,data=training) # 模型训练
pred.training<-predict(fit.lm5,newdata=training) # 对训练集预测
pred.testing<-predict(fit.lm5,newdata=testing) # 对测试集预测
NMSE.training[i]<-mean((training$质量-pred.training)^2)/mean((training$质量-mean(training$质量))^2) # 训练集均方误差
NMSE.testing[i]<-mean((testing$质量-pred.testing)^2)/mean((testing$质量-mean(testing$质量))^2) # 测试集均方误差
cor.training[i]<-cor(training$质量,pred.training) # 训练集相关系数
cor.testing[i]<-cor(testing$质量,pred.testing) # 测试集相关系数
}
mean(NMSE.training) # 0.6098425
## [1] 0.6098425
mean(NMSE.testing) # 0.6192012
## [1] 0.6192012
mean(cor.training) # 0.6245795
## [1] 0.6245795
mean(cor.testing) # 0.6195533
## [1] 0.6195533

6、决策树模型验证

require(rpart.plot)
for(i in 1:5){
t<-index.1[index.3==i]
training<-redwine5[-t,]
testing<-redwine5[t,]
fit.rpart<-rpart(质量~.,data=training)
pred.training<-predict(fit.rpart,newdata=training)
pred.testing<-predict(fit.rpart,newdata=testing)
NMSE.training[i]<-mean((training$质量-pred.training)^2)/mean((training$质量-mean(training$质量))^2)
NMSE.testing[i]<-mean((testing$质量-pred.testing)^2)/mean((testing$质量-mean(testing$质量))^2)
cor.training[i]<-cor(training$质量,pred.training)
cor.testing[i]<-cor(testing$质量,pred.testing)
}
mean(NMSE.training) # 0.6004183
## [1] 0.6004183
mean(NMSE.testing) # 0.7019723
## [1] 0.7019723
mean(cor.training) # 0.6320459
## [1] 0.6320459
mean(cor.testing) # 0.5526312
## [1] 0.5526312

7、随机森林模型验证

require(randomForest)
for(i in 1:5){
t<-index.1[index.3==i]
training<-redwine5[-t,]
testing<-redwine5[t,]
fit.randomForest<-randomForest(质量~.,data=training,importance=T,proximity=T,ntree=30)
pred.training<-predict(fit.randomForest,newdata=training)
pred.testing<-predict(fit.randomForest,newdata=testing)
NMSE.training[i]<-mean((training$质量-pred.training)^2)/mean((training$质量-mean(training$质量))^2)
NMSE.testing[i]<-mean((testing$质量-pred.testing)^2)/mean((testing$质量-mean(testing$质量))^2)
cor.training[i]<-cor(training$质量,pred.training)
cor.testing[i]<-cor(testing$质量,pred.testing)
}
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
mean(NMSE.training) # 0.1210717
## [1] 0.1210717
mean(NMSE.testing) # 0.5092464
## [1] 0.5092464
mean(cor.training) # 0.9528049
## [1] 0.9528049
mean(cor.testing) # 0.6930467
## [1] 0.6930467

8、神经网络模型验证

require(nnet)
for(i in 1:5){
t<-index.1[index.3==i]
training<-redwine5[-t,]
testing<-redwine5[t,]
fit.nnet<-nnet(质量~.,data=training,size=10,decay=0.01,maxit=1000,linout=T,trace=F)
pred.training<-predict(fit.nnet,newdata=training)
pred.testing<-predict(fit.nnet,newdata=testing)
NMSE.training[i]<-mean((training$质量-pred.training)^2)/mean((training$质量-mean(training$质量))^2)
NMSE.testing[i]<-mean((testing$质量-pred.testing)^2)/mean((testing$质量-mean(testing$质量))^2)
cor.training[i]<-cor(training$质量,pred.training)
cor.testing[i]<-cor(testing$质量,pred.testing)
}
mean(NMSE.training) # 0.5072141
## [1] 0.5072141
mean(NMSE.testing) # 0.6369096
## [1] 0.6369096
mean(cor.training) # 0.702109
## [1] 0.702109
mean(cor.testing) # 0.6254076
## [1] 0.6254076

9、支持向量机模型验证

require(e1071)
for(i in 1:5){
t<-index.1[index.3==i]
training<-redwine5[-t,]
testing<-redwine5[t,]
fit.svm<-svm(质量~.,data=training)
pred.training<-predict(fit.svm,newdata=training)
pred.testing<-predict(fit.svm,newdata=testing)
NMSE.training[i]<-mean((training$质量-pred.training)^2)/mean((training$质量-mean(training$质量))^2)
NMSE.testing[i]<-mean((testing$质量-pred.testing)^2)/mean((testing$质量-mean(testing$质量))^2)
cor.training[i]<-cor(training$质量,pred.training)
cor.testing[i]<-cor(testing$质量,pred.testing)
}
mean(NMSE.training) # 0.500947
## [1] 0.500947
mean(NMSE.testing) # 0.5883868
## [1] 0.5883868
mean(cor.training) # 0.7085196
## [1] 0.7085196
mean(cor.testing) # 0.6472574
## [1] 0.6472574

10、汇总对比

compare<-data.frame(
model=c('lm','rpart','randomForest','nnet','svm'),
NMSE.training=c(0.6098425,0.6004183,0.1210717,0.5072141,0.500947),
NMSE.testing=c(0.6192012,0.7019723,0.5092464,0.6369096,0.5883868),
cor.training=c(0.6245795,0.6320459,0.9528049,0.702109,0.7085196),
cor.testing=c(0.6195533,0.5526312,0.6930467,0.6254076,0.6472574)
)
require(ggplot2)
ggplot(compare,aes(x=NMSE.testing,y=cor.testing))+geom_point()+geom_text(aes(label=model))+geom_line(aes(x=mean(NMSE.testing)))+geom_line(aes(y=mean(cor.testing)))

总结：我们关注的是测试集的均方误差和相关系数，对于这个数据集，随机森林的均方误差最小，相关系数最大；其次是支持向量机；经典最小二乘回归和神经网络位于中间水平；决策树拟合效果最差。

举报/Report

我的更多文章

下载客户端阅读体验更佳

APP专享

新浪博客