R做评分卡模型(woe转化为评分)
评分卡模型流程
参考书:《信用风险评分卡研究》
数据读取和EDA
Kaggle数据cs-training,共15万条
https://www.kaggle.com/c/GiveMeSomeCredit/data
rawdata<-read.csv(file.choose(),header = T)#读取原始数据
data<-rawdata[,-1]#去除序列号列
#剔除缺失值超过20%或单一值超过80%的变量
l=c()
for(j in 2:ncol(data))
{
if(length(which(is.na(data[,j])))/nrow(data)>=0.2|max(table(data[,j]))>=0.8*nrow(data))
{l=c(l,j)}
}
#剔除有缺失值的行(个体)
data=data[,-l]
l=c()
for(i in 1:nrow(data))
{
if(length(which(is.na(data[i,])))>0)
{l=c(l,i)}
}
data=data[-l,]
注:数据EDA分析和处理要根据具体数据和业务去做,这里主要介绍评分卡模型,数据处理简化处理
数据准备(计算woe、分箱)
连续变量的数据准备
定义qua函数:返回变量的K等分位点
#fwd[1]-1是为了方便统一后续的区间编写(前开后闭),输出的结果会改写回来
qua<-function(var,K)
{
fwd=round(unique(quantile(var,0:K/K)),4)
fwd[1]=fwd[1]-1
return(fwd)
}
这里数据较少,大家可以观察每个变量的分布,自行分箱。
定义函数binning:对连续变量分箱(K等分),返回变量指标var_index(list格式):WOE(每个个体在该变量上的woe值),index(每个箱的最小值,最大值,坏的数量,坏的比率,好的数量,好的比率,总数量,woe值,iv值),IV(该变量的iv值),quantitl(分位点)
#usqua=T:使用默认K等分分箱,否则自行定义分位点fwd(注:包括前后端点)
#若某一箱全为好或全为坏,则woe为0,iv为0
binning<-function(var,K,fwd,usequa=T)
{
if(usequa)
{
fwd=qua(var,K)
}
numallgood=length(which(data[,1]==0))
numallbad=length(which(data[,1]==1))
WOE=data.frame(matrix(0,nrow(data),1));var_index=list()
num_bad=0;num_good=0;rate_bad=0;rate_good=0;woe=0;iv=0;IV=0
index=data.frame(matrix(0,length(fwd)-1,9))
colnames(index)=c("min","max","num_bad","rate_bad","num_good","rate_good","count","woe","iv")
for(k in 1:(length(fwd)-1))
{
num_bad=length(which(data[,1][which(var>fwd[k]&var<=fwd[k+1])]==1))
num_good=length(which(data[,1][which(var>fwd[k]&var<=fwd[k+1])]==0))
rate_bad=num_bad/numallbad
rate_good=num_good/numallgood
if(num_bad!=0&num_good!=0)
{
woe=log(rate_bad/rate_good)
iv=(rate_bad-rate_good)*woe
}
if(num_bad==0|num_good==0)
{
woe=0;iv=0
}
WOE[which(var>fwd[k]&var<=fwd[k+1]),1]=woe
index$min[k]=fwd[k];index$max[k]=fwd[k+1]
index$num_bad[k]=num_bad;index$rate_bad[k]=rate_bad
index$num_good[k]=num_good;index$rate_good[k]=rate_good
index$count[k]=num_bad+num_good
index$woe[k]=woe;index$iv[k]=iv
IV=IV+iv
}
index$min[1]=index$min[1]+1
var_index[[1]]=WOE
var_index[[2]]=index
var_index[[3]]=IV
var_index[[4]]=fwd
names(var_index)=c("WOE","index","IV","quantitl")
return(var_index)
}
运行代码
age_index=binning(data$age,K=10,usequa=T)
age_index
得到结果:
定义合箱函数closing:根据woe相邻最近的两箱进行合并
var 是某一变量,var_index是binning函数对var返回的结果
closing<-function(var,var_index)
{
fwd=var_index$quantitl
woe=var_index$index$woe
diff_woe=abs(diff(woe))
num_min=which.min(diff_woe)
fwd=fwd[-(num_min+1)]
var_index_byclosing=binning(var,fwd=fwd,usequa = F)
return(var_index_byclosing)
}
运行代码
age_index_byclosing=closing(data$age,age_index)
age_index_byclosing
得到结果:由10箱合并为9箱
定义按单调性函数clobymono:根据变量各箱woe值的单调性合箱,非单调且箱数大于K_min时循环合箱
(注:有些变量U型,或倒U型,这里暂不考虑)
clobymono<-function(var,K,K_min)
{
var_index=binning(var,K,usequa = T)
woe=var_index$index$woe
while(length(which(woe!=sort(woe)))>0&
length(which(woe!=sort(woe,decreasing = T)))>0&
length(var_index$quantitl)>=K_min+1)
{
var_index=closing(var,var_index)
woe=var_index$index$woe
if(length(var_index$quantitl)==(K_min+1))
{break}
}
return(var_index)
}
运行代码
age_index=clobymono(data$age,10,5)
age_index
得到结果
这里结果与binning(data$age,10,ueage=T)相同,是因为woe单调,无需合箱。我们带入另一个变量再观察:
运行代码
MonthlyIncome_index=clobymono(data$MonthlyIncome,10,5)
MonthlyIncome_index
得到结果
变量MonthlyIncome一开始等分为10箱,直到合为6箱woe才单调
定义连续变量指标函数convarindex
#all continuous variable\'s index
convarindex<-function(data,K,K_min)
{
convar_index=list()
WOEmatrix=as.data.frame(data[,1],norw=nrow(data),ncol=1)
colnames(WOEmatrix)[1]=colnames(data)[1]
for(j in 2:ncol(data))
{
var_index=clobymono(data[,j],K,K_min)
WOEmatrix=cbind(WOEmatrix,var_index$WOE)
colnames(WOEmatrix)[j]=colnames(data)[j]
convar_index[[j-1]]=var_index[-1]
names(convar_index)[j-1]=colnames(data)[j]
}
convar_index[[length(convar_index)+1]]=WOEmatrix
names(convar_index)[length(convar_index)]="WOEmatrix"
return(convar_index)
}
这里取4个连续变量演示,data的第一列为目标值
运行代码:
convar_index=convarindex(data[,1:5],10,5)
convar_index
得到结果:所有连续变量指标和连续变量woe矩阵
离散变量的数据准备
定义分类函数classifing:返回离散变量指标
classifing<-function(var)
{
classify<-unique(var)
numallgood=length(which(data[,1]==0))
numallbad=length(which(data[,1]==1))
WOE=data.frame(matrix(0,nrow(data),1));var_index=list()
num_bad=0;num_good=0;rate_bad=0;rate_good=0;woe=0;iv=0; IV=0
index=data.frame(matrix(0,length(classify),8))
colnames(index)=c("classify","num_bad","rate_bad","num_good","rate_good","count","woe","iv")
for(k in 1:length(classify))
{
num_bad=length(which(data[,1][which(var==classify[k])]==1))
num_good=length(which(data[,1][which(var==classify[k])]==0))
rate_bad=num_bad/numallbad
rate_good=num_good/numallgood
if(num_bad!=0&num_good!=0)
{
woe=log(rate_bad/rate_good)
iv=(rate_bad-rate_good)*woe
}
if(num_bad==0|num_good==0)
{
woe=0;iv=0
}
WOE[which(var==classify[k]),1]=woe
index$classify[k]=classify[k]
index$num_bad[k]=num_bad;index$rate_bad[k]=rate_bad
index$num_good[k]=num_good;index$rate_good[k]=rate_good
index$count[k]=num_bad+num_good
index$woe[k]=woe;index$iv[k]=iv
IV=IV+iv
}
var_index[[1]]=WOE
var_index[[2]]=index
var_index[[3]]=IV
var_index[[4]]=classify
names(var_index)=c("WOE","index","IV","classify")
return(var_index)
}
运行代码:
numofdep_index=classifing(data$NumberOfDependents)
numofdep_index
得到结果:
共13类,与连续变量相同,若某一类全为好或全为坏,则woe为0,iv为0(NumberOfDependents其实也是离散变量,这里只是作为演示)
定义所有离散变量指标函数clavarindex,与连续变量一致,返回所有变量指标和woe矩阵(合为list)
clavarindex<-function(data)
{
cladata_index=list()
WOEmatrix=data.frame(data[,1])
for(j in 2:ncol(data))
{
classifyvar_index=classifing(data[,j])
cladata_index[[j-1]]=classifyvar_index[-1]
WOEmatrix=cbind(WOEmatrix,classifyvar_index[[1]])
colnames(WOEmatrix)[j]=colnames(data)[j]
}
cladata_index[[length(cladata_index)+1]]=WOEmatrix
names(cladata_index)=c(colnames(data)[-1],"WOEmatrix")
return(cladata_index)
}
运行代码:
cladata_index=clavarindex(data[,c(1,6,7)])
cladata_index
这里,假设第6和第7列为离散变量,第一列为目标值,结果与连续变量一致,这里不再给出图形结果。
以上出去数据处理以外,从qua函数开始的所有函数都是为了下面allvarindex函数调取所用:
定义所有变量指标函数allvarindex:返回所有变量指标和woe矩阵
(注:concol为连续变量的列,不包括第一列目标值列)
#combining continuous and classify indexes
allvarindex<-function(data,K,K_min,concol)
{
conindex=convarindex(data[,c(1,concol)],K,K_min)
claindex=clavarindex(data[,-concol])
all_index=append(conindex[-length(conindex)],claindex[-length(claindex)])
WOEmatrix=cbind(conindex$WOEmatrix,claindex$WOEmatrix[,-1])
colnames(WOEmatrix)=c(colnames(conindex$WOEmatrix),colnames(claindex$WOEmatrix)[-1])
all_index[[length(all_index)+1]]=WOEmatrix
names(all_index)[length(all_index)]="WOEmatrix"
return(all_index)
}
运行代码:
all_index=allvarindex(data,10,5,c(2:7))
all_index
给出all_index的内容:
得出所有变量的指标和WOEmatrix(woe矩阵)
可以看出变量NumberRealEstateLoansOrLines的值为0、1、2的较多
变量选择
下面对woe矩阵建立逐步logistic回归模型:
运行代码:
library(caret)
set.seed(1234)
woeMatrix<-all_index$WOEmatrix
splitIndex<-createDataPartition(woeMatrix[,1],time=1,p=0.5,list=FALSE)
train<-woeMatrix[splitIndex,];traindata=data[splitIndex,]
test<-woeMatrix[-splitIndex,];testdata=data[splitIndex,]
glm_fit=glm(woeMatrix[,1]~.,family = binomial,woeMatrix[,-1])
step_glm_fit=step(glm_fit)
pre=predict(step_glm_fit,test,type="response")
注:train、test分别为woe矩阵的训练集和测试集;traindata、testdata分别为数据data的训练集和测试集。
运行代码
summary(step_glm_fit)
结果如下:
逐步回归后不通过的变量不再出现在模型中,在转化为评分卡时,只调用留下来的变量的woe。
模型评估
ROC曲线
计算ROC画出图形:
library(pROC)
modelroc <- roc(test[,1],pre)
plot(modelroc, print.auc=T, auc.polygon=T, grid=c(0.2, 0.1),
grid.col=c("red", "yellow"), max.auc.polygon=T,
auc.polygon.col="seagreen2", print.thres=T)
结果如下:
KS曲线
计算KS
定义KS函数ksindex:这里pre是对test的预测,所以y应该是test[,1]
ksindex<-function(y,pre,K=10)
{
per_bad=c();per_good=c()
y2=y[order(rank(pre),decreasing = T)]
numallgood=length(which(y2==0))
numallbad=length(which(y2==1))
for(i in 1:K)
{
fwd=floor(quantile(0:length(y2),0:K/K))
per_bad[i]=length(which(y2[(fwd[i]+1):fwd[i+1]]==1))*100/numallbad
per_good[i]=length(which(y2[(fwd[i]+1):fwd[i+1]]==0))*100/numallgood
}
acc_per_bad=c();acc_per_good=c()
for(i in 1:K)
{
acc_per_bad[i]=sum(per_bad[1:i])
acc_per_good[i]=sum(per_good[1:i])
}
per_diff=acc_per_bad-acc_per_good
ksindex=data.frame(acc_per_bad,acc_per_good,per_diff)
colnames(ksindex)=c("acc_per_bad","acc_per_good","per_diff")
return(ksindex)
}
运行代码:
ks_index=ksindex(test[,1],pre)
ks_index
有如下结果:
定义ks作图函数ksplot:
ksplot<-function(ksindex,goodcol="blue",badcol="darkgreen",kscol="red")
{
plot(1:nrow(ksindex),ksindex$acc_per_good,type="l",col=goodcol,lwd=2,ylab ="acc_per %",
xlim=c(1,10),ylim=c(0,100),xlab="Decili",main="KS curve")
points(1:nrow(ksindex),ksindex$acc_per_good,cex=0.3)
text(floor(nrow(ksindex)*0.2)+1,ksindex$acc_per_good[floor(nrow(ksindex)*0.2)]-10,"goodrate",col = goodcol)
lines(1:nrow(ksindex),ksindex$acc_per_bad,"l",col=badcol,lwd=2)
points(1:nrow(ksindex),ksindex$acc_per_bad,cex=0.3)
text(floor(nrow(ksindex)*0.8)-1,ksindex$acc_per_bad[floor(nrow(ksindex)*0.8)]-10,"badrate",col=badcol)
lines(1:nrow(ksindex),ksindex$per_diff,col=kscol,lwd=2)
points(1:nrow(ksindex),ksindex$per_diff,cex=0.3)
points(which.max(ksindex$per_diff),ksindex$per_diff[which.max(ksindex$per_diff)],cex=1,pch=16)
text(which.max(ksindex$per_diff),ksindex$per_diff[which.max(ksindex$per_diff)]+5,
paste("KS=",round(ksindex$per_diff[which.max(ksindex$per_diff)],2),"%",sep = ""),col=kscol)
}
运行代码:
ksplot(ks_index)
输出图形
评分卡刻度
定义计算变量woe函数(个体某一变量的值对应的woe值,若离散变量的类不在训练集中,则输出woe为0)get_varwoe:
get_varwoe<-function(colvar,var_value)
{
if(names(colvar)[3]=="quantitl")
{
k=min(which(var_value<=colvar$quantitl))
if(k==1)
{k=2}
if(length(k)==0)
{k=length(colvar$quantitl)}
woe_var=colvar$index$woe[k-1]
}
if(names(colvar)[3]=="classify")
{
if(var_value%in%colvar$classify)
{
woe_var=colvar$index$woe[var_value==colvar$classify]
}
else
{woe_var=0;print("new classify")}
}
return(woe_var)
}
例如:得出第一个测试样本的age值所对应的woe值;
运行代码:
get_varwoe(all_index$age,test$age[1])
得出结果:
定义计算变量得分函数var_score:colvar_index为allvarindex函数返回值的某一元素变量的index元素
var_score<-function(colvar_index)
{
df<-data.frame()
if(colnames(colvar_index)[1]=="min")
{
df[1,1]=paste("[",as.character(round(colvar_index$min[1],4)),",",
as.character(round(colvar_index$max[1],4)),"]",sep = "")
df[2:nrow(colvar_index),1]=paste("(",as.character(round(colvar_index$min[2:nrow(colvar_index)],4)),
",",
as.character(round(colvar_index$max[2:nrow(colvar_index)],4)),"]",
sep = "")
}
if(colnames(colvar_index)[1]=="classify")
{
df=colvar_index$classify
}
df=cbind(df,colvar_index$woe)
rownames(df)=as.character(c(1:nrow(df)))
return(df)
}
运行代码:
var_score(all_index$age$index)
输出如下:
定义评分卡函数score_card:
注:若theta取1,表明theta为默认值:样本中的坏好比
model_fit为模型拟合对象,allvar_index为函数allvarindex返回的结果,若list取真,则输出list格式,若list为假输出data.frame格式。
scorecard<-function(y,theta=1,PDO,p0,model_fit,allvar_index,list=T)
{
if (theta==1)
{
theta=length(which(y==1))/length(which(y==0))
}
B=PDO/log(2);A=p0+B*log(theta)
beta<-model_fit$coefficients
namesofmodelcoef=names(beta)
allvar_index=allvar_index[names(allvar_index)%in%namesofmodelcoef]
score_card=list()
varscore=data.frame()
for(i in 1:length(allvar_index))
{
varscore=var_score(allvar_index[[i]]$index)
varscore[,2]=floor(-varscore[,2]*B*beta[i+1]+(A-B*beta[1])/length(allvar_index))
varscore=cbind(as.data.frame(rep(names(allvar_index)[i],nrow(varscore))),varscore)
colnames(varscore)=c("varname","interval","score")
score_card[[i]]=varscore
names(score_card)[i]=names(allvar_index)[i]
}
if(!list)
{
sc=score_card[[1]]
for(i in 2:length(score_card))
{
sc=rbind(sc,score_card[[i]])
}
score_card=sc
rownames(score_card)=as.character(1:nrow(score_card))
}
return(score_card)
}
运行代码:
score_card=scorecard(data[,1],1,50,580,step_glm_fit,all_index,T)
score_card
输出结果:
运行代码:
score_card=scorecard(data[,1],1,50,580,step_glm_fit,all_index,F)
score_card
输出结果:
代码流程总结
数据预处理完成后,按以下代码即可得到评分卡:
#计算woe等相关指标
all_index=allvarindex(data,K=10,K_min=5,concol=c(2:7))
#建立logistic回归模型
library(caret)
set.seed(1234)
woeMatrix<-all_index$WOEmatrix
splitIndex<-createDataPartition(woeMatrix[,1],time=1,p=0.5,list=FALSE)
train<-woeMatrix[splitIndex,];traindata=data[splitIndex,]
test<-woeMatrix[-splitIndex,];testdata=data[splitIndex,]
glm_fit=glm(woeMatrix[,1]~.,family = binomial,woeMatrix[,-1])
step_glm_fit=step(glm_fit)
pre=predict(step_glm_fit,test,type="response")
#观察ROC曲线和KS曲线
library(pROC)
modelroc <- roc(test[,1],pre)
plot(modelroc, print.auc=T, auc.polygon=T, grid=c(0.2, 0.1),
grid.col=c("red", "yellow"), max.auc.polygon=T,
auc.polygon.col="seagreen2", print.thres=T)
ks_index=ksindex(test[,1],pre)
ksplot(ks_index)
#woe转化为评分卡
score_card=scorecard(data[,1],1,50,580,step_glm_fit,all_index,F)
score_card
当然,这只是个不精细的通用代码,在数据预处理、变量分析与筛选的过程中往往要精细化,比如观察每个变量的分布,对异常值的分箱考虑等。Kaggle的cs-training.csv的变量都是连续变量,但家属数量“类别”较少,这里就当作离散变量演示。