R做评分卡模型(woe转化为评分) - zjwkk

时间:2024-03-18 08:01:13

R做评分卡模型(woe转化为评分)

评分卡模型流程

参考书:《信用风险评分卡研究》

数据读取和EDA

Kaggle数据cs-training,共15万条
https://www.kaggle.com/c/GiveMeSomeCredit/data

rawdata<-read.csv(file.choose(),header = T)#读取原始数据
data<-rawdata[,-1]#去除序列号列
#剔除缺失值超过20%或单一值超过80%的变量
l=c()
for(j in 2:ncol(data))
{
if(length(which(is.na(data[,j])))/nrow(data)>=0.2|max(table(data[,j]))>=0.8*nrow(data))
    {l=c(l,j)}
}
#剔除有缺失值的行(个体)
data=data[,-l]
l=c()
for(i in 1:nrow(data))
{
  if(length(which(is.na(data[i,])))>0)
    {l=c(l,i)}
}
data=data[-l,]

注:数据EDA分析和处理要根据具体数据和业务去做,这里主要介绍评分卡模型,数据处理简化处理

数据准备(计算woe、分箱)

连续变量的数据准备

定义qua函数:返回变量的K等分位点

#fwd[1]-1是为了方便统一后续的区间编写(前开后闭),输出的结果会改写回来
qua<-function(var,K)
{
  fwd=round(unique(quantile(var,0:K/K)),4)
  fwd[1]=fwd[1]-1
  return(fwd)
}

这里数据较少,大家可以观察每个变量的分布,自行分箱。

定义函数binning:对连续变量分箱(K等分),返回变量指标var_index(list格式):WOE(每个个体在该变量上的woe值),index(每个箱的最小值,最大值,坏的数量,坏的比率,好的数量,好的比率,总数量,woe值,iv值),IV(该变量的iv值),quantitl(分位点)

#usqua=T:使用默认K等分分箱,否则自行定义分位点fwd(注:包括前后端点)
#若某一箱全为好或全为坏,则woe为0,iv为0
binning<-function(var,K,fwd,usequa=T)
{
  if(usequa)
  {
    fwd=qua(var,K)
  }
  numallgood=length(which(data[,1]==0))
  numallbad=length(which(data[,1]==1))
  WOE=data.frame(matrix(0,nrow(data),1));var_index=list()
  num_bad=0;num_good=0;rate_bad=0;rate_good=0;woe=0;iv=0;IV=0
  index=data.frame(matrix(0,length(fwd)-1,9))
  colnames(index)=c("min","max","num_bad","rate_bad","num_good","rate_good","count","woe","iv")
  for(k in 1:(length(fwd)-1))
  {
    num_bad=length(which(data[,1][which(var>fwd[k]&var<=fwd[k+1])]==1))
    num_good=length(which(data[,1][which(var>fwd[k]&var<=fwd[k+1])]==0))
    rate_bad=num_bad/numallbad
    rate_good=num_good/numallgood
    if(num_bad!=0&num_good!=0)
    {
      woe=log(rate_bad/rate_good)
      iv=(rate_bad-rate_good)*woe
    }
    if(num_bad==0|num_good==0)
    {
      woe=0;iv=0
    }
    WOE[which(var>fwd[k]&var<=fwd[k+1]),1]=woe
    index$min[k]=fwd[k];index$max[k]=fwd[k+1]
    index$num_bad[k]=num_bad;index$rate_bad[k]=rate_bad
index$num_good[k]=num_good;index$rate_good[k]=rate_good
    index$count[k]=num_bad+num_good
    index$woe[k]=woe;index$iv[k]=iv
    IV=IV+iv
  }
  index$min[1]=index$min[1]+1
  var_index[[1]]=WOE
  var_index[[2]]=index
  var_index[[3]]=IV
  var_index[[4]]=fwd
  names(var_index)=c("WOE","index","IV","quantitl")
  return(var_index)
}

运行代码

age_index=binning(data$age,K=10,usequa=T)
age_index

得到结果:
变量指标

定义合箱函数closing:根据woe相邻最近的两箱进行合并
var 是某一变量,var_index是binning函数对var返回的结果

closing<-function(var,var_index)
{
  fwd=var_index$quantitl
  woe=var_index$index$woe
  diff_woe=abs(diff(woe))
  num_min=which.min(diff_woe)
  fwd=fwd[-(num_min+1)]
  var_index_byclosing=binning(var,fwd=fwd,usequa = F)
  return(var_index_byclosing)
}

运行代码

age_index_byclosing=closing(data$age,age_index)
age_index_byclosing

得到结果:由10箱合并为9箱
合箱结果

定义按单调性函数clobymono:根据变量各箱woe值的单调性合箱,非单调且箱数大于K_min时循环合箱
(注:有些变量U型,或倒U型,这里暂不考虑)

clobymono<-function(var,K,K_min)
{
  var_index=binning(var,K,usequa = T)
  woe=var_index$index$woe
  while(length(which(woe!=sort(woe)))>0&
        length(which(woe!=sort(woe,decreasing = T)))>0&
        length(var_index$quantitl)>=K_min+1)
  {
    var_index=closing(var,var_index)
    woe=var_index$index$woe
    if(length(var_index$quantitl)==(K_min+1))
    {break}
  }
  return(var_index)
}

运行代码

age_index=clobymono(data$age,10,5)
age_index

得到结果
age_index
这里结果与binning(data$age,10,ueage=T)相同,是因为woe单调,无需合箱。我们带入另一个变量再观察:
运行代码

MonthlyIncome_index=clobymono(data$MonthlyIncome,10,5)
MonthlyIncome_index

得到结果
MonthlyIncome_index
变量MonthlyIncome一开始等分为10箱,直到合为6箱woe才单调

定义连续变量指标函数convarindex

#all continuous variable\'s index 
convarindex<-function(data,K,K_min)
{
  convar_index=list()
  WOEmatrix=as.data.frame(data[,1],norw=nrow(data),ncol=1)
  colnames(WOEmatrix)[1]=colnames(data)[1]
  for(j in 2:ncol(data))
  {
    var_index=clobymono(data[,j],K,K_min)
    WOEmatrix=cbind(WOEmatrix,var_index$WOE)
    colnames(WOEmatrix)[j]=colnames(data)[j]
    convar_index[[j-1]]=var_index[-1]
    names(convar_index)[j-1]=colnames(data)[j]
  }
  convar_index[[length(convar_index)+1]]=WOEmatrix
  names(convar_index)[length(convar_index)]="WOEmatrix"
  return(convar_index)
}

这里取4个连续变量演示,data的第一列为目标值
运行代码:

convar_index=convarindex(data[,1:5],10,5)
convar_index

得到结果:所有连续变量指标和连续变量woe矩阵
convar_index

离散变量的数据准备

定义分类函数classifing:返回离散变量指标

classifing<-function(var)
{
  classify<-unique(var)
  numallgood=length(which(data[,1]==0))
  numallbad=length(which(data[,1]==1))
  WOE=data.frame(matrix(0,nrow(data),1));var_index=list()
  num_bad=0;num_good=0;rate_bad=0;rate_good=0;woe=0;iv=0;  IV=0
  index=data.frame(matrix(0,length(classify),8))
  colnames(index)=c("classify","num_bad","rate_bad","num_good","rate_good","count","woe","iv")
  for(k in 1:length(classify))
  {
    num_bad=length(which(data[,1][which(var==classify[k])]==1))
    num_good=length(which(data[,1][which(var==classify[k])]==0))
    rate_bad=num_bad/numallbad
    rate_good=num_good/numallgood
    if(num_bad!=0&num_good!=0)
    {
      woe=log(rate_bad/rate_good)
      iv=(rate_bad-rate_good)*woe
    }
    if(num_bad==0|num_good==0)
    {
      woe=0;iv=0
    }
    WOE[which(var==classify[k]),1]=woe
    index$classify[k]=classify[k]
    index$num_bad[k]=num_bad;index$rate_bad[k]=rate_bad
    index$num_good[k]=num_good;index$rate_good[k]=rate_good
    index$count[k]=num_bad+num_good
    index$woe[k]=woe;index$iv[k]=iv
    IV=IV+iv
  }
  var_index[[1]]=WOE
  var_index[[2]]=index
  var_index[[3]]=IV
  var_index[[4]]=classify
  names(var_index)=c("WOE","index","IV","classify")
  return(var_index)
}

运行代码:

numofdep_index=classifing(data$NumberOfDependents)
numofdep_index

得到结果:
numoddep_index
共13类,与连续变量相同,若某一类全为好或全为坏,则woe为0,iv为0(NumberOfDependents其实也是离散变量,这里只是作为演示)

定义所有离散变量指标函数clavarindex,与连续变量一致,返回所有变量指标和woe矩阵(合为list)

clavarindex<-function(data)
{
  cladata_index=list()
  WOEmatrix=data.frame(data[,1])
  for(j in 2:ncol(data))
  {
    classifyvar_index=classifing(data[,j])
    cladata_index[[j-1]]=classifyvar_index[-1]
    WOEmatrix=cbind(WOEmatrix,classifyvar_index[[1]])
    colnames(WOEmatrix)[j]=colnames(data)[j]
  }
  cladata_index[[length(cladata_index)+1]]=WOEmatrix
  names(cladata_index)=c(colnames(data)[-1],"WOEmatrix")
  return(cladata_index)
}

运行代码:

cladata_index=clavarindex(data[,c(1,6,7)])
cladata_index

这里,假设第6和第7列为离散变量,第一列为目标值,结果与连续变量一致,这里不再给出图形结果。

以上出去数据处理以外,从qua函数开始的所有函数都是为了下面allvarindex函数调取所用:
定义所有变量指标函数allvarindex:返回所有变量指标和woe矩阵
(注:concol为连续变量的列,不包括第一列目标值列)

#combining continuous and classify indexes
allvarindex<-function(data,K,K_min,concol)
{
  conindex=convarindex(data[,c(1,concol)],K,K_min)
  claindex=clavarindex(data[,-concol])
  all_index=append(conindex[-length(conindex)],claindex[-length(claindex)])
  WOEmatrix=cbind(conindex$WOEmatrix,claindex$WOEmatrix[,-1])
  colnames(WOEmatrix)=c(colnames(conindex$WOEmatrix),colnames(claindex$WOEmatrix)[-1])
  all_index[[length(all_index)+1]]=WOEmatrix
  names(all_index)[length(all_index)]="WOEmatrix"
  return(all_index)
}

运行代码:

all_index=allvarindex(data,10,5,c(2:7))
all_index

给出all_index的内容:
all_index
这里写图片描述

得出所有变量的指标和WOEmatrix(woe矩阵)
可以看出变量NumberRealEstateLoansOrLines的值为0、1、2的较多

变量选择

下面对woe矩阵建立逐步logistic回归模型:
运行代码:

library(caret)
set.seed(1234)
woeMatrix<-all_index$WOEmatrix
splitIndex<-createDataPartition(woeMatrix[,1],time=1,p=0.5,list=FALSE) 
train<-woeMatrix[splitIndex,];traindata=data[splitIndex,] 
test<-woeMatrix[-splitIndex,];testdata=data[splitIndex,]
glm_fit=glm(woeMatrix[,1]~.,family = binomial,woeMatrix[,-1])
step_glm_fit=step(glm_fit)
pre=predict(step_glm_fit,test,type="response")

注:train、test分别为woe矩阵的训练集和测试集;traindata、testdata分别为数据data的训练集和测试集。

运行代码

summary(step_glm_fit)

结果如下:
logistic回归
逐步回归后不通过的变量不再出现在模型中,在转化为评分卡时,只调用留下来的变量的woe。

模型评估

ROC曲线

计算ROC画出图形:

library(pROC)
modelroc <- roc(test[,1],pre)
plot(modelroc, print.auc=T, auc.polygon=T, grid=c(0.2, 0.1),
     grid.col=c("red", "yellow"), max.auc.polygon=T,
     auc.polygon.col="seagreen2", print.thres=T)

结果如下:
这里写图片描述

KS曲线

计算KS
定义KS函数ksindex:这里pre是对test的预测,所以y应该是test[,1]

ksindex<-function(y,pre,K=10)
{
  per_bad=c();per_good=c()
  y2=y[order(rank(pre),decreasing = T)]
  numallgood=length(which(y2==0))
  numallbad=length(which(y2==1))
  for(i in 1:K)
  {
    fwd=floor(quantile(0:length(y2),0:K/K))
    per_bad[i]=length(which(y2[(fwd[i]+1):fwd[i+1]]==1))*100/numallbad
    per_good[i]=length(which(y2[(fwd[i]+1):fwd[i+1]]==0))*100/numallgood
  }
  acc_per_bad=c();acc_per_good=c()
  for(i in 1:K)
  {
    acc_per_bad[i]=sum(per_bad[1:i])
    acc_per_good[i]=sum(per_good[1:i])
  }
  per_diff=acc_per_bad-acc_per_good
  ksindex=data.frame(acc_per_bad,acc_per_good,per_diff)
  colnames(ksindex)=c("acc_per_bad","acc_per_good","per_diff")
  return(ksindex)
}

运行代码:

ks_index=ksindex(test[,1],pre)
ks_index

有如下结果:
ks_index

定义ks作图函数ksplot:

ksplot<-function(ksindex,goodcol="blue",badcol="darkgreen",kscol="red")
{
  plot(1:nrow(ksindex),ksindex$acc_per_good,type="l",col=goodcol,lwd=2,ylab ="acc_per  %",
       xlim=c(1,10),ylim=c(0,100),xlab="Decili",main="KS curve")
  points(1:nrow(ksindex),ksindex$acc_per_good,cex=0.3)
  text(floor(nrow(ksindex)*0.2)+1,ksindex$acc_per_good[floor(nrow(ksindex)*0.2)]-10,"goodrate",col = goodcol)
  lines(1:nrow(ksindex),ksindex$acc_per_bad,"l",col=badcol,lwd=2)
  points(1:nrow(ksindex),ksindex$acc_per_bad,cex=0.3)
  text(floor(nrow(ksindex)*0.8)-1,ksindex$acc_per_bad[floor(nrow(ksindex)*0.8)]-10,"badrate",col=badcol)
  lines(1:nrow(ksindex),ksindex$per_diff,col=kscol,lwd=2)
  points(1:nrow(ksindex),ksindex$per_diff,cex=0.3)
  points(which.max(ksindex$per_diff),ksindex$per_diff[which.max(ksindex$per_diff)],cex=1,pch=16)
  text(which.max(ksindex$per_diff),ksindex$per_diff[which.max(ksindex$per_diff)]+5,
       paste("KS=",round(ksindex$per_diff[which.max(ksindex$per_diff)],2),"%",sep = ""),col=kscol)
}

运行代码:

ksplot(ks_index)

输出图形
ks曲线

评分卡刻度

定义计算变量woe函数(个体某一变量的值对应的woe值,若离散变量的类不在训练集中,则输出woe为0)get_varwoe:

get_varwoe<-function(colvar,var_value)
{
  if(names(colvar)[3]=="quantitl")
  {
    k=min(which(var_value<=colvar$quantitl))
    if(k==1)
    {k=2}
    if(length(k)==0)
    {k=length(colvar$quantitl)}
    woe_var=colvar$index$woe[k-1]
  }
  if(names(colvar)[3]=="classify")
  {
    if(var_value%in%colvar$classify)
    {
      woe_var=colvar$index$woe[var_value==colvar$classify]
    }
  else
    {woe_var=0;print("new classify")}
  }
  return(woe_var)
}

例如:得出第一个测试样本的age值所对应的woe值;
运行代码:

get_varwoe(all_index$age,test$age[1])

得出结果:
varwoe

定义计算变量得分函数var_score:colvar_index为allvarindex函数返回值的某一元素变量的index元素

var_score<-function(colvar_index)
{
  df<-data.frame()
  if(colnames(colvar_index)[1]=="min")
  {
    df[1,1]=paste("[",as.character(round(colvar_index$min[1],4)),",",
                  as.character(round(colvar_index$max[1],4)),"]",sep = "")
    df[2:nrow(colvar_index),1]=paste("(",as.character(round(colvar_index$min[2:nrow(colvar_index)],4)),
                                     ",",
                                     as.character(round(colvar_index$max[2:nrow(colvar_index)],4)),"]",
                                     sep = "")
  }
  if(colnames(colvar_index)[1]=="classify")
  {
    df=colvar_index$classify
  }
  df=cbind(df,colvar_index$woe)
  rownames(df)=as.character(c(1:nrow(df)))
  return(df)
}

运行代码:

var_score(all_index$age$index)

输出如下:

定义评分卡函数score_card:
注:若theta取1,表明theta为默认值:样本中的坏好比
model_fit为模型拟合对象,allvar_index为函数allvarindex返回的结果,若list取真,则输出list格式,若list为假输出data.frame格式。

scorecard<-function(y,theta=1,PDO,p0,model_fit,allvar_index,list=T)
{
  if (theta==1)
  {
    theta=length(which(y==1))/length(which(y==0))
  }
  B=PDO/log(2);A=p0+B*log(theta)
  beta<-model_fit$coefficients
  namesofmodelcoef=names(beta)
  allvar_index=allvar_index[names(allvar_index)%in%namesofmodelcoef]
  score_card=list()
  varscore=data.frame()
  for(i in 1:length(allvar_index))
  {
    varscore=var_score(allvar_index[[i]]$index)
    varscore[,2]=floor(-varscore[,2]*B*beta[i+1]+(A-B*beta[1])/length(allvar_index))
    varscore=cbind(as.data.frame(rep(names(allvar_index)[i],nrow(varscore))),varscore)
    colnames(varscore)=c("varname","interval","score")
    score_card[[i]]=varscore
    names(score_card)[i]=names(allvar_index)[i]
  }
  if(!list)
  {
    sc=score_card[[1]]
    for(i in 2:length(score_card))
    {
      sc=rbind(sc,score_card[[i]])
    }
    score_card=sc
    rownames(score_card)=as.character(1:nrow(score_card))
  }
  return(score_card)
}

运行代码:

score_card=scorecard(data[,1],1,50,580,step_glm_fit,all_index,T)
score_card

输出结果:
score_card

运行代码:

score_card=scorecard(data[,1],1,50,580,step_glm_fit,all_index,F)
score_card

输出结果:
score_card

代码流程总结

数据预处理完成后,按以下代码即可得到评分卡:

#计算woe等相关指标
all_index=allvarindex(data,K=10,K_min=5,concol=c(2:7))

#建立logistic回归模型
library(caret)
set.seed(1234)
woeMatrix<-all_index$WOEmatrix
splitIndex<-createDataPartition(woeMatrix[,1],time=1,p=0.5,list=FALSE) 
train<-woeMatrix[splitIndex,];traindata=data[splitIndex,] 
test<-woeMatrix[-splitIndex,];testdata=data[splitIndex,]
glm_fit=glm(woeMatrix[,1]~.,family = binomial,woeMatrix[,-1])
step_glm_fit=step(glm_fit)
pre=predict(step_glm_fit,test,type="response")

#观察ROC曲线和KS曲线
library(pROC)
modelroc <- roc(test[,1],pre)
plot(modelroc, print.auc=T, auc.polygon=T, grid=c(0.2, 0.1),
     grid.col=c("red", "yellow"), max.auc.polygon=T,
     auc.polygon.col="seagreen2", print.thres=T)

ks_index=ksindex(test[,1],pre)
ksplot(ks_index)

#woe转化为评分卡
score_card=scorecard(data[,1],1,50,580,step_glm_fit,all_index,F)
score_card

当然,这只是个不精细的通用代码,在数据预处理、变量分析与筛选的过程中往往要精细化,比如观察每个变量的分布,对异常值的分箱考虑等。Kaggle的cs-training.csv的变量都是连续变量,但家属数量“类别”较少,这里就当作离散变量演示。