信用评分卡-逻辑回归建模

时间:2022-11-04 23:52:39

    数据来源kaggle上Give Me Some Credit,目的是预测借款人两年内经历财务危机的可能性,帮助借贷人做出最好的选择。结合信用评分卡的构建原理,利用python语言完成数据清洗和处理工作。用逻辑回归模型建立信用评分卡的基础模型。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.style.use("fivethirtyeight")
%matplotlib inline

input1='../data/cs-test.csv'
input2='../data/cs-training.csv'
test=pd.read_csv(input1)
train=pd.read_csv(input2)
train.sample(10)         #在样本中随机抽取10个数据记录进行查看

test.sample(10)

信用评分卡-逻辑回归建模

train.info()#查看数据基本信息

test.info()

train.describe()

test.describe()


plt.figure(figsize=(10,8))#标签值可视化

sns.countplot("SeriousDlqin2yrs",data=train)

信用评分卡-逻辑回归建模

train[train['age']<18]#查看未满18岁的记录

train.loc[train['age']==0,'age']=train['age'].median()#异常值处理,用中位数填充年龄为0的值

age_working=train.loc[(train['age']>=18) & (train['age']<60)]#按年龄将数据分为退休人员和在职人员

age_senior=train.loc[train['age']>=60]


#用平均值填充缺失值

age_working_income=age_working['MonthlyIncome'].mean()

age_senior_income=age_senior['MonthlyIncome'].mean()

train["MonthlyIncome"] = train["MonthlyIncome"].fillna(99999)
train.loc[((train['age']>=18) & (train['age']<60))&(train["MonthlyIncome"]==99999),"MonthlyIncome"]=age_working_income#用对应平均数填充
train.loc[(train['age']>=60)&(train["MonthlyIncome"]==99999),"MonthlyIncome"]=age_senior_income

train["MonthlyIncome"] = train["MonthlyIncome"].astype('int64')


train["NumberOfDependents"] = train["NumberOfDependents"].fillna(0)#用0填充
train["NumberOfDependents"] = train["NumberOfDependents"].astype('int64')

train["NumberOfDependents"].value_counts()#分类求和


corr=train.corr()#相关性分析
plt.figure(figsize=(14,12))

sns.heatmap(corr, annot=True, fmt=".2g")

信用评分卡-逻辑回归建模

从图中可以看出,这些原始属性两两之间的相关性很低,所以需要进行属性组合和重构

#违约组合
train["CombinedDefaulted"]=train['NumberOfTime30-59DaysPastDueNotWorse']+train['NumberOfTime60-89DaysPastDueNotWorse']+train['NumberOfTimes90DaysLate']

train.loc[train["CombinedDefaulted"]>=1,"CombinedDefaulted"]=1

#信贷组合
train["CombinedCreditLoans"] = train["NumberOfOpenCreditLinesAndLoans"] +train["NumberRealEstateLoansOrLines"]
train.loc[(train["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
train.loc[(train["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

train["CombinedCreditLoans"].value_counts()


train["WithDependents"] = train["NumberOfDependents"]
train.loc[(train["WithDependents"] >= 1), "WithDependents"] = 1

train["WithDependents"].value_counts()


#每月债务支出
train["MonthlyDebtPayments"] = train["DebtRatio"] * train["MonthlyIncome"]
train["MonthlyDebtPayments"] = np.absolute(train["MonthlyDebtPayments"])

train["MonthlyDebtPayments"] = train["MonthlyDebtPayments"].astype('int64')


train['age'].astype('int64')

train["MonthlyDebtPayments"].astype('int64')


train["age_map"] = train["age"]
train.loc[(train["age"] >= 18) & (train["age"] < 60), "age_map"] = 1

train.loc[(train["age"] >= 60), "age_map"] = 0

#替换为分类特征,然后获得虚拟变量
train["age_map"] = train["age_map"].replace(0, "working")

train["age_map"] = train["age_map"].replace(1, "senior")

train= pd.concat([train, pd.get_dummies(train.age_map,prefix='is')], axis=1)

#通过相关矩阵决定保留哪些属性

corr = train.corr()
plt.figure(figsize=(14,12))

sns.heatmap(corr, annot=True, fmt=".2g")

信用评分卡-逻辑回归建模

据此可排除一些属性:

train_data.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
                 "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse",\
                 "WithDependents","age_map","is_senior","is_working", "MonthlyDebtPayments"], axis=1, inplace=True)



接下来再对测试数据test进行处理,步骤和train一样,就直接用函数封装了

#测试数据处理,步骤同训练数据
def cleaned_dataset(dataset):
    dataset.loc[dataset["age"] <= 18, "age"] = dataset.age.median()
    
    age_working = dataset.loc[(dataset["age"] >= 18) & (dataset["age"] < 60)]
    age_senior = dataset.loc[(dataset["age"] >= 60)]


    age_working_impute = age_working.MonthlyIncome.mean()
    age_senior_impute = age_senior.MonthlyIncome.mean()


    dataset["MonthlyIncome"] = np.absolute(dataset["MonthlyIncome"])
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].fillna(99999)
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].astype('int64')


    dataset.loc[((dataset["age"] >= 18) & (dataset["age"] < 60)) & (dataset["MonthlyIncome"] == 99999),\
                   "MonthlyIncome"] = age_working_impute
    dataset.loc[(train["age"] >= 60) & (dataset["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute
    dataset["NumberOfDependents"] = np.absolute(dataset["NumberOfDependents"])
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].fillna(0)
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].astype('int64')


    dataset["CombinedDefaulted"] = (dataset["NumberOfTimes90DaysLate"] + dataset["NumberOfTime60-89DaysPastDueNotWorse"])\
                                            + dataset["NumberOfTime30-59DaysPastDueNotWorse"]


    dataset.loc[(dataset["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1


    dataset["CombinedCreditLoans"] = dataset["NumberOfOpenCreditLinesAndLoans"] + \
                                            dataset["NumberRealEstateLoansOrLines"]
    dataset.loc[(dataset["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
    dataset.loc[(dataset["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1


    dataset.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
                 "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse"], axis=1, inplace=True)

cleaned_dataset(test)

#分离标签列和特征列

X = train.drop("SeriousDlqin2yrs", axis=1).copy()#特征列

y = train.SeriousDlqin2yrs#标签列

X_test = test.drop("SeriousDlqin2yrs", axis=1).copy()

y_test = test.SeriousDlqin2yrs

#下面开始构建LogisticRegression模型

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42)
logit = LogisticRegression(random_state=42, solver="saga", penalty="l1", class_weight="balanced", C=1.0, max_iter=500)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

X_val_scaled = scaler.transform(X_val)

logit.fit(X_train_scaled, y_train)
logit_scores_proba = logit.predict_proba(X_train_scaled)

logit_scores = logit_scores_proba[:,1]

def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--")
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")

    plt.ylabel("True Positive rate")

fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)#画出ROC曲线

plot_roc_curve(fpr_logit,tpr_logit)

信用评分卡-逻辑回归建模

print("AUC Score {}".format(roc_auc_score(y_train,logit_scores)))#用AUC进行模型评价