使用Python开发信用卡申请评分模型

绪论

信用评分卡是金融行业常见的风险控制方式。它使用信用卡申请者提交的个人信息与数据,预测他们今后违约、拖欠信用卡借款的概率。银行能够根据它决定是否向申请人发放信用卡。信用评分能够客观地量化风险的大小。
 
一般而言,信用评分卡以历史数据为基础。一旦遭遇较大的经济波动。以往的模型可能失去原有的预测效力。Logistic模型是信用评分的常用方法。因为Logistic适用于二分类任务,并且能够计算出每个特征的系数。为了便于理解和操作,评分卡会将Logistic回归系数乘以一定的数值(如100)并取整。
 
而目前,随着机器学习算法的发展。Boosting、随机森林、支持向量机等预测力更强的方法被引入信用卡评分中。然而,这些方法往往不具有很好的透明度。可能难以向客户和监管者提供一个拒绝或接受的理由。

In [1]:
from pylab import *
warnings.filterwarnings("ignore") 
In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
In [3]:
import pandas as pd    
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("E:\\R_codes\\others\\application_record.csv", encoding = 'utf-8') 
record = pd.read_csv("E:\\R_codes\\others\\credit_record.csv", encoding = 'utf-8')  
In [4]:
sns.set_style('white') 

特征工程

因变量

In [5]:
# 提取所有用户的开户月份
begin_month=pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
begin_month=begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) #变量重命名
new_data=pd.merge(data,begin_month,how="left",on="ID") #整合到记录表中

2007年的巴塞尔协议规定违约是指贷款出现90天逾期或贷款机构有理由认为贷款不会被偿还(消费信用模型 定价 利润与组合)。一般而言风险用户的比例应该控制在3%以内,因此选择逾期60天以上的用户作为目标风险用户。

In [6]:
record['dep_value'] = None
record['dep_value'][record['STATUS'] =='2']='Yes' 
record['dep_value'][record['STATUS'] =='3']='Yes' 
record['dep_value'][record['STATUS'] =='4']='Yes' 
record['dep_value'][record['STATUS'] =='5']='Yes' 
In [7]:
cpunt=record.groupby('ID').count()
cpunt['dep_value'][cpunt['dep_value'] > 0]='Yes' 
cpunt['dep_value'][cpunt['dep_value'] == 0]='No' 
cpunt = cpunt[['dep_value']]
new_data=pd.merge(new_data,cpunt,how='inner',on='ID')
new_data['target']=new_data['dep_value']
new_data.loc[new_data['target']=='Yes','target']=1
new_data.loc[new_data['target']=='No','target']=0
In [8]:
print(cpunt['dep_value'].value_counts())
cpunt['dep_value'].value_counts(normalize=True)
No     45318
Yes      667
Name: dep_value, dtype: int64
Out[8]:
No     0.985495
Yes    0.014505
Name: dep_value, dtype: float64

自变量

  • 重命名
In [9]:
new_data.rename(columns={'CODE_GENDER':'Gender','FLAG_OWN_CAR':'Car','FLAG_OWN_REALTY':'Reality',
                         'CNT_CHILDREN':'ChldNo','AMT_INCOME_TOTAL':'inc',
                         'NAME_EDUCATION_TYPE':'edutp','NAME_FAMILY_STATUS':'famtp',
                        'NAME_HOUSING_TYPE':'houtp','FLAG_EMAIL':'email',
                         'NAME_INCOME_TYPE':'inctp','FLAG_WORK_PHONE':'wkphone',
                         'FLAG_PHONE':'phone','CNT_FAM_MEMBERS':'famsize',
                        'OCCUPATION_TYPE':'occyp'
                        },inplace=True)
In [10]:
new_data.dropna()
new_data = new_data.mask(new_data == 'NULL').dropna()
In [11]:
ivtable=pd.DataFrame(new_data.columns,columns=['变量名'])
ivtable['IV值']=None
namelist = ['FLAG_MOBIL','begin_month','dep_value','target','ID']

for i in namelist:
    ivtable.drop(ivtable[ivtable['变量名'] == i].index, inplace=True)

自编函数

In [12]:
# Calculate information value
def calc_iv(df, feature, target, pr=False):
    lst = []

    df[feature] = df[feature].fillna("NULL")

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good (think: Fraud == 0)
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad (think: Fraud == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])
    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    
    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print('IV = ', data['IV'].sum())

    iv = data['IV'].sum()
    print('变量的iv值为:',iv)
    return iv, data
In [13]:
def convert_dummy(df, feature,rank=0):
    pos = pd.get_dummies(df[feature], prefix=feature)
    mode = df[feature].value_counts().index[rank]
    biggest = feature + '_' + str(mode)
    pos.drop([biggest],axis=1,inplace=True)
    df.drop([feature],axis=1,inplace=True)
    df=df.join(pos)
    return df
In [14]:
def get_category(df, col, binsnum, labels, qcut = False):
    if qcut:
        localdf = pd.qcut(df[col], q = binsnum, labels = labels)
        localdf = pd.DataFrame(localdf)
        name = 'gp' + '_' + col
        localdf[name] = localdf[col]
        df = df.join(localdf[name])
        df[name] = df[name].astype(object)
        return df
    else:
        localdf = pd.cut(df[col], bins = binsnum, labels = labels)
        localdf = pd.DataFrame(localdf)
        name = 'gp' + '_' + col
        localdf[name] = localdf[col]
        df = df.join(localdf[name])
        df[name] = df[name].astype(object)
    return df
In [15]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

二分类变量

性别

In [16]:
new_data['Gender']=new_data['Gender'].replace(['F','M'],[0,1])
print(new_data['Gender'].value_counts())
iv, data=calc_iv(new_data,'Gender','target')
ivtable.loc[ivtable['变量名']=='Gender','IV值']=iv
data.head()
0    15630
1     9504
Name: Gender, dtype: int64
变量的iv值为: 0.02520350452745081
Out[16]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 Gender 0 15630 15400 230 0.621867 0.014715 0.623179 0.545024 0.134005 0.010473
1 Gender 1 9504 9312 192 0.378133 0.020202 0.376821 0.454976 -0.188475 0.014730

是否有车

In [17]:
new_data['Car']=new_data['Car'].replace(['N','Y'],[0,1])
print(new_data['Car'].value_counts())
iv, data=calc_iv(new_data,'Car','target')
ivtable.loc[ivtable['变量名']=='Car','IV值']=iv
data.head()
0    14618
1    10516
Name: Car, dtype: int64
变量的iv值为: 4.54248124999671e-06
Out[17]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 Car 0 14618 14373 245 0.581603 0.016760 0.58162 0.580569 0.00181 0.000002
1 Car 1 10516 10339 177 0.418397 0.016831 0.41838 0.419431 -0.00251 0.000003

是否有房产

In [18]:
new_data['Reality']=new_data['Reality'].replace(['N','Y'],[0,1])
print(new_data['Reality'].value_counts())
iv, data=calc_iv(new_data,'Reality','target')
ivtable.loc[ivtable['变量名']=='Reality','IV值']=iv
data.head()
1    16461
0     8673
Name: Reality, dtype: int64
变量的iv值为: 0.02744070350168343
Out[18]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 Reality 0 8673 8494 179 0.34507 0.020639 0.34372 0.424171 -0.210309 0.016920
1 Reality 1 16461 16218 243 0.65493 0.014762 0.65628 0.575829 0.130777 0.010521

是否有自己的电话

In [19]:
new_data['phone']=new_data['phone'].astype(str)
print(new_data['phone'].value_counts(normalize=True,sort=False))
new_data.drop(new_data[new_data['phone'] == 'nan' ].index, inplace=True)
iv, data=calc_iv(new_data,'phone','target')
ivtable.loc[ivtable['变量名']=='phone','IV值']=iv
data.head()
1    0.292791
0    0.707209
Name: phone, dtype: float64
变量的iv值为: 0.0005480495762639297
Out[19]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 phone 0 17775 17481 294 0.707209 0.016540 0.707389 0.696682 0.015251 0.000163
1 phone 1 7359 7231 128 0.292791 0.017394 0.292611 0.303318 -0.035937 0.000385

是否有email

In [20]:
print(new_data['email'].value_counts(normalize=True,sort=False))
new_data['email']=new_data['email'].astype(str)
iv, data=calc_iv(new_data,'email','target')
ivtable.loc[ivtable['变量名']=='email','IV值']=iv
data.head()
0    0.89934
1    0.10066
Name: email, dtype: float64
变量的iv值为: 1.7343581493999816e-05
Out[20]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 email 0 22604 22225 379 0.89934 0.016767 0.899361 0.898104 0.001398 0.000002
1 email 1 2530 2487 43 0.10066 0.016996 0.100639 0.101896 -0.012407 0.000016

是否有工作电话

In [21]:
new_data['wkphone']=new_data['wkphone'].astype(str)
iv, data=calc_iv(new_data,'wkphone','target')
new_data.drop(new_data[new_data['wkphone'] == 'nan' ].index, inplace=True)
ivtable.loc[ivtable['变量名']=='wkphone','IV值']=iv
data.head()
变量的iv值为: 0.002042429795148461
Out[21]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 wkphone 0 18252 17954 298 0.726188 0.016327 0.72653 0.706161 0.028436 0.000579
1 wkphone 1 6882 6758 124 0.273812 0.018018 0.27347 0.293839 -0.071838 0.001463

连续变量分箱

孩子数

In [22]:
new_data.loc[new_data['ChldNo'] >= 2,'ChldNo']='2More'
print(new_data['ChldNo'].value_counts(sort=False))
0        15908
1         6118
2More     3108
Name: ChldNo, dtype: int64
In [23]:
iv, data=calc_iv(new_data,'ChldNo','target')
ivtable.loc[ivtable['变量名']=='ChldNo','IV值']=iv
data.head()
变量的iv值为: 0.0011214542503301935
Out[23]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 ChldNo 0 15908 15635 273 0.632928 0.017161 0.632689 0.646919 -0.022243 0.000317
1 ChldNo 1 6118 6021 97 0.243415 0.015855 0.243647 0.229858 0.058259 0.000803
2 ChldNo 2More 3108 3056 52 0.123657 0.016731 0.123665 0.123223 0.003580 0.000002
In [24]:
new_data = convert_dummy(new_data,'ChldNo')

年收入

对年收入变量进行等频分箱

In [25]:
new_data['inc']=new_data['inc'].astype(object)
new_data['inc'] = new_data['inc']/10000 #以万为单位
print(new_data['inc'].value_counts(bins=10,sort=False))
new_data['inc'].plot(kind='hist',bins=50,density=True)
(2.544, 18.18]      14663
(18.18, 33.66]       8464
(33.66, 49.14]       1637
(49.14, 64.62]        175
(64.62, 80.1]         124
(80.1, 95.58]          50
(95.58, 111.06]         4
(111.06, 126.54]        3
(126.54, 142.02]        6
(142.02, 157.5]         8
Name: inc, dtype: int64
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x21500273748>
In [26]:
new_data = get_category(new_data,'inc', 3, ["low","medium", "high"], qcut = True)
iv, data = calc_iv(new_data,'gp_inc','target')
ivtable.loc[ivtable['变量名']=='inc','IV值']=iv
data.head()
变量的iv值为: 0.0024219962221596752
Out[26]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 gp_inc high 8244 8096 148 0.328002 0.017952 0.327614 0.350711 -0.068126 0.001573
1 gp_inc low 8996 8849 147 0.357922 0.016341 0.358085 0.348341 0.027588 0.000269
2 gp_inc medium 7894 7767 127 0.314077 0.016088 0.314301 0.300948 0.043413 0.000580
In [27]:
new_data = convert_dummy(new_data,'gp_inc')

年龄

对年龄进行等频分组

In [28]:
new_data['Age']=-(new_data['DAYS_BIRTH'])//365	
print(new_data['Age'].value_counts(bins=10,normalize=True,sort=False))
new_data['Age'].plot(kind='hist',bins=20,density=True)
(19.951999999999998, 24.7]    0.025066
(24.7, 29.4]                  0.134280
(29.4, 34.1]                  0.169770
(34.1, 38.8]                  0.140805
(38.8, 43.5]                  0.173072
(43.5, 48.2]                  0.141880
(48.2, 52.9]                  0.099069
(52.9, 57.6]                  0.076550
(57.6, 62.3]                  0.032585
(62.3, 67.0]                  0.006923
Name: Age, dtype: float64
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x2150078e278>
In [29]:
new_data = get_category(new_data,'Age',5, ["lowest","low","medium","high","highest"])
iv, data=calc_iv(new_data,'gp_Age','target')
ivtable.loc[ivtable['变量名']=='DAYS_BIRTH','IV值'] = iv
data.head()
变量的iv值为: 0.06593513858884348
Out[29]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 gp_Age high 4414 4323 91 0.175619 0.020616 0.174935 0.215640 -0.209194 0.008515
1 gp_Age highest 993 989 4 0.039508 0.004028 0.040021 0.009479 1.440361 0.043992
2 gp_Age low 7806 7686 120 0.310575 0.015373 0.311023 0.284360 0.089625 0.002390
3 gp_Age lowest 4005 3921 84 0.159346 0.020974 0.158668 0.199052 -0.226754 0.009157
4 gp_Age medium 7916 7793 123 0.314952 0.015538 0.315353 0.291469 0.078758 0.001881
In [30]:
new_data = convert_dummy(new_data,'gp_Age')

工龄

  • 等距分箱
In [31]:
new_data['worktm']=-(new_data['DAYS_EMPLOYED'])//365	
new_data[new_data['worktm']<0] = np.nan #填补为缺失值
new_data['DAYS_EMPLOYED']
new_data['worktm'].fillna(new_data['worktm'].mean(),inplace=True) #均值填补
new_data['worktm'].plot(kind='hist',bins=20,density=True)
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x215005f33c8>
In [32]:
new_data = get_category(new_data,'worktm',5, ["lowest","low","medium","high","highest"])
iv, data=calc_iv(new_data,'gp_worktm','target')
ivtable.loc[ivtable['变量名']=='DAYS_EMPLOYED','IV值']=iv
data.head()
变量的iv值为: 0.04022152230816303
Out[32]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 gp_worktm high 425 423 2 0.016909 0.004706 0.017117 0.004739 1.284186 0.015895
1 gp_worktm highest 90 90 0 0.003581 0.000000 0.003642 0.000000 0.000000 0.000000
2 gp_worktm low 4987 4921 66 0.198416 0.013234 0.199134 0.156398 0.241573 0.010324
3 gp_worktm lowest 18254 17916 338 0.726267 0.018516 0.724992 0.800948 -0.099635 0.007568
4 gp_worktm medium 1378 1362 16 0.054826 0.011611 0.055115 0.037915 0.374082 0.006434
In [33]:
new_data = convert_dummy(new_data,'gp_worktm')

家庭规模

In [34]:
new_data['famsize'].value_counts(sort=False)
Out[34]:
2.0     12697
1.0      4263
3.0      5216
4.0      2576
6.0        51
9.0         2
5.0       307
7.0        18
20.0        1
15.0        3
Name: famsize, dtype: int64
In [35]:
new_data['famsize']=new_data['famsize'].astype(int)
new_data['famsizegp']=new_data['famsize']
new_data['famsizegp']=new_data['famsizegp'].astype(object)
new_data.loc[new_data['famsizegp']>=3,'famsizegp']='3more'
iv, data=calc_iv(new_data,'famsizegp','target')
ivtable.loc[ivtable['变量名']=='famsize','IV值']=iv
data.head()
变量的iv值为: 0.006156138510778323
Out[35]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 famsizegp 1 4263 4179 84 0.169611 0.019704 0.169108 0.199052 -0.163028 0.004882
1 famsizegp 2 12697 12489 208 0.505172 0.016382 0.505382 0.492891 0.025027 0.000313
2 famsizegp 3more 8174 8044 130 0.325217 0.015904 0.325510 0.308057 0.055108 0.000962
In [36]:
new_data = convert_dummy(new_data,'famsizegp')

多分类变量

收入类别

In [37]:
print(new_data['inctp'].value_counts(sort=False))
print(new_data['inctp'].value_counts(normalize=True,sort=False))
new_data.loc[new_data['inctp']=='Pensioner','inctp']='State servant'
new_data.loc[new_data['inctp']=='Student','inctp']='State servant'
iv, data=calc_iv(new_data,'inctp','target')
ivtable.loc[ivtable['变量名']=='inctp','IV值']=iv
data.head()
Pensioner                  13
Working                 15622
State servant            2437
Commercial associate     7052
Student                    10
Name: inctp, dtype: int64
Pensioner               0.000517
Working                 0.621549
State servant           0.096960
Commercial associate    0.280576
Student                 0.000398
Name: inctp, dtype: float64
变量的iv值为: 5.159303327851404e-05
Out[37]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 inctp Commercial associate 7052 6933 119 0.280576 0.016875 0.280552 0.281991 -0.005115 0.000007
1 inctp State servant 2460 2418 42 0.097875 0.017073 0.097847 0.099526 -0.017013 0.000029
2 inctp Working 15622 15361 261 0.621549 0.016707 0.621601 0.618483 0.005028 0.000016
In [38]:
new_data = convert_dummy(new_data,'inctp')

职业类型

In [39]:
new_data.loc[(new_data['occyp']=='Cleaning staff') | (new_data['occyp']=='Cooking staff') | (new_data['occyp']=='Drivers') | (new_data['occyp']=='Laborers') | (new_data['occyp']=='Low-skill Laborers') | (new_data['occyp']=='Security staff') | (new_data['occyp']=='Waiters/barmen staff'),'occyp']='Laborwk'
new_data.loc[(new_data['occyp']=='Accountants') | (new_data['occyp']=='Core staff') | (new_data['occyp']=='HR staff') | (new_data['occyp']=='Medicine staff') | (new_data['occyp']=='Private service staff') | (new_data['occyp']=='Realty agents') | (new_data['occyp']=='Sales staff') | (new_data['occyp']=='Secretaries'),'occyp']='officewk'
new_data.loc[(new_data['occyp']=='Managers') | (new_data['occyp']=='High skill tech staff') | (new_data['occyp']=='IT staff'),'occyp']='hightecwk'
print(new_data['occyp'].value_counts())
iv, data=calc_iv(new_data,'occyp','target')
ivtable.loc[ivtable['变量名']=='occyp','IV值']=iv
data.head()         
Laborwk      10496
officewk     10183
hightecwk     4455
Name: occyp, dtype: int64
变量的iv值为: 0.004820472062853304
Out[39]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 occyp Laborwk 10496 10311 185 0.417602 0.017626 0.417247 0.438389 -0.049428 0.001045
1 occyp hightecwk 4455 4375 80 0.177250 0.017957 0.177039 0.189573 -0.068404 0.000857
2 occyp officewk 10183 10026 157 0.405148 0.015418 0.405714 0.372038 0.086652 0.002918
In [40]:
new_data = convert_dummy(new_data,'occyp')

住户类型

In [41]:
iv, data=calc_iv(new_data,'houtp','target')
ivtable.loc[ivtable['变量名']=='houtp','IV值']=iv
data.head()
变量的iv值为: 0.0073275026880227365
Out[41]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 houtp Co-op apartment 152 149 3 0.006048 0.019737 0.006029 0.007109 -0.164705 0.000178
1 houtp House / apartment 22102 21738 364 0.879367 0.016469 0.879654 0.862559 0.019624 0.000335
2 houtp Municipal apartment 812 793 19 0.032307 0.023399 0.032090 0.045024 -0.338655 0.004380
3 houtp Office apartment 199 194 5 0.007918 0.025126 0.007850 0.011848 -0.411619 0.001646
4 houtp Rented apartment 439 433 6 0.017466 0.013667 0.017522 0.014218 0.208939 0.000690
In [42]:
new_data = convert_dummy(new_data,'houtp')

教育水平

In [43]:
new_data.loc[new_data['edutp']=='Academic degree','edutp']='Higher education'
iv, data=calc_iv(new_data,'edutp','target')
ivtable.loc[ivtable['变量名']=='edutp','IV值']=iv
data.head()
变量的iv值为: 0.010361794017679489
Out[43]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 edutp Higher education 7146 7018 128 0.284316 0.017912 0.283992 0.303318 -0.065836 0.001272
1 edutp Incomplete higher 993 972 21 0.039508 0.021148 0.039333 0.049763 -0.235206 0.002453
2 edutp Lower secondary 187 181 6 0.007440 0.032086 0.007324 0.014218 -0.663301 0.004573
3 edutp Secondary / secondary special 16808 16541 267 0.668736 0.015885 0.669351 0.632701 0.056310 0.002064
In [44]:
new_data = convert_dummy(new_data,'edutp')

婚姻\家庭状况

In [45]:
new_data['famtp'].value_counts(normalize=True,sort=False)
Out[45]:
Married                 0.696626
Widow                   0.023076
Separated               0.058367
Civil marriage          0.084865
Single / not married    0.137065
Name: famtp, dtype: float64
In [46]:
iv, data=calc_iv(new_data,'famtp','target')
ivtable.loc[ivtable['变量名']=='famtp','IV值']=iv
data.head()
变量的iv值为: 0.043137115423127946
Out[46]:
Variable Value All Good Bad Share Bad Rate Distribution Good Distribution Bad WoE IV
0 famtp Civil marriage 2133 2101 32 0.084865 0.015002 0.085019 0.075829 0.114394 0.001051
1 famtp Married 17509 17232 277 0.696626 0.015820 0.697313 0.656398 0.060467 0.002474
2 famtp Separated 1467 1452 15 0.058367 0.010225 0.058757 0.035545 0.502608 0.011666
3 famtp Single / not married 3445 3362 83 0.137065 0.024093 0.136047 0.196682 -0.368588 0.022349
4 famtp Widow 580 565 15 0.023076 0.025862 0.022863 0.035545 -0.441263 0.005596
In [47]:
new_data = convert_dummy(new_data,'famtp')

IV、WOE:概念及分析

Weight of Evidence(WoE):

$$wo{e_i} = \ln {{{P_{yi}}} \over {{P_{ni}}}} = \ln {{{y_i}/{y_s}} \over {{n_i}/{n_s}}}$$

其中,$wo{e_i}$表示的是第i类的WOE值。${{P_{yi}}}$是这个类中正样本(${{y_i}}$)占所有正样本(${{y_s}}$)的比例。${{P_{ni}}}$是这个类中负样本(${{n_i}}$)占所有负样本(${{n_s}}$)的比例。

Information Value (IV):
$$I{V_i} = ({P_{yi}} - {P_{ni}}) \times wo{e_i}$$
而各类的IV值则是条件正类率和条件负类率的差乘以变量的WOE值。变量的总IV值则可以理解为条件正类率和条件负类率差的加权和:
$$IV = \sum\limits_i^n {I{V_i}} $$

IV值衡量了变量预测能力的大小。IV值总是正值。

IV值和预测能力的关系

IV 预测能力
<0.02 几乎没有预测力
0.02~0.1 预测力较弱
0.1~0.3 预测力适中
0.3~0.5 预测力较强
>0.5 预测力过强,需要检查变量
In [48]:
ivtable=ivtable.sort_values(by='IV值',ascending=False)
ivtable.loc[ivtable['变量名']=='DAYS_BIRTH','变量名']='agegp'
ivtable.loc[ivtable['变量名']=='DAYS_EMPLOYED','变量名']='worktmgp'
ivtable.loc[ivtable['变量名']=='inc','变量名']='incgp'
ivtable
Out[48]:
变量名 IV值
10 agegp 0.0659351
8 famtp 0.0431371
11 worktmgp 0.0402215
3 Reality 0.0274407
1 Gender 0.0252035
7 edutp 0.0103618
9 houtp 0.0073275
17 famsize 0.00615614
16 occyp 0.00482047
5 incgp 0.002422
13 wkphone 0.00204243
4 ChldNo 0.00112145
14 phone 0.00054805
6 inctp 5.1593e-05
15 email 1.73436e-05
2 Car 4.54248e-06

算法

  • 至此,我们完成了特征工程和数据清洗工作。

划分数据集

In [49]:
new_data.columns
Out[49]:
Index(['ID', 'Gender', 'Car', 'Reality', 'inc', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'wkphone', 'phone', 'email', 'famsize', 'begin_month',
       'dep_value', 'target', 'ChldNo_1', 'ChldNo_2More', 'gp_inc_high',
       'gp_inc_medium', 'Age', 'gp_Age_high', 'gp_Age_highest', 'gp_Age_low',
       'gp_Age_lowest', 'worktm', 'gp_worktm_high', 'gp_worktm_highest',
       'gp_worktm_low', 'gp_worktm_medium', 'famsizegp_1', 'famsizegp_3more',
       'inctp_Commercial associate', 'inctp_State servant', 'occyp_hightecwk',
       'occyp_officewk', 'houtp_Co-op apartment', 'houtp_Municipal apartment',
       'houtp_Office apartment', 'houtp_Rented apartment',
       'houtp_With parents', 'edutp_Higher education',
       'edutp_Incomplete higher', 'edutp_Lower secondary',
       'famtp_Civil marriage', 'famtp_Separated', 'famtp_Single / not married',
       'famtp_Widow'],
      dtype='object')
In [50]:
Y = new_data['target']
X = new_data[['Gender','Reality','ChldNo_1', 'ChldNo_2More','wkphone', 'gp_Age_high', 'gp_Age_highest', 'gp_Age_low',
       'gp_Age_lowest','gp_worktm_high', 'gp_worktm_highest',
       'gp_worktm_low', 'gp_worktm_medium','occyp_hightecwk', 'occyp_officewk','famsizegp_1', 'famsizegp_3more',
       'houtp_Co-op apartment', 'houtp_Municipal apartment',
       'houtp_Office apartment', 'houtp_Rented apartment',
       'houtp_With parents','edutp_Higher education',
       'edutp_Incomplete higher', 'edutp_Lower secondary','famtp_Civil marriage',
       'famtp_Separated','famtp_Single / not married','famtp_Widow']]
  • 由于样本中的不平衡问题(违约者仅仅占总样本非常小的一部分),我们需要从正样本中进行抽样。当然,也可以人为增加负样本的数量,这种做法被称为过采样。接下来使用人工少数类过采样算法(SMOTE)解决不平衡样本的问题。
In [51]:
from imblearn.over_sampling import SMOTE
X_balance,Y_balance = SMOTE().fit_sample(X,Y)
X_balance = pd.DataFrame(X_balance,columns=X.columns)
Using TensorFlow backend.
  • 采样后,因变量类别0和1的样本数量几乎相等。在混淆矩阵部分可以看出。

Logistic Regression

$$\log ({p \over {1 - p}}) = {\beta _0} + {\beta _1}{x_1} + \cdot \cdot \cdot + {\beta _q}{x_q}$$
In [52]:
"""
import statsmodels.api as sm
X_balance = sm.add_constant(X_balance)
logit_model=sm.GLM(Y_balance,X_balance,family=sm.families.Binomial())
logit_results = logit_model.fit()
#logit_results.summary()
"""
Out[52]:
'\nimport statsmodels.api as sm\nX_balance = sm.add_constant(X_balance)\nlogit_model=sm.GLM(Y_balance,X_balance,family=sm.families.Binomial())\nlogit_results = logit_model.fit()\n#logit_results.summary()\n'

年龄与逾期的关系呈现非线性。年龄较高者逾期率更低。
收入越高逾期率越高。
和已婚者相比,未婚者与丧偶者更容易拖欠还款。

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_balance,Y_balance, 
                                                    stratify=Y_balance, test_size=0.3,
                                                    random_state = 10086)
In [55]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(random_state=0, solver='lbfgs')
clf = logit.fit(X_train, y_train)
y_predict = clf.predict(X_test)
In [56]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test,y_predict))
Out[56]:
0 1
0 4007 3407
1 2415 4999
In [57]:
sns.set_style("white") 
from sklearn.utils.multiclass import unique_labels
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_test,y_predict),classes=class_names, normalize=True, 
                      title='Normalized Confusion Matrix: LogisticRegression')
Normalized confusion matrix
[[0.54046399 0.45953601]
 [0.3257351  0.6742649 ]]

KNN

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
In [59]:
param_set = {
 'n_neighbors':range(1,3),
}
In [60]:
neigh = KNeighborsClassifier()
gridcv = GridSearchCV(neigh, param_grid = param_set, cv=10)
gridcv.fit(X_train, y_train)
Out[60]:
GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': range(1, 3)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [61]:
gridcv.best_params_, gridcv.best_score_ 
Out[61]:
({'n_neighbors': 1}, 0.923574979766447)
In [62]:
bestcv = gridcv.best_estimator_
bestfit = bestcv.fit(X_test, y_test)
y_predict = bestfit.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_predict))
Out[62]:
0 1
0 7020 394
1 439 6975
In [63]:
from sklearn.metrics import accuracy_score 
accuracy_score(y_test, y_predict)
Out[63]:
0.9438224979768006
In [64]:
bestcv
Out[64]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
In [65]:
plot_confusion_matrix(confusion_matrix(y_test,y_predict),classes=class_names, normalize=True, 
                      title='Normalized Confusion Matrix: KNN')
Normalized confusion matrix
[[0.9468573 0.0531427]
 [0.0592123 0.9407877]]
In [66]:
#cv_results = cross_validate(neigh,X_train, y_train, cv = 10)
#cv_results['test_score'].mean()
In [67]:
from sklearn.model_selection import cross_validate
lst = []
for i in range(1,11):
    neigh = KNeighborsClassifier(n_neighbors = i)
    cv_results = cross_validate(neigh, X_train, y_train, cv = 5)
    new = cv_results['test_score'].mean()
    lst.append(new)
    #print(cv_results['test_score'].mean())

lst
Out[67]:
[0.9213496583429004,
 0.9040064788680613,
 0.9207715651721988,
 0.9075041568698227,
 0.9121865636598503,
 0.9037751396624705,
 0.9036593321936408,
 0.8951035081474649,
 0.8918950480191574,
 0.8864319685431488]
In [68]:
s = pd.Series(lst, index=np.arange(1,11,1))
s.plot().set(xlabel='K',title='Learning Curve',
ylabel='test_score')
Out[68]:
[Text(0, 0.5, 'test_score'),
 Text(0.5, 0, 'K'),
 Text(0.5, 1.0, 'Learning Curve')]

Decision Tree

In [69]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dtfit = dt.fit(X_train, y_train)
y_predict = dtfit.predict(X_test)
pd.DataFrame(confusion_matrix(y_test,y_predict))
Out[69]:
0 1
0 6960 454
1 325 7089
In [70]:
accuracy_score(y_test, y_predict)
Out[70]:
0.9474642568114379
In [71]:
plot_confusion_matrix(confusion_matrix(y_test,y_predict),classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: CART')
Normalized confusion matrix
[[0.9387645  0.0612355 ]
 [0.04383599 0.95616401]]
In [ ]:
 
In [ ]:
 

Random Forest


Random Forest
In [ ]:
 
In [72]:
rf = RandomForestClassifier(n_estimators=50)
rffit = rf.fit(X_train, y_train)
y_predict = rffit.predict(X_test)

混淆矩阵

In [73]:
pd.DataFrame(confusion_matrix(y_test,y_predict))
Out[73]:
0 1
0 6910 504
1 250 7164
In [74]:
#sns.set_style("white") 
#from sklearn.metrics import confusion_matrix
#from sklearn.utils.multiclass import unique_labels
#class_names = ['0','1']
#plot_confusion_matrix(confusion_matrix(y_test,y_predict),classes=class_names, normalize=True, 
#                     title='Normalized Confusion Matrix: LogisticRegression')

准确率

In [75]:
accuracy_score(y_test, y_predict)
Out[75]:
0.949150256271918
In [76]:
#from sklearn.metrics import precision_score
#precision_score(y_test, y_predict)

召回率

In [77]:
#from sklearn.metrics import recall_score
#recall_score(y_test,y_predict,average='weighted')  
  • 使用随机搜索法调参,在对于有连续变量的参数时,RandomizedSearchCV会将其当作一个分布进行采样。
In [78]:
"""
clf = RandomForestClassifier(n_estimators=50)
# 给定参数搜索范围:list or distribution
param_dist = {"max_depth": [3, None],                     #给定list
              "max_features": sp_randint(1, 11),          #给定distribution
              "min_samples_split": sp_randint(2, 11),     #给定distribution
              "bootstrap": [True, False],                 #给定list
              "criterion": ["gini", "entropy"]}           #给定list

n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=11, cv=10, iid=False)
random_search.fit(X_train,y_train)
print('最优训练器的精度:',random_search.best_score_)
print('最优训练器参数:',random_search.best_estimator_) 
"""
Out[78]:
'\nclf = RandomForestClassifier(n_estimators=50)\n# 给定参数搜索范围:list or distribution\nparam_dist = {"max_depth": [3, None],                     #给定list\n              "max_features": sp_randint(1, 11),          #给定distribution\n              "min_samples_split": sp_randint(2, 11),     #给定distribution\n              "bootstrap": [True, False],                 #给定list\n              "criterion": ["gini", "entropy"]}           #给定list\n\nn_iter_search = 20\nrandom_search = RandomizedSearchCV(clf, param_distributions=param_dist,\n                                   n_iter=11, cv=10, iid=False)\nrandom_search.fit(X_train,y_train)\nprint(\'最优训练器的精度:\',random_search.best_score_)\nprint(\'最优训练器参数:\',random_search.best_estimator_) \n'
In [79]:
"""
best = random_search.best_estimator_
bestfit = best.fit(X_train, y_train)
"""
Out[79]:
'\nbest = random_search.best_estimator_\nbestfit = best.fit(X_train, y_train)\n'
  • 混淆矩阵
In [80]:
"""
y_predict1=bestfit.predict(X_test)
confusiontable2 = pd.DataFrame(confusion_matrix(y_test, y_predict1))
confusiontable2
"""
Out[80]:
'\ny_predict1=bestfit.predict(X_test)\nconfusiontable2 = pd.DataFrame(confusion_matrix(y_test, y_predict1))\nconfusiontable2\n'
In [81]:
#plot_confusion_matrix(confusion_matrix(y_test,y_predict1),classes=class_names, normalize=True, 
#                      title='Normalized Confusion Matrix: RandomForestClassifier')
In [82]:
#accuracy_score(y_test,y_predict1)

SVM


Support Vector Machine
In [83]:
from sklearn import svm
model = svm.SVC(kernel='linear', C = 1)
svmc = model.fit(X_train, y_train)
y_predict = svmc.predict(X_test)
pd.DataFrame(confusion_matrix(y_test,y_predict))
Out[83]:
0 1
0 3084 4330
1 1842 5572
In [84]:
plot_confusion_matrix(confusion_matrix(y_test,y_predict),classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: SVM')
Normalized confusion matrix
[[0.41596979 0.58403021]
 [0.24844888 0.75155112]]
In [85]:
accuracy_score(y_test, y_predict)
Out[85]:
0.5837604531966549
In [ ]:
 

模型评估

In [86]:
"""
y_score = rffit.predict_proba(X_train) 
y_score2 = bestfit.predict_proba(X_test) 
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,1]);
fpr2, tpr2, thresholds2 = roc_curve(y_test, y_score2[:,1]);
roc_auc = auc(fpr, tpr)
plt.subplots(figsize=(7,5.5));
plt.plot(fpr, tpr, color='red');
plt.plot(fpr2, tpr2, color='green');
plt.plot([0, 1], [0, 1], color='navy', linestyle='-');
#plt.fill_between(fpr,tpr, alpha=0.2)
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.05]);
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate');
plt.title('ROC Curve');
plt.legend(loc="lower right");
plt.show()
"""
Out[86]:
'\ny_score = rffit.predict_proba(X_train) \ny_score2 = bestfit.predict_proba(X_test) \nfpr, tpr, thresholds = roc_curve(y_train, y_score[:,1]);\nfpr2, tpr2, thresholds2 = roc_curve(y_test, y_score2[:,1]);\nroc_auc = auc(fpr, tpr)\nplt.subplots(figsize=(7,5.5));\nplt.plot(fpr, tpr, color=\'red\');\nplt.plot(fpr2, tpr2, color=\'green\');\nplt.plot([0, 1], [0, 1], color=\'navy\', linestyle=\'-\');\n#plt.fill_between(fpr,tpr, alpha=0.2)\nplt.xlim([0.0, 1.0]);\nplt.ylim([0.0, 1.05]);\nplt.xlabel(\'False Positive Rate\');\nplt.ylabel(\'True Positive Rate\');\nplt.title(\'ROC Curve\');\nplt.legend(loc="lower right");\nplt.show()\n'
In [87]:
'''
ks = max(tpr-fpr)
ks
'''
Out[87]:
'\nks = max(tpr-fpr)\nks\n'
In [ ]: