[学习交流] 【上海校区】随机深林挑选特征和iv

任务
分别用IV值和随机森林挑选特征，再构建模型，进行模型评估

IV值选择特征
#!/user/bin/env python
#-*- coding:utf-8 -*-
# @Time : 2018/11/27 21:08
# @Author  : 刘
# @Site :
# @File : RandomForest.py
# @Software: PyCharm
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import  accuracy_score,roc_auc_score

path = "E:/mypython/moxingxuexi/"
data= pd.read_csv(path+ 'data/data.csv',encoding= 'gbk')
data.drop_duplicates(inplace =True)
#载入特征
with open('feature.pkl','rb') as f:
X=pickle.load(f)
#提取标签
y= data.status
#划分训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=2000)

# iv值进行特征选择
import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target

def woe(X, y, event=1):
res_woe = []  # 列表存放woe字典
iv_dict = {}  # 列表存放iv
for feature in X.columns:
      x = X[feature].values
      # 1) 连续特征离散化
      if type_of_target(x) == 'continuous':
         x = discrete(x)
      # 2) 计算该特征的woe和iv
      # woe_dict, iv = woe_single_x(x, y, feature, event)
      woe_dict, iv = woe_single_x(x, y, feature, event)  # 计算单个特征的woe值
      iv_dict[feature] = iv
      res_woe.append(woe_dict)
return iv_dict

def discrete(x):
# 使用5等分离散化特征
res = np.zeros(x.shape)
for i in range(5):
      point1 = stats.scoreatpercentile(x, i * 20)
      point2 = stats.scoreatpercentile(x, (i + 1) * 20)
      x1 = x[np.where((x >= point1) & (x <= point2))]
      mask = np.in1d(x, x1)
      res[mask] = i + 1
return res

def woe_single_x(x, y, feature, event=1):
# 计算单个特征的woe值
event_total = sum(y == event)
non_event_total = y.shape[-1] - event_total

iv = 0
woe_dict = {}
for x1 in set(x):  # 遍历各个块
      y1 = y.reindex(np.where(x == 1)[0])
      event_count = sum(y1 == event)
      non_event_count = y1.shape[-1] - event_count
      rate_event = event_count / event_total
      rate_non_event = non_event_count / non_event_total
      # woe无穷大时处理
      if rate_event == 0:
         rate_event = 0.0001
      elif rate_event == 0:
         rate_non_event = 0.0001
      else:
         woei = math.log(rate_event / rate_non_event)
         woe_dict[x1] = woei
         iv += (rate_event - rate_non_event) * woei
return woe_dict, iv

随机森林挑选特征
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 观察默认参数的性能
rf0 = RandomForestClassifier(oob_score=True, random_state=2333)
rf0.fit(X_train, y_train)
print('袋外分数：', rf0.oob_score_)
model_metrics(rf0, X_train, X_test, y_train, y_test)
建模调参
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from mlxtend.classifier import StackingClassifier

lr = LogisticRegression(C = 0.1, penalty = 'l1')
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_poly =  svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_rbf =  svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_sigmoid =  svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
dt = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
xgb = XGBClassifier(learning_rate =0.1, n_estimators=80, max_depth=3, min_child_weight=5,
                  gamma=0.2, subsample=0.8, colsample_bytree=0.8, reg_alpha=1e-5,
                  objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgb = LGBMClassifier(learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=11,
                  gamma=0.1, subsample=0.5, colsample_bytree=0.9, reg_alpha=1e-5,
                  nthread=4,scale_pos_weight=1, seed=27)

sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, svm_sigmoid, dt, xgb, lgb],
                        meta_classifier=lr, use_probas=True,average_probas=False)
性能评估
性能评估
from sklearn.metrics import accuracy_score,roc_auc_score
def model_metrics(clf,X_train,X_test,y_train,y_test):
#预测
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

y_train_proba=clf.predict_proba(X_train)[:,1]
y_test_proba = clf.predict_proba(X_test)[:,1]

#准确率
print('[准确率]',end = ' ')
print('训练集：','%.4f'%accuracy_score(y_train,y_train_pred) ,end=' ')
print('测试集：','%.4f'%accuracy_score(y_test,y_test_pred))
# auc取值：用roc_auc_score或auc
print('[auc值]',end = ' ')
print('训练集：', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
print('测试集：', '%.4f'%roc_auc_score(y_test, y_test_proba))

sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)
评价结果
[准确率] 训练集： 0.8560 测试集： 0.7912
[auc值] 训练集： 0.9042 测试集： 0.7952
---------------------
作者：lgy54321
来源：CSDN
原文：https://blog.csdn.net/lgy54321/article/details/84594152
版权声明：本文为博主原创文章，转载请附上博文链接！

不二晨 · 不二晨

梦缠绕的时候 · 梦缠绕的时候

帐号		自动登录	找回密码
密码			加入黑马

[学习交流] 【上海校区】随机深林挑选特征和iv

2 个回复

浏览过的版块