任务
分别用IV值和随机森林挑选特征,再构建模型,进行模型评估
IV值选择特征
#!/user/bin/env python
#-*- coding:utf-8 -*-
# @Time : 2018/11/27 21:08
# @Author : 刘
# @Site :
# @File : RandomForest.py
# @Software: PyCharm
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score
path = "E:/mypython/moxingxuexi/"
data= pd.read_csv(path+ 'data/data.csv',encoding= 'gbk')
data.drop_duplicates(inplace =True)
#载入特征
with open('feature.pkl','rb') as f:
X=pickle.load(f)
#提取标签
y= data.status
#划分训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=2000)
# iv值进行特征选择
import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target
def woe(X, y, event=1):
res_woe = [] # 列表存放woe字典
iv_dict = {} # 列表存放iv
for feature in X.columns:
x = X[feature].values
# 1) 连续特征离散化
if type_of_target(x) == 'continuous':
x = discrete(x)
# 2) 计算该特征的woe和iv
# woe_dict, iv = woe_single_x(x, y, feature, event)
woe_dict, iv = woe_single_x(x, y, feature, event) # 计算单个特征的woe值
iv_dict[feature] = iv
res_woe.append(woe_dict)
return iv_dict
def discrete(x):
# 使用5等分离散化特征
res = np.zeros(x.shape)
for i in range(5):
point1 = stats.scoreatpercentile(x, i * 20)
point2 = stats.scoreatpercentile(x, (i + 1) * 20)
x1 = x[np.where((x >= point1) & (x <= point2))]
mask = np.in1d(x, x1)
res[mask] = i + 1
return res
def woe_single_x(x, y, feature, event=1):
# 计算单个特征的woe值
event_total = sum(y == event)
non_event_total = y.shape[-1] - event_total
iv = 0
woe_dict = {}
for x1 in set(x): # 遍历各个块
y1 = y.reindex(np.where(x == 1)[0])
event_count = sum(y1 == event)
non_event_count = y1.shape[-1] - event_count
rate_event = event_count / event_total
rate_non_event = non_event_count / non_event_total
# woe无穷大时处理
if rate_event == 0:
rate_event = 0.0001
elif rate_event == 0:
rate_non_event = 0.0001
else:
woei = math.log(rate_event / rate_non_event)
woe_dict[x1] = woei
iv += (rate_event - rate_non_event) * woei
return woe_dict, iv
随机森林挑选特征
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# 观察默认参数的性能
rf0 = RandomForestClassifier(oob_score=True, random_state=2333)
rf0.fit(X_train, y_train)
print('袋外分数:', rf0.oob_score_)
model_metrics(rf0, X_train, X_test, y_train, y_test)
建模调参
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from mlxtend.classifier import StackingClassifier
lr = LogisticRegression(C = 0.1, penalty = 'l1')
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_poly = svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_rbf = svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_sigmoid = svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
dt = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
xgb = XGBClassifier(learning_rate =0.1, n_estimators=80, max_depth=3, min_child_weight=5,
gamma=0.2, subsample=0.8, colsample_bytree=0.8, reg_alpha=1e-5,
objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgb = LGBMClassifier(learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=11,
gamma=0.1, subsample=0.5, colsample_bytree=0.9, reg_alpha=1e-5,
nthread=4,scale_pos_weight=1, seed=27)
sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, svm_sigmoid, dt, xgb, lgb],
meta_classifier=lr, use_probas=True,average_probas=False)
性能评估
性能评估
from sklearn.metrics import accuracy_score,roc_auc_score
def model_metrics(clf,X_train,X_test,y_train,y_test):
#预测
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
y_train_proba=clf.predict_proba(X_train)[:,1]
y_test_proba = clf.predict_proba(X_test)[:,1]
#准确率
print('[准确率]',end = ' ')
print('训练集:','%.4f'%accuracy_score(y_train,y_train_pred) ,end=' ')
print('测试集:','%.4f'%accuracy_score(y_test,y_test_pred))
# auc取值:用roc_auc_score或auc
print('[auc值]',end = ' ')
print('训练集:', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
print('测试集:', '%.4f'%roc_auc_score(y_test, y_test_proba))
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)
评价结果
[准确率] 训练集: 0.8560 测试集: 0.7912
[auc值] 训练集: 0.9042 测试集: 0.7952
---------------------
作者:lgy54321
来源:CSDN
原文:https://blog.csdn.net/lgy54321/article/details/84594152
版权声明:本文为博主原创文章,转载请附上博文链接!
|
|