A股上市公司传智教育(股票代码 003032)旗下技术交流社区北京昌平校区

 找回密码
 加入黑马

QQ登录

只需一步,快速开始

任务
分别用IV值和随机森林挑选特征,再构建模型,进行模型评估

IV值选择特征
#!/user/bin/env python
#-*- coding:utf-8 -*-
# @Time    : 2018/11/27 21:08
# @Author  : 刘
# @Site    :
# @File    : RandomForest.py
# @Software: PyCharm
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import  accuracy_score,roc_auc_score

path = "E:/mypython/moxingxuexi/"
data= pd.read_csv(path+ 'data/data.csv',encoding= 'gbk')
data.drop_duplicates(inplace =True)
#载入特征
with open('feature.pkl','rb') as f:
    X=pickle.load(f)
#提取标签
y= data.status
#划分训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=2000)

# iv值进行特征选择
import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target


def woe(X, y, event=1):
    res_woe = []  # 列表存放woe字典
    iv_dict = {}  # 列表存放iv
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)  # 计算单个特征的woe值
        iv_dict[feature] = iv
        res_woe.append(woe_dict)
    return iv_dict


def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1
    return res


def woe_single_x(x, y, feature, event=1):
    # 计算单个特征的woe值
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total

    iv = 0
    woe_dict = {}
    for x1 in set(x):  # 遍历各个块
        y1 = y.reindex(np.where(x == 1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total
        rate_non_event = non_event_count / non_event_total
        # woe无穷大时处理
        if rate_event == 0:
            rate_event = 0.0001
        elif rate_event == 0:
            rate_non_event = 0.0001
        else:
            woei = math.log(rate_event / rate_non_event)
            woe_dict[x1] = woei
            iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv


随机森林挑选特征
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 观察默认参数的性能
rf0 = RandomForestClassifier(oob_score=True, random_state=2333)
rf0.fit(X_train, y_train)
print('袋外分数:', rf0.oob_score_)
model_metrics(rf0, X_train, X_test, y_train, y_test)
建模调参
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from mlxtend.classifier import StackingClassifier

lr = LogisticRegression(C = 0.1, penalty = 'l1')
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_poly =  svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_rbf =  svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_sigmoid =  svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
dt = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
xgb = XGBClassifier(learning_rate =0.1, n_estimators=80, max_depth=3, min_child_weight=5,
                    gamma=0.2, subsample=0.8, colsample_bytree=0.8, reg_alpha=1e-5,
                    objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgb = LGBMClassifier(learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=11,
                    gamma=0.1, subsample=0.5, colsample_bytree=0.9, reg_alpha=1e-5,
                    nthread=4,scale_pos_weight=1, seed=27)


sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, svm_sigmoid, dt, xgb, lgb],
                            meta_classifier=lr, use_probas=True,average_probas=False)
性能评估
性能评估
from sklearn.metrics import accuracy_score,roc_auc_score
def model_metrics(clf,X_train,X_test,y_train,y_test):
    #预测
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    y_train_proba=clf.predict_proba(X_train)[:,1]
    y_test_proba = clf.predict_proba(X_test)[:,1]

    #准确率
    print('[准确率]',end = ' ')
    print('训练集:','%.4f'%accuracy_score(y_train,y_train_pred) ,end=' ')
    print('测试集:','%.4f'%accuracy_score(y_test,y_test_pred))
    # auc取值:用roc_auc_score或auc
    print('[auc值]',end = ' ')
    print('训练集:', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
    print('测试集:', '%.4f'%roc_auc_score(y_test, y_test_proba))


sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)
评价结果
[准确率] 训练集: 0.8560 测试集: 0.7912
[auc值] 训练集: 0.9042 测试集: 0.7952
---------------------
作者:lgy54321
来源:CSDN
原文:https://blog.csdn.net/lgy54321/article/details/84594152
版权声明:本文为博主原创文章,转载请附上博文链接!

2 个回复

正序浏览
回复 使用道具 举报
回复 使用道具 举报
您需要登录后才可以回帖 登录 | 加入黑马