#2
hkb9112019-09-09 20:34
|
请问,模型保存的时候,是传rf,还是gc,请各位大神不吝指教!!!!!
程序代码:
"""
随机森林模型
"""
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.preprocessing import MinMaxScaler
def decision():
"""
随机森林对电话号码做分类
:return: None
"""
# 获取数据
all_data = pd.read_csv('e:/index/t.ns300.csv')
# 处理数据,找出特征值和目标值
x = all_data[[ 'CALLTIMES', 'CALLEDNUMBERS', 'CALLEDLSD', 'PJTALKLEN', 'TOTALCALLLEN', 'BZCTALKLEN', 'NOTALKTIMES', 'JTL', 'CALLING_PROVINCE_MODE', 'CALLING_CITY_MODE', 'CALLED_PROVINCE_MODE', 'CALLED_CITY_MODE', 'CALLEN_PJ', 'YIDI_PROVINCE_PER', 'YIDI_CITY_PER']]
y = all_data['TOPIC_CLASS']
print(x)
# 异常值处理
# x['JTL'].fillna(x['JTL'].mean(), inplace=True)
# 分割数据集到训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 进行处理(特征工程)
dict = DictVectorizer(sparse=False)
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient="records"))
# 随机森林进行预测 (超参数调优)
rf = RandomForestClassifier()
param = {"n_estimators": [50,100,120, 200, 300, 500, 800, 1200,1500], "max_depth": [5, 8,10, 15,20, 25, 30,35]}
# 网格搜索与交叉验证
gc = GridSearchCV(rf, param_grid=param, cv=5)
gc.fit(x_train, y_train)
print("准确率:", gc.score(x_test, y_test))
if os.path.exists("e:/index/result/test_ns300_data.csv"):
os.remove("e:/index/result/test_ns300_data.csv")
np.savetxt("e:/index/result/test_ns300_data.csv",x_test,fmt='%3f',delimiter=',')
if os.path.exists("e:/index/result/realty_ns300_data.csv"):
os.remove("e:/index/result/realty_ns300_data.csv")
np.savetxt("e:/index/result/realty_ns300_data.csv",y_test,header='realty')
print("查看选择的参数模型:", gc.best_params_)
print("预测结果:",gc.predict(x_test))
if os.path.exists("e:/index/result/predict_ns300_data.csv"):
os.remove("e:/index/result/predict_ns300_data.csv")
np.savetxt("e:/index/result/predict_ns300_data.csv",gc.predict(x_test),header='predict')
# 创建文件目录
dirs = 'e:/index/testModel'
if not os.path.exists(dirs):
os.makedirs(dirs)
# 保存模型
joblib.dump(gc, dirs + '/rf_ns300.pkl')
return None
if __name__ == "__main__":
start=time.perf_counter()
decision()
end=time.perf_counter()
print('模块运行时间: %s Seconds'%(end-start))