python 模型保存问题
请问各位大神,小弟在做sklearn模型训练,然后做网格搜索和交叉验证,最后保存模型,不知道该如何保存模型请问,模型保存的时候,是传rf,还是gc,请各位大神不吝指教!!!!!
程序代码:
""" 随机森林模型 """ from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.feature_extraction import DictVectorizer from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np import os import joblib import time from sklearn.preprocessing import MinMaxScaler def decision(): """ 随机森林对电话号码做分类 :return: None """ # 获取数据 all_data = pd.read_csv('e:/index/t.ns300.csv') # 处理数据,找出特征值和目标值 x = all_data[[ 'CALLTIMES', 'CALLEDNUMBERS', 'CALLEDLSD', 'PJTALKLEN', 'TOTALCALLLEN', 'BZCTALKLEN', 'NOTALKTIMES', 'JTL', 'CALLING_PROVINCE_MODE', 'CALLING_CITY_MODE', 'CALLED_PROVINCE_MODE', 'CALLED_CITY_MODE', 'CALLEN_PJ', 'YIDI_PROVINCE_PER', 'YIDI_CITY_PER']] y = all_data['TOPIC_CLASS'] print(x) # 异常值处理 # x['JTL'].fillna(x['JTL'].mean(), inplace=True) # 分割数据集到训练集合测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 进行处理(特征工程) dict = DictVectorizer(sparse=False) x_train = dict.fit_transform(x_train.to_dict(orient="records")) print(dict.get_feature_names()) x_test = dict.transform(x_test.to_dict(orient="records")) # 随机森林进行预测 (超参数调优) rf = RandomForestClassifier() param = {"n_estimators": [50,100,120, 200, 300, 500, 800, 1200,1500], "max_depth": [5, 8,10, 15,20, 25, 30,35]} # 网格搜索与交叉验证 gc = GridSearchCV(rf, param_grid=param, cv=5) gc.fit(x_train, y_train) print("准确率:", gc.score(x_test, y_test)) if os.path.exists("e:/index/result/test_ns300_data.csv"): os.remove("e:/index/result/test_ns300_data.csv") np.savetxt("e:/index/result/test_ns300_data.csv",x_test,fmt='%3f',delimiter=',') if os.path.exists("e:/index/result/realty_ns300_data.csv"): os.remove("e:/index/result/realty_ns300_data.csv") np.savetxt("e:/index/result/realty_ns300_data.csv",y_test,header='realty') print("查看选择的参数模型:", gc.best_params_) print("预测结果:",gc.predict(x_test)) if os.path.exists("e:/index/result/predict_ns300_data.csv"): os.remove("e:/index/result/predict_ns300_data.csv") np.savetxt("e:/index/result/predict_ns300_data.csv",gc.predict(x_test),header='predict') # 创建文件目录 dirs = 'e:/index/testModel' if not os.path.exists(dirs): os.makedirs(dirs) # 保存模型 joblib.dump(gc, dirs + '/rf_ns300.pkl') return None if __name__ == "__main__": start=time.perf_counter() decision() end=time.perf_counter() print('模块运行时间: %s Seconds'%(end-start))