预先Baseline,后期在基础上优化提升。
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore') #利用过滤器实现忽略警报
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
import time
%matplotlib inline
from sklearn import linear_model #线性回归算法模型
from sklearn import preprocessing #预处理数据
from sklearn.svm import SVR #支持向量回归,区别于支持向量机
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor #随机森林梯度决策树
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
#模型
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_data = pd.read_csv(used_car_train_20200313.csv',sep=' ')
testA_data = pd.read_csv(used_car_testA_20200313.csv',sep=' ')
train_data.head().append(train_data.tail())
test_data.head().append(testA_data.tail())
#数据信息查看,info简要查看对应数据列名即NAN缺失信息
train_data.info()
#查看train列名
train_data.columns
#查看数值特征的统计信息
train_data.describe()
#缺失和异常值查看
train_data.isnull().sum()
testA_data.isnull().sum()
numerical_cols = train_data.select_dtypes(exclude = 'object').columns
print(numerical_cols)
categorical_cols = train_data.select_dtypes(include = 'object').columns
print(categorical_cols)
#选择特征
feature_cols = [col for col in numerical_cols if col not in ['SaleID', 'name', 'regDate', 'model', 'brand', 'price', 'regionCode', 'seller', 'creatDate']]
feature_cols = [col for col in feature_cols if 'Type' not in col]
#提前特征列,标签列构造训练样本和测试样本
X_data = train_data[feature_cols]
Y_data = train_data['price']
X_test = testA_data[feature_cols]
print('X train shape:',X_data.shape)
print('X test shape:',X_test.shape)
#定义一个统计函数,方便后续信息统计
def Sta_inf(data):
print('_min',np.min(data))
print('_max',np.max(data))
print('_mean',np.mean(data))
print('_ptp',np.ptp(data))
print('_std',np.std(data))
print('_var',np.var(data))
print('Sta of label:')
Sta_inf(Y_data)
#绘制标签的统计图,查看标签分布
plt.hist(Y_data)
plt.show()
plt.close()
X_data = X_data.fillna(-1)
X_test = X_test.fillna(-1)
#xgb-Model
xgr = xgb.XGBRegressor(n_estimators=120,learning_rate=0.1,gamma=0,subsample=0.8,\
colsample_bytree=0.9,max_depth=7)#objective = 'reg:squarederror'
scores_train = []
scores = []
## 5折交叉验证方法
sk = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
for train_ind,val_ind in sk.split(X_data,Y_data):
train_x = X_data.iloc[train_ind].values
train_y = Y_data.iloc[train_ind]
val_x = X_data.iloc[val_ind].values
val_y = Y_data.iloc[val_ind]
xgr.fit(train_x,train_y)
pred_train_xgb=xgr.predict(train_x)
pred_xgb=xgr.predict(val_x)
score_train = mean_absolute_error(train_y,pred_train_xgb)
scores_train.append(score_train)
score = mean_absolute_error(val_y,pred_xgb)
scores.append(score)
print('Train mae:',np.mean(score_train))
print('Val mae:',np.mean(scores))
def build_model_xgb(x_train,y_train):
model = xgb.XGBRegressor(n_estimators=150,learning_rate=0.1,gamma=0,subsample=0.8,\
colsample_bytree=0.9,max_depth=7)#objective = 'reg:squarederror'
model.fit(x_train,y_train)
return model
def build_model_lgb(x_train,y_train):
estimator = lgb.LGBMRegressor(num_leaves=127,n_estimators = 150)
param_grid ={
'learning_rate':[0.01,0.05,0.1,0.2],
}
gbm = GridSearchCV(estimator,param_grid)
gbm.fit(x_train,y_train)
return gbm
##Split data with val
x_train,x_val,y_train,y_val = train_test_split(X_data,Y_data,test_size=0.3)
print('Train LGB...')
model_lgb = build_model_lgb(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
MAE_lgb = mean_absolute_error(y_val,val_lgb)
print('MAE of val with lgb:',MAE_lgb)
print('Predict LGB...')
model_lgb_pre = build_model_lgb(X_data,Y_data)
subA_lgb = model_lgb_pre.predict(X_test)
print('Sta of Predict lgb:')
Sta_inf(subA_lgb)
print('Train XGB...')
model_xgb = build_model_xgb(x_train,y_train)
val_xgb = model_xgb.predict(x_val)
MAE_xgb = mean_absolute_error(y_val,val_xgb)
print('MAE of val with xgb:',MAE_xgb)
print('Predict XGB...')
model_xgb_pre = build_model_xgb(X_data,Y_data)
subA_xgb = model_xgb_pre.predict(X_test)
print('Sta of Predict XGB:')
Sta_inf(subA_xgb)
#采取简单的加权融合方式
val_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*val_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*val_xgb
val_Weighted[val_Weighted<0]=10 # 由于我们发现预测的最小值有负数,而真实情况下,price为负是不存在的,由此我们进行对应的后修正
print('MAE of val with Weighted ensemble:',mean_absolute_error(y_val,val_Weighted))
sub_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*subA_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*subA_xgb
## 查看预测值的统计进行
plt.hist(Y_data)
plt.show()
plt.close()
sub = pd.DataFrame()
sub['SaleID'] = testA_data.SaleID
sub['price'] = sub_Weighted
sub.to_csv('./sub_Weighted.csv',index=False)
sub.head()
数据业务==>数据结构==>重要的实际意义特征==>异常/离群数据==>查看数据情况-试用合理模型。
数据原始图;时序图;统计图(均值图、箱型图、小提琴图、直方图等);对比差异和相同点。
偏度、峰度、方差等异常情况;取值范围
变量间无线性相关,还可能存在非线性关联。相对下述方法应用性比较强
不同模型对数据分布要求不同,有个基本假设,例如每个样本假设属于同分布的;变量之间相互;自变量和应变量同分布;长尾分布中进行Log后,长尾影响会放大。
做特征工程,再用模型试用,和其他模型做特征工程差不多。线性一般要做归一化,XGB、LGBM、随机森林不需要满足量纲一致;深度模型省去特征模型的步骤。
偏度(左偏、右偏)表示数据需要做特殊处理;峰度表示凸起的高度。
EDA的目的就是这个。
相对而言,一般凭自己的经验;或相对比较。
映射方法或哈希表,未知。
Task4建模中有详解
Task4和Task5
特征工程
先做再组队,github可做管理迭代。
提前很大的数据库训练好模型,将好的模型迁移训练,类似pretrain。
一开始用简单的规则做预测(例如线性回归、中位数*权重),后期将规则当做特征放入模型里面,特征工程里面的特征进行融合放在模型里面。
不合符训练模型样本分布,先分析为何会异常?手动解决。
PCA降维会损失特征(具体百度)
因篇幅问题不能全部显示,请点此查看更多更全内容
Copyright © 2019- oldu.cn 版权所有 浙ICP备2024123271号-1
违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com
本站由北京市万商天勤律师事务所王兴未律师提供法律服务