私はxgboostを初めて使っていて、次のことをしようとしています。 PythonでXGBoostを使用して複数の入力変数に基づいて出力を予測します
- は、入力変数、出力変数とより相関(良好な関係)を有しているかを検索しようとする入力変数
- を使用して、出力変数を予測します。
私は1と2の両方で正しい結果を得ることができません。私はこのxgboostの初心者です、plzは私を助けます。前もって感謝します。
呼ば:(ジェイソン・ブラウンリーのブログ、kaggle)
CODE:難しいの
import pandas as pd
from sklearn import preprocessing
import numpy as np
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
def main():
#Removing the blank fields and filling with mean values
def xls_to_csv():
df = pd.read_excel(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_S2-2017-02-14-103304-836.xlsx")
df.drop(['aggregation','lot','____________________wafer','wafer','lot wafer'],axis=1, inplace=True)
df_1 = df.apply(lambda x: x.fillna(x.mean()),axis=0)
df_1.to_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute.csv", index=False)
#xls_to_csv()
#Applying normalization
df1 = pd.read_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute.csv")
for feature in df1.columns: # Loop through all columns in the dataframe
if df1[feature].dtype == 'object': # Only apply for columns with categorical strings
df1[feature] = pd.Categorical(df1[feature]).codes
df2 = (df1 - df1.mean())/df1.std()
df2 = df2.dropna(axis=1,how='all',thresh=None)
df2.to_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute_after_nml.csv", index=False)
def get_data():
train = pd.read_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute_after_nml.csv")
y_train = train.pop('7;IDDQ_IPD;tested_pct;sbin')
features = train.columns
x_train = train[features]
return features, x_train, y_train
features, x_train, y_train = get_data()
final_train,final_test = train_test_split(x_train, test_size = 0.2)
final_y_train,final_y_test = train_test_split(y_train, test_size = 0.2)
#XGboost modelling starts here
xgdmat = xgb.DMatrix(final_train, final_y_train) # Create our DMatrix to make XGBoost more efficient
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,'objective': 'reg:linear', 'max_depth':3, 'min_child_weight':1} # Grid Search CV optimized settings
final_gb = xgb.train(our_params, xgdmat, num_boost_round= 1000)
importances = final_gb.get_fscore()
importance_frame = pd.DataFrame({'Importance': list(importances.values()), 'Feature': list(importances.keys())})
importance_frame.sort('Importance', inplace = True)
importance_frame.to_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_scores.csv", index=False)
# Analysing the test results
testdmat = xgb.DMatrix(final_test)
y_pred = final_gb.predict(testdmat)
print y_pred,"\n",final_y_test
if __name__ == '__main__':
main()