私は機械学習に新しいので、KDD Cup 1999のデータセットでKNNアルゴリズムを実行しようとしています。私はクラシファイアを作成し、およそ92%の精度でデータセットを予測することができました。KNN python sklearnのNクロス検証方法
しかし、私は、テストとトレーニングのデータセットが静的に設定されており、データセットの種類によって異なる可能性があるため、精度が正確でないことがあることがわかりました。
どのようにNクロス検証を行うことができますか?以下は
は、これまでの私のコードです:
import pandas
from time import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
#TRAINING
col_names = ["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
kdd_data_10percent = pandas.read_csv("data/kdd_10pc", header=None, names = col_names)
num_features = [
"duration","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate"
]
features = kdd_data_10percent[num_features].astype(float)
#classifying all labels not "normal" as attack
labels = kdd_data_10percent['label'].copy()
labels[labels!='normal.'] = 'attack.'
print labels.value_counts()
#TODO: Normalising of data
#TODO: Principal Component Analysis - Data reduction
clf = KNeighborsClassifier(n_neighbors = 5, algorithm = 'ball_tree', leaf_size=500)
t0 = time()
clf.fit(features,labels)
tt = time()-t0
print "Classifier trained in {} seconds".format(round(tt,3))
#TESTING
kdd_data_test = pandas.read_csv("data/corrected", header=None, names = col_names)
kdd_data_test['label'][kdd_data_test['label']!='normal.'] = 'attack.'
kdd_data_test[num_features] = kdd_data_test[num_features].astype(float)
features_train, features_test, labels_train, labels_test = train_test_split(
kdd_data_test[num_features],
kdd_data_test['label'],
test_size=0.1,
random_state=42)
t0 = time()
pred = clf.predict(features_test)
tt = time() - t0
print "Predicted in {} seconds".format(round(tt,3))
acc = accuracy_score(pred, labels_test)
print "R squared is {}.".format(round(acc,4))
は、任意の指導に感謝します!どうもありがとうございました !サイキットのページ(http://scikit-learn.org/stable/modules/cross_validation.html)から