2016-06-26 4 views
2

スパークのランダムフォレストでクロス検証を実行しようとしています。スパークランダムフォレスト相互検証エラー

from pyspark.ml import Pipeline 
from pyspark.ml.classification import RandomForestClassifier 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 

data = nds.sc.parallelize([ 
LabeledPoint(0.0, [0,402,6,0]), 
LabeledPoint(0.0, [3,500,3,0]), 
LabeledPoint(1.0, [1,590,1,1]), 
LabeledPoint(1.0, [3,328,5,0]), 
LabeledPoint(1.0, [4,351,4,0]), 
LabeledPoint(0.0, [2,372,2,0]), 
LabeledPoint(0.0, [4,302,5,0]), 
LabeledPoint(1.0, [1,387,2,0]), 
LabeledPoint(1.0, [1,419,3,0]), 
LabeledPoint(0.0, [1,370,5,0]), 
LabeledPoint(0.0, [1,410,4,0]), 
LabeledPoint(0.0, [2,509,7,1]), 
LabeledPoint(0.0, [1,307,5,0]), 
LabeledPoint(0.0, [0,424,4,1]), 
LabeledPoint(0.0, [1,509,2,1]), 
LabeledPoint(1.0, [3,361,4,0]), 
]) 


train=data.toDF(['label','features']) 

numfolds =2 

rf = RandomForestClassifier(labelCol="label", featuresCol="features") 
evaluator = MulticlassClassificationEvaluator() 


paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,  
[4,8,10]).addGrid(rf.impurity, ['entropy','gini']).addGrid(rf.featureSubsetStrategy, [6,8,10]).build() 

pipeline = Pipeline(stages=[rf]) 

crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=evaluator, 
    numFolds= numfolds) 

model = crossval.fit(train) 

私はparamGridがリストとして私の入力を読んでされていないことが表示されます、次のエラー

Py4JJavaError        Traceback (most recent call last) 
<ipython-input-87-7ea70f89086a> in <module>() 
66  numFolds=num) 
67 
---> 68 model = crossval.fit(train) 

/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params) 
67     return self.copy(params)._fit(dataset) 
68    else: 
---> 69     return self._fit(dataset) 
70   else: 
71    raise ValueError("Params must be either a param map or a list/tuple of param maps, " 

/opt/spark/current/python/pyspark/ml/tuning.py in _fit(self, dataset) 
237    train = df.filter(~condition) 
238    for j in range(numModels): 
--> 239     model = est.fit(train, epm[j]) 
240     # TODO: duplicate evaluator to take extra params from input 
241     metric = eva.evaluate(model.transform(validation, epm[j])) 

/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params) 
65   elif isinstance(params, dict): 
66    if params: 
---> 67     return self.copy(params)._fit(dataset) 
68    else: 
69     return self._fit(dataset) 

/opt/spark/current/python/pyspark/ml/pipeline.py in _fit(self, dataset) 
211      dataset = stage.transform(dataset) 
212     else: # must be an Estimator 
--> 213      model = stage.fit(dataset) 
214      transformers.append(model) 
215      if i < indexOfLastEstimator: 

/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params) 
67     return self.copy(params)._fit(dataset) 
68    else: 
---> 69     return self._fit(dataset) 
70   else: 
71    raise ValueError("Params must be either a param map or a list/tuple of param maps, " 

/opt/spark/current/python/pyspark/ml/wrapper.py in _fit(self, dataset) 
130 
131  def _fit(self, dataset): 
--> 132   java_model = self._fit_java(dataset) 
133   return self._create_model(java_model) 
134 

/opt/spark/current/python/pyspark/ml/wrapper.py in _fit_java(self, dataset) 
126   :return: fitted Java model 
127   """ 
--> 128   self._transfer_params_to_java() 
129   return self._java_obj.fit(dataset._jdf) 
130 

/opt/spark/current/python/pyspark/ml/wrapper.py in _transfer_params_to_java(self) 
80   for param in self.params: 
81    if param in paramMap: 
---> 82     pair = self._make_java_param_pair(param, paramMap[param]) 
83     self._java_obj.set(pair) 
84 

/opt/spark/current/python/pyspark/ml/wrapper.py in _make_java_param_pair(self, param, value) 
71   java_param = self._java_obj.getParam(param.name) 
72   java_value = _py2java(sc, value) 
---> 73   return java_param.w(java_value) 
74 
75  def _transfer_params_to_java(self): 

    /opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 
811   answer = self.gateway_client.send_command(command) 
812   return_value = get_return_value(
    --> 813    answer, self.gateway_client, self.target_id, self.name) 
814 
815   for temp_arg in temp_args: 

/opt/spark/current/python/pyspark/sql/utils.py in deco(*a, **kw) 
43  def deco(*a, **kw): 
44   try: 
    ---> 45    return f(*a, **kw) 
46   except py4j.protocol.Py4JJavaError as e: 
47    s = e.java_exception.toString() 

/opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 
306     raise Py4JJavaError(
307      "An error occurred while calling {0}{1}{2}.\n". 
--> 308      format(target_id, ".", name), value) 
309    else: 
310     raise Py4JError(

Py4JJavaError: An error occurred while calling o1434.w. 
: java.lang.ClassCastException: java.lang.Integer cannot be cast to  java.lang.String 
at org.apache.spark.ml.tree.RandomForestParams$$anonfun$5.apply(treeParams.scala:340) 
at org.apache.spark.ml.param.Param.validate(params.scala:71) 
at org.apache.spark.ml.param.ParamPair.<init>(params.scala:509) 
at org.apache.spark.ml.param.Param.$minus$greater(params.scala:85) 
at org.apache.spark.ml.param.Param.w(params.scala:82) 
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 
at java.lang.reflect.Method.invoke(Method.java:497) 
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) 
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) 
at py4j.Gateway.invoke(Gateway.java:259) 
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) 
at py4j.commands.CallCommand.execute(CallCommand.java:79) 
at py4j.GatewayConnection.run(GatewayConnection.java:209) 
at java.lang.Thread.run(Thread.java:745) 

を取得します。別の形式または回避策がありますか?どんな助けもありがとう。

答えて

2

正しくない値をrf.featureSubsetStrategyに渡します。それは戦略を説明する文字列でなければならず、auto、all、onethird、sqrt、log2という値をサポートしています。参照:RandomForestClassifier.featureSubsetStrategy.doc

また、data.toDF(['label','features'])も使用しないでください。正しい順序を保持しません。用途:

data.toDF() 

か、名前を変更したい場合:

from operator import attrgetter 

data.map(attrgetter("label", "features")).toDF(["some_name", "some_other_name"]) 

が最後にラベル列がインデックスを作成する必要があるか、必要なメタデータを提供する必要があります。 How can I declare a Column as a categorical feature in a DataFrame for use in ml

+0

私はrf.featureSubsetStrategy、['auto'、 'onethird']と置き換えましたが、同じエラーが発生しました。 その後、ParamGridBuilderからrf.featureSubsetStrategy、['auto'、 'onethird']を削除して、同じエラーが再び発生しました。 – mikeL

+0

@mikeLあなたのコードには 'ParamGrid'には関係しない他の問題がありますが、この特定の問題を修正しても同じエラーは発生しません。 – zero323

+0

はい、それは違うエラーですし、はい他の問題があります。データフレームのフィーチャとラベルが間違った位置にあるように見えます。 – mikeL