2016-12-28 10 views
1

私はデータセット上でロジスティック回帰を実行していますが、すべてが機能しているようですが、混乱マトリクスを印刷しようとしているときにエラーが発生します。対処する。Spark/Scala Error - ロジスティック回帰のためのConfusion Matrixの出力

import org.apache.spark.mllib.evaluation.MulticlassMetrics 
val predictionAndLabels = results.select($"prediction", $"label").as[(Double, Double)].rdd 

val metrics = new MulticlassMetrics(predictionAndLabels) 

println("Confusion matrix:") 
println(metrics.confusionMatrix) 

ログメッセージ:

root 
|-- ORDER_QUANTITY: integer (nullable = true) 
|-- IS_BOUGHT: integer (nullable = true) 
|-- CUSTOMER_ID: long (nullable = true) 
|-- SNAPSHOT_DAY: string (nullable = true) 
|-- WEEK_DAY: string (nullable = true) 
|-- DEVICE_TYPE: string (nullable = true) 
|-- HIT_TIME: string (nullable = true) 
|-- MARKETPLACE: string (nullable = true) 
|-- ASIN: string (nullable = true) 
|-- VEL: string (nullable = true) 
|-- GL_PRODUCT_GROUP: string (nullable = true) 
|-- IS_FT: string (nullable = true) 
|-- INSTOCK_STATUS: string (nullable = true) 
|-- PDD_WD: string (nullable = true) 
|-- FT_DAYS: string (nullable = true) 
|-- IN_STOCK_QUANTITY: string (nullable = true) 
|-- ASIN_PRICE: double (nullable = true) 
|-- PRICE_GAP: double (nullable = true) 
|-- IS_DISCOUNT: string (nullable = true) 

logRegDataAll: org.apache.spark.sql.DataFrame = [label: int, DEVICE_TYPE: string ... 10 more fields] 
logRegData: org.apache.spark.sql.DataFrame = [label: int, DEVICE_TYPE: string ... 10 more fields] 
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder} 
import org.apache.spark.ml.linalg.Vectors 
DeviceTypeIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_703e2f28bf96 
MarketplaceIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_4bd47e3e31c5 
VelocityIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_744315e59c01 
GLIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_30a9705e2305 
FTIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_fb2e7ec8b38c 
InStockIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_15ceee49c6a9 
PDDIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_a3987fcecd10 
InStockQtyIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_9c0bc369a617 
IsDicountIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_cf2902b30b63 
DeviceTypeEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_5560566be7cb 
MarketplaceEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_d2c6ca94f073 
VelocityEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_0f1f237e9700 
GLEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_70baf14c780a 
FTEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_bb3312ac9c1e 
InStockEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_f273d6b316b6 
PDDEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_a663d8560283 
InStockQtyEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_8300bb250ef0 
IsDiscountEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_f5eed05b0391 
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_474029a89693 
training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, DEVICE_TYPE: string ... 10 more fields] 
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, DEVICE_TYPE: string ... 10 more fields] 
import org.apache.spark.ml.Pipeline 
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_cf2a6574a539 
pipeline: org.apache.spark.ml.Pipeline = pipeline_2be53eb735dc 
model: org.apache.spark.ml.PipelineModel = pipeline_2be53eb735dc     
results: org.apache.spark.sql.DataFrame = [label: int, DEVICE_TYPE: string ... 32 more fields] 
import org.apache.spark.mllib.evaluation.MulticlassMetrics 
predictionAndLabels: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1413] at rdd at <console>:292 
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = [email protected] 

エラー:

Confusion matrix: 
16/12/28 15:15:43 ERROR Executor: Exception in task 2.0 in stage 771.0 (TID 1651) 
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double) 
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) 
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192) 
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) 
    at org.apache.spark.scheduler.Task.run(Task.scala:86) 
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
    at java.lang.Thread.run(Thread.java:745) 
Caused by: org.apache.spark.SparkException: Unseen label: 4500. 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170) 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166) 
    ... 15 more 
16/12/28 15:15:43 ERROR TaskSetManager: Task 2 in stage 771.0 failed 1 times; aborting job 
16/12/28 15:15:43 ERROR Executor: Exception in task 0.0 in stage 771.0 (TID 1649) 
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double) 
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) 
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192) 
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) 
    at org.apache.spark.scheduler.Task.run(Task.scala:86) 
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
    at java.lang.Thread.run(Thread.java:745) 
Caused by: org.apache.spark.SparkException: Unseen label: 3583. 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170) 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166) 
    ... 15 more 
16/12/28 15:15:43 ERROR Executor: Exception in task 1.0 in stage 771.0 (TID 1650) 
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double) 
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) 
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192) 
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) 
    at org.apache.spark.scheduler.Task.run(Task.scala:86) 
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
    at java.lang.Thread.run(Thread.java:745) 
Caused by: org.apache.spark.SparkException: Unseen label: 8710. 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170) 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166) 
    ... 15 more 
org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 771.0 failed 1 times, most recent failure: Lost task 2.0 in stage 771.0 (TID 1651, localhost): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double) 
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) 
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192) 
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) 
    at org.apache.spark.scheduler.Task.run(Task.scala:86) 
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
    at java.lang.Thread.run(Thread.java:745) 
Caused by: org.apache.spark.SparkException: Unseen label: 4500. 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170) 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166) 
    ... 15 more 

Driver stacktrace: 
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454) 
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442) 
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441) 
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) 
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) 
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441) 
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) 
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) 
    at scala.Option.foreach(Option.scala:257) 
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811) 
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667) 
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622) 
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611) 
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) 
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632) 
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873) 
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886) 
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899) 
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913) 
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912) 
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) 
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) 
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) 
    at org.apache.spark.rdd.RDD.collect(RDD.scala:911) 
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:745) 
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:744) 
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) 
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) 
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) 
    at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:744) 
    at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass$lzycompute(MulticlassMetrics.scala:48) 
    at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass(MulticlassMetrics.scala:44) 
    at org.apache.spark.mllib.evaluation.MulticlassMetrics.labels$lzycompute(MulticlassMetrics.scala:223) 
    at org.apache.spark.mllib.evaluation.MulticlassMetrics.labels(MulticlassMetrics.scala:223) 
    at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusionMatrix(MulticlassMetrics.scala:68) 
    ... 159 elided 
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double) 
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) 
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192) 
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) 
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) 
    at org.apache.spark.scheduler.Task.run(Task.scala:86) 
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) 
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
    at java.lang.Thread.run(Thread.java:745) 
Caused by: org.apache.spark.SparkException: Unseen label: 4500. 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170) 
    at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166) 
    ... 15 more 

編集:私はトレーニングとテストセットに私のデータを分割しない場合

すると、私はドン」エラーはありません。パイプライン継手の「目に見えないラベル」エラーを回避するにはどうすればよいですか?

+2

あなたには目に見えないラベルがあります:「目に見えないラベル:4500」印刷しようとするとその理由が表示されるのは、その時点までのすべてがゆっくりと評価されているためです。あなたは印刷しようとしました –

+0

@ evan058どのラベルが見えませんか?エラーの原因は何か?私はこれを見つけましたが(https://issues.apache.org/jira/browse/SPARK-8764)、意味は分かりません。 – Lior

+0

エラーメッセージに基づいて、目に見えないラベルは '4500'です。私はあなたのデータを掘り下げずにこれがなぜ起こるのか理解できません –

答えて

0

例外処理を設定することでエラーを回避できました。

val DeviceTypeIndexer = new StringIndexer().setInputCol("DEVICE_TYPE").setOutputCol("DeviceTypeIndex").setHandleInvalid("skip") 
関連する問題