Я новичок в сверкающей воде. Я пытался разработать проект для него в intellij, но couldnt.I не смог найти много ресурсов для того же в Интернете. Так что любой может рассказать, как разработать простой проект с использованием h20 и искра в scala с IntelliJ.Как установить искру H2O, используя scala в Intellij?
Я попробовал этот код:
import org.apache.spark.h2o.H2OContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.{h2o, SparkConf, SparkContext}
import water.H2OClientApp
import water.fvec._
import org.apache.spark.h2o._
object test {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[*]").setAppName("testing")
val sc = new SparkContext(conf)
val source = getClass.getResource("data.txt")
val distF = sc.textFile(source.getFile)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val table1 = distF.map(_.split(",")).map(p => Person(p(0), p(1),p(2),p(3),p(4),p(5),p(6))).toDF()
import org.apache.spark.h2o._
val h2oContext = new H2OContext(sc).start()
import h2oContext._
import org.apache.spark.rdd.RDD
val mydf2:h2o.RDD[Person] = h2oContext.createH2ORDD(table1)
println("Count of mydf2================>>>>>>>>"+mydf2.count())
}
}
case class Person(Country: String, ASN: String,Time_Stamp: String,Metric_A: String,Co_Server: String,Bytes: String,Send_Time:String);
А для этого я получил ошибку. Ошибка часть журнала генерируемый:
15/12/24 03:45:53 WARN TaskSetManager: Lost task 1.0 in stage 5.0 (TID 17, localhost): java.lang.IllegalArgumentException: argument type mismatch
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at org.apache.spark.rdd.H2ORDD$$anon$1.next(H2ORDD.scala:106)
at org.apache.spark.rdd.H2ORDD$$anon$1.next(H2ORDD.scala:64)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1555)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1121)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1121)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/12/24 03:45:53 ERROR TaskSetManager: Task 1 in stage 5.0 failed 1 times; aborting job
15/12/24 03:45:53 INFO TaskSchedulerImpl: Removed TaskSet 5.0, whose tasks have all completed, from pool
15/12/24 03:45:53 INFO TaskSetManager: Lost task 0.0 in stage 5.0 (TID 16) on executor localhost: java.lang.IllegalArgumentException (argument type mismatch) [duplicate 1]
15/12/24 03:45:53 INFO TaskSchedulerImpl: Removed TaskSet 5.0, whose tasks have all completed, from pool
15/12/24 03:45:53 INFO TaskSchedulerImpl: Cancelling stage 5
15/12/24 03:45:53 INFO DAGScheduler: ResultStage 5 (count at test.scala:32) failed in 0.038 s
15/12/24 03:45:53 INFO DAGScheduler: Job 5 failed: count at test.scala:32, took 0.050463 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 5.0 failed 1 times, most recent failure: Lost task 1.0 in stage 5.0 (TID 17, localhost): java.lang.IllegalArgumentException: argument type mismatch
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at org.apache.spark.rdd.H2ORDD$$anon$1.next(H2ORDD.scala:106)
at org.apache.spark.rdd.H2ORDD$$anon$1.next(H2ORDD.scala:64)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1555)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1121)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1121)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1822)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1835)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1848)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1919)
at org.apache.spark.rdd.RDD.count(RDD.scala:1121)
at test$.main(test.scala:32)
at test.main(test.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Caused by: java.lang.IllegalArgumentException: argument type mismatch
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at org.apache.spark.rdd.H2ORDD$$anon$1.next(H2ORDD.scala:106)
at org.apache.spark.rdd.H2ORDD$$anon$1.next(H2ORDD.scala:64)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1555)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1121)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1121)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Пожалуйста, дайте мне знать, где я Wnt неправильно и какие изменения я должен делать и почему.
Чтобы добавить к второму пункту Михала, убедитесь, что у вас установлен плагин scala для intellij. Затем сделайте следующее: 'git clone https: // github.com/h2oai/sparkling-water.git'; 'cd sparkling-water'; './ gradlew idea'; 'open sparkling-water.ipr' –
Спасибо Michal.Can u скажите мне, возникла ли эта ошибка из-за преобразования RDD в h2oRDD? или это только из-за конфигурации зависимости .... –
Можете ли вы предоставить небольшую информацию? Какую версию игристой воды и искры вы используете? Из моего пользовательского опыта лучше использовать DataFrame вместо использования сильно типизированного RDD. – Michal