Uploaded image for project: 'SW'
  1. SW-321

composite function fail when inner cbind()

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: None
    • Fix Version/s: 1.6.9, 2.0.6, 2.1.1
    • Component/s: None
    • Labels:
      None

      Description

      Doing as_spark_frame(df.cbind(df2)) fails

      repro:

      import numpy as np
      from pyspark.sql.types import Row
      ri1 = np.random.random_integers(1,1000000,2000000)
      df1 = sc.parallelize(ri).repartition(5).map(lambda x: Row(int(x))).cache()
      
      ri2 = np.random.random_integers(1,1000000,2000000)
      df2 = sc.parallelize(ri).repartition(5).map(lambda x: Row(int(x))).cache()
      
      h2o_df1 = context.as_h2o_frame(df1)
      h2o_df2 = context.as_h2o_frame(df2)
      
      combined = h2o_df1.cbind(h2o_df2)
      copy1 = context.as_spark_frame(combined)
      copy1.count()
      #2000000
      
      #but:
      copy2 = context.as_spark_frame(h2o_df1.cbind(h2o_df2))
      copy2.count()
      
      ---------------------------------------------------------------------------
      Py4JJavaError                             Traceback (most recent call last)
      <ipython-input-40-d55cbbb3ecbc> in <module>()
            1 copy2 = context.as_spark_frame(h2o_df1.cbind(h2o_df2))
      ----> 2 copy2.count()
      
      /opt/spark/2.0.2/python/pyspark/sql/dataframe.py in count(self)
          297         2
          298         """
      --> 299         return int(self._jdf.count())
          300 
          301     @ignore_unicode_prefix
      
      /opt/spark/2.0.2/python/lib/py4j-src.zip/py4j/java_gateway.py in __call__(self, *args)
         1131         answer = self.gateway_client.send_command(command)
         1132         return_value = get_return_value(
      -> 1133             answer, self.gateway_client, self.target_id, self.name)
         1134 
         1135         for temp_arg in temp_args:
      
      /opt/spark/2.0.2/python/pyspark/sql/utils.py in deco(*a, **kw)
           61     def deco(*a, **kw):
           62         try:
      ---> 63             return f(*a, **kw)
           64         except py4j.protocol.Py4JJavaError as e:
           65             s = e.java_exception.toString()
      
      /opt/spark/2.0.2/python/lib/py4j-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
          317                 raise Py4JJavaError(
          318                     "An error occurred while calling {0}{1}{2}.\n".
      --> 319                     format(target_id, ".", name), value)
          320             else:
          321                 raise Py4JError(
      
      Py4JJavaError: An error occurred while calling o511.count.
      : java.lang.RuntimeException: Rollups not possible, because Vec was deleted: $04ff09000000ffffffffff7196961d66889eac470028e14b8eaa$
      	at water.fvec.RollupStats.get(RollupStats.java:319)
      	at water.fvec.RollupStats.get(RollupStats.java:346)
      	at water.fvec.Vec.rollupStats(Vec.java:806)
      	at water.fvec.Vec.isInt(Vec.java:773)
      	at org.apache.spark.h2o.utils.ReflectionUtils$.detectSupportedNumericType(ReflectionUtils.scala:158)
      	at org.apache.spark.h2o.utils.ReflectionUtils$.supportedType(ReflectionUtils.scala:148)
      	at org.apache.spark.h2o.utils.ReflectionUtils$.dataTypeFor(ReflectionUtils.scala:141)
      	at org.apache.spark.h2o.converters.H2ODataFrame$anonfun$1.apply(H2ODataFrame.scala:51)
      	at org.apache.spark.h2o.converters.H2ODataFrame$anonfun$1.apply(H2ODataFrame.scala:51)
      	at scala.collection.TraversableLike$anonfun$map$1.apply(TraversableLike.scala:234)
      	at scala.collection.TraversableLike$anonfun$map$1.apply(TraversableLike.scala:234)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at org.apache.spark.h2o.converters.H2ODataFrame.<init>(H2ODataFrame.scala:51)
      	at org.apache.spark.sql.H2OFrameRelation.buildScan(H2OSQLContextUtils.scala:59)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$anonfun$9.apply(DataSourceStrategy.scala:267)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$anonfun$9.apply(DataSourceStrategy.scala:267)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:303)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:302)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:379)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:298)
      	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:263)
      	at org.apache.spark.sql.catalyst.planning.QueryPlanner$anonfun$1.apply(QueryPlanner.scala:60)
      	at org.apache.spark.sql.catalyst.planning.QueryPlanner$anonfun$1.apply(QueryPlanner.scala:60)
      	at scala.collection.Iterator$anon$12.nextCur(Iterator.scala:434)
      	at scala.collection.Iterator$anon$12.hasNext(Iterator.scala:440)
      	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:61)
      	at org.apache.spark.sql.execution.SparkPlanner.plan(SparkPlanner.scala:47)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:308)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:308)
      	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:307)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$4.apply(TreeNode.scala:305)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$4.apply(TreeNode.scala:305)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$5.apply(TreeNode.scala:328)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:326)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:305)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$4.apply(TreeNode.scala:305)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$4.apply(TreeNode.scala:305)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$5.apply(TreeNode.scala:328)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:326)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:305)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1.apply(SparkPlanner.scala:48)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1.apply(SparkPlanner.scala:48)
      	at scala.collection.Iterator$anon$11.next(Iterator.scala:409)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:308)
      	at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:308)
      	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:307)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1.apply(SparkPlanner.scala:48)
      	at org.apache.spark.sql.execution.SparkPlanner$anonfun$plan$1.apply(SparkPlanner.scala:48)
      	at scala.collection.Iterator$anon$11.next(Iterator.scala:409)
      	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:78)
      	at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:76)
      	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:83)
      	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:83)
      	at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2555)
      	at org.apache.spark.sql.Dataset.count(Dataset.scala:2226)
      	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
      	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.lang.reflect.Method.invoke(Method.java:606)
      	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
      	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
      	at py4j.Gateway.invoke(Gateway.java:280)
      	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
      	at py4j.commands.CallCommand.execute(CallCommand.java:79)
      	at py4j.GatewayConnection.run(GatewayConnection.java:214)
      	at java.lang.Thread.run(Thread.java:745)
       
      In [ ]:
       

        Attachments

          Activity

            People

            • Assignee:
              Kuba Jakub Hava
              Reporter:
              nickkarpov Nick Karpov
            • Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: