[DF] Change signature of optimize_npartitions in backends

vepadulano · vepadulano · commit 6110909400f3 · 2021-06-24T12:27:57.000+02:00
This method does not really need an extra positional parameter, just to return it if it could not "optimize" it.
Switch to returning the default `MIN_NPARTITIONS` data member if nothing better could be done.
diff --git a/bindings/experimental/distrdf/python/DistRDF/Backends/Base.py b/bindings/experimental/distrdf/python/DistRDF/Backends/Base.py
@@ -305,12 +305,12 @@ def distribute_unique_paths(self, paths):
         """
         pass
 
-    def optimize_npartitions(self, npartitions):
+    def optimize_npartitions(self):
         """
         Distributed backends may optimize the number of partitions of the
         current dataset or leave it as it is.
         """
-        return npartitions
+        return self.MIN_NPARTITIONS
 
     def distribute_files(self, files_paths):
         """
diff --git a/bindings/experimental/distrdf/python/DistRDF/Backends/Spark/Backend.py b/bindings/experimental/distrdf/python/DistRDF/Backends/Spark/Backend.py
@@ -64,16 +64,16 @@ def __init__(self, sparkcontext=None):
         else:
             self.sc = pyspark.SparkContext.getOrCreate()
 
-    def optimize_npartitions(self, npartitions):
+    def optimize_npartitions(self):
         numex = self.sc.getConf().get("spark.executor.instances")
         numcoresperex = self.sc.getConf().get("spark.executor.cores")
 
-        if numex:
-            if numcoresperex:
+        if numex is not None:
+            if numcoresperex is not None:
                 return int(numex) * int(numcoresperex)
             return int(numex)
         else:
-            return npartitions
+            return self.MIN_NPARTITIONS
 
     def ProcessAndMerge(self, ranges, mapper, reducer):
         """
@@ -158,6 +158,6 @@ def make_dataframe(self, *args, **kwargs):
         # 2. An educated guess according to the backend, using the backend's
         #    `optimize_npartitions` function
         # 3. Set `npartitions` to 2
-        npartitions = kwargs.pop("npartitions", self.optimize_npartitions(Base.BaseBackend.MIN_NPARTITIONS))
+        npartitions = kwargs.pop("npartitions", self.optimize_npartitions())
         headnode = HeadNode.get_headnode(npartitions, *args)
         return DataFrame.RDataFrame(headnode, self)
diff --git a/bindings/experimental/distrdf/test/backend/test_spark.py b/bindings/experimental/distrdf/test/backend/test_spark.py
@@ -70,7 +70,7 @@ def test_optimize_npartitions_with_num_executors(self):
         sc = pyspark.SparkContext(conf=sconf)
         backend = Backend.SparkBackend(sparkcontext=sc)
 
-        self.assertEqual(backend.optimize_npartitions(1), 10)
+        self.assertEqual(backend.optimize_npartitions(), 10)
 
 
 class OperationSupportTest(unittest.TestCase):