[DF] Move from namedtuples to classes in Ranges.py

vepadulano · vepadulano · commit edc1d780e58c · 2021-07-02T12:28:55.000+02:00
diff --git a/bindings/experimental/distrdf/python/DistRDF/Ranges.py b/bindings/experimental/distrdf/python/DistRDF/Ranges.py
@@ -5,8 +5,90 @@
 
 logger = logging.getLogger(__name__)
 
-EmptySourceRange = collections.namedtuple("EmptySourceRange", ["start", "end"])
-TreeRange = collections.namedtuple("TreeRange", ["start", "end", "filelist", "friend_info"])
+class EmptySourceRange(object):
+    """
+    Empty source range of entries
+    
+    Attributes:
+    start (int): Starting entry of this range.
+
+    end (int): Ending entry of this range.
+    """
+
+    def __init__(self, start, end):
+        """set attributes"""
+        self.start = start
+        self.end = end
+
+class TreeRange(object):
+    """
+    TTree range of entries. The entries are global in the chain made from the
+    input files of the dataset.
+    
+    Attributes:
+    start (int): Starting entry of this range.
+
+    end (int): Ending entry of this range.
+
+    filelist (list[str]): Input files of the dataset.
+
+    friend_info (DistRDF.HeadNode.FriendInfo): Information about friend trees.
+    """
+
+    def __init__(self, start, end, filelist, friend_info):
+        """set attributes"""
+        self.start = start
+        self.end = end
+        self.filelist = filelist
+        self.friend_info = friend_info
+
+
+class FileAndIndex(object):
+    """
+    This is a pair (filename, index) that represents the index of the current
+    filename in the list of input files of the dataset
+
+    Attributes:
+    filename (str): The name of the file.
+
+    index (int): The index of the file in the list of input files.
+    """
+
+    def __init__(self, filename, fileindex):
+        """set attributes"""
+        self.filename = filename
+        self.fileindex = fileindex
+
+class ChainCluster(object):
+    """
+    Descriptor of a cluster of entries in a TChain. Uses global entries rather
+    than local.
+    
+    Attributes:
+    start (int): The starting global entry of this cluster in the chain.
+
+    end (int): The ending global entry of this cluster in the chain.
+
+    offset (int): The offset of this cluster in the chain. That is, the starting
+        entry of the file this cluster belongs to in the chain.
+    
+    filetuple (FileAndIndex): A pair with the name of the file this cluster
+        belongs to and the index of that file in the chain.
+    """
+
+    def __init__(self, start, end, offset, filetuple):
+        """set attributes"""
+        self.start = start
+        self.end = end
+        self.offset = offset
+        self.filetuple = filetuple
+    
+    def __lt__(self, other):
+        """
+        In `get_clustered_ranges` we need to retrieve the minimum and maximum
+        entries in a certain list of clusters.
+        """
+        return self.start < other.start and self.end < other.end
 
 
 def _n_even_chunks(iterable, n_chunks):
@@ -79,10 +161,7 @@ def get_clusters(treename, filelist):
     """
 
     clusters = []
-    cluster = collections.namedtuple(
-        "cluster", ["start", "end", "offset", "filetuple"])
-    fileandindex = collections.namedtuple("fileandindex",
-                                          ["filename", "index"])
+
     offset = 0
     fileindex = 0
 
@@ -97,8 +176,8 @@ def get_clusters(treename, filelist):
 
         while start < entries:
             end = it()
-            clusters.append(cluster(start + offset, end + offset, offset,
-                                    fileandindex(filename, fileindex)))
+            clusters.append(ChainCluster(start + offset, end + offset, offset,
+                                    FileAndIndex(filename, fileindex)))
             start = end
 
         fileindex += 1
@@ -249,13 +328,13 @@ def get_clustered_ranges(clustersinfiles, npartitions, treename, friend_info):
     """
     clustered_ranges = [
         TreeRange(
-            min(clusters)[0] - clusters[0].offset,  # type: int
-            max(clusters)[1] - clusters[0].offset,  # type: int
+            min(clusters).start - clusters[0].offset,  # type: int
+            max(clusters).end - clusters[0].offset,  # type: int
             [
                 filetuple.filename
                 for filetuple in sorted(set([
                     cluster.filetuple for cluster in clusters
-                ]), key=lambda curtuple: curtuple[1])
+                ]), key=lambda curtuple: curtuple.fileindex)
             ],  # type: list[str]
             friend_info  # type: DistRDF.HeadNode.FriendInfo
         )  # type: collections.namedtuple