55
66logger = logging .getLogger (__name__ )
77
8- EmptySourceRange = collections .namedtuple ("EmptySourceRange" , ["start" , "end" ])
9- TreeRange = collections .namedtuple ("TreeRange" , ["start" , "end" , "filelist" , "friend_info" ])
8+ class EmptySourceRange (object ):
9+ """
10+ Empty source range of entries
11+
12+ Attributes:
13+ start (int): Starting entry of this range.
14+
15+ end (int): Ending entry of this range.
16+ """
17+
18+ def __init__ (self , start , end ):
19+ """set attributes"""
20+ self .start = start
21+ self .end = end
22+
23+ class TreeRange (object ):
24+ """
25+ TTree range of entries. The entries are global in the chain made from the
26+ input files of the dataset.
27+
28+ Attributes:
29+ start (int): Starting entry of this range.
30+
31+ end (int): Ending entry of this range.
32+
33+ filelist (list[str]): Input files of the dataset.
34+
35+ friend_info (DistRDF.HeadNode.FriendInfo): Information about friend trees.
36+ """
37+
38+ def __init__ (self , start , end , filelist , friend_info ):
39+ """set attributes"""
40+ self .start = start
41+ self .end = end
42+ self .filelist = filelist
43+ self .friend_info = friend_info
44+
45+
46+ class FileAndIndex (object ):
47+ """
48+ This is a pair (filename, index) that represents the index of the current
49+ filename in the list of input files of the dataset
50+
51+ Attributes:
52+ filename (str): The name of the file.
53+
54+ index (int): The index of the file in the list of input files.
55+ """
56+
57+ def __init__ (self , filename , fileindex ):
58+ """set attributes"""
59+ self .filename = filename
60+ self .fileindex = fileindex
61+
62+ class ChainCluster (object ):
63+ """
64+ Descriptor of a cluster of entries in a TChain. Uses global entries rather
65+ than local.
66+
67+ Attributes:
68+ start (int): The starting global entry of this cluster in the chain.
69+
70+ end (int): The ending global entry of this cluster in the chain.
71+
72+ offset (int): The offset of this cluster in the chain. That is, the starting
73+ entry of the file this cluster belongs to in the chain.
74+
75+ filetuple (FileAndIndex): A pair with the name of the file this cluster
76+ belongs to and the index of that file in the chain.
77+ """
78+
79+ def __init__ (self , start , end , offset , filetuple ):
80+ """set attributes"""
81+ self .start = start
82+ self .end = end
83+ self .offset = offset
84+ self .filetuple = filetuple
85+
86+ def __lt__ (self , other ):
87+ """
88+ In `get_clustered_ranges` we need to retrieve the minimum and maximum
89+ entries in a certain list of clusters.
90+ """
91+ return self .start < other .start and self .end < other .end
1092
1193
1294def _n_even_chunks (iterable , n_chunks ):
@@ -79,10 +161,7 @@ def get_clusters(treename, filelist):
79161 """
80162
81163 clusters = []
82- cluster = collections .namedtuple (
83- "cluster" , ["start" , "end" , "offset" , "filetuple" ])
84- fileandindex = collections .namedtuple ("fileandindex" ,
85- ["filename" , "index" ])
164+
86165 offset = 0
87166 fileindex = 0
88167
@@ -97,8 +176,8 @@ def get_clusters(treename, filelist):
97176
98177 while start < entries :
99178 end = it ()
100- clusters .append (cluster (start + offset , end + offset , offset ,
101- fileandindex (filename , fileindex )))
179+ clusters .append (ChainCluster (start + offset , end + offset , offset ,
180+ FileAndIndex (filename , fileindex )))
102181 start = end
103182
104183 fileindex += 1
@@ -249,13 +328,13 @@ def get_clustered_ranges(clustersinfiles, npartitions, treename, friend_info):
249328 """
250329 clustered_ranges = [
251330 TreeRange (
252- min (clusters )[ 0 ] - clusters [0 ].offset , # type: int
253- max (clusters )[ 1 ] - clusters [0 ].offset , # type: int
331+ min (clusters ). start - clusters [0 ].offset , # type: int
332+ max (clusters ). end - clusters [0 ].offset , # type: int
254333 [
255334 filetuple .filename
256335 for filetuple in sorted (set ([
257336 cluster .filetuple for cluster in clusters
258- ]), key = lambda curtuple : curtuple [ 1 ] )
337+ ]), key = lambda curtuple : curtuple . fileindex )
259338 ], # type: list[str]
260339 friend_info # type: DistRDF.HeadNode.FriendInfo
261340 ) # type: collections.namedtuple
0 commit comments