-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Filter while recursing #5425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Filter while recursing #5425
Changes from all commits
4761b89
e7951a4
b0ad49c
ce4a17f
97100c5
e07715e
fc9b714
44d21c4
2a5fabd
6eb2196
483d42b
028182e
de3ddf9
5db80b6
154a4e6
c793669
d9d0226
e600b8d
1f6a183
86e226a
eb77fd6
73426af
189bed0
46ffa92
26ddbed
132d574
f447790
ddea4f2
db0c1b8
fd3238a
62257ac
4436e51
0e39b92
e7a7ab6
28de390
517f2f1
3853d29
992f1bb
f557b41
90780d8
7ee6bb3
a899f0b
7a126d9
0b94184
c14a273
57f5cfd
d2613c6
3ca7c31
6a9bea8
c3f4d5c
28f7025
ed64dde
5899105
3129a9a
dc7047a
01f1cdd
b82ba41
46d52a0
a61f362
9ade39a
9127d20
b8f8b42
8f2df87
c6306ce
67727ee
f549ad0
8fa65cd
d33bcc6
c4fd548
17b3bd8
a959914
570b2c1
1a1f205
8a02ca9
63a3870
02c9422
8772d6d
be110fa
9fbc519
560bbe8
123c938
62cbd3f
268b957
2fa2b17
b7322a0
d9b5159
716276b
aa192fb
c23ecc2
99cd22c
6aa17da
976a703
67ee14a
96a28ba
16dc426
b5e8e7f
1121561
f4023c2
f2f5659
8eab6f9
fa0fd01
bf9bb5b
dce9cd1
bee1a05
c3a4fce
ee47c94
2603115
f0bcc74
0652f82
fbbf1df
7245c0e
813edc6
4a614b8
723d5c9
f4cc633
a7b88c4
db36d70
8c83c4d
6c8b86c
86f2b0e
1b03d0f
31b358a
1f834c6
a74c818
c2c63ce
fac92af
5153b46
37195fc
9a64de0
8b1b9a8
facf49e
e3ffa5d
77d1fc1
dbc674a
b6a5be4
48bcb8c
b3ffd6a
b1021d2
74cd099
35d79bf
e73bb8e
6e3b330
74aa2e3
d8fc647
56d4b38
b11fe1d
cd46cdb
88bc1e0
ca0b934
46025a0
8ba3de8
52aa3ba
eddb4f7
55464cd
a3e1a2a
9ee8794
56cd3d2
bc0660a
306089d
dea638d
4e357e3
b1a4946
0ffb009
d0f133c
2c43cbd
c90a3a0
9cdc7c3
fa3f8a1
34813ff
3219450
f41fc02
4a58b87
07c51e4
bb43c56
5976c68
b1e8878
b6e9764
3919400
89db5ea
52e2ca2
ba4accc
6c16b14
0a4baef
1000956
5160f01
17a3fbb
490e128
9c5ae02
efadea9
77de650
74770b7
e5bdcf1
ab613f2
aeed389
76ff8d9
ee8c7e8
188c58f
7beb63d
b434169
1e8a329
2fd3823
becfc0c
5c30d53
b6bb5ab
0f85352
dae78a1
f01a908
2e99111
b466f3d
ba0618a
57d3055
cb8d9be
47ee70c
af85c6e
be2a693
19559b1
4b7d616
f15cf5f
6f23701
819f2a4
6382ebf
adf2380
af3efaf
8cfffd3
83130db
53d7b36
0a6a563
423249f
edad847
d3572db
0b6949c
882f8a6
eeb12d8
2644a6f
5a43f8c
a32597d
34a823e
2105789
0432968
0b6a3e5
b1263fe
884ad4d
1e06fda
375efe1
b82c7b6
bddaff9
81a2b75
26f85e6
91723b6
0dccece
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| from dateutil.tz import tzlocal | ||
| from botocore.exceptions import ClientError | ||
|
|
||
| from awscli.customizations.s3.fileinfo import FileInfo | ||
| from awscli.customizations.s3.utils import find_bucket_key, get_file_stat | ||
| from awscli.customizations.s3.utils import BucketLister, create_warning, \ | ||
| find_dest_path_comp_key, EPOCH_TIME | ||
|
|
@@ -116,7 +117,8 @@ class FileGenerator(object): | |
| ``FileInfo`` objects to send to a ``Comparator`` or ``S3Handler``. | ||
| """ | ||
| def __init__(self, client, operation_name, follow_symlinks=True, | ||
| page_size=None, result_queue=None, request_parameters=None): | ||
| page_size=None, result_queue=None, request_parameters=None, | ||
| file_filter=None): | ||
| self._client = client | ||
| self.operation_name = operation_name | ||
| self.follow_symlinks = follow_symlinks | ||
|
|
@@ -127,6 +129,7 @@ def __init__(self, client, operation_name, follow_symlinks=True, | |
| self.request_parameters = {} | ||
| if request_parameters is not None: | ||
| self.request_parameters = request_parameters | ||
| self.file_filter = file_filter | ||
|
|
||
| def call(self, files): | ||
| """ | ||
|
|
@@ -168,8 +171,8 @@ def list_files(self, path, dir_op): | |
| outputs. It yields the file's source path, size, and last | ||
| update | ||
| """ | ||
| join, isdir, isfile = os.path.join, os.path.isdir, os.path.isfile | ||
| error, listdir = os.error, os.listdir | ||
| join, isdir = os.path.join, os.path.isdir | ||
| listdir = os.listdir | ||
| if not self.should_ignore_file(path): | ||
| if not dir_op: | ||
| stats = self._safely_get_file_stats(path) | ||
|
|
@@ -265,12 +268,18 @@ def should_ignore_file(self, path): | |
| file generation process. This includes symlinks that are not to be | ||
| followed and files that generate warnings. | ||
| """ | ||
| if os.path.isdir(path) and path.endswith(os.sep): | ||
| # Trailing slash must be removed to check if it is a symlink. | ||
| # This also creates a normalized form for use in autogenerated | ||
| # parent-directory include filters. | ||
| path = path[:-1] | ||
| if not self.follow_symlinks: | ||
| if os.path.isdir(path) and path.endswith(os.sep): | ||
| # Trailing slash must be removed to check if it is a symlink. | ||
| path = path[:-1] | ||
| if os.path.islink(path): | ||
| return True | ||
| if self.file_filter is not None: | ||
| # We received a file_filter. Let's see if the path matches: | ||
| if not list(self.file_filter.call([FileInfo(path)])): | ||
| return True | ||
| warning_triggered = self.triggers_warning(path) | ||
| if warning_triggered: | ||
| return True | ||
|
|
@@ -303,7 +312,21 @@ def triggers_warning(self, path): | |
| return True | ||
| return False | ||
|
|
||
| def list_objects(self, s3_path, dir_op): | ||
| def list_objects(self, *args, **kwargs): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be great if we could consolidate the calls to the def list_objects(self, s3_path, dir_op):
for path, data in self._list_objects_raw(s3_path, dir_op):
if self._should_ignore_from_file_filter(path):
continue
yield path, data |
||
| """ | ||
| Take unfiltered objects yielded by list_objects_raw() and return | ||
| only objects that pass through any filtering. | ||
| """ | ||
| if self.file_filter is None: | ||
| # With no filter, just pass data through. | ||
| yield from self.list_objects_raw(*args, **kwargs) | ||
| else: | ||
| # With a filter, check each path. | ||
| for path, data in self.list_objects_raw(*args, **kwargs): | ||
| if list(self.file_filter.call([FileInfo(path)])): | ||
| yield path, data | ||
|
|
||
| def list_objects_raw(self, s3_path, dir_op): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this method let's add an underscore to it to make it |
||
| """ | ||
| This function yields the appropriate object or objects under a | ||
| common prefix depending if the operation is on objects under a | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -101,6 +101,20 @@ def _full_path_patterns(self, original_patterns, rootdir): | |
| for pattern in original_patterns: | ||
| full_patterns.append( | ||
| (pattern[0], os.path.join(rootdir, pattern[1]))) | ||
| # When including paths deep in the directory tree, auto-include the | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm still not sure I understand the case this logic is optimizing for after reading this comment: #5425 (comment) and the code. Would it be possible to illustrate the use/edge case with a sample directory structure and CLI commands? I think that would be really helpful. In general, I'd prefer we do not change anything in the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. I have to focus on another project for the time being, but I will circle back to this later for the technical changes requested.
The use case is fairly simple: If we make the FileGenerator no longer recurse into excluded directories, then ...but that seems onerous and would break current behavior. In my PR, this is now verified by the commit with description The best solution I thought of was to auto-insert the necessary filter, since we know the existing filtering logic does what we need. I wanted to keep the special-case code to a minimum.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the detailed examples that was really helpful in understanding it! In general with this PR, we definitely need to make sure we do not break current behavior (i.e. given any set of filters + file set, the CLI will continue to include/exclude the same set of files after this change with no needing to adjust the filters provided at the command line). While it looks like the coded that got added does handle the missing edge case that you illustrated, I am concerned that we are going to be missing other edge cases after seeing the edge case you raised. For example, suppose we had this directory layout: If we only wanted to upload the This would result in all of the However, with the approach we are currently taking, we would still not traversing the In general, I'm not optimistic the original approach of just applying the filters to the directory before traversing it is going to work. In terms of options to pursue, I think we either:
Thoughts? I want to think about this a bit more to see if I have any better suggestions, but I at least wanted to put this realization down now to start the discussion.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, that is rough. Your first idea seems best, but I don't know how to do that either. A third idea would be to give the user control, either by a global option |
||
| # parent directories. Otherwise, we may skip traversing into those | ||
| # parents due to an exclude. | ||
| if pattern[0] == 'include': | ||
| last_path = pattern[1] | ||
| while True: | ||
| parent = os.path.dirname(last_path) | ||
| if parent == last_path: | ||
| break | ||
| full_patterns.append( | ||
| ('include', os.path.join(rootdir, parent))) | ||
|
|
||
| last_path = parent | ||
|
|
||
| return full_patterns | ||
|
|
||
| def call(self, file_infos): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be great if we can move the file filter check into its own helper method that we could reuse across the local and s3 listing methods. So something like this:
This will make it a bit more reusable and improve readability as we are reducing the level of nesting/complexity in the if statements.