Skip to content

Commit 8035eeb

Browse files
committed
Revert "Use urllib handler for s3:// and gs://, improve url_exists through HEAD requests (#34324)"
This reverts commit db8f115.
1 parent 57383a2 commit 8035eeb

File tree

4 files changed

+121
-77
lines changed

4 files changed

+121
-77
lines changed

lib/spack/spack/gcs_handler.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,9 @@
33
#
44
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
55
import urllib.response
6-
from urllib.error import URLError
7-
from urllib.request import BaseHandler
86

97
import spack.util.url as url_util
8+
import spack.util.web as web_util
109

1110

1211
def gcs_open(req, *args, **kwargs):
@@ -17,13 +16,8 @@ def gcs_open(req, *args, **kwargs):
1716
gcsblob = gcs_util.GCSBlob(url)
1817

1918
if not gcsblob.exists():
20-
raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
19+
raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path))
2120
stream = gcsblob.get_blob_byte_stream()
2221
headers = gcsblob.get_blob_headers()
2322

2423
return urllib.response.addinfourl(stream, headers, url)
25-
26-
27-
class GCSHandler(BaseHandler):
28-
def gs_open(self, req):
29-
return gcs_open(req)

lib/spack/spack/s3_handler.py

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import urllib.error
77
import urllib.request
88
import urllib.response
9-
from io import BufferedReader, BytesIO, IOBase
9+
from io import BufferedReader, IOBase
1010

1111
import spack.util.s3 as s3_util
1212
import spack.util.url as url_util
@@ -42,7 +42,7 @@ def __getattr__(self, key):
4242
return getattr(self.raw, key)
4343

4444

45-
def _s3_open(url, method="GET"):
45+
def _s3_open(url):
4646
parsed = url_util.parse(url)
4747
s3 = s3_util.get_s3_session(url, method="fetch")
4848

@@ -52,29 +52,27 @@ def _s3_open(url, method="GET"):
5252
if key.startswith("/"):
5353
key = key[1:]
5454

55-
if method not in ("GET", "HEAD"):
56-
raise urllib.error.URLError(
57-
"Only GET and HEAD verbs are currently supported for the s3:// scheme"
58-
)
59-
60-
try:
61-
if method == "GET":
62-
obj = s3.get_object(Bucket=bucket, Key=key)
63-
# NOTE(opadron): Apply workaround here (see above)
64-
stream = WrapStream(obj["Body"])
65-
elif method == "HEAD":
66-
obj = s3.head_object(Bucket=bucket, Key=key)
67-
stream = BytesIO()
68-
except s3.ClientError as e:
69-
raise urllib.error.URLError(e) from e
55+
obj = s3.get_object(Bucket=bucket, Key=key)
7056

57+
# NOTE(opadron): Apply workaround here (see above)
58+
stream = WrapStream(obj["Body"])
7159
headers = obj["ResponseMetadata"]["HTTPHeaders"]
7260

7361
return url, headers, stream
7462

7563

76-
class UrllibS3Handler(urllib.request.BaseHandler):
64+
class UrllibS3Handler(urllib.request.HTTPSHandler):
7765
def s3_open(self, req):
7866
orig_url = req.get_full_url()
79-
url, headers, stream = _s3_open(orig_url, method=req.get_method())
80-
return urllib.response.addinfourl(stream, headers, url)
67+
from botocore.exceptions import ClientError # type: ignore[import]
68+
69+
try:
70+
url, headers, stream = _s3_open(orig_url)
71+
return urllib.response.addinfourl(stream, headers, url)
72+
except ClientError as err:
73+
raise urllib.error.URLError(err) from err
74+
75+
76+
S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
77+
78+
open = S3OpenerDirector.open

lib/spack/spack/test/web.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,7 @@ def paginate(self, *args, **kwargs):
223223

224224
class MockClientError(Exception):
225225
def __init__(self):
226-
self.response = {
227-
"Error": {"Code": "NoSuchKey"},
228-
"ResponseMetadata": {"HTTPStatusCode": 404},
229-
}
226+
self.response = {"Error": {"Code": "NoSuchKey"}}
230227

231228

232229
class MockS3Client(object):
@@ -245,13 +242,7 @@ def delete_object(self, *args, **kwargs):
245242
def get_object(self, Bucket=None, Key=None):
246243
self.ClientError = MockClientError
247244
if Bucket == "my-bucket" and Key == "subdirectory/my-file":
248-
return {"ResponseMetadata": {"HTTPHeaders": {}}}
249-
raise self.ClientError
250-
251-
def head_object(self, Bucket=None, Key=None):
252-
self.ClientError = MockClientError
253-
if Bucket == "my-bucket" and Key == "subdirectory/my-file":
254-
return {"ResponseMetadata": {"HTTPHeaders": {}}}
245+
return True
255246
raise self.ClientError
256247

257248

lib/spack/spack/util/web.py

Lines changed: 99 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import traceback
1818
from html.parser import HTMLParser
1919
from urllib.error import URLError
20-
from urllib.request import HTTPSHandler, Request, build_opener
20+
from urllib.request import Request, urlopen
2121

2222
import llnl.util.lang
2323
import llnl.util.tty as tty
@@ -26,8 +26,6 @@
2626
import spack
2727
import spack.config
2828
import spack.error
29-
import spack.gcs_handler
30-
import spack.s3_handler
3129
import spack.url
3230
import spack.util.crypto
3331
import spack.util.gcs as gcs_util
@@ -37,28 +35,6 @@
3735
from spack.util.executable import CommandNotFoundError, which
3836
from spack.util.path import convert_to_posix_path
3937

40-
41-
def _urlopen():
42-
s3 = spack.s3_handler.UrllibS3Handler()
43-
gcs = spack.gcs_handler.GCSHandler()
44-
45-
# One opener with HTTPS ssl enabled
46-
with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
47-
48-
# One opener with HTTPS ssl disabled
49-
without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
50-
51-
# And dynamically dispatch based on the config:verify_ssl.
52-
def dispatch_open(*args, **kwargs):
53-
opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
54-
return opener.open(*args, **kwargs)
55-
56-
return dispatch_open
57-
58-
59-
#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
60-
urlopen = llnl.util.lang.Singleton(_urlopen)
61-
6238
#: User-Agent used in Request objects
6339
SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
6440

@@ -83,12 +59,43 @@ def handle_starttag(self, tag, attrs):
8359
self.links.append(val)
8460

8561

62+
def uses_ssl(parsed_url):
63+
if parsed_url.scheme == "https":
64+
return True
65+
66+
if parsed_url.scheme == "s3":
67+
endpoint_url = os.environ.get("S3_ENDPOINT_URL")
68+
if not endpoint_url:
69+
return True
70+
71+
if url_util.parse(endpoint_url, scheme="https").scheme == "https":
72+
return True
73+
74+
elif parsed_url.scheme == "gs":
75+
tty.debug("(uses_ssl) GCS Blob is https")
76+
return True
77+
78+
return False
79+
80+
8681
def read_from_url(url, accept_content_type=None):
8782
url = url_util.parse(url)
83+
context = None
8884

8985
# Timeout in seconds for web requests
9086
timeout = spack.config.get("config:connect_timeout", 10)
9187

88+
# Don't even bother with a context unless the URL scheme is one that uses
89+
# SSL certs.
90+
if uses_ssl(url):
91+
if spack.config.get("config:verify_ssl"):
92+
# User wants SSL verification, and it *can* be provided.
93+
context = ssl.create_default_context()
94+
else:
95+
# User has explicitly indicated that they do not want SSL
96+
# verification.
97+
context = ssl._create_unverified_context()
98+
9299
url_scheme = url.scheme
93100
url = url_util.format(url)
94101
if sys.platform == "win32" and url_scheme == "file":
@@ -104,15 +111,15 @@ def read_from_url(url, accept_content_type=None):
104111
# one round-trip. However, most servers seem to ignore the header
105112
# if you ask for a tarball with Accept: text/html.
106113
req.get_method = lambda: "HEAD"
107-
resp = urlopen(req, timeout=timeout)
114+
resp = _urlopen(req, timeout=timeout, context=context)
108115

109116
content_type = get_header(resp.headers, "Content-type")
110117

111118
# Do the real GET request when we know it's just HTML.
112119
req.get_method = lambda: "GET"
113120

114121
try:
115-
response = urlopen(req, timeout=timeout)
122+
response = _urlopen(req, timeout=timeout, context=context)
116123
except URLError as err:
117124
raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
118125

@@ -344,6 +351,12 @@ def url_exists(url, curl=None):
344351
Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
345352
method defined by `config:url_fetch_method` is used.
346353
354+
If the method is `curl`, it also uses the following configuration option:
355+
356+
* config:verify_ssl (str): Perform SSL verification
357+
358+
Otherwise, `urllib` will be used.
359+
347360
Arguments:
348361
url (str): URL whose existence is being checked
349362
curl (spack.util.executable.Executable or None): (optional) curl
@@ -354,11 +367,31 @@ def url_exists(url, curl=None):
354367
tty.debug("Checking existence of {0}".format(url))
355368
url_result = url_util.parse(url)
356369

357-
# Use curl if configured to do so
358-
use_curl = spack.config.get(
359-
"config:url_fetch_method", "urllib"
360-
) == "curl" and url_result.scheme not in ("gs", "s3")
361-
if use_curl:
370+
# Check if a local file
371+
local_path = url_util.local_file_path(url_result)
372+
if local_path:
373+
return os.path.exists(local_path)
374+
375+
# Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
376+
if url_result.scheme == "s3":
377+
# Check for URL-specific connection information
378+
s3 = s3_util.get_s3_session(url_result, method="fetch")
379+
380+
try:
381+
s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
382+
return True
383+
except s3.ClientError as err:
384+
if err.response["Error"]["Code"] == "NoSuchKey":
385+
return False
386+
raise err
387+
388+
# Check if Google Storage .. urllib-based fetch
389+
if url_result.scheme == "gs":
390+
gcs = gcs_util.GCSBlob(url_result)
391+
return gcs.exists()
392+
393+
# Otherwise, use the configured fetch method
394+
if spack.config.get("config:url_fetch_method") == "curl":
362395
curl_exe = _curl(curl)
363396
if not curl_exe:
364397
return False
@@ -371,14 +404,13 @@ def url_exists(url, curl=None):
371404
_ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
372405
return curl_exe.returncode == 0
373406

374-
# Otherwise use urllib.
407+
# If we get here, then the only other fetch method option is urllib.
408+
# So try to "read" from the URL and assume that *any* non-throwing
409+
# response contains the resource represented by the URL.
375410
try:
376-
urlopen(
377-
Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
378-
timeout=spack.config.get("config:connect_timeout", 10),
379-
)
411+
read_from_url(url)
380412
return True
381-
except URLError as e:
413+
except (SpackWebError, URLError) as e:
382414
tty.debug("Failure reading URL: " + str(e))
383415
return False
384416

@@ -661,6 +693,35 @@ def _spider(url, collect_nested):
661693
return pages, links
662694

663695

696+
def _urlopen(req, *args, **kwargs):
697+
"""Wrapper for compatibility with old versions of Python."""
698+
url = req
699+
try:
700+
url = url.get_full_url()
701+
except AttributeError:
702+
pass
703+
704+
del kwargs["context"]
705+
706+
opener = urlopen
707+
if url_util.parse(url).scheme == "s3":
708+
import spack.s3_handler
709+
710+
opener = spack.s3_handler.open
711+
elif url_util.parse(url).scheme == "gs":
712+
import spack.gcs_handler
713+
714+
opener = spack.gcs_handler.gcs_open
715+
716+
try:
717+
return opener(req, *args, **kwargs)
718+
except TypeError as err:
719+
# If the above fails because of 'context', call without 'context'.
720+
if "context" in kwargs and "context" in str(err):
721+
del kwargs["context"]
722+
return opener(req, *args, **kwargs)
723+
724+
664725
def find_versions_of_archive(
665726
archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
666727
):

0 commit comments

Comments
 (0)