|
7 | 7 | Utility functions for parsing, formatting, and manipulating URLs.
|
8 | 8 | """
|
9 | 9 |
|
10 |
| -import itertools |
11 |
| -import posixpath |
12 | 10 | import re
|
13 | 11 | import sys
|
14 | 12 |
|
|
23 | 21 | is_windows = sys.platform == 'win32'
|
24 | 22 |
|
25 | 23 |
|
26 |
| -def _split_all(path): |
27 |
| - """Split path into its atomic components. |
28 |
| -
|
29 |
| - Returns the shortest list, L, of strings such that posixpath.join(*L) == |
30 |
| - path and posixpath.split(element) == ('', element) for every element in L |
31 |
| - except possibly the first. This first element may possibly have the value |
32 |
| - of '/'. |
33 |
| - """ |
34 |
| - result = [] |
35 |
| - a = path |
36 |
| - old_a = None |
37 |
| - while a != old_a: |
38 |
| - (old_a, (a, b)) = a, posixpath.split(a) |
39 |
| - |
40 |
| - if a or b: |
41 |
| - result.insert(0, b or '/') |
42 |
| - |
43 |
| - return result |
44 |
| - |
45 |
| - |
46 | 24 | def local_file_path(url):
|
47 | 25 | """Get a local file path from a url.
|
48 | 26 |
|
@@ -124,168 +102,31 @@ def format(parsed_url):
|
124 | 102 | return parsed_url.geturl()
|
125 | 103 |
|
126 | 104 |
|
127 |
| -def join(base_url, path, *extra, **kwargs): |
128 |
| - """Joins a base URL with one or more local URL path components |
129 |
| -
|
130 |
| - If resolve_href is True, treat the base URL as though it where the locator |
131 |
| - of a web page, and the remaining URL path components as though they formed
10000
code> |
132 |
| - a relative URL to be resolved against it (i.e.: as in posixpath.join(...)). |
133 |
| - The result is an absolute URL to the resource to which a user's browser |
134 |
| - would navigate if they clicked on a link with an "href" attribute equal to |
135 |
| - the relative URL. |
136 |
| -
|
137 |
| - If resolve_href is False (default), then the URL path components are joined |
138 |
| - as in posixpath.join(). |
139 |
| -
|
140 |
| - Note: file:// URL path components are not canonicalized as part of this |
141 |
| - operation. To canonicalize, pass the joined url to format(). |
142 |
| -
|
143 |
| - Examples: |
144 |
| - base_url = 's3://bucket/index.html' |
145 |
| - body = fetch_body(prefix) |
146 |
| - link = get_href(body) # link == '../other-bucket/document.txt' |
147 |
| -
|
148 |
| - # wrong - link is a local URL that needs to be resolved against base_url |
149 |
| - spack.util.url.join(base_url, link) |
150 |
| - 's3://bucket/other_bucket/document.txt' |
151 |
| -
|
152 |
| - # correct - resolve local URL against base_url |
153 |
| - spack.util.url.join(base_url, link, resolve_href=True) |
154 |
| - 's3://other_bucket/document.txt' |
155 |
| -
|
156 |
| - prefix = 'https://mirror.spack.io/build_cache' |
157 |
| -
|
158 |
| - # wrong - prefix is just a URL prefix |
159 |
| - spack.util.url.join(prefix, 'my-package', resolve_href=True) |
160 |
| - 'https://mirror.spack.io/my-package' |
161 |
| -
|
162 |
| - # correct - simply append additional URL path components |
163 |
| - spack.util.url.join(prefix, 'my-package', resolve_href=False) # default |
164 |
| - 'https://mirror.spack.io/build_cache/my-package' |
165 |
| -
|
166 |
| - # For canonicalizing file:// URLs, take care to explicitly differentiate |
167 |
| - # between absolute and relative join components. |
168 |
| -
|
169 |
| - # '$spack' is not an absolute path component |
170 |
| - join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result |
171 |
| - 'file:///a/b/c/$spack' |
172 |
| - spack.util.url.format(join_result) |
173 |
| - 'file:///a/b/c/opt/spack' |
174 |
| -
|
175 |
| - # '/$spack' *is* an absolute path component |
176 |
| - join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result |
177 |
| - 'file:///$spack' |
178 |
| - spack.util.url.format(join_result) |
179 |
| - 'file:///opt/spack' |
180 |
| - """ |
181 |
| - paths = [ |
182 |
| - (x) if isinstance(x, str) |
183 |
| - else x.geturl() |
184 |
| - for x in itertools.chain((base_url, path), extra)] |
185 |
| - |
186 |
| - paths = [convert_to_posix_path(x) for x in paths] |
187 |
| - n = len(paths) |
188 |
| - last_abs_component = None |
189 |
| - scheme = '' |
190 |
| - for i in range(n - 1, -1, -1): |
191 |
| - obj = urllib.parse.urlparse( |
192 |
| - paths[i], scheme='', allow_fragments=False) |
193 |
| - |
194 |
| - scheme = obj.scheme |
195 |
| - |
196 |
| - # in either case the component is absolute |
197 |
| - if scheme or obj.path.startswith('/'): |
198 |
| - if not scheme: |
199 |
| - # Without a scheme, we have to go back looking for the |
200 |
| - # next-last component that specifies a scheme. |
201 |
| - for j in range(i - 1, -1, -1): |
202 |
| - obj = urllib.parse.urlparse( |
203 |
| - paths[j], scheme='', allow_fragments=False) |
204 |
| - |
205 |
| - if obj.scheme: |
206 |
| - paths[i] = '{SM}://{NL}{PATH}'.format( |
207 |
| - SM=obj.scheme, |
208 |
| - NL=( |
209 |
| - (obj.netloc + '/') |
210 |
| - if obj.scheme != 's3' else ''), |
211 |
| - PATH=paths[i][1:]) |
212 |
| - break |
213 |
| - |
214 |
| - last_abs_component = i |
215 |
| - break |
216 |
| - |
217 |
| - if last_abs_component is not None: |
218 |
| - paths = paths[last_abs_component:] |
219 |
| - if len(paths) == 1: |
220 |
| - result = urllib.parse.urlparse( |
221 |
| - paths[0], scheme='file', allow_fragments=False) |
222 |
| - |
223 |
| - # another subtlety: If the last argument to join() is an absolute |
224 |
| - # file:// URL component with a relative path, the relative path |
225 |
| - # needs to be resolved. |
226 |
| - if result.scheme == 'file' and result.netloc: |
227 |
| - result = urllib.parse.ParseResult( |
228 |
| - scheme=result.scheme, |
229 |
| - netloc='', |
230 |
| - path=posixpath.abspath(result.netloc + result.path), |
231 |
| - params=result.params, |
232 |
| - query=result.query, |
233 |
| - fragment=None) |
234 |
| - |
235 |
| - return result.geturl() |
236 |
| - |
237 |
| - return _join(*paths, **kwargs) |
238 |
| - |
239 |
| - |
240 |
| -def _join(base_url, path, *extra, **kwargs): |
241 |
| - base_url = parse(base_url) |
242 |
| - resolve_href = kwargs.get('resolve_href', False) |
243 |
| - |
244 |
| - (scheme, netloc, base_path, params, query, _) = base_url |
245 |
| - scheme = scheme.lower() |
246 |
| - |
247 |
| - path_tokens = [ |
248 |
| - part for part in itertools.chain( |
249 |
| - _split_all(path), |
250 |
| - itertools.chain.from_iterable( |
251 |
| - _split_all(extra_path) for extra_path in extra)) |
252 |
| - if part and part != '/'] |
253 |
| - |
254 |
| - base_path_args = ['/fake-root'] |
255 |
| - if scheme == 's3': |
256 |
| - if netloc: |
257 |
| - base_path_args.append(netloc) |
258 |
| - |
259 |
| - if base_path.startswith('/'): |
260 |
| - base_path = base_path[1:] |
261 |
| - |
262 |
| - base_path_args.append(base_path) |
263 |
| - |
264 |
| - if resolve_href: |
265 |
| - new_base_path, _ = posixpath.split(posixpath.join(*base_path_args)) |
266 |
| - base_path_args = [new_base_path] |
267 |
| - |
268 |
| - base_path_args.extend(path_tokens) |
269 |
| - base_path = posixpath.relpath(posixpath.join(*base_path_args), '/fake-root') |
270 |
| - |
271 |
| - if scheme == 's3': |
272 |
| - path_tokens = [ |
273 |
| - part for part in _split_all(base_path) |
274 |
| - if part and part != '/'] |
275 |
| - |
276 |
| - if path_tokens: |
277 |
| - netloc = path_tokens.pop(0) |
278 |
| - base_path = posixpath.join('', *path_tokens) |
279 |
| - |
280 |
| - if sys.platform == "win32": |
281 |
| - base_path = convert_to_posix_path(base_path) |
282 |
| - |
283 |
| - return format(urllib.parse.ParseResult(scheme=scheme, |
284 |
| - netloc=netloc, |
285 |
| - path=base_path, |
286 |
| - params=params, |
287 |
| - query=query, |
288 |
| - fragment=None)) |
| 105 | +def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str: |
| 106 | + """Convenience wrapper around ``urllib.parse.urljoin``, with a few differences: |
| 107 | + 1. By default resolve_href=False, which makes the function like os.path.join: for example |
| 108 | + https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the |
| 109 | + behavior is how a browser would resolve the URL: https://example.com/a/c/d. |
| 110 | + 2. s3://, gs://, oci:// URLs are joined like http:// URLs. |
| 111 | + 3. It accepts multiple components for convenience. Note that components[1:] are treated as |
| 112 | + literal path components and appended to components[0] separated by slashes.""" |
| 113 | + # Ensure a trailing slash in the path component of the base URL to get os.path.join-like |
| 114 | + # behavior instead of web browser behavior. |
| 115 | + if not resolve_href: |
| 116 | + parsed = urllib.parse.urlparse(base) |
| 117 | + if not parsed.path.endswith("/"): |
| 118 | + base = parsed._replace(path=f"{parsed.path}/").geturl() |
| 119 | + uses_netloc = urllib.parse.uses_netloc |
| 120 | + uses_relative = urllib.parse.uses_relative |
| 121 | + try: |
| 122 | + # NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http. |
| 123 | + # This is non-portable, and may be forward incompatible with future cpython versions. |
| 124 | + urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs", "oci"] |
| 125 | + urllib.parse.uses_relative = [*uses_relative, "s3", "gs", "oci"] |
| 126 | + return urllib.parse.urljoin(base, "/".join(components), **kwargs) |
| 127 | + finally: |
| 128 | + urllib.parse.uses_netloc = uses_netloc |
| 129 | + urllib.parse.uses_relative = uses_relative |
289 | 130 |
|
290 | 131 |
|
291 | 132 | git_re = (
|
|
0 commit comments