diff --git a/Lib/http/server.py b/Lib/http/server.py index 7d0da5052d2d4d..4433b77b5f241c 100644 --- a/Lib/http/server.py +++ b/Lib/http/server.py @@ -686,6 +686,33 @@ def do_HEAD(self): if f: f.close() + def _request_path_split(self, path): + """Parse a path that can include an optional query and fragment. + """ + # We only handle the 'abs_path' case for the Request-URI part of the + # request line (the second word). We don't handle the case of a URL + # containing a scheme or netloc. + path, _, query = path.partition('?') + path, _, fragment = path.partition('#') + return urllib.parse.SplitResult('', '', path, query, fragment) + + def _get_redirect_url_for_dir(self): + """Returns URL with trailing slash on path, if required. If not + required, returns None. + """ + # Previous versions of this class used urllib.parse.urlsplit() here. + # However, the 'path' is being treated as a local filesystem path and + # it can't have a scheme or netloc. We need to avoid parsing it + # incorrectly. For example, as reported in gh-87389, a path starting + # with a double slash should not be treated as a relative URI. Also, a + # path with a colon in the first component could also be parsed + # wrongly. + parts = self._request_path_split(self.path) + if parts.path.endswith('/'): + return None # already has slash, no redirect needed + return urllib.parse.urlunsplit(('', '', parts.path + '/', parts.query, + parts.fragment)) + def send_head(self): """Common code for GET and HEAD commands. @@ -700,13 +727,10 @@ def send_head(self): path = self.translate_path(self.path) f = None if os.path.isdir(path): - parts = urllib.parse.urlsplit(self.path) - if not parts.path.endswith('/'): + new_url = self._get_redirect_url_for_dir() + if new_url: # redirect browser - doing basically what apache does self.send_response(HTTPStatus.MOVED_PERMANENTLY) - new_parts = (parts[0], parts[1], parts[2] + '/', - parts[3], parts[4]) - new_url = urllib.parse.urlunsplit(new_parts) self.send_header("Location", new_url) self.send_header("Content-Length", "0") self.end_headers() @@ -839,9 +863,8 @@ def translate_path(self, path): probably be diagnosed.) """ - # abandon query parameters - path = path.split('?',1)[0] - path = path.split('#',1)[0] + # extract only path, abandon query parameters and fragment + path = self._request_path_split(path).path # Don't forget explicit trailing slash when normalizing. Issue17324 trailing_slash = path.rstrip().endswith('/') try: diff --git a/Lib/test/test_httpservers.py b/Lib/test/test_httpservers.py index 1c370dcafa9fea..634514c5062ae1 100644 --- a/Lib/test/test_httpservers.py +++ b/Lib/test/test_httpservers.py @@ -484,18 +484,19 @@ def test_get_dir_redirect_location_domain_injection_bug(self): self.assertEqual(response.getheader('Location'), expected_location) # If the second word in the http request (Request-URI for the http - # method) is a full URI, we don't worry about it, as that'll be parsed - # and reassembled as a full URI within BaseHTTPRequestHandler.send_head - # so no errant scheme-less //netloc//evil.co/ domain mixup can happen. + # method) has a scheme and netloc, it still gets treated as an + # absolute path by the server. In that case, the redirect is + # constructed so it is parsed as a path. The './' part of the path + # is added by urlunsplit() so that the 'https:' part of what is being + # treated as a path is not treated as a scheme in the redirect + # location. http.server is not a proxy and doesn't handle Request-URI + # being an absolute URI with a scheme and or netloc. attack_scheme_netloc_2slash_url = f'https://pypi.org/{url}' - expected_scheme_netloc_location = f'{attack_scheme_netloc_2slash_url}/' + expected_location = f'./{attack_scheme_netloc_2slash_url}/' response = self.request(attack_scheme_netloc_2slash_url) self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY) location = response.getheader('Location') - # We're just ensuring that the scheme and domain make it through, if - # there are or aren't multiple slashes at the start of the path that - # follows that isn't important in this Location: header. - self.assertTrue(location.startswith('https://pypi.org/'), msg=location) + self.assertEqual(location, expected_location) def test_get(self): #constructs the path relative to the root directory of the HTTPServer diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 4faad733245df9..0b7fa76dc4e6f9 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1354,6 +1354,25 @@ def test_urlsplit_normalization(self): with self.assertRaises(ValueError): urllib.parse.urlsplit(url) + def test_urlunsplit_relative(self): + cases = [ + # expected result is a relative URL without netloc and scheme + (('', 'a', '', '', ''), '//a'), + # extra leading slashes need to be stripped to avoid confusion + # with a relative URL + (('', '', '//a', '', ''), '/a'), + (('', '', '///a', '', ''), '/a'), + # not relative so extra leading slashes don't need stripping since + # they don't cause confusion + (('http', 'x.y', '//a', '', ''), 'http://x.y//a'), + # avoid confusion with path containing colon + (('', '', 'a:b', '', ''), './a:b'), + ] + for parts, result in cases: + self.assertEqual(urllib.parse.urlunsplit(parts), result, + msg=f'{parts=}') + + class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" # In Python 2 this test class was in test_urllib. diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 3932bb99c7e7d1..28f84f29c690ca 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -517,14 +517,32 @@ def urlunparse(components): url = "%s;%s" % (url, params) return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) +# Returns true if path can confused with a scheme. I.e. a relative path +# without leading dot that includes a colon in the first component. +_is_scheme_like = re.compile(r'[^/.][^/]*:').match + def urlunsplit(components): """Combine the elements of a tuple as returned by urlsplit() into a complete URL as a string. The data argument can be any five-item iterable. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had unnecessary delimiters (for example, a ? with an empty query; the RFC states that these are equivalent).""" - scheme, netloc, url, query, fragment, _coerce_result = ( + scheme, netloc, path, query, fragment, _coerce_result = ( _coerce_args(*components)) + if not scheme and not netloc: + # Building a relative URI. Need to be careful that path is not + # confused with scheme or netloc. + if path.startswith('//'): + # gh-87389: don't treat first component of path as netloc + url = '/' + path.lstrip('/') + elif _is_scheme_like(path): + # first component has colon, ensure it will not be parsed as the + # scheme + url = './' + path + else: + url = path + else: + url = path if netloc or (scheme and scheme in uses_netloc) or url[:2] == '//': if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url diff --git a/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst b/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst new file mode 100644 index 00000000000000..9dbf777bf23534 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst @@ -0,0 +1,5 @@ +Change :func:`urllib.parse.urlunsplit` to sanitize ``path`` argument in order +to avoid confusing the first component of the path as a net location or +scheme. + +Co-authored-by: Gregory P. Smith [Google]