diff --git a/news/5833.trivial b/news/5833.trivial new file mode 100644 index 00000000000..f01ca00231c --- /dev/null +++ b/news/5833.trivial @@ -0,0 +1 @@ +Move static and class methods out of HTMLPage for prepare for refactoring. diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py index 9eb8e120751..8c2f24f1b7b 100644 --- a/src/pip/_internal/index.py +++ b/src/pip/_internal/index.py @@ -59,6 +59,122 @@ logger = logging.getLogger(__name__) +def _get_content_type(url, session): + """Get the Content-Type of the given url, using a HEAD request""" + scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url) + if scheme not in {'http', 'https'}: + # FIXME: some warning or something? + # assertion error? + return '' + + resp = session.head(url, allow_redirects=True) + resp.raise_for_status() + + return resp.headers.get("Content-Type", "") + + +def _handle_get_page_fail(link, reason, url, meth=None): + if meth is None: + meth = logger.debug + meth("Could not fetch URL %s: %s - skipping", link, reason) + + +def _get_html_page(link, session=None): + if session is None: + raise TypeError( + "_get_html_page() missing 1 required keyword argument: 'session'" + ) + + url = link.url + url = url.split('#', 1)[0] + + # Check for VCS schemes that do not support lookup as web pages. + from pip._internal.vcs import VcsSupport + for scheme in VcsSupport.schemes: + if url.lower().startswith(scheme) and url[len(scheme)] in '+:': + logger.debug('Cannot look at %s URL %s', scheme, link) + return None + + try: + filename = link.filename + for bad_ext in ARCHIVE_EXTENSIONS: + if filename.endswith(bad_ext): + content_type = _get_content_type(url, session=session) + if content_type.lower().startswith('text/html'): + break + else: + logger.debug( + 'Skipping page %s because of Content-Type: %s', + link, + content_type, + ) + return + + logger.debug('Getting page %s', url) + + # Tack index.html onto file:// URLs that point to directories + (scheme, netloc, path, params, query, fragment) = \ + urllib_parse.urlparse(url) + if (scheme == 'file' and + os.path.isdir(urllib_request.url2pathname(path))): + # add trailing slash if not present so urljoin doesn't trim + # final segment + if not url.endswith('/'): + url += '/' + url = urllib_parse.urljoin(url, 'index.html') + logger.debug(' file: URL is directory, getting %s', url) + + resp = session.get( + url, + headers={ + "Accept": "text/html", + # We don't want to blindly returned cached data for + # /simple/, because authors generally expecting that + # twine upload && pip install will function, but if + # they've done a pip install in the last ~10 minutes + # it won't. Thus by setting this to zero we will not + # blindly use any cached data, however the benefit of + # using max-age=0 instead of no-cache, is that we will + # still support conditional requests, so we will still + # minimize traffic sent in cases where the page hasn't + # changed at all, we will just always incur the round + # trip for the conditional GET now instead of only + # once per 10 minutes. + # For more information, please see pypa/pip#5670. + "Cache-Control": "max-age=0", + }, + ) + resp.raise_for_status() + + # The check for archives above only works if the url ends with + # something that looks like an archive. However that is not a + # requirement of an url. Unless we issue a HEAD request on every + # url we cannot know ahead of time for sure if something is HTML + # or not. However we can check after we've downloaded it. + content_type = resp.headers.get('Content-Type', 'unknown') + if not content_type.lower().startswith("text/html"): + logger.debug( + 'Skipping page %s because of Content-Type: %s', + link, + content_type, + ) + return + + inst = HTMLPage(resp.content, resp.url, resp.headers) + except requests.HTTPError as exc: + _handle_get_page_fail(link, exc, url) + except SSLError as exc: + reason = "There was a problem confirming the ssl certificate: " + reason += str(exc) + _handle_get_page_fail(link, reason, url, meth=logger.info) + except requests.ConnectionError as exc: + _handle_get_page_fail(link, "connection error: %s" % exc, url) + except requests.Timeout: + _handle_get_page_fail(link, "timed out", url) + else: + return inst + + class PackageFinder(object): """This finds packages. @@ -674,7 +790,7 @@ def _link_package_versions(self, link, search): return InstallationCandidate(search.supplied, version, link) def _get_page(self, link): - return HTMLPage.get_page(link, session=self.session) + return _get_html_page(link, session=self.session) def egg_info_matches( @@ -756,125 +872,6 @@ def __init__(self, content, url, headers=None): def __str__(self): return self.url - @classmethod - def get_page(cls, link, session=None): - if session is None: - raise TypeError( - "get_page() missing 1 required keyword argument: 'session'" - ) - - url = link.url - url = url.split('#', 1)[0] - - # Check for VCS schemes that do not support lookup as web pages. - from pip._internal.vcs import VcsSupport - for scheme in VcsSupport.schemes: - if url.lower().startswith(scheme) and url[len(scheme)] in '+:': - logger.debug('Cannot look at %s URL %s', scheme, link) - return None - - try: - filename = link.filename - for bad_ext in ARCHIVE_EXTENSIONS: - if filename.endswith(bad_ext): - content_type = cls._get_content_type( - url, session=session, - ) - if content_type.lower().startswith('text/html'): - break - else: - logger.debug( - 'Skipping page %s because of Content-Type: %s', - link, - content_type, - ) - return - - logger.debug('Getting page %s', url) - - # Tack index.html onto file:// URLs that point to directories - (scheme, netloc, path, params, query, fragment) = \ - urllib_parse.urlparse(url) - if (scheme == 'file' and - os.path.isdir(urllib_request.url2pathname(path))): - # add trailing slash if not present so urljoin doesn't trim - # final segment - if not url.endswith('/'): - url += '/' - url = urllib_parse.urljoin(url, 'index.html') - logger.debug(' file: URL is directory, getting %s', url) - - resp = session.get( - url, - headers={ - "Accept": "text/html", - # We don't want to blindly returned cached data for - # /simple/, because authors generally expecting that - # twine upload && pip install will function, but if - # they've done a pip install in the last ~10 minutes - # it won't. Thus by setting this to zero we will not - # blindly use any cached data, however the benefit of - # using max-age=0 instead of no-cache, is that we will - # still support conditional requests, so we will still - # minimize traffic sent in cases where the page hasn't - # changed at all, we will just always incur the round - # trip for the conditional GET now instead of only - # once per 10 minutes. - # For more information, please see pypa/pip#5670. - "Cache-Control": "max-age=0", - }, - ) - resp.raise_for_status() - - # The check for archives above only works if the url ends with - # something that looks like an archive. However that is not a - # requirement of an url. Unless we issue a HEAD request on every - # url we cannot know ahead of time for sure if something is HTML - # or not. However we can check after we've downloaded it. - content_type = resp.headers.get('Content-Type', 'unknown') - if not content_type.lower().startswith("text/html"): - logger.debug( - 'Skipping page %s because of Content-Type: %s', - link, - content_type, - ) - return - - inst = cls(resp.content, resp.url, resp.headers) - except requests.HTTPError as exc: - cls._handle_fail(link, exc, url) - except SSLError as exc: - reason = "There was a problem confirming the ssl certificate: " - reason += str(exc) - cls._handle_fail(link, reason, url, meth=logger.info) - except requests.ConnectionError as exc: - cls._handle_fail(link, "connection error: %s" % exc, url) - except requests.Timeout: - cls._handle_fail(link, "timed out", url) - else: - return inst - - @staticmethod - def _handle_fail(link, reason, url, meth=None): - if meth is None: - meth = logger.debug - - meth("Could not fetch URL %s: %s - skipping", link, reason) - - @staticmethod - def _get_content_type(url, session): - """Get the Content-Type of the given url, using a HEAD request""" - scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url) - if scheme not in {'http', 'https'}: - # FIXME: some warning or something? - # assertion error? - return '' - - resp = session.head(url, allow_redirects=True) - resp.raise_for_status() - - return resp.headers.get("Content-Type", "") - def iter_links(self): """Yields all links in the page""" document = html5lib.parse( @@ -889,7 +886,7 @@ def iter_links(self): url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None - yield Link(url, self, requires_python=pyrequire) + yield Link(url, self.url, requires_python=pyrequire) Search = namedtuple('Search', 'supplied canonical formats')