Merge pull request #5833 from uranusjr/htmlpage-extract-trimming

cjerdonek · web-flow · commit b6bbabe3e7bc · 2018-09-30T23:12:06.000-07:00
Move static methods out of HTMLPage
diff --git a/news/5833.trivial b/news/5833.trivial
@@ -0,0 +1 @@
+Move static and class methods out of HTMLPage for prepare for refactoring.
diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py
@@ -59,6 +59,122 @@
 logger = logging.getLogger(__name__)
 
 
+def _get_content_type(url, session):
+    """Get the Content-Type of the given url, using a HEAD request"""
+    scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
+    if scheme not in {'http', 'https'}:
+        # FIXME: some warning or something?
+        # assertion error?
+        return ''
+
+    resp = session.head(url, allow_redirects=True)
+    resp.raise_for_status()
+
+    return resp.headers.get("Content-Type", "")
+
+
+def _handle_get_page_fail(link, reason, url, meth=None):
+    if meth is None:
+        meth = logger.debug
+    meth("Could not fetch URL %s: %s - skipping", link, reason)
+
+
+def _get_html_page(link, session=None):
+    if session is None:
+        raise TypeError(
+            "_get_html_page() missing 1 required keyword argument: 'session'"
+        )
+
+    url = link.url
+    url = url.split('#', 1)[0]
+
+    # Check for VCS schemes that do not support lookup as web pages.
+    from pip._internal.vcs import VcsSupport
+    for scheme in VcsSupport.schemes:
+        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
+            logger.debug('Cannot look at %s URL %s', scheme, link)
+            return None
+
+    try:
+        filename = link.filename
+        for bad_ext in ARCHIVE_EXTENSIONS:
+            if filename.endswith(bad_ext):
+                content_type = _get_content_type(url, session=session)
+                if content_type.lower().startswith('text/html'):
+                    break
+                else:
+                    logger.debug(
+                        'Skipping page %s because of Content-Type: %s',
+                        link,
+                        content_type,
+                    )
+                    return
+
+        logger.debug('Getting page %s', url)
+
+        # Tack index.html onto file:// URLs that point to directories
+        (scheme, netloc, path, params, query, fragment) = \
+            urllib_parse.urlparse(url)
+        if (scheme == 'file' and
+                os.path.isdir(urllib_request.url2pathname(path))):
+            # add trailing slash if not present so urljoin doesn't trim
+            # final segment
+            if not url.endswith('/'):
+                url += '/'
+            url = urllib_parse.urljoin(url, 'index.html')
+            logger.debug(' file: URL is directory, getting %s', url)
+
+        resp = session.get(
+            url,
+            headers={
+                "Accept": "text/html",
+                # We don't want to blindly returned cached data for
+                # /simple/, because authors generally expecting that
+                # twine upload && pip install will function, but if
+                # they've done a pip install in the last ~10 minutes
+                # it won't. Thus by setting this to zero we will not
+                # blindly use any cached data, however the benefit of
+                # using max-age=0 instead of no-cache, is that we will
+                # still support conditional requests, so we will still
+                # minimize traffic sent in cases where the page hasn't
+                # changed at all, we will just always incur the round
+                # trip for the conditional GET now instead of only
+                # once per 10 minutes.
+                # For more information, please see pypa/pip#5670.
+                "Cache-Control": "max-age=0",
+            },
+        )
+        resp.raise_for_status()
+
+        # The check for archives above only works if the url ends with
+        # something that looks like an archive. However that is not a
+        # requirement of an url. Unless we issue a HEAD request on every
+        # url we cannot know ahead of time for sure if something is HTML
+        # or not. However we can check after we've downloaded it.
+        content_type = resp.headers.get('Content-Type', 'unknown')
+        if not content_type.lower().startswith("text/html"):
+            logger.debug(
+                'Skipping page %s because of Content-Type: %s',
+                link,
+                content_type,
+            )
+            return
+
+        inst = HTMLPage(resp.content, resp.url, resp.headers)
+    except requests.HTTPError as exc:
+        _handle_get_page_fail(link, exc, url)
+    except SSLError as exc:
+        reason = "There was a problem confirming the ssl certificate: "
+        reason += str(exc)
+        _handle_get_page_fail(link, reason, url, meth=logger.info)
+    except requests.ConnectionError as exc:
+        _handle_get_page_fail(link, "connection error: %s" % exc, url)
+    except requests.Timeout:
+        _handle_get_page_fail(link, "timed out", url)
+    else:
+        return inst
+
+
 class PackageFinder(object):
     """This finds packages.
 
@@ -674,7 +790,7 @@ def _link_package_versions(self, link, search):
         return InstallationCandidate(search.supplied, version, link)
 
     def _get_page(self, link):
-        return HTMLPage.get_page(link, session=self.session)
+        return _get_html_page(link, session=self.session)
 
 
 def egg_info_matches(
@@ -756,125 +872,6 @@ def __init__(self, content, url, headers=None):
     def __str__(self):
         return self.url
 
-    @classmethod
-    def get_page(cls, link, session=None):
-        if session is None:
-            raise TypeError(
-                "get_page() missing 1 required keyword argument: 'session'"
-            )
-
-        url = link.url
-        url = url.split('#', 1)[0]
-
-        # Check for VCS schemes that do not support lookup as web pages.
-        from pip._internal.vcs import VcsSupport
-        for scheme in VcsSupport.schemes:
-            if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
-                logger.debug('Cannot look at %s URL %s', scheme, link)
-                return None
-
-        try:
-            filename = link.filename
-            for bad_ext in ARCHIVE_EXTENSIONS:
-                if filename.endswith(bad_ext):
-                    content_type = cls._get_content_type(
-                        url, session=session,
-                    )
-                    if content_type.lower().startswith('text/html'):
-                        break
-                    else:
-                        logger.debug(
-                            'Skipping page %s because of Content-Type: %s',
-                            link,
-                            content_type,
-                        )
-                        return
-
-            logger.debug('Getting page %s', url)
-
-            # Tack index.html onto file:// URLs that point to directories
-            (scheme, netloc, path, params, query, fragment) = \
-                urllib_parse.urlparse(url)
-            if (scheme == 'file' and
-                    os.path.isdir(urllib_request.url2pathname(path))):
-                # add trailing slash if not present so urljoin doesn't trim
-                # final segment
-                if not url.endswith('/'):
-                    url += '/'
-                url = urllib_parse.urljoin(url, 'index.html')
-                logger.debug(' file: URL is directory, getting %s', url)
-
-            resp = session.get(
-                url,
-                headers={
-                    "Accept": "text/html",
-                    # We don't want to blindly returned cached data for
-                    # /simple/, because authors generally expecting that
-                    # twine upload && pip install will function, but if
-                    # they've done a pip install in the last ~10 minutes
-                    # it won't. Thus by setting this to zero we will not
-                    # blindly use any cached data, however the benefit of
-                    # using max-age=0 instead of no-cache, is that we will
-                    # still support conditional requests, so we will still
-                    # minimize traffic sent in cases where the page hasn't
-                    # changed at all, we will just always incur the round
-                    # trip for the conditional GET now instead of only
-                    # once per 10 minutes.
-                    # For more information, please see pypa/pip#5670.
-                    "Cache-Control": "max-age=0",
-                },
-            )
-            resp.raise_for_status()
-
-            # The check for archives above only works if the url ends with
-            # something that looks like an archive. However that is not a
-            # requirement of an url. Unless we issue a HEAD request on every
-            # url we cannot know ahead of time for sure if something is HTML
-            # or not. However we can check after we've downloaded it.
-            content_type = resp.headers.get('Content-Type', 'unknown')
-            if not content_type.lower().startswith("text/html"):
-                logger.debug(
-                    'Skipping page %s because of Content-Type: %s',
-                    link,
-                    content_type,
-                )
-                return
-
-            inst = cls(resp.content, resp.url, resp.headers)
-        except requests.HTTPError as exc:
-            cls._handle_fail(link, exc, url)
-        except SSLError as exc:
-            reason = "There was a problem confirming the ssl certificate: "
-            reason += str(exc)
-            cls._handle_fail(link, reason, url, meth=logger.info)
-        except requests.ConnectionError as exc:
-            cls._handle_fail(link, "connection error: %s" % exc, url)
-        except requests.Timeout:
-            cls._handle_fail(link, "timed out", url)
-        else:
-            return inst
-
-    @staticmethod
-    def _handle_fail(link, reason, url, meth=None):
-        if meth is None:
-            meth = logger.debug
-
-        meth("Could not fetch URL %s: %s - skipping", link, reason)
-
-    @staticmethod
-    def _get_content_type(url, session):
-        """Get the Content-Type of the given url, using a HEAD request"""
-        scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
-        if scheme not in {'http', 'https'}:
-            # FIXME: some warning or something?
-            # assertion error?
-            return ''
-
-        resp = session.head(url, allow_redirects=True)
-        resp.raise_for_status()
-
-        return resp.headers.get("Content-Type", "")
-
     def iter_links(self):
         """Yields all links in the page"""
         document = html5lib.parse(
@@ -889,7 +886,7 @@ def iter_links(self):
                 url = _clean_link(urllib_parse.urljoin(base_url, href))
                 pyrequire = anchor.get('data-requires-python')
                 pyrequire = unescape(pyrequire) if pyrequire else None
-                yield Link(url, self, requires_python=pyrequire)
+                yield Link(url, self.url, requires_python=pyrequire)
 
 
 Search = namedtuple('Search', 'supplied canonical formats')

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Move static and class methods out of HTMLPage for prepare for refactoring.`