Skip to content

Move static methods out of HTMLPage #5833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/5833.trivial
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Move static and class methods out of HTMLPage for prepare for refactoring.
239 changes: 118 additions & 121 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,122 @@
logger = logging.getLogger(__name__)


def _get_content_type(url, session):
"""Get the Content-Type of the given url, using a HEAD request"""
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
if scheme not in {'http', 'https'}:
# FIXME: some warning or something?
# assertion error?
return ''

resp = session.head(url, allow_redirects=True)
resp.raise_for_status()

return resp.headers.get("Content-Type", "")


def _handle_get_page_fail(link, reason, url, meth=None):
if meth is None:
meth = logger.debug
meth("Could not fetch URL %s: %s - skipping", link, reason)


def _get_html_page(link, session=None):
if session is None:
raise TypeError(
"_get_html_page() missing 1 required keyword argument: 'session'"
)

url = link.url
url = url.split('#', 1)[0]

# Check for VCS schemes that do not support lookup as web pages.
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
logger.debug('Cannot look at %s URL %s', scheme, link)
return None

try:
filename = link.filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
content_type = _get_content_type(url, session=session)
if content_type.lower().startswith('text/html'):
break
else:
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return

logger.debug('Getting page %s', url)

# Tack index.html onto file:// URLs that point to directories
(scheme, netloc, path, params, query, fragment) = \
urllib_parse.urlparse(url)
if (scheme == 'file' and
os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)

resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
)
resp.raise_for_status()

# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
content_type = resp.headers.get('Content-Type', 'unknown')
if not content_type.lower().startswith("text/html"):
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return

inst = HTMLPage(resp.content, resp.url, resp.headers)
except requests.HTTPError as exc:
_handle_get_page_fail(link, exc, url)
except SSLError as exc:
reason = "There was a problem confirming the ssl certificate: "
reason += str(exc)
_handle_get_page_fail(link, reason, url, meth=logger.info)
except requests.ConnectionError as exc:
_handle_get_page_fail(link, "connection error: %s" % exc, url)
except requests.Timeout:
_handle_get_page_fail(link, "timed out", url)
else:
return inst


class PackageFinder(object):
"""This finds packages.

Expand Down Expand Up @@ -674,7 +790,7 @@ def _link_package_versions(self, link, search):
return InstallationCandidate(search.supplied, version, link)

def _get_page(self, link):
return HTMLPage.get_page(link, session=self.session)
return _get_html_page(link, session=self.session)


def egg_info_matches(
Expand Down Expand Up @@ -756,125 +872,6 @@ def __init__(self, content, url, headers=None):
def __str__(self):
return self.url

@classmethod
def get_page(cls, link, session=None):
if session is None:
raise TypeError(
"get_page() missing 1 required keyword argument: 'session'"
)

url = link.url
url = url.split('#', 1)[0]

# Check for VCS schemes that do not support lookup as web pages.
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
logger.debug('Cannot look at %s URL %s', scheme, link)
return None

try:
filename = link.filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
content_type = cls._get_content_type(
url, session=session,
)
if content_type.lower().startswith('text/html'):
break
else:
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return

logger.debug('Getting page %s', url)

# Tack index.html onto file:// URLs that point to directories
(scheme, netloc, path, params, query, fragment) = \
urllib_parse.urlparse(url)
if (scheme == 'file' and
os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)

resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
)
resp.raise_for_status()

# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
content_type = resp.headers.get('Content-Type', 'unknown')
if not content_type.lower().startswith("text/html"):
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return

inst = cls(resp.content, resp.url, resp.headers)
except requests.HTTPError as exc:
cls._handle_fail(link, exc, url)
except SSLError as exc:
reason = "There was a problem confirming the ssl certificate: "
reason += str(exc)
cls._handle_fail(link, reason, url, meth=logger.info)
except requests.ConnectionError as exc:
cls._handle_fail(link, "connection error: %s" % exc, url)
except requests.Timeout:
cls._handle_fail(link, "timed out", url)
else:
return inst

@staticmethod
def _handle_fail(link, reason, url, meth=None):
if meth is None:
meth = logger.debug

meth("Could not fetch URL %s: %s - skipping", link, reason)

@staticmethod
def _get_content_type(url, session):
"""Get the Content-Type of the given url, using a HEAD request"""
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
if scheme not in {'http', 'https'}:
# FIXME: some warning or something?
# assertion error?
return ''

resp = session.head(url, allow_redirects=True)
resp.raise_for_status()

return resp.headers.get("Content-Type", "")

def iter_links(self):
"""Yields all links in the page"""
document = html5lib.parse(
Expand All @@ -889,7 +886,7 @@ def iter_links(self):
url = _clean_link(urllib_parse.urljoin(base_url, href))
pyrequire = anchor.get('data-requires-python')
pyrequire = unescape(pyrequire) if pyrequire else None
yield Link(url, self, requires_python=pyrequire)
yield Link(url, self.url, requires_python=pyrequire)


Search = namedtuple('Search', 'supplied canonical formats')
Expand Down