Skip to content

Commit b6bbabe

Browse files
authored
Merge pull request #5833 from uranusjr/htmlpage-extract-trimming
Move static methods out of HTMLPage
2 parents 4a07894 + 8f432f5 commit b6bbabe

File tree

2 files changed

+119
-121
lines changed

2 files changed

+119
-121
lines changed

news/5833.trivial

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Move static and class methods out of HTMLPage for prepare for refactoring.

src/pip/_internal/index.py

Lines changed: 118 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,122 @@
5959
logger = logging.getLogger(__name__)
6060

6161

62+
def _get_content_type(url, session):
63+
"""Get the Content-Type of the given url, using a HEAD request"""
64+
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
65+
if scheme not in {'http', 'https'}:
66+
# FIXME: some warning or something?
67+
# assertion error?
68+
return ''
69+
70+
resp = session.head(url, allow_redirects=True)
71+
resp.raise_for_status()
72+
73+
return resp.headers.get("Content-Type", "")
74+
75+
76+
def _handle_get_page_fail(link, reason, url, meth=None):
77+
if meth is None:
78+
meth = logger.debug
79+
meth("Could not fetch URL %s: %s - skipping", link, reason)
80+
81+
82+
def _get_html_page(link, session=None):
83+
if session is None:
84+
raise TypeError(
85+
"_get_html_page() missing 1 required keyword argument: 'session'"
86+
)
87+
88+
url = link.url
89+
url = url.split('#', 1)[0]
90+
91+
# Check for VCS schemes that do not support lookup as web pages.
92+
from pip._internal.vcs import VcsSupport
93+
for scheme in VcsSupport.schemes:
94+
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
95+
logger.debug('Cannot look at %s URL %s', scheme, link)
96+
return None
97+
98+
try:
99+
filename = link.filename
100+
for bad_ext in ARCHIVE_EXTENSIONS:
101+
if filename.endswith(bad_ext):
102+
content_type = _get_content_type(url, session=session)
103+
if content_type.lower().startswith('text/html'):
104+
break
105+
else:
106+
logger.debug(
107+
'Skipping page %s because of Content-Type: %s',
108+
link,
109+
content_type,
110+
)
111+
return
112+
113+
logger.debug('Getting page %s', url)
114+
115+
# Tack index.html onto file:// URLs that point to directories
116+
(scheme, netloc, path, params, query, fragment) = \
117+
urllib_parse.urlparse(url)
118+
if (scheme == 'file' and
119+
os.path.isdir(urllib_request.url2pathname(path))):
120+
# add trailing slash if not present so urljoin doesn't trim
121+
# final segment
122+
if not url.endswith('/'):
123+
url += '/'
124+
url = urllib_parse.urljoin(url, 'index.html')
125+
logger.debug(' file: URL is directory, getting %s', url)
126+
127+
resp = session.get(
128+
url,
129+
headers={
130+
"Accept": "text/html",
131+
# We don't want to blindly returned cached data for
132+
# /simple/, because authors generally expecting that
133+
# twine upload && pip install will function, but if
134+
# they've done a pip install in the last ~10 minutes
135+
# it won't. Thus by setting this to zero we will not
136+
# blindly use any cached data, however the benefit of
137+
# using max-age=0 instead of no-cache, is that we will
138+
# still support conditional requests, so we will still
139+
# minimize traffic sent in cases where the page hasn't
140+
# changed at all, we will just always incur the round
141+
# trip for the conditional GET now instead of only
142+
# once per 10 minutes.
143+
# For more information, please see pypa/pip#5670.
144+
"Cache-Control": "max-age=0",
145+
},
146+
)
147+
resp.raise_for_status()
148+
149+
# The check for archives above only works if the url ends with
150+
# something that looks like an archive. However that is not a
151+
# requirement of an url. Unless we issue a HEAD request on every
152+
# url we cannot know ahead of time for sure if something is HTML
153+
# or not. However we can check after we've downloaded it.
154+
content_type = resp.headers.get('Content-Type', 'unknown')
155+
if not content_type.lower().startswith("text/html"):
156+
logger.debug(
157+
'Skipping page %s because of Content-Type: %s',
158+
link,
159+
content_type,
160+
)
161+
return
162+
163+
inst = HTMLPage(resp.content, resp.url, resp.headers)
164+
except requests.HTTPError as exc:
165+
_handle_get_page_fail(link, exc, url)
166+
except SSLError as exc:
167+
reason = "There was a problem confirming the ssl certificate: "
168+
reason += str(exc)
169+
_handle_get_page_fail(link, reason, url, meth=logger.info)
170+
except requests.ConnectionError as exc:
171+
_handle_get_page_fail(link, "connection error: %s" % exc, url)
172+
except requests.Timeout:
173+
_handle_get_page_fail(link, "timed out", url)
174+
else:
175+
return inst
176+
177+
62178
class PackageFinder(object):
63179
"""This finds packages.
64180
@@ -674,7 +790,7 @@ def _link_package_versions(self, link, search):
674790
return InstallationCandidate(search.supplied, version, link)
675791

676792
def _get_page(self, link):
677-
return HTMLPage.get_page(link, session=self.session)
793+
return _get_html_page(link, session=self.session)
678794

679795

680796
def egg_info_matches(
@@ -756,125 +872,6 @@ def __init__(self, content, url, headers=None):
756872
def __str__(self):
757873
return self.url
758874

759-
@classmethod
760-
def get_page(cls, link, session=None):
761-
if session is None:
762-
raise TypeError(
763-
"get_page() missing 1 required keyword argument: 'session'"
764-
)
765-
766-
url = link.url
767-
url = url.split('#', 1)[0]
768-
769-
# Check for VCS schemes that do not support lookup as web pages.
770-
from pip._internal.vcs import VcsSupport
771-
for scheme in VcsSupport.schemes:
772-
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
773-
logger.debug('Cannot look at %s URL %s', scheme, link)
774-
return None
775-
776-
try:
777-
filename = link.filename
778-
for bad_ext in ARCHIVE_EXTENSIONS:
779-
if filename.endswith(bad_ext):
780-
content_type = cls._get_content_type(
781-
url, session=session,
782-
)
783-
if content_type.lower().startswith('text/html'):
784-
break
785-
else:
786-
logger.debug(
787-
'Skipping page %s because of Content-Type: %s',
788-
link,
789-
content_type,
790-
)
791-
return
792-
793-
logger.debug('Getting page %s', url)
794-
795-
# Tack index.html onto file:// URLs that point to directories
796-
(scheme, netloc, path, params, query, fragment) = \
797-
urllib_parse.urlparse(url)
798-
if (scheme == 'file' and
799-
os.path.isdir(urllib_request.url2pathname(path))):
800-
# add trailing slash if not present so urljoin doesn't trim
801-
# final segment
802-
if not url.endswith('/'):
803-
url += '/'
804-
url = urllib_parse.urljoin(url, 'index.html')
805-
logger.debug(' file: URL is directory, getting %s', url)
806-
807-
resp = session.get(
808-
url,
809-
headers={
810-
"Accept": "text/html",
811-
# We don't want to blindly returned cached data for
812-
# /simple/, because authors generally expecting that
813-
# twine upload && pip install will function, but if
814-
# they've done a pip install in the last ~10 minutes
815-
# it won't. Thus by setting this to zero we will not
816-
# blindly use any cached data, however the benefit of
817-
# using max-age=0 instead of no-cache, is that we will
818-
# still support conditional requests, so we will still
819-
# minimize traffic sent in cases where the page hasn't
820-
# changed at all, we will just always incur the round
821-
# trip for the conditional GET now instead of only
822-
# once per 10 minutes.
823-
# For more information, please see pypa/pip#5670.
824-
"Cache-Control": "max-age=0",
825-
},
826-
)
827-
resp.raise_for_status()
828-
829-
# The check for archives above only works if the url ends with
830-
# something that looks like an archive. However that is not a
831-
# requirement of an url. Unless we issue a HEAD request on every
832-
# url we cannot know ahead of time for sure if something is HTML
833-
# or not. However we can check after we've downloaded it.
834-
content_type = resp.headers.get('Content-Type', 'unknown')
835-
if not content_type.lower().startswith("text/html"):
836-
logger.debug(
837-
'Skipping page %s because of Content-Type: %s',
838-
link,
839-
content_type,
840-
)
841-
return
842-
843-
inst = cls(resp.content, resp.url, resp.headers)
844-
except requests.HTTPError as exc:
845-
cls._handle_fail(link, exc, url)
846-
except SSLError as exc:
847-
reason = "There was a problem confirming the ssl certificate: "
848-
reason += str(exc)
849-
cls._handle_fail(link, reason, url, meth=logger.info)
850-
except requests.ConnectionError as exc:
851-
cls._handle_fail(link, "connection error: %s" % exc, url)
852-
except requests.Timeout:
853-
cls._handle_fail(link, "timed out", url)
854-
else:
855-
return inst
856-
857-
@staticmethod
858-
def _handle_fail(link, reason, url, meth=None):
859-
if meth is None:
860-
meth = logger.debug
861-
862-
meth("Could not fetch URL %s: %s - skipping", link, reason)
863-
864-
@staticmethod
865-
def _get_content_type(url, session):
866-
"""Get the Content-Type of the given url, using a HEAD request"""
867-
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
868-
if scheme not in {'http', 'https'}:
869-
# FIXME: some warning or something?
870-
# assertion error?
871-
return ''
872-
873-
resp = session.head(url, allow_redirects=True)
874-
resp.raise_for_status()
875-
876-
return resp.headers.get("Content-Type", "")
877-
878875
def iter_links(self):
879876
"""Yields all links in the page"""
880877
document = html5lib.parse(
@@ -889,7 +886,7 @@ def iter_links(self):
889886
url = _clean_link(urllib_parse.urljoin(base_url, href))
890887
pyrequire = anchor.get('data-requires-python')
891888
pyrequire = unescape(pyrequire) if pyrequire else None
892-
yield Link(url, self, requires_python=pyrequire)
889+
yield Link(url, self.url, requires_python=pyrequire)
893890

894891

895892
Search = namedtuple('Search', 'supplied canonical formats')

0 commit comments

Comments
 (0)