59
59
logger = logging .getLogger (__name__ )
60
60
61
61
62
+ def _get_content_type (url , session ):
63
+ """Get the Content-Type of the given url, using a HEAD request"""
64
+ scheme , netloc , path , query , fragment = urllib_parse .urlsplit (url )
65
+ if scheme not in {'http' , 'https' }:
66
+ # FIXME: some warning or something?
67
+ # assertion error?
68
+ return ''
69
+
70
+ resp = session .head (url , allow_redirects = True )
71
+ resp .raise_for_status ()
72
+
73
+ return resp .headers .get ("Content-Type" , "" )
74
+
75
+
76
+ def _handle_get_page_fail (link , reason , url , meth = None ):
77
+ if meth is None :
78
+ meth = logger .debug
79
+ meth ("Could not fetch URL %s: %s - skipping" , link , reason )
80
+
81
+
82
+ def _get_html_page (link , session = None ):
83
+ if session is None :
84
+ raise TypeError (
85
+ "_get_html_page() missing 1 required keyword argument: 'session'"
86
+ )
87
+
88
+ url = link .url
89
+ url = url .split ('#' , 1 )[0 ]
90
+
91
+ # Check for VCS schemes that do not support lookup as web pages.
92
+ from pip ._internal .vcs import VcsSupport
93
+ for scheme in VcsSupport .schemes :
94
+ if url .lower ().startswith (scheme ) and url [len (scheme )] in '+:' :
95
+ logger .debug ('Cannot look at %s URL %s' , scheme , link )
96
+ return None
97
+
98
+ try :
99
+ filename = link .filename
100
+ for bad_ext in ARCHIVE_EXTENSIONS :
101
+ if filename .endswith (bad_ext ):
102
+ content_type = _get_content_type (url , session = session )
103
+ if content_type .lower ().startswith ('text/html' ):
104
+ break
105
+ else :
106
+ logger .debug (
107
+ 'Skipping page %s because of Content-Type: %s' ,
108
+ link ,
109
+ content_type ,
110
+ )
111
+ return
112
+
113
+ logger .debug ('Getting page %s' , url )
114
+
115
+ # Tack index.html onto file:// URLs that point to directories
116
+ (scheme , netloc , path , params , query , fragment ) = \
117
+ urllib_parse .urlparse (url )
118
+ if (scheme == 'file' and
119
+ os .path .isdir (urllib_request .url2pathname (path ))):
120
+ # add trailing slash if not present so urljoin doesn't trim
121
+ # final segment
122
+ if not url .endswith ('/' ):
123
+ url += '/'
124
+ url = urllib_parse .urljoin (url , 'index.html' )
125
+ logger .debug (' file: URL is directory, getting %s' , url )
126
+
127
+ resp = session .get (
128
+ url ,
129
+ headers = {
130
+ "Accept" : "text/html" ,
131
+ # We don't want to blindly returned cached data for
132
+ # /simple/, because authors generally expecting that
133
+ # twine upload && pip install will function, but if
134
+ # they've done a pip install in the last ~10 minutes
135
+ # it won't. Thus by setting this to zero we will not
136
+ # blindly use any cached data, however the benefit of
137
+ # using max-age=0 instead of no-cache, is that we will
138
+ # still support conditional requests, so we will still
139
+ # minimize traffic sent in cases where the page hasn't
140
+ # changed at all, we will just always incur the round
141
+ # trip for the conditional GET now instead of only
142
+ # once per 10 minutes.
143
+ # For more information, please see pypa/pip#5670.
144
+ "Cache-Control" : "max-age=0" ,
145
+ },
146
+ )
147
+ resp .raise_for_status ()
148
+
149
+ # The check for archives above only works if the url ends with
150
+ # something that looks like an archive. However that is not a
151
+ # requirement of an url. Unless we issue a HEAD request on every
152
+ # url we cannot know ahead of time for sure if something is HTML
153
+ # or not. However we can check after we've downloaded it.
154
+ content_type = resp .headers .get ('Content-Type' , 'unknown' )
155
+ if not content_type .lower ().startswith ("text/html" ):
156
+ logger .debug (
157
+ 'Skipping page %s because of Content-Type: %s' ,
158
+ link ,
159
+ content_type ,
160
+ )
161
+ return
162
+
163
+ inst = HTMLPage (resp .content , resp .url , resp .headers )
164
+ except requests .HTTPError as exc :
165
+ _handle_get_page_fail (link , exc , url )
166
+ except SSLError as exc :
167
+ reason = "There was a problem confirming the ssl certificate: "
168
+ reason += str (exc )
169
+ _handle_get_page_fail (link , reason , url , meth = logger .info )
170
+ except requests .ConnectionError as exc :
171
+ _handle_get_page_fail (link , "connection error: %s" % exc , url )
172
+ except requests .Timeout :
173
+ _handle_get_page_fail (link , "timed out" , url )
174
+ else :
175
+ return inst
176
+
177
+
62
178
class PackageFinder (object ):
63
179
"""This finds packages.
64
180
@@ -674,7 +790,7 @@ def _link_package_versions(self, link, search):
674
790
return InstallationCandidate (search .supplied , version , link )
675
791
676
792
def _get_page (self , link ):
677
- return HTMLPage . get_page (link , session = self .session )
793
+ return _get_html_page (link , session = self .session )
678
794
679
795
680
796
def egg_info_matches (
@@ -756,125 +872,6 @@ def __init__(self, content, url, headers=None):
756
872
def __str__ (self ):
757
873
return self .url
758
874
759
- @classmethod
760
- def get_page (cls , link , session = None ):
761
- if session is None :
762
- raise TypeError (
763
- "get_page() missing 1 required keyword argument: 'session'"
764
- )
765
-
766
- url = link .url
767
- url = url .split ('#' , 1 )[0 ]
768
-
769
- # Check for VCS schemes that do not support lookup as web pages.
770
- from pip ._internal .vcs import VcsSupport
771
- for scheme in VcsSupport .schemes :
772
- if url .lower ().startswith (scheme ) and url [len (scheme )] in '+:' :
773
- logger .debug ('Cannot look at %s URL %s' , scheme , link )
774
- return None
775
-
776
- try :
777
- filename = link .filename
778
- for bad_ext in ARCHIVE_EXTENSIONS :
779
- if filename .endswith (bad_ext ):
780
- content_type = cls ._get_content_type (
781
- url , session = session ,
782
- )
783
- if content_type .lower ().startswith ('text/html' ):
784
- break
785
- else :
786
- logger .debug (
787
- 'Skipping page %s because of Content-Type: %s' ,
788
- link ,
789
- content_type ,
790
- )
791
- return
792
-
793
- logger .debug ('Getting page %s' , url )
794
-
795
- # Tack index.html onto file:// URLs that point to directories
796
- (scheme , netloc , path , params , query , fragment ) = \
797
- urllib_parse .urlparse (url )
798
- if (scheme == 'file' and
799
- os .path .isdir (urllib_request .url2pathname (path ))):
800
- # add trailing slash if not present so urljoin doesn't trim
801
- # final segment
802
- if not url .endswith ('/' ):
803
- url += '/'
804
- url = urllib_parse .urljoin (url , 'index.html' )
805
- logger .debug (' file: URL is directory, getting %s' , url )
806
-
807
- resp = session .get (
808
- url ,
809
- headers = {
810
- "Accept" : "text/html" ,
811
- # We don't want to blindly returned cached data for
812
- # /simple/, because authors generally expecting that
813
- # twine upload && pip install will function, but if
814
- # they've done a pip install in the last ~10 minutes
815
- # it won't. Thus by setting this to zero we will not
816
- # blindly use any cached data, however the benefit of
817
- # using max-age=0 instead of no-cache, is that we will
818
- # still support conditional requests, so we will still
819
- # minimize traffic sent in cases where the page hasn't
820
- # changed at all, we will just always incur the round
821
- # trip for the conditional GET now instead of only
822
- # once per 10 minutes.
823
- # For more information, please see pypa/pip#5670.
824
- "Cache-Control" : "max-age=0" ,
825
- },
826
- )
827
- resp .raise_for_status ()
828
-
829
- # The check for archives above only works if the url ends with
830
- # something that looks like an archive. However that is not a
831
- # requirement of an url. Unless we issue a HEAD request on every
832
- # url we cannot know ahead of time for sure if something is HTML
833
- # or not. However we can check after we've downloaded it.
834
- content_type = resp .headers .get ('Content-Type' , 'unknown' )
835
- if not content_type .lower ().startswith ("text/html" ):
836
- logger .debug (
837
- 'Skipping page %s because of Content-Type: %s' ,
838
- link ,
839
- content_type ,
840
- )
841
- return
842
-
843
- inst = cls (resp .content , resp .url , resp .headers )
844
- except requests .HTTPError as exc :
845
- cls ._handle_fail (link , exc , url )
846
- except SSLError as exc :
847
- reason = "There was a problem confirming the ssl certificate: "
848
- reason += str (exc )
849
- cls ._handle_fail (link , reason , url , meth = logger .info )
850
- except requests .ConnectionError as exc :
851
- cls ._handle_fail (link , "connection error: %s" % exc , url )
852
- except requests .Timeout :
853
- cls ._handle_fail (link , "timed out" , url )
854
- else :
855
- return inst
856
-
857
- @staticmethod
858
- def _handle_fail (link , reason , url , meth = None ):
859
- if meth is None :
860
- meth = logger .debug
861
-
862
- meth ("Could not fetch URL %s: %s - skipping" , link , reason )
863
-
864
- @staticmethod
865
- def _get_content_type (url , session ):
866
- """Get the Content-Type of the given url, using a HEAD request"""
867
- scheme , netloc , path , query , fragment = urllib_parse .urlsplit (url )
868
- if scheme not in {'http' , 'https' }:
869
- # FIXME: some warning or something?
870
- # assertion error?
871
- return ''
872
-
873
- resp = session .head (url , allow_redirects = True )
874
- resp .raise_for_status ()
875
-
876
- return resp .headers .get ("Content-Type" , "" )
877
-
878
875
def iter_links (self ):
879
876
"""Yields all links in the page"""
880
877
document = html5lib .parse (
@@ -889,7 +886,7 @@ def iter_links(self):
889
886
url = _clean_link (urllib_parse .urljoin (base_url , href ))
890
887
pyrequire = anchor .get ('data-requires-python' )
891
888
pyrequire = unescape (pyrequire ) if pyrequire else None
892
- yield Link (url , self , requires_python = pyrequire )
889
+ yield Link (url , self . url , requires_python = pyrequire )
893
890
894
891
895
892
Search = namedtuple ('Search' , 'supplied canonical formats' )
0 commit comments