110
110
import re
111
111
import shlex
112
112
from collections import namedtuple
113
- try :
114
- from html .parser import HTMLParser
115
- except ImportError :
116
- from HTMLParser import HTMLParser
117
- try :
118
- from xml .etree import cElementTree as ET
119
- except ImportError :
120
- from xml .etree import ElementTree as ET
121
-
122
- try :
123
- from html .entities import name2codepoint
124
- except ImportError :
125
- from htmlentitydefs import name2codepoint
126
-
127
- # "void elements" (no closing tag) from the HTML Standard section 12.1.2
128
- VOID_ELEMENTS = {'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
129
- 'link' , 'menuitem' , 'meta' , 'param' , 'source' , 'track' , 'wbr' }
130
-
131
- # Python 2 -> 3 compatibility
132
- try :
133
- unichr
134
- except NameError :
135
- unichr = chr
136
-
113
+ from parsel import Selector
137
114
138
115
channel = os .environ ["DOC_RUST_LANG_ORG_CHANNEL" ]
139
-
140
- class CustomHTMLParser (HTMLParser ):
141
- """simplified HTML parser.
142
-
143
- this is possible because we are dealing with very regular HTML from
144
- rustdoc; we only have to deal with i) void elements and ii) empty
145
- attributes."""
146
- def __init__ (self , target = None ):
147
- HTMLParser .__init__ (self )
148
- self .__builder = target or ET .TreeBuilder ()
149
-
150
- def handle_starttag (self , tag , attrs ):
151
- attrs = {k : v or '' for k , v in attrs }
152
- self .__builder .start (tag , attrs )
153
- if tag in VOID_ELEMENTS :
154
- self .__builder .end (tag )
155
-
156
- def handle_endtag (self , tag ):
157
- self .__builder .end (tag )
158
-
159
- def handle_startendtag (self , tag , attrs ):
160
- attrs = {k : v or '' for k , v in attrs }
161
- self .__builder .start (tag , attrs )
162
- self .__builder .end (tag )
163
-
164
- def handle_data (self , data ):
165
- self .__builder .data (data )
166
-
167
- def handle_entityref (self , name ):
168
- self .__builder .data (unichr (name2codepoint [name ]))
169
-
170
- def handle_charref (self , name ):
171
- code = int (name [1 :], 16 ) if name .startswith (('x' , 'X' )) else int (name , 10 )
172
- self .__builder .data (unichr (code ))
173
-
174
- def close (self ):
175
- HTMLParser .close (self )
176
- return self .__builder .close ()
177
-
178
-
179
116
Command = namedtuple ('Command' , 'negated cmd args lineno context' )
180
117
181
118
@@ -256,29 +193,11 @@ def get_commands(template):
256
193
yield Command (negated = negated , cmd = cmd , args = args , lineno = lineno + 1 , context = line )
257
194
258
195
259
- def _flatten (node , acc ):
260
- if node .text :
261
- acc .append (node .text )
262
- for e in node :
263
- _flatten (e , acc )
264
- if e .tail :
265
- acc .append (e .tail )
266
-
267
-
268
- def flatten (node ):
269
- acc = []
270
- _flatten (node , acc )
271
- return '' .join (acc )
272
-
273
-
274
196
def normalize_xpath (path ):
275
197
path = path .replace ("{{channel}}" , channel )
276
- if path .startswith ('//' ):
277
- return '.' + path # avoid warnings
278
- elif path .startswith ('.//' ):
279
- return path
280
- else :
198
+ if not path .startswith ('//' ):
281
199
raise InvalidCheck ('Non-absolute XPath is not supported due to implementation issues' )
200
+ return path
282
201
283
202
284
203
class CachedFiles (object ):
@@ -323,7 +242,7 @@ def get_tree(self, path):
323
242
324
243
with io .open (abspath , encoding = 'utf-8' ) as f :
325
244
try :
326
- tree = ET . fromstringlist ( f . readlines (), CustomHTMLParser ())
245
+ tree = Selector ( text = f . read ())
327
246
except Exception as e :
328
247
raise RuntimeError ('Cannot parse an HTML file {!r}: {}' .format (path , e ))
329
248
self .trees [path ] = tree
@@ -351,7 +270,7 @@ def check_string(data, pat, regexp):
351
270
def check_tree_attr (tree , path , attr , pat , regexp ):
352
271
path = normalize_xpath (path )
353
272
ret = False
354
- for e in tree .findall (path ):
273
+ for e in tree .xpath (path ):
355
274
if attr in e .attrib :
356
275
value = e .attrib [attr ]
357
276
else :
@@ -363,19 +282,19 @@ def check_tree_attr(tree, path, attr, pat, regexp):
363
282
return ret
364
283
365
284
285
+ def flatten (elem ):
286
+ return '' .join (elem .css ('::text' ).getall ())
287
+
288
+
366
289
def check_tree_text (tree , path , pat , regexp ):
367
290
path = normalize_xpath (path )
368
291
ret = False
369
292
try :
370
- for e in tree .findall (path ):
371
- try :
372
- value = flatten (e )
373
- except KeyError :
374
- continue
375
- else :
376
- ret = check_string (value , pat , regexp )
377
- if ret :
378
- break
293
+ for e in tree .xpath (path ):
294
+ value = flatten (e )
295
+ ret = check_string (value , pat , regexp )
296
+ if ret :
297
+ break
379
298
except Exception :
380
299
print ('Failed to get path "{}"' .format (path ))
381
300
raise
@@ -384,7 +303,7 @@ def check_tree_text(tree, path, pat, regexp):
384
303
385
304
def get_tree_count (tree , path ):
386
305
path = normalize_xpath (path )
387
- return len (tree .findall (path ))
306
+ return len (tree .xpath (path ))
388
307
389
308
390
309
def stderr (* args ):
0 commit comments