Skip to content

Commit fc496cf

Browse files
Switch tester to parsel
1 parent 40ebd07 commit fc496cf

File tree

1 file changed

+15
-96
lines changed

1 file changed

+15
-96
lines changed

src/etc/htmldocck.py

Lines changed: 15 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -110,72 +110,9 @@
110110
import re
111111
import shlex
112112
from collections import namedtuple
113-
try:
114-
from html.parser import HTMLParser
115-
except ImportError:
116-
from HTMLParser import HTMLParser
117-
try:
118-
from xml.etree import cElementTree as ET
119-
except ImportError:
120-
from xml.etree import ElementTree as ET
121-
122-
try:
123-
from html.entities import name2codepoint
124-
except ImportError:
125-
from htmlentitydefs import name2codepoint
126-
127-
# "void elements" (no closing tag) from the HTML Standard section 12.1.2
128-
VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
129-
'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
130-
131-
# Python 2 -> 3 compatibility
132-
try:
133-
unichr
134-
except NameError:
135-
unichr = chr
136-
113+
from parsel import Selector
137114

138115
channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
139-
140-
class CustomHTMLParser(HTMLParser):
141-
"""simplified HTML parser.
142-
143-
this is possible because we are dealing with very regular HTML from
144-
rustdoc; we only have to deal with i) void elements and ii) empty
145-
attributes."""
146-
def __init__(self, target=None):
147-
HTMLParser.__init__(self)
148-
self.__builder = target or ET.TreeBuilder()
149-
150-
def handle_starttag(self, tag, attrs):
151-
attrs = {k: v or '' for k, v in attrs}
152-
self.__builder.start(tag, attrs)
153-
if tag in VOID_ELEMENTS:
154-
self.__builder.end(tag)
155-
156-
def handle_endtag(self, tag):
157-
self.__builder.end(tag)
158-
159-
def handle_startendtag(self, tag, attrs):
160-
attrs = {k: v or '' for k, v in attrs}
161-
self.__builder.start(tag, attrs)
162-
self.__builder.end(tag)
163-
164-
def handle_data(self, data):
165-
self.__builder.data(data)
166-
167-
def handle_entityref(self, name):
168-
self.__builder.data(unichr(name2codepoint[name]))
169-
170-
def handle_charref(self, name):
171-
code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
172-
self.__builder.data(unichr(code))
173-
174-
def close(self):
175-
HTMLParser.close(self)
176-
return self.__builder.close()
177-
178-
179116
Command = namedtuple('Command', 'negated cmd args lineno context')
180117

181118

@@ -256,29 +193,11 @@ def get_commands(template):
256193
yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
257194

258195

259-
def _flatten(node, acc):
260-
if node.text:
261-
acc.append(node.text)
262-
for e in node:
263-
_flatten(e, acc)
264-
if e.tail:
265-
acc.append(e.tail)
266-
267-
268-
def flatten(node):
269-
acc = []
270-
_flatten(node, acc)
271-
return ''.join(acc)
272-
273-
274196
def normalize_xpath(path):
275197
path = path.replace("{{channel}}", channel)
276-
if path.startswith('//'):
277-
return '.' + path # avoid warnings
278-
elif path.startswith('.//'):
279-
return path
280-
else:
198+
if not path.startswith('//'):
281199
raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
200+
return path
282201

283202

284203
class CachedFiles(object):
@@ -323,7 +242,7 @@ def get_tree(self, path):
323242

324243
with io.open(abspath, encoding='utf-8') as f:
325244
try:
326-
tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
245+
tree = Selector(text=f.read())
327246
except Exception as e:
328247
raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
329248
self.trees[path] = tree
@@ -351,7 +270,7 @@ def check_string(data, pat, regexp):
351270
def check_tree_attr(tree, path, attr, pat, regexp):
352271
path = normalize_xpath(path)
353272
ret = False
354-
for e in tree.findall(path):
273+
for e in tree.xpath(path):
355274
if attr in e.attrib:
356275
value = e.attrib[attr]
357276
else:
@@ -363,19 +282,19 @@ def check_tree_attr(tree, path, attr, pat, regexp):
363282
return ret
364283

365284

285+
def flatten(elem):
286+
return ''.join(elem.css('::text').getall())
287+
288+
366289
def check_tree_text(tree, path, pat, regexp):
367290
path = normalize_xpath(path)
368291
ret = False
369292
try:
370-
for e in tree.findall(path):
371-
try:
372-
value = flatten(e)
373-
except KeyError:
374-
continue
375-
else:
376-
ret = check_string(value, pat, regexp)
377-
if ret:
378-
break
293+
for e in tree.xpath(path):
294+
value = flatten(e)
295+
ret = check_string(value, pat, regexp)
296+
if ret:
297+
break
379298
except Exception:
380299
print('Failed to get path "{}"'.format(path))
381300
raise
@@ -384,7 +303,7 @@ def check_tree_text(tree, path, pat, regexp):
384303

385304
def get_tree_count(tree, path):
386305
path = normalize_xpath(path)
387-
return len(tree.findall(path))
306+
return len(tree.xpath(path))
388307

389308

390309
def stderr(*args):

0 commit comments

Comments
 (0)