Skip to content

Commit 62ff77d

Browse files
committed
Fixed #401: accomodate extra large favicons
Parse XML files line by line in order to prevent memory exhaustion ( actually InvalidXmlSyntax Memory allocation failed : Huge input lookup) due to very large `favicon` attribute (base64 encoded Illustration).
1 parent 77384d0 commit 62ff77d

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

Diff for: zim/library-mgmt/library-maint.py

+31-16
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,19 @@ def is_latest(fpath: pathlib.Path) -> bool:
339339
return True
340340

341341

342+
def parseable_xml_file(fpath: pathlib.Path) -> bool:
343+
"""whether Catalog XML file can be parsed as XML"""
344+
try:
345+
for _, elem in etree.iterparse(str(fpath)):
346+
if elem.tag == "book":
347+
assert elem.attrib["id"]
348+
except Exception as exc:
349+
logger.error(exc)
350+
logger.exception(exc)
351+
return False
352+
return True
353+
354+
342355
def convert_pub_library_to_internal(
343356
pub_library_dest: pathlib.Path,
344357
internal_library_dest: pathlib.Path,
@@ -348,20 +361,22 @@ def convert_pub_library_to_internal(
348361
logger.info(f"[LIBS] Preparing internal Library for {internal_library_dest}")
349362

350363
int_library_tmp = get_tmp(internal_library_dest)
351-
tree = etree.parse(str(pub_library_dest)) # noqa: S320
352-
364+
tree = None
365+
encoding = "UTF-8"
353366
with open_chmod(int_library_tmp, "wb", chmod=0o644) as fh:
354-
fh.write(
355-
(
356-
f'<?xml version="{tree.docinfo.xml_version}" '
357-
f'encoding="{tree.docinfo.encoding}" ?>\n'
358-
f"<{tree.docinfo.root_name} "
359-
f'version="{tree.getroot().attrib.get("version")}">\n'
360-
).encode(tree.docinfo.encoding)
361-
)
362-
for elem in tree.iter():
363-
if elem.tag == tree.docinfo.root_name:
364-
continue
367+
368+
for _, elem in etree.iterparse(str(pub_library_dest), tag="book"):
369+
if not tree:
370+
tree = elem.getroottree()
371+
encoding = tree.docinfo.encoding or encoding
372+
fh.write(
373+
(
374+
f'<?xml version="{tree.docinfo.xml_version}" '
375+
f'encoding="{encoding}" ?>\n'
376+
f"<{tree.docinfo.root_name} "
377+
f'version="{tree.getroot().attrib.get("version")}">\n'
378+
).encode(encoding)
379+
)
365380

366381
# internal library path is constructed from relative download path
367382
# in URL and prefix with internal_zim_root
@@ -374,10 +389,10 @@ def convert_pub_library_to_internal(
374389
),
375390
)
376391
fh.write(etree.tostring(elem, encoding=tree.docinfo.encoding))
377-
fh.write(f"</{tree.docinfo.root_name}>\n".encode(tree.docinfo.encoding))
392+
fh.write(f"</{tree.docinfo.root_name}>\n".encode(encoding))
378393

379394
logger.info("[LIBS] Internal Library successfuly generated. Verifying XML…")
380-
etree.parse(str(int_library_tmp)) # noqa: S320
395+
assert parseable_xml_file(int_library_tmp)
381396
logger.info("[LIBS] XML is well formed. Swaping files…")
382397
swap(int_library_tmp, internal_library_dest)
383398
logger.info("[LIBS] > done.")
@@ -741,7 +756,7 @@ def write_public_library(self):
741756
fh.write(b"</library>\n")
742757

743758
logger.info("[LIBS] Public Library successfuly generated. Verifying XML…")
744-
etree.parse(str(pub_library_tmp)) # noqa: S320
759+
assert parseable_xml_file(pub_library_tmp)
745760
logger.info("[LIBS] Public XML is well formed. Swaping files…")
746761
swap(pub_library_tmp, self.pub_library_dest)
747762
logger.info("[LIBS] > done.")

0 commit comments

Comments
 (0)