@@ -339,6 +339,19 @@ def is_latest(fpath: pathlib.Path) -> bool:
339
339
return True
340
340
341
341
342
+ def parseable_xml_file (fpath : pathlib .Path ) -> bool :
343
+ """whether Catalog XML file can be parsed as XML"""
344
+ try :
345
+ for _ , elem in etree .iterparse (str (fpath )):
346
+ if elem .tag == "book" :
347
+ assert elem .attrib ["id" ]
348
+ except Exception as exc :
349
+ logger .error (exc )
350
+ logger .exception (exc )
351
+ return False
352
+ return True
353
+
354
+
342
355
def convert_pub_library_to_internal (
343
356
pub_library_dest : pathlib .Path ,
344
357
internal_library_dest : pathlib .Path ,
@@ -348,20 +361,22 @@ def convert_pub_library_to_internal(
348
361
logger .info (f"[LIBS] Preparing internal Library for { internal_library_dest } " )
349
362
350
363
int_library_tmp = get_tmp (internal_library_dest )
351
- tree = etree . parse ( str ( pub_library_dest )) # noqa: S320
352
-
364
+ tree = None
365
+ encoding = "UTF-8"
353
366
with open_chmod (int_library_tmp , "wb" , chmod = 0o644 ) as fh :
354
- fh .write (
355
- (
356
- f'<?xml version="{ tree .docinfo .xml_version } " '
357
- f'encoding="{ tree .docinfo .encoding } " ?>\n '
358
- f"<{ tree .docinfo .root_name } "
359
- f'version="{ tree .getroot ().attrib .get ("version" )} ">\n '
360
- ).encode (tree .docinfo .encoding )
361
- )
362
- for elem in tree .iter ():
363
- if elem .tag == tree .docinfo .root_name :
364
- continue
367
+
368
+ for _ , elem in etree .iterparse (str (pub_library_dest ), tag = "book" ):
369
+ if not tree :
370
+ tree = elem .getroottree ()
371
+ encoding = tree .docinfo .encoding or encoding
372
+ fh .write (
373
+ (
374
+ f'<?xml version="{ tree .docinfo .xml_version } " '
375
+ f'encoding="{ encoding } " ?>\n '
376
+ f"<{ tree .docinfo .root_name } "
377
+ f'version="{ tree .getroot ().attrib .get ("version" )} ">\n '
378
+ ).encode (encoding )
379
+ )
365
380
366
381
# internal library path is constructed from relative download path
367
382
# in URL and prefix with internal_zim_root
@@ -374,10 +389,10 @@ def convert_pub_library_to_internal(
374
389
),
375
390
)
376
391
fh .write (etree .tostring (elem , encoding = tree .docinfo .encoding ))
377
- fh .write (f"</{ tree .docinfo .root_name } >\n " .encode (tree . docinfo . encoding ))
392
+ fh .write (f"</{ tree .docinfo .root_name } >\n " .encode (encoding ))
378
393
379
394
logger .info ("[LIBS] Internal Library successfuly generated. Verifying XML…" )
380
- etree . parse ( str ( int_library_tmp )) # noqa: S320
395
+ assert parseable_xml_file ( int_library_tmp )
381
396
logger .info ("[LIBS] XML is well formed. Swaping files…" )
382
397
swap (int_library_tmp , internal_library_dest )
383
398
logger .info ("[LIBS] > done." )
@@ -741,7 +756,7 @@ def write_public_library(self):
741
756
fh .write (b"</library>\n " )
742
757
743
758
logger .info ("[LIBS] Public Library successfuly generated. Verifying XML…" )
744
- etree . parse ( str ( pub_library_tmp )) # noqa: S320
759
+ assert parseable_xml_file ( pub_library_tmp )
745
760
logger .info ("[LIBS] Public XML is well formed. Swaping files…" )
746
761
swap (pub_library_tmp , self .pub_library_dest )
747
762
logger .info ("[LIBS] > done." )
0 commit comments