Skip to content

Commit 9983c7d

Browse files
pythongh-133890: Handle UnicodeEncodeError in tarfile (pythonGH-134147)
UnicodeEncodeError is now handled the same way as OSError during TarFile member extraction.
1 parent 5cbc8c6 commit 9983c7d

File tree

3 files changed

+49
-6
lines changed

3 files changed

+49
-6
lines changed

Lib/tarfile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2439,7 +2439,7 @@ def _get_extract_tarinfo(self, member, filter_function, path):
24392439
unfiltered = tarinfo
24402440
try:
24412441
tarinfo = filter_function(tarinfo, path)
2442-
except (OSError, FilterError) as e:
2442+
except (OSError, UnicodeEncodeError, FilterError) as e:
24432443
self._handle_fatal_error(e)
24442444
except ExtractError as e:
24452445
self._handle_nonfatal_error(e)
@@ -2460,7 +2460,7 @@ def _extract_one(self, tarinfo, path, set_attrs, numeric_owner):
24602460
self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
24612461
set_attrs=set_attrs,
24622462
numeric_owner=numeric_owner)
2463-
except OSError as e:
2463+
except (OSError, UnicodeEncodeError) as e:
24642464
self._handle_fatal_error(e)
24652465
except ExtractError as e:
24662466
self._handle_nonfatal_error(e)

Lib/test/test_tarfile.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3490,11 +3490,12 @@ class ArchiveMaker:
34903490
with t.open() as tar:
34913491
... # `tar` is now a TarFile with 'filename' in it!
34923492
"""
3493-
def __init__(self):
3493+
def __init__(self, **kwargs):
34943494
self.bio = io.BytesIO()
3495+
self.tar_kwargs = dict(kwargs)
34953496

34963497
def __enter__(self):
3497-
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio)
3498+
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs)
34983499
return self
34993500

35003501
def __exit__(self, *exc):
@@ -4073,7 +4074,10 @@ def test_tar_filter(self):
40734074
# that in the test archive.)
40744075
with tarfile.TarFile.open(tarname) as tar:
40754076
for tarinfo in tar.getmembers():
4076-
filtered = tarfile.tar_filter(tarinfo, '')
4077+
try:
4078+
filtered = tarfile.tar_filter(tarinfo, '')
4079+
except UnicodeEncodeError:
4080+
continue
40774081
self.assertIs(filtered.name, tarinfo.name)
40784082
self.assertIs(filtered.type, tarinfo.type)
40794083

@@ -4084,11 +4088,48 @@ def test_data_filter(self):
40844088
for tarinfo in tar.getmembers():
40854089
try:
40864090
filtered = tarfile.data_filter(tarinfo, '')
4087-
except tarfile.FilterError:
4091+
except (tarfile.FilterError, UnicodeEncodeError):
40884092
continue
40894093
self.assertIs(filtered.name, tarinfo.name)
40904094
self.assertIs(filtered.type, tarinfo.type)
40914095

4096+
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
4097+
def test_filter_unencodable(self):
4098+
# Sanity check using a valid path.
4099+
tarinfo = tarfile.TarInfo(os_helper.TESTFN)
4100+
filtered = tarfile.tar_filter(tarinfo, '')
4101+
self.assertIs(filtered.name, tarinfo.name)
4102+
filtered = tarfile.data_filter(tarinfo, '')
4103+
self.assertIs(filtered.name, tarinfo.name)
4104+
4105+
tarinfo = tarfile.TarInfo('test\x00')
4106+
self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '')
4107+
self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '')
4108+
tarinfo = tarfile.TarInfo('\ud800')
4109+
self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '')
4110+
self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '')
4111+
4112+
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
4113+
def test_extract_unencodable(self):
4114+
# Create a member with name \xed\xa0\x80 which is UTF-8 encoded
4115+
# lone surrogate \ud800.
4116+
with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc:
4117+
arc.add('\udced\udca0\udc80')
4118+
with os_helper.temp_cwd() as tmp:
4119+
tar = arc.open(encoding='utf-8', errors='surrogatepass',
4120+
errorlevel=1)
4121+
self.assertEqual(tar.getnames(), ['\ud800'])
4122+
with self.assertRaises(UnicodeEncodeError):
4123+
tar.extractall()
4124+
self.assertEqual(os.listdir(), [])
4125+
4126+
tar = arc.open(encoding='utf-8', errors='surrogatepass',
4127+
errorlevel=0, debug=1)
4128+
with support.captured_stderr() as stderr:
4129+
tar.extractall()
4130+
self.assertEqual(os.listdir(), [])
4131+
self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())
4132+
40924133
def test_change_default_filter_on_instance(self):
40934134
tar = tarfile.TarFile(tarname, 'r')
40944135
def strict_filter(tarinfo, path):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same
2+
way as :exc:`OSError` when cannot extract a member.

0 commit comments

Comments
 (0)