Skip to content

Commit 93ef66f

Browse files
committed
[SP-2655] Only produce fh2 hash if we detect line endings and is not bin file
1 parent a32f404 commit 93ef66f

File tree

1 file changed

+21
-17
lines changed

1 file changed

+21
-17
lines changed

src/scanoss/winnowing.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -355,10 +355,10 @@ def __strip_snippets(self, file: str, wfp: str) -> str:
355355

356356
def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool, bool]:
357357
"""Detect the types of line endings present in file contents.
358-
358+
359359
Args:
360360
contents: File contents as bytes.
361-
361+
362362
Returns:
363363
Tuple of (has_crlf, has_lf_only, has_cr_only, has_mixed) indicating which line ending types are present.
364364
"""
@@ -368,51 +368,54 @@ def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool, bool
368368
has_standalone_lf = b'\n' in content_without_crlf
369369
# For CR detection, we need to find CR that's not part of CRLF
370370
has_standalone_cr = b'\r' in content_without_crlf
371-
371+
372372
# Check if we have mixed line endings
373373
line_ending_count = sum([has_crlf, has_standalone_lf, has_standalone_cr])
374374
has_mixed = line_ending_count > 1
375-
375+
376376
return has_crlf, has_standalone_lf, has_standalone_cr, has_mixed
377377

378-
def __calculate_opposite_line_ending_hash(self, contents: bytes) -> str:
378+
def __calculate_opposite_line_ending_hash(self, contents: bytes):
379379
"""Calculate hash for contents with opposite line endings.
380-
380+
381381
If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
382382
If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
383383
384384
Args:
385385
contents: File contents as bytes.
386386
387387
Returns:
388-
Hash with opposite line endings as hex string.
388+
Hash with opposite line endings as hex string, or None if no line endings detected.
389389
"""
390390
has_crlf, has_standalone_lf, has_standalone_cr, has_mixed = self.__detect_line_endings(contents)
391-
391+
392+
if not has_crlf and not has_standalone_lf and not has_standalone_cr:
393+
return None
394+
392395
# Normalize all line endings to LF first
393396
normalized = contents.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
394-
397+
395398
# Determine the dominant line ending type
396399
if has_crlf and not has_standalone_lf and not has_standalone_cr:
397400
# File is Windows (CRLF) - produce Unix (LF) hash
398401
opposite_contents = normalized
399402
else:
400-
# File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
403+
# File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
401404
opposite_contents = normalized.replace(b'\n', b'\r\n')
402-
405+
403406
return hashlib.md5(opposite_contents).hexdigest()
404407

405408
def __should_generate_opposite_hash(self, contents: bytes) -> bool:
406409
"""Determine if an opposite line ending hash (fh2) should be generated.
407-
410+
408411
Args:
409412
contents: File contents as bytes.
410-
413+
411414
Returns:
412415
True if fh2 hash should be generated, False otherwise.
413416
"""
414-
has_crlf, has_standalone_lf, has_standalone_cr, has_mixed = self.__detect_line_endings(contents)
415-
417+
has_crlf, has_standalone_lf, has_standalone_cr = self.__detect_line_endings(contents)
418+
416419
# Generate fh2 hash when file has any line endings (CRLF, LF, or CR)
417420
# This allows us to always produce the opposite hash
418421
return has_crlf or has_standalone_lf or has_standalone_cr
@@ -448,9 +451,10 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
448451
wfp = 'file={0},{1},{2}\n'.format(file_md5, content_length, wfp_filename)
449452

450453
# Add opposite line ending hash based on line ending analysis
451-
if self.__should_generate_opposite_hash(contents):
454+
if not bin_file and self.__should_generate_opposite_hash(contents):
452455
opposite_hash = self.__calculate_opposite_line_ending_hash(contents)
453-
wfp += f'fh2={opposite_hash}\n'
456+
if opposite_hash is not None:
457+
wfp += f'fh2={opposite_hash}\n'
454458

455459
# We don't process snippets for binaries, or other uninteresting files, or if we're requested to skip
456460
if bin_file or self.skip_snippets or self.__skip_snippets(file, contents.decode('utf-8', 'ignore')):

0 commit comments

Comments
 (0)