[SP-2655] Only produce fh2 hash if we detect line endings and is not bin file

matiasdaloia · matiasdaloia · commit 93ef66f8085a · 2025-06-09T16:39:16.000+02:00
diff --git a/src/scanoss/winnowing.py b/src/scanoss/winnowing.py
@@ -355,10 +355,10 @@ def __strip_snippets(self, file: str, wfp: str) -> str:
 
     def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool, bool]:
         """Detect the types of line endings present in file contents.
-        
+
         Args:
             contents: File contents as bytes.
-            
+
         Returns:
             Tuple of (has_crlf, has_lf_only, has_cr_only, has_mixed) indicating which line ending types are present.
         """
@@ -368,51 +368,54 @@ def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool, bool
         has_standalone_lf = b'\n' in content_without_crlf
         # For CR detection, we need to find CR that's not part of CRLF
         has_standalone_cr = b'\r' in content_without_crlf
-        
+
         # Check if we have mixed line endings
         line_ending_count = sum([has_crlf, has_standalone_lf, has_standalone_cr])
         has_mixed = line_ending_count > 1
-        
+
         return has_crlf, has_standalone_lf, has_standalone_cr, has_mixed
 
-    def __calculate_opposite_line_ending_hash(self, contents: bytes) -> str:
+    def __calculate_opposite_line_ending_hash(self, contents: bytes):
         """Calculate hash for contents with opposite line endings.
-        
+
         If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
         If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
 
         Args:
             contents: File contents as bytes.
 
         Returns:
-            Hash with opposite line endings as hex string.
+            Hash with opposite line endings as hex string, or None if no line endings detected.
         """
         has_crlf, has_standalone_lf, has_standalone_cr, has_mixed = self.__detect_line_endings(contents)
-        
+
+        if not has_crlf and not has_standalone_lf and not has_standalone_cr:
+            return None
+
         # Normalize all line endings to LF first
         normalized = contents.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
-        
+
         # Determine the dominant line ending type
         if has_crlf and not has_standalone_lf and not has_standalone_cr:
             # File is Windows (CRLF) - produce Unix (LF) hash
             opposite_contents = normalized
         else:
-            # File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash  
+            # File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
             opposite_contents = normalized.replace(b'\n', b'\r\n')
-            
+
         return hashlib.md5(opposite_contents).hexdigest()
 
     def __should_generate_opposite_hash(self, contents: bytes) -> bool:
         """Determine if an opposite line ending hash (fh2) should be generated.
-        
+
         Args:
             contents: File contents as bytes.
-            
+
         Returns:
             True if fh2 hash should be generated, False otherwise.
         """
-        has_crlf, has_standalone_lf, has_standalone_cr, has_mixed = self.__detect_line_endings(contents)
-        
+        has_crlf, has_standalone_lf, has_standalone_cr = self.__detect_line_endings(contents)
+
         # Generate fh2 hash when file has any line endings (CRLF, LF, or CR)
         # This allows us to always produce the opposite hash
         return has_crlf or has_standalone_lf or has_standalone_cr
@@ -448,9 +451,10 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
         wfp = 'file={0},{1},{2}\n'.format(file_md5, content_length, wfp_filename)
 
         # Add opposite line ending hash based on line ending analysis
-        if self.__should_generate_opposite_hash(contents):
+        if not bin_file and self.__should_generate_opposite_hash(contents):
             opposite_hash = self.__calculate_opposite_line_ending_hash(contents)
-            wfp += f'fh2={opposite_hash}\n'
+            if opposite_hash is not None:
+                wfp += f'fh2={opposite_hash}\n'
 
         # We don't process snippets for binaries, or other uninteresting files, or if we're requested to skip
         if bin_file or self.skip_snippets or self.__skip_snippets(file, contents.decode('utf-8', 'ignore')):