Fix #239

cowtowncoder · cowtowncoder · commit f64a88645d84 · 2021-01-30T20:27:22.000-08:00
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java
@@ -1232,7 +1232,7 @@ public String nextFieldName() throws IOException
                     if (name != null) {
                         _inputPtr += lenMarker;
                     } else {
-                        name = _decodeShortName(lenMarker);
+                        name = _decodeContiguousName(lenMarker);
                         name = _addDecodedToSymbols(lenMarker, name);
                     }
                 }
@@ -2122,17 +2122,21 @@ protected void _finishToken() throws IOException
             }
             return;
         }
-        if (len > (_inputEnd - _inputPtr)) {
-            // or if not, could we read?
-            if (len >= _inputBuffer.length) {
-                // If not enough space, need handling similar to chunked
-                _finishLongText(len);
+        // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
+        //    the longest individual unit is 4 bytes (surrogate pair) so we
+        //    actually need len+3 bytes to avoid bounds checks
+        final int needed = len + 3;
+        final int available = _inputEnd - _inputPtr;
+
+        if ((available >= needed)
+                // if not, could we read? NOTE: we do not require it, just attempt to read
+                    || ((_inputBuffer.length >= needed)
+                            && _tryToLoadToHaveAtLeast(needed))) {
+                _finishShortText(len);
                 return;
-            }
-            _loadToHaveAtLeast(len);
         }
-        // offline for better optimization
-        _finishShortText(len);
+        // If not enough space, need handling similar to chunked
+        _finishLongText(len);
     }
 
     /**
@@ -2184,7 +2188,7 @@ private final String _finishShortText(int len) throws IOException
         if (outBuf.length < len) { // one minor complication
             outBuf = _textBuffer.expandCurrentSegment(len);
         }
-        
+
         int outPtr = 0;
         int inPtr = _inputPtr;
         _inputPtr += len;
@@ -2200,33 +2204,47 @@ private final String _finishShortText(int len) throws IOException
                 return _textBuffer.setCurrentAndReturn(outPtr);
             }
         }
-
         final int[] codes = UTF8_UNIT_CODES;
         do {
             i = inputBuf[inPtr++] & 0xFF;
             switch (codes[i]) {
             case 0:
                 break;
             case 1:
-                i = ((i & 0x1F) << 6) | (inputBuf[inPtr++] & 0x3F);
+                {
+                    final int c2 = inputBuf[inPtr++];
+                    if ((c2 & 0xC0) != 0x080) {
+                        _reportInvalidOther(c2 & 0xFF, inPtr);
+                    }
+                    i = ((i & 0x1F) << 6) | (c2 & 0x3F);
+                }
                 break;
             case 2:
-                i = ((i & 0x0F) << 12)
-                   | ((inputBuf[inPtr++] & 0x3F) << 6)
-                   | (inputBuf[inPtr++] & 0x3F);
+                {
+                    final int c2 = inputBuf[inPtr++];
+                    if ((c2 & 0xC0) != 0x080) {
+                        _reportInvalidOther(c2 & 0xFF, inPtr);
+                    }
+                    final int c3 = inputBuf[inPtr++];
+                    if ((c3 & 0xC0) != 0x080) {
+                        _reportInvalidOther(c3 & 0xFF, inPtr);
+                    }
+                    i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+                }
                 break;
             case 3:
+                // 30-Jan-2021, tatu: TODO - validate these too?
                 i = ((i & 0x07) << 18)
-                 | ((inputBuf[inPtr++] & 0x3F) << 12)
-                 | ((inputBuf[inPtr++] & 0x3F) << 6)
-                 | (inputBuf[inPtr++] & 0x3F);
+                    | ((inputBuf[inPtr++] & 0x3F) << 12)
+                    | ((inputBuf[inPtr++] & 0x3F) << 6)
+                    | (inputBuf[inPtr++] & 0x3F);
                 // note: this is the codepoint value; need to split, too
                 i -= 0x10000;
                 outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
                 i = 0xDC00 | (i & 0x3FF);
                 break;
             default: // invalid
-                _reportError("Invalid byte "+Integer.toHexString(i)+" in Unicode text block");
+                _reportInvalidInitial(i);
             }
             outBuf[outPtr++] = (char) i;
         } while (inPtr < end);
@@ -2594,7 +2612,7 @@ protected final JsonToken _decodePropertyName() throws IOException
                 if (name != null) {
                     _inputPtr += lenMarker;
                 } else {
-                    name = _decodeShortName(lenMarker);
+                    name = _decodeContiguousName(lenMarker);
                     name = _addDecodedToSymbols(lenMarker, name);
                 }
             }
@@ -2610,7 +2628,7 @@ protected final JsonToken _decodePropertyName() throws IOException
         return JsonToken.FIELD_NAME;
     }
     
-    private final String _decodeShortName(int len) throws IOException
+    private final String _decodeContiguousName(int len) throws IOException
     {
         // note: caller ensures we have enough bytes available
         int outPtr = 0;
@@ -2623,7 +2641,7 @@ private final String _decodeShortName(int len) throws IOException
         final int[] codes = UTF8_UNIT_CODES;
         final byte[] inBuf = _inputBuffer;
 
-        // First a tight loop for Ascii
+        // First a tight loop for ASCII
         final int end = inPtr + len;
         while (true) {
             int i = inBuf[inPtr] & 0xFF;
@@ -2645,25 +2663,40 @@ private final String _decodeShortName(int len) throws IOException
                 // trickiest one, need surrogate handling
                 switch (code) {
                 case 1:
-                    i = ((i & 0x1F) << 6) | (inBuf[inPtr++] & 0x3F);
+                    {
+                        final int c2 = inBuf[inPtr++];
+                        if ((c2 & 0xC0) != 0x080) {
+                            _reportInvalidOther(c2 & 0xFF, inPtr);
+                        }
+                        i = ((i & 0x1F) << 6) | (c2 & 0x3F);
+                    }
                     break;
                 case 2:
-                    i = ((i & 0x0F) << 12)
-                    | ((inBuf[inPtr++] & 0x3F) << 6)
-                    | (inBuf[inPtr++] & 0x3F);
+                    {
+                        final int c2 = inBuf[inPtr++];
+                        if ((c2 & 0xC0) != 0x080) {
+                            _reportInvalidOther(c2 & 0xFF, inPtr);
+                        }
+                        final int c3 = inBuf[inPtr++];
+                        if ((c3 & 0xC0) != 0x080) {
+                            _reportInvalidOther(c3 & 0xFF, inPtr);
+                        }
+                        i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+                    }
                     break;
                 case 3:
+                    // 30-Jan-2021, tatu: TODO - validate surrogate case too?
                     i = ((i & 0x07) << 18)
-                    | ((inBuf[inPtr++] & 0x3F) << 12)
-                    | ((inBuf[inPtr++] & 0x3F) << 6)
-                    | (inBuf[inPtr++] & 0x3F);
+                        | ((inBuf[inPtr++] & 0x3F) << 12)
+                        | ((inBuf[inPtr++] & 0x3F) << 6)
+                        | (inBuf[inPtr++] & 0x3F);
                     // note: this is the codepoint value; need to split, too
                     i -= 0x10000;
                     outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
                     i = 0xDC00 | (i & 0x3FF);
                     break;
                 default: // invalid
-                    _reportError("Invalid byte "+Integer.toHexString(i)+" in Object name");
+                    _reportError("Invalid UTF-8 byte 0x"+Integer.toHexString(i)+" in Object property name");
                 }
             }
             outBuf[outPtr++] = (char) i;
@@ -2688,7 +2721,7 @@ private final String _decodeLongerName(int len) throws IOException
             _inputPtr += len;
             return name;
         }
-        name = _decodeShortName(len);
+        name = _decodeContiguousName(len);
         return _addDecodedToSymbols(len, name);
     }
     
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java
@@ -54,4 +54,74 @@ public void testShortString236TruncatedString() throws Exception
             }
         }
     }
+
+    public void testShortString237InvalidTextValue() throws Exception
+    {
+        // String with length of 2 bytes, but a few null bytes as fillers to
+        // avoid buffer boundary
+        // (2nd byte implies 2-byte sequence but 3rd byte does not have high-bit set)
+        byte[] input2 = {0x62, (byte) 0xCF, 0x2d,
+                0, 0, 0, 0, 0, 0};
+        try (CBORParser p = cborParser(input2)) {
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            try {
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Invalid UTF-8 middle byte 0x2d");
+            }
+        }
+
+        // but let's also validate 3-byte variant as well
+        byte[] input3 = {0x63, (byte) 0xEF, (byte) 0x8e, 0x2d,
+                0, 0, 0, 0, 0, 0};
+        try (CBORParser p = cborParser(input3)) {
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            try {
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Invalid UTF-8 middle byte 0x2d");
+            }
+        }
+    }
+
+    public void testShortString237InvalidName() throws Exception
+    {
+        // Object with 2-byte invalid name
+        byte[] input2 = { (byte) 0xBF, // Object, indefinite length
+                0x62, (byte) 0xCF, 0x2e, // 2-byte name but invalid second byte
+                0x21, // int value of 33
+                (byte) 0xFF, // Object END marker
+                0, 0, 0, 0 // padding
+        };
+        try (CBORParser p = cborParser(input2)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            try {
+                p.nextToken();
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Invalid UTF-8 middle byte 0x2e");
+            }
+        }
+
+        // but let's also validate 3-byte variant as well
+        byte[] input3 = { (byte) 0xBF, // Object, indefinite length
+                0x62, (byte) 0xEF, (byte) 0x8e, 0x2f, // 3-byte name but invalid third byte
+                0x22, // int value of 34
+                (byte) 0xFF, // Object END marker
+                0, 0, 0, 0 // padding
+        };
+        try (CBORParser p = cborParser(input3)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            try {
+                p.nextToken();
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Invalid UTF-8 middle byte 0x2f");
+            }
+        }
+    }
 }
diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x
@@ -11,7 +11,7 @@ Project: jackson-datatypes-binaryModules:
 
 2.13.0 (not yet released)
 
-No changes since 2.12
+#239: Should validate UTF-8 multi-byte validity for short decode path too
 
 2.12.2 (not yet released)