Skip to content

Commit f64a886

Browse files
committed
Fix #239
1 parent 6bf8509 commit f64a886

File tree

3 files changed

+136
-33
lines changed

3 files changed

+136
-33
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java

Lines changed: 65 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,7 +1232,7 @@ public String nextFieldName() throws IOException
12321232
if (name != null) {
12331233
_inputPtr += lenMarker;
12341234
} else {
1235-
name = _decodeShortName(lenMarker);
1235+
name = _decodeContiguousName(lenMarker);
12361236
name = _addDecodedToSymbols(lenMarker, name);
12371237
}
12381238
}
@@ -2122,17 +2122,21 @@ protected void _finishToken() throws IOException
21222122
}
21232123
return;
21242124
}
2125-
if (len > (_inputEnd - _inputPtr)) {
2126-
// or if not, could we read?
2127-
if (len >= _inputBuffer.length) {
2128-
// If not enough space, need handling similar to chunked
2129-
_finishLongText(len);
2125+
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2126+
// the longest individual unit is 4 bytes (surrogate pair) so we
2127+
// actually need len+3 bytes to avoid bounds checks
2128+
final int needed = len + 3;
2129+
final int available = _inputEnd - _inputPtr;
2130+
2131+
if ((available >= needed)
2132+
// if not, could we read? NOTE: we do not require it, just attempt to read
2133+
|| ((_inputBuffer.length >= needed)
2134+
&& _tryToLoadToHaveAtLeast(needed))) {
2135+
_finishShortText(len);
21302136
return;
2131-
}
2132-
_loadToHaveAtLeast(len);
21332137
}
2134-
// offline for better optimization
2135-
_finishShortText(len);
2138+
// If not enough space, need handling similar to chunked
2139+
_finishLongText(len);
21362140
}
21372141

21382142
/**
@@ -2184,7 +2188,7 @@ private final String _finishShortText(int len) throws IOException
21842188
if (outBuf.length < len) { // one minor complication
21852189
outBuf = _textBuffer.expandCurrentSegment(len);
21862190
}
2187-
2191+
21882192
int outPtr = 0;
21892193
int inPtr = _inputPtr;
21902194
_inputPtr += len;
@@ -2200,33 +2204,47 @@ private final String _finishShortText(int len) throws IOException
22002204
return _textBuffer.setCurrentAndReturn(outPtr);
22012205
}
22022206
}
2203-
22042207
final int[] codes = UTF8_UNIT_CODES;
22052208
do {
22062209
i = inputBuf[inPtr++] & 0xFF;
22072210
switch (codes[i]) {
22082211
case 0:
22092212
break;
22102213
case 1:
2211-
i = ((i & 0x1F) << 6) | (inputBuf[inPtr++] & 0x3F);
2214+
{
2215+
final int c2 = inputBuf[inPtr++];
2216+
if ((c2 & 0xC0) != 0x080) {
2217+
_reportInvalidOther(c2 & 0xFF, inPtr);
2218+
}
2219+
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
2220+
}
22122221
break;
22132222
case 2:
2214-
i = ((i & 0x0F) << 12)
2215-
| ((inputBuf[inPtr++] & 0x3F) << 6)
2216-
| (inputBuf[inPtr++] & 0x3F);
2223+
{
2224+
final int c2 = inputBuf[inPtr++];
2225+
if ((c2 & 0xC0) != 0x080) {
2226+
_reportInvalidOther(c2 & 0xFF, inPtr);
2227+
}
2228+
final int c3 = inputBuf[inPtr++];
2229+
if ((c3 & 0xC0) != 0x080) {
2230+
_reportInvalidOther(c3 & 0xFF, inPtr);
2231+
}
2232+
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
2233+
}
22172234
break;
22182235
case 3:
2236+
// 30-Jan-2021, tatu: TODO - validate these too?
22192237
i = ((i & 0x07) << 18)
2220-
| ((inputBuf[inPtr++] & 0x3F) << 12)
2221-
| ((inputBuf[inPtr++] & 0x3F) << 6)
2222-
| (inputBuf[inPtr++] & 0x3F);
2238+
| ((inputBuf[inPtr++] & 0x3F) << 12)
2239+
| ((inputBuf[inPtr++] & 0x3F) << 6)
2240+
| (inputBuf[inPtr++] & 0x3F);
22232241
// note: this is the codepoint value; need to split, too
22242242
i -= 0x10000;
22252243
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
22262244
i = 0xDC00 | (i & 0x3FF);
22272245
break;
22282246
default: // invalid
2229-
_reportError("Invalid byte "+Integer.toHexString(i)+" in Unicode text block");
2247+
_reportInvalidInitial(i);
22302248
}
22312249
outBuf[outPtr++] = (char) i;
22322250
} while (inPtr < end);
@@ -2594,7 +2612,7 @@ protected final JsonToken _decodePropertyName() throws IOException
25942612
if (name != null) {
25952613
_inputPtr += lenMarker;
25962614
} else {
2597-
name = _decodeShortName(lenMarker);
2615+
name = _decodeContiguousName(lenMarker);
25982616
name = _addDecodedToSymbols(lenMarker, name);
25992617
}
26002618
}
@@ -2610,7 +2628,7 @@ protected final JsonToken _decodePropertyName() throws IOException
26102628
return JsonToken.FIELD_NAME;
26112629
}
26122630

2613-
private final String _decodeShortName(int len) throws IOException
2631+
private final String _decodeContiguousName(int len) throws IOException
26142632
{
26152633
// note: caller ensures we have enough bytes available
26162634
int outPtr = 0;
@@ -2623,7 +2641,7 @@ private final String _decodeShortName(int len) throws IOException
26232641
final int[] codes = UTF8_UNIT_CODES;
26242642
final byte[] inBuf = _inputBuffer;
26252643

2626-
// First a tight loop for Ascii
2644+
// First a tight loop for ASCII
26272645
final int end = inPtr + len;
26282646
while (true) {
26292647
int i = inBuf[inPtr] & 0xFF;
@@ -2645,25 +2663,40 @@ private final String _decodeShortName(int len) throws IOException
26452663
// trickiest one, need surrogate handling
26462664
switch (code) {
26472665
case 1:
2648-
i = ((i & 0x1F) << 6) | (inBuf[inPtr++] & 0x3F);
2666+
{
2667+
final int c2 = inBuf[inPtr++];
2668+
if ((c2 & 0xC0) != 0x080) {
2669+
_reportInvalidOther(c2 & 0xFF, inPtr);
2670+
}
2671+
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
2672+
}
26492673
break;
26502674
case 2:
2651-
i = ((i & 0x0F) << 12)
2652-
| ((inBuf[inPtr++] & 0x3F) << 6)
2653-
| (inBuf[inPtr++] & 0x3F);
2675+
{
2676+
final int c2 = inBuf[inPtr++];
2677+
if ((c2 & 0xC0) != 0x080) {
2678+
_reportInvalidOther(c2 & 0xFF, inPtr);
2679+
}
2680+
final int c3 = inBuf[inPtr++];
2681+
if ((c3 & 0xC0) != 0x080) {
2682+
_reportInvalidOther(c3 & 0xFF, inPtr);
2683+
}
2684+
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
2685+
}
26542686
break;
26552687
case 3:
2688+
// 30-Jan-2021, tatu: TODO - validate surrogate case too?
26562689
i = ((i & 0x07) << 18)
2657-
| ((inBuf[inPtr++] & 0x3F) << 12)
2658-
| ((inBuf[inPtr++] & 0x3F) << 6)
2659-
| (inBuf[inPtr++] & 0x3F);
2690+
| ((inBuf[inPtr++] & 0x3F) << 12)
2691+
| ((inBuf[inPtr++] & 0x3F) << 6)
2692+
| (inBuf[inPtr++] & 0x3F);
26602693
// note: this is the codepoint value; need to split, too
26612694
i -= 0x10000;
26622695
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
26632696
i = 0xDC00 | (i & 0x3FF);
26642697
break;
26652698
default: // invalid
2666-
_reportError("Invalid byte "+Integer.toHexString(i)+" in Object name");
2699+
_reportError("Invalid UTF-8 byte 0x"+Integer.toHexString(i)+" in Object property name");
26672700
}
26682701
}
26692702
outBuf[outPtr++] = (char) i;
@@ -2688,7 +2721,7 @@ private final String _decodeLongerName(int len) throws IOException
26882721
_inputPtr += len;
26892722
return name;
26902723
}
2691-
name = _decodeShortName(len);
2724+
name = _decodeContiguousName(len);
26922725
return _addDecodedToSymbols(len, name);
26932726
}
26942727

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,74 @@ public void testShortString236TruncatedString() throws Exception
5454
}
5555
}
5656
}
57+
58+
public void testShortString237InvalidTextValue() throws Exception
59+
{
60+
// String with length of 2 bytes, but a few null bytes as fillers to
61+
// avoid buffer boundary
62+
// (2nd byte implies 2-byte sequence but 3rd byte does not have high-bit set)
63+
byte[] input2 = {0x62, (byte) 0xCF, 0x2d,
64+
0, 0, 0, 0, 0, 0};
65+
try (CBORParser p = cborParser(input2)) {
66+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
67+
try {
68+
String str = p.getText();
69+
fail("Should have failed, did not, String = '"+str+"'");
70+
} catch (StreamReadException e) {
71+
verifyException(e, "Invalid UTF-8 middle byte 0x2d");
72+
}
73+
}
74+
75+
// but let's also validate 3-byte variant as well
76+
byte[] input3 = {0x63, (byte) 0xEF, (byte) 0x8e, 0x2d,
77+
0, 0, 0, 0, 0, 0};
78+
try (CBORParser p = cborParser(input3)) {
79+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
80+
try {
81+
String str = p.getText();
82+
fail("Should have failed, did not, String = '"+str+"'");
83+
} catch (StreamReadException e) {
84+
verifyException(e, "Invalid UTF-8 middle byte 0x2d");
85+
}
86+
}
87+
}
88+
89+
public void testShortString237InvalidName() throws Exception
90+
{
91+
// Object with 2-byte invalid name
92+
byte[] input2 = { (byte) 0xBF, // Object, indefinite length
93+
0x62, (byte) 0xCF, 0x2e, // 2-byte name but invalid second byte
94+
0x21, // int value of 33
95+
(byte) 0xFF, // Object END marker
96+
0, 0, 0, 0 // padding
97+
};
98+
try (CBORParser p = cborParser(input2)) {
99+
assertToken(JsonToken.START_OBJECT, p.nextToken());
100+
try {
101+
p.nextToken();
102+
String str = p.getText();
103+
fail("Should have failed, did not, String = '"+str+"'");
104+
} catch (StreamReadException e) {
105+
verifyException(e, "Invalid UTF-8 middle byte 0x2e");
106+
}
107+
}
108+
109+
// but let's also validate 3-byte variant as well
110+
byte[] input3 = { (byte) 0xBF, // Object, indefinite length
111+
0x62, (byte) 0xEF, (byte) 0x8e, 0x2f, // 3-byte name but invalid third byte
112+
0x22, // int value of 34
113+
(byte) 0xFF, // Object END marker
114+
0, 0, 0, 0 // padding
115+
};
116+
try (CBORParser p = cborParser(input3)) {
117+
assertToken(JsonToken.START_OBJECT, p.nextToken());
118+
try {
119+
p.nextToken();
120+
String str = p.getText();
121+
fail("Should have failed, did not, String = '"+str+"'");
122+
} catch (StreamReadException e) {
123+
verifyException(e, "Invalid UTF-8 middle byte 0x2f");
124+
}
125+
}
126+
}
57127
}

release-notes/VERSION-2.x

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Project: jackson-datatypes-binaryModules:
1111

1212
2.13.0 (not yet released)
1313

14-
No changes since 2.12
14+
#239: Should validate UTF-8 multi-byte validity for short decode path too
1515

1616
2.12.2 (not yet released)
1717

0 commit comments

Comments
 (0)