@@ -1232,7 +1232,7 @@ public String nextFieldName() throws IOException
1232
1232
if (name != null ) {
1233
1233
_inputPtr += lenMarker ;
1234
1234
} else {
1235
- name = _decodeShortName (lenMarker );
1235
+ name = _decodeContiguousName (lenMarker );
1236
1236
name = _addDecodedToSymbols (lenMarker , name );
1237
1237
}
1238
1238
}
@@ -2122,17 +2122,21 @@ protected void _finishToken() throws IOException
2122
2122
}
2123
2123
return ;
2124
2124
}
2125
- if (len > (_inputEnd - _inputPtr )) {
2126
- // or if not, could we read?
2127
- if (len >= _inputBuffer .length ) {
2128
- // If not enough space, need handling similar to chunked
2129
- _finishLongText (len );
2125
+ // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2126
+ // the longest individual unit is 4 bytes (surrogate pair) so we
2127
+ // actually need len+3 bytes to avoid bounds checks
2128
+ final int needed = len + 3 ;
2129
+ final int available = _inputEnd - _inputPtr ;
2130
+
2131
+ if ((available >= needed )
2132
+ // if not, could we read? NOTE: we do not require it, just attempt to read
2133
+ || ((_inputBuffer .length >= needed )
2134
+ && _tryToLoadToHaveAtLeast (needed ))) {
2135
+ _finishShortText (len );
2130
2136
return ;
2131
- }
2132
- _loadToHaveAtLeast (len );
2133
2137
}
2134
- // offline for better optimization
2135
- _finishShortText (len );
2138
+ // If not enough space, need handling similar to chunked
2139
+ _finishLongText (len );
2136
2140
}
2137
2141
2138
2142
/**
@@ -2184,7 +2188,7 @@ private final String _finishShortText(int len) throws IOException
2184
2188
if (outBuf .length < len ) { // one minor complication
2185
2189
outBuf = _textBuffer .expandCurrentSegment (len );
2186
2190
}
2187
-
2191
+
2188
2192
int outPtr = 0 ;
2189
2193
int inPtr = _inputPtr ;
2190
2194
_inputPtr += len ;
@@ -2200,33 +2204,47 @@ private final String _finishShortText(int len) throws IOException
2200
2204
return _textBuffer .setCurrentAndReturn (outPtr );
2201
2205
}
2202
2206
}
2203
-
2204
2207
final int [] codes = UTF8_UNIT_CODES ;
2205
2208
do {
2206
2209
i = inputBuf [inPtr ++] & 0xFF ;
2207
2210
switch (codes [i ]) {
2208
2211
case 0 :
2209
2212
break ;
2210
2213
case 1 :
2211
- i = ((i & 0x1F ) << 6 ) | (inputBuf [inPtr ++] & 0x3F );
2214
+ {
2215
+ final int c2 = inputBuf [inPtr ++];
2216
+ if ((c2 & 0xC0 ) != 0x080 ) {
2217
+ _reportInvalidOther (c2 & 0xFF , inPtr );
2218
+ }
2219
+ i = ((i & 0x1F ) << 6 ) | (c2 & 0x3F );
2220
+ }
2212
2221
break ;
2213
2222
case 2 :
2214
- i = ((i & 0x0F ) << 12 )
2215
- | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2216
- | (inputBuf [inPtr ++] & 0x3F );
2223
+ {
2224
+ final int c2 = inputBuf [inPtr ++];
2225
+ if ((c2 & 0xC0 ) != 0x080 ) {
2226
+ _reportInvalidOther (c2 & 0xFF , inPtr );
2227
+ }
2228
+ final int c3 = inputBuf [inPtr ++];
2229
+ if ((c3 & 0xC0 ) != 0x080 ) {
2230
+ _reportInvalidOther (c3 & 0xFF , inPtr );
2231
+ }
2232
+ i = ((i & 0x0F ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
2233
+ }
2217
2234
break ;
2218
2235
case 3 :
2236
+ // 30-Jan-2021, tatu: TODO - validate these too?
2219
2237
i = ((i & 0x07 ) << 18 )
2220
- | ((inputBuf [inPtr ++] & 0x3F ) << 12 )
2221
- | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2222
- | (inputBuf [inPtr ++] & 0x3F );
2238
+ | ((inputBuf [inPtr ++] & 0x3F ) << 12 )
2239
+ | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2240
+ | (inputBuf [inPtr ++] & 0x3F );
2223
2241
// note: this is the codepoint value; need to split, too
2224
2242
i -= 0x10000 ;
2225
2243
outBuf [outPtr ++] = (char ) (0xD800 | (i >> 10 ));
2226
2244
i = 0xDC00 | (i & 0x3FF );
2227
2245
break ;
2228
2246
default : // invalid
2229
- _reportError ( "Invalid byte " + Integer . toHexString ( i )+ " in Unicode text block" );
2247
+ _reportInvalidInitial ( i );
2230
2248
}
2231
2249
outBuf [outPtr ++] = (char ) i ;
2232
2250
} while (inPtr < end );
@@ -2594,7 +2612,7 @@ protected final JsonToken _decodePropertyName() throws IOException
2594
2612
if (name != null ) {
2595
2613
_inputPtr += lenMarker ;
2596
2614
} else {
2597
- name = _decodeShortName (lenMarker );
2615
+ name = _decodeContiguousName (lenMarker );
2598
2616
name = _addDecodedToSymbols (lenMarker , name );
2599
2617
}
2600
2618
}
@@ -2610,7 +2628,7 @@ protected final JsonToken _decodePropertyName() throws IOException
2610
2628
return JsonToken .FIELD_NAME ;
2611
2629
}
2612
2630
2613
- private final String _decodeShortName (int len ) throws IOException
2631
+ private final String _decodeContiguousName (int len ) throws IOException
2614
2632
{
2615
2633
// note: caller ensures we have enough bytes available
2616
2634
int outPtr = 0 ;
@@ -2623,7 +2641,7 @@ private final String _decodeShortName(int len) throws IOException
2623
2641
final int [] codes = UTF8_UNIT_CODES ;
2624
2642
final byte [] inBuf = _inputBuffer ;
2625
2643
2626
- // First a tight loop for Ascii
2644
+ // First a tight loop for ASCII
2627
2645
final int end = inPtr + len ;
2628
2646
while (true ) {
2629
2647
int i = inBuf [inPtr ] & 0xFF ;
@@ -2645,25 +2663,40 @@ private final String _decodeShortName(int len) throws IOException
2645
2663
// trickiest one, need surrogate handling
2646
2664
switch (code ) {
2647
2665
case 1 :
2648
- i = ((i & 0x1F ) << 6 ) | (inBuf [inPtr ++] & 0x3F );
2666
+ {
2667
+ final int c2 = inBuf [inPtr ++];
2668
+ if ((c2 & 0xC0 ) != 0x080 ) {
2669
+ _reportInvalidOther (c2 & 0xFF , inPtr );
2670
+ }
2671
+ i = ((i & 0x1F ) << 6 ) | (c2 & 0x3F );
2672
+ }
2649
2673
break ;
2650
2674
case 2 :
2651
- i = ((i & 0x0F ) << 12 )
2652
- | ((inBuf [inPtr ++] & 0x3F ) << 6 )
2653
- | (inBuf [inPtr ++] & 0x3F );
2675
+ {
2676
+ final int c2 = inBuf [inPtr ++];
2677
+ if ((c2 & 0xC0 ) != 0x080 ) {
2678
+ _reportInvalidOther (c2 & 0xFF , inPtr );
2679
+ }
2680
+ final int c3 = inBuf [inPtr ++];
2681
+ if ((c3 & 0xC0 ) != 0x080 ) {
2682
+ _reportInvalidOther (c3 & 0xFF , inPtr );
2683
+ }
2684
+ i = ((i & 0x0F ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
2685
+ }
2654
2686
break ;
2655
2687
case 3 :
2688
+ // 30-Jan-2021, tatu: TODO - validate surrogate case too?
2656
2689
i = ((i & 0x07 ) << 18 )
2657
- | ((inBuf [inPtr ++] & 0x3F ) << 12 )
2658
- | ((inBuf [inPtr ++] & 0x3F ) << 6 )
2659
- | (inBuf [inPtr ++] & 0x3F );
2690
+ | ((inBuf [inPtr ++] & 0x3F ) << 12 )
2691
+ | ((inBuf [inPtr ++] & 0x3F ) << 6 )
2692
+ | (inBuf [inPtr ++] & 0x3F );
2660
2693
// note: this is the codepoint value; need to split, too
2661
2694
i -= 0x10000 ;
2662
2695
outBuf [outPtr ++] = (char ) (0xD800 | (i >> 10 ));
2663
2696
i = 0xDC00 | (i & 0x3FF );
2664
2697
break ;
2665
2698
default : // invalid
2666
- _reportError ("Invalid byte " +Integer .toHexString (i )+" in Object name" );
2699
+ _reportError ("Invalid UTF-8 byte 0x " +Integer .toHexString (i )+" in Object property name" );
2667
2700
}
2668
2701
}
2669
2702
outBuf [outPtr ++] = (char ) i ;
@@ -2688,7 +2721,7 @@ private final String _decodeLongerName(int len) throws IOException
2688
2721
_inputPtr += len ;
2689
2722
return name ;
2690
2723
}
2691
- name = _decodeShortName (len );
2724
+ name = _decodeContiguousName (len );
2692
2725
return _addDecodedToSymbols (len , name );
2693
2726
}
2694
2727
0 commit comments