@@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool {
813
813
814
814
#[ inline( always) ]
815
815
fn first_non_utf8_index ( v : & [ u8 ] ) -> Option < uint > {
816
- let mut i = 0 u;
817
- let total = v. len ( ) ;
818
- fn unsafe_get ( xs : & [ u8 ] , i : uint ) -> u8 {
819
- unsafe { * xs. unsafe_ref ( i) }
820
- }
821
- while i < total {
822
- let v_i = unsafe_get ( v, i) ;
823
- if v_i < 128u8 {
824
- i += 1 u;
825
- } else {
826
- let w = utf8_char_width ( v_i) ;
827
- if w == 0 u { return Some ( i) ; }
828
-
829
- let nexti = i + w;
830
- if nexti > total { return Some ( i) ; }
816
+ let mut it = v. iter ( ) ;
831
817
832
- // 2-byte encoding is for codepoints \u0080 to \u07ff
833
- // first C2 80 last DF BF
834
- // 3-byte encoding is for codepoints \u0800 to \uffff
835
- // first E0 A0 80 last EF BF BF
836
- // excluding surrogates codepoints \ud800 to \udfff
837
- // ED A0 80 to ED BF BF
838
- // 4-byte encoding is for codepoints \u10000 to \u10ffff
839
- // first F0 90 80 80 last F4 8F BF BF
840
- //
841
- // Use the UTF-8 syntax from the RFC
842
- //
843
- // https://tools.ietf.org/html/rfc3629
844
- // UTF8-1 = %x00-7F
845
- // UTF8-2 = %xC2-DF UTF8-tail
846
- // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
847
- // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
848
- // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
849
- // %xF4 %x80-8F 2( UTF8-tail )
850
- // UTF8-tail = %x80-BF
851
- match w {
852
- 2 => if unsafe_get ( v, i + 1 ) & 192u8 != TAG_CONT_U8 {
853
- return Some ( i)
854
- } ,
855
- 3 => match ( v_i,
856
- unsafe_get ( v, i + 1 ) ,
857
- unsafe_get ( v, i + 2 ) & 192u8 ) {
858
- ( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
859
- ( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
860
- ( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) => ( ) ,
861
- ( 0xEE .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
862
- _ => return Some ( i) ,
863
- } ,
864
- _ => match ( v_i,
865
- unsafe_get ( v, i + 1 ) ,
866
- unsafe_get ( v, i + 2 ) & 192u8 ,
867
- unsafe_get ( v, i + 3 ) & 192u8 ) {
868
- ( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
869
- ( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
870
- ( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
871
- _ => return Some ( i)
872
- } ,
873
- }
874
-
875
- i = nexti;
876
- }
818
+ let ok = run_utf8_validation_iterator ( & mut it) ;
819
+ if ok {
820
+ None
821
+ } else {
822
+ // work out how many valid bytes we've consumed
823
+ // (run_utf8_validation_iterator resets the iterator to just
824
+ // after the last good byte), which we can do because the
825
+ // vector iterator size_hint is exact.
826
+ let ( remaining, _) = it. size_hint ( ) ;
827
+ Some ( v. len ( ) - remaining)
877
828
}
878
- None
879
829
}
880
830
881
831
/// Determines if a vector of `u16` contains valid UTF-16
0 commit comments