Skip to content

Commit 67d5037

Browse files
stasoidtordex
authored andcommitted
add test for utf_8_decoder
1 parent 75356cd commit 67d5037

File tree

1 file changed

+74
-0
lines changed

1 file changed

+74
-0
lines changed

encodings_test.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include <gtest/gtest.h>
2+
#include <litehtml.h>
3+
4+
using namespace litehtml;
5+
using namespace std;
6+
7+
#define S(s) string{ s, sizeof(s) - 1 }
8+
9+
struct test
10+
{
11+
string input;
12+
string output;
13+
};
14+
15+
test utf8_tests[] =
16+
{
17+
// VALID INPUTS
18+
19+
// input output
20+
{ S("\x00"), S("\x00") }, // NUL
21+
{ S("A"), S("A") }, // printable ASCII
22+
23+
// A - ASCII, L2 - lead byte of a 2-byte sequence, C - continuation byte
24+
{ S("\xC2\xA3"), S("\xC2\xA3") }, // L2 C
25+
{ S("\xE2\x82\xAC"), S("\xE2\x82\xAC") }, // L3 C C
26+
{ S("\xF0\x90\x8D\x88"), S("\xF0\x90\x8D\x88") }, // L4 C C C
27+
28+
{ S("\xEF\xBB\xBF\xC2\xA3"), S("\xC2\xA3") }, // utf-8 bom is removed
29+
30+
// KINDA INVALID INPUTS
31+
32+
{ S("\xFF\xFE\xC2\xA3"), S("\xEA\x8F\x82") }, // utf-16le bom is removed, the rest is interpreted as utf-16le
33+
// Because UTF-32 is not a valid HTML encoding, UTF-32LE BOM is not recognized, it is interpreted as
34+
// UTF-16LE BOM. UTF-16LE BOM is removed, the rest is interpreted as UTF-16LE.
35+
{ S("\xFF\xFE\x00\x00\xC2\xA3"), S("\x00\xEA\x8F\x82") },
36+
37+
// INVALID INPUTS
38+
39+
{ S("\xC0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // overlong sequence for NUL -> 2 U+FFFD
40+
{ S("\xF0\x82\x82\xAC"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // overlong sequence for € -> 4 U+FFFD
41+
42+
{ S("\xED\xB0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // low surrogate U+DC00 -> 3 U+FFFD
43+
// low surrogate U+DC00 + high surrogate U+D800 -> 6 U+FFFD
44+
{ S("\xED\xB0\x80\xED\xA0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") },
45+
46+
{ S("\xF4\x90\x80\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // U+110000 invalid codepoint (must be <= 0x10FFFF)
47+
48+
{ S("\x80"), S("\xEF\xBF\xBD") }, // C
49+
{ S("\xC2\x41"), S("\xEF\xBF\xBD\x41") }, // L2 A
50+
{ S("\xC2"), S("\xEF\xBF\xBD") }, // L2
51+
{ S("\xE0\x80\x41"), S("\xEF\xBF\xBD\xEF\xBF\xBD\x41") }, // L3 C A
52+
{ S("\xE0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // L3 C
53+
{ S("\xF0\x80\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // L4 C C
54+
{ S("\xC2\xE0"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // L2 L3
55+
{ S("\xE0\xC2\x80"), S("\xEF\xBF\xBD\xC2\x80") }, // L3 L2 C
56+
57+
{ S("\xF5"), S("\xEF\xBF\xBD") }, // L4
58+
{ S("\xF8"), S("\xEF\xBF\xBD") }, // L5
59+
{ S("\xFC"), S("\xEF\xBF\xBD") }, // L6
60+
{ S("\xFE"), S("\xEF\xBF\xBD") },
61+
{ S("\xFF"), S("\xEF\xBF\xBD") },
62+
63+
};
64+
65+
66+
TEST(encodings, utf8)
67+
{
68+
for (auto test : utf8_tests)
69+
{
70+
string output;
71+
decode(test.input, encoding::utf_8, output);
72+
EXPECT_EQ(output, test.output);
73+
}
74+
}

0 commit comments

Comments
 (0)