|
| 1 | +#include <gtest/gtest.h> |
| 2 | +#include <litehtml.h> |
| 3 | + |
| 4 | +using namespace litehtml; |
| 5 | +using namespace std; |
| 6 | + |
| 7 | +#define S(s) string{ s, sizeof(s) - 1 } |
| 8 | + |
| 9 | +struct test |
| 10 | +{ |
| 11 | + string input; |
| 12 | + string output; |
| 13 | +}; |
| 14 | + |
| 15 | +test utf8_tests[] = |
| 16 | +{ |
| 17 | + // VALID INPUTS |
| 18 | + |
| 19 | + // input output |
| 20 | + { S("\x00"), S("\x00") }, // NUL |
| 21 | + { S("A"), S("A") }, // printable ASCII |
| 22 | + |
| 23 | + // A - ASCII, L2 - lead byte of a 2-byte sequence, C - continuation byte |
| 24 | + { S("\xC2\xA3"), S("\xC2\xA3") }, // L2 C |
| 25 | + { S("\xE2\x82\xAC"), S("\xE2\x82\xAC") }, // L3 C C |
| 26 | + { S("\xF0\x90\x8D\x88"), S("\xF0\x90\x8D\x88") }, // L4 C C C |
| 27 | + |
| 28 | + { S("\xEF\xBB\xBF\xC2\xA3"), S("\xC2\xA3") }, // utf-8 bom is removed |
| 29 | + |
| 30 | + // KINDA INVALID INPUTS |
| 31 | + |
| 32 | + { S("\xFF\xFE\xC2\xA3"), S("\xEA\x8F\x82") }, // utf-16le bom is removed, the rest is interpreted as utf-16le |
| 33 | + // Because UTF-32 is not a valid HTML encoding, UTF-32LE BOM is not recognized, it is interpreted as |
| 34 | + // UTF-16LE BOM. UTF-16LE BOM is removed, the rest is interpreted as UTF-16LE. |
| 35 | + { S("\xFF\xFE\x00\x00\xC2\xA3"), S("\x00\xEA\x8F\x82") }, |
| 36 | + |
| 37 | + // INVALID INPUTS |
| 38 | + |
| 39 | + { S("\xC0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // overlong sequence for NUL -> 2 U+FFFD |
| 40 | + { S("\xF0\x82\x82\xAC"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // overlong sequence for € -> 4 U+FFFD |
| 41 | + |
| 42 | + { S("\xED\xB0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // low surrogate U+DC00 -> 3 U+FFFD |
| 43 | + // low surrogate U+DC00 + high surrogate U+D800 -> 6 U+FFFD |
| 44 | + { S("\xED\xB0\x80\xED\xA0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, |
| 45 | + |
| 46 | + { S("\xF4\x90\x80\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // U+110000 invalid codepoint (must be <= 0x10FFFF) |
| 47 | + |
| 48 | + { S("\x80"), S("\xEF\xBF\xBD") }, // C |
| 49 | + { S("\xC2\x41"), S("\xEF\xBF\xBD\x41") }, // L2 A |
| 50 | + { S("\xC2"), S("\xEF\xBF\xBD") }, // L2 |
| 51 | + { S("\xE0\x80\x41"), S("\xEF\xBF\xBD\xEF\xBF\xBD\x41") }, // L3 C A |
| 52 | + { S("\xE0\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // L3 C |
| 53 | + { S("\xF0\x80\x80"), S("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD") }, // L4 C C |
| 54 | + { S("\xC2\xE0"), S("\xEF\xBF\xBD\xEF\xBF\xBD") }, // L2 L3 |
| 55 | + { S("\xE0\xC2\x80"), S("\xEF\xBF\xBD\xC2\x80") }, // L3 L2 C |
| 56 | + |
| 57 | + { S("\xF5"), S("\xEF\xBF\xBD") }, // L4 |
| 58 | + { S("\xF8"), S("\xEF\xBF\xBD") }, // L5 |
| 59 | + { S("\xFC"), S("\xEF\xBF\xBD") }, // L6 |
| 60 | + { S("\xFE"), S("\xEF\xBF\xBD") }, |
| 61 | + { S("\xFF"), S("\xEF\xBF\xBD") }, |
| 62 | + |
| 63 | +}; |
| 64 | + |
| 65 | + |
| 66 | +TEST(encodings, utf8) |
| 67 | +{ |
| 68 | + for (auto test : utf8_tests) |
| 69 | + { |
| 70 | + string output; |
| 71 | + decode(test.input, encoding::utf_8, output); |
| 72 | + EXPECT_EQ(output, test.output); |
| 73 | + } |
| 74 | +} |
0 commit comments