14 #include <parserutils/charset/utf8.h> 27 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
28 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
29 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
30 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
36 static const uint8_t
u_fffd[3] = {
'\xEF',
'\xBF',
'\xBD' };
43 static const uint8_t
lf =
'\n';
288 parserutils_error perror;
291 if (input == NULL || tokeniser == NULL)
298 perror = parserutils_buffer_create(&tok->
buffer);
299 if (perror != PARSERUTILS_OK) {
304 perror = parserutils_buffer_create(&tok->
insert_buf);
305 if (perror != PARSERUTILS_OK) {
306 parserutils_buffer_destroy(tok->
buffer);
342 if (tokeniser == NULL)
349 parserutils_buffer_destroy(tokeniser->
insert_buf);
351 parserutils_buffer_destroy(tokeniser->
buffer);
372 if (tokeniser == NULL || params == NULL)
394 if (tokeniser->
paused ==
true) {
395 tokeniser->
paused =
false;
416 const uint8_t *data,
size_t len)
418 parserutils_error perror;
420 if (tokeniser == NULL || data == NULL)
423 perror = parserutils_buffer_append(tokeniser->
insert_buf, data, len);
424 if (perror != PARSERUTILS_OK)
440 if (tokeniser == NULL)
443 if (tokeniser->
paused ==
true)
456 switch (tokeniser->
state) {
627 #define START_BUF(str, cptr, length) \ 629 parserutils_error perror; \ 630 perror = parserutils_buffer_append(tokeniser->buffer, \ 631 (uint8_t *) (cptr), (length)); \ 632 if (perror != PARSERUTILS_OK) \ 633 return hubbub_error_from_parserutils_error(perror); \ 634 (str).len = (length); \ 637 #define COLLECT(str, cptr, length) \ 639 parserutils_error perror; \ 640 assert(str.len != 0); \ 641 perror = parserutils_buffer_append(tokeniser->buffer, \ 642 (uint8_t *) (cptr), (length)); \ 643 if (perror != PARSERUTILS_OK) \ 644 return hubbub_error_from_parserutils_error(perror); \ 645 (str).len += (length); \ 648 #define COLLECT_MS(str, cptr, length) \ 650 parserutils_error perror; \ 651 perror = parserutils_buffer_append(tokeniser->buffer, \ 652 (uint8_t *) (cptr), (length)); \ 653 if (perror != PARSERUTILS_OK) \ 654 return hubbub_error_from_parserutils_error(perror); \ 655 (str).len += (length); \ 662 parserutils_error error;
667 while ((error = parserutils_inputstream_peek(tokeniser->
input,
670 const uint8_t c = *cptr;
681 }
else if (c ==
'-' &&
689 error = parserutils_inputstream_peek(
695 assert(error == PARSERUTILS_OK);
697 if (strncmp((
char *)cptr,
698 "<!--",
SLEN(
"<!--")) == 0) {
719 }
else if (c ==
'>' && tokeniser->
escape_flag ==
true &&
728 error = parserutils_inputstream_peek(
734 assert(error == PARSERUTILS_OK);
736 if (strncmp((
char *) cptr,
"-->",
SLEN(
"-->")) == 0) {
741 }
else if (c ==
'\0') {
751 parserutils_inputstream_advance(tokeniser->
input, 1);
752 }
else if (c ==
'\r') {
753 error = parserutils_inputstream_peek(
759 if (error != PARSERUTILS_OK &&
760 error != PARSERUTILS_EOF) {
769 if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
775 parserutils_inputstream_advance(tokeniser->
input, 1);
789 if (error == PARSERUTILS_EOF) {
794 if (error == PARSERUTILS_EOF) {
814 uint8_t *utf8ptr = utf8;
815 size_t len =
sizeof(utf8);
820 parserutils_charset_utf8_from_ucs4(
830 parserutils_inputstream_advance(tokeniser->
input,
834 parserutils_error error;
835 const uint8_t *cptr = NULL;
837 error = parserutils_inputstream_peek(
842 if (error != PARSERUTILS_OK) {
851 parserutils_inputstream_advance(tokeniser->
input, len);
871 parserutils_error error;
877 error = parserutils_inputstream_peek(tokeniser->
input,
880 if (error != PARSERUTILS_OK) {
881 if (error == PARSERUTILS_EOF) {
906 parserutils_inputstream_advance(tokeniser->
input,
911 }
else if (
'A' <= c && c <=
'Z') {
912 uint8_t lc = (c + 0x20);
922 }
else if (
'a' <= c && c <=
'z') {
931 }
else if (c ==
'>') {
936 }
else if (c ==
'?') {
940 parserutils_inputstream_advance(
962 parserutils_error error;
974 uint8_t *start_tag_name =
976 size_t start_tag_len =
979 while ((error = parserutils_inputstream_peek(tokeniser->
input,
983 &len)) == PARSERUTILS_OK) {
999 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1004 error = parserutils_inputstream_peek(
1011 if (error != PARSERUTILS_OK &&
1012 error != PARSERUTILS_EOF) {
1015 }
else if (error != PARSERUTILS_EOF) {
1018 if (c !=
'\t' && c !=
'\n' && c !=
'\f' &&
1019 c !=
' ' && c !=
'>' &&
1035 error = parserutils_inputstream_peek(tokeniser->
input,
1038 if (error == PARSERUTILS_EOF) {
1044 }
else if (error != PARSERUTILS_OK) {
1050 if (
'A' <= c && c <=
'Z') {
1051 uint8_t lc = (c + 0x20);
1062 }
else if (
'a' <= c && c <=
'z') {
1073 }
else if (c ==
'>') {
1078 parserutils_inputstream_advance(tokeniser->
input,
1088 parserutils_inputstream_advance(tokeniser->
input,
1106 const uint8_t *cptr;
1107 parserutils_error error;
1115 error = parserutils_inputstream_peek(tokeniser->
input,
1118 if (error != PARSERUTILS_OK) {
1119 if (error == PARSERUTILS_EOF) {
1129 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1132 }
else if (c ==
'>') {
1136 }
else if (c ==
'\0') {
1139 }
else if (c ==
'/') {
1142 }
else if (
'A' <= c && c <=
'Z') {
1143 uint8_t lc = (c + 0x20);
1160 const uint8_t *cptr;
1161 parserutils_error error;
1164 error = parserutils_inputstream_peek(tokeniser->
input,
1167 if (error != PARSERUTILS_OK) {
1168 if (error == PARSERUTILS_EOF) {
1178 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1181 }
else if (c ==
'>') {
1185 }
else if (c ==
'/') {
1191 if (c ==
'"' || c ==
'\'' || c ==
'=') {
1203 if (
'A' <= c && c <=
'Z') {
1204 uint8_t lc = (c + 0x20);
1206 }
else if (c ==
'\0') {
1231 const uint8_t *cptr;
1232 parserutils_error error;
1237 error = parserutils_inputstream_peek(tokeniser->
input,
1240 if (error != PARSERUTILS_OK) {
1241 if (error == PARSERUTILS_EOF) {
1251 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1254 }
else if (c ==
'=') {
1257 }
else if (c ==
'>') {
1261 }
else if (c ==
'/') {
1264 }
else if (c ==
'\0') {
1268 }
else if (
'A' <= c && c <=
'Z') {
1269 uint8_t lc = (c + 0x20);
1288 const uint8_t *cptr;
1289 parserutils_error error;
1292 error = parserutils_inputstream_peek(tokeniser->
input,
1295 if (error != PARSERUTILS_OK) {
1296 if (error == PARSERUTILS_EOF) {
1306 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1308 }
else if (c ==
'=') {
1311 }
else if (c ==
'>') {
1316 }
else if (c ==
'/') {
1322 if (c ==
'"' || c ==
'\'') {
1334 if (
'A' <= c && c <=
'Z') {
1335 uint8_t lc = (c + 0x20);
1337 }
else if (c ==
'\0') {
1364 const uint8_t *cptr;
1365 parserutils_error error;
1368 error = parserutils_inputstream_peek(tokeniser->
input,
1371 if (error != PARSERUTILS_OK) {
1372 if (error == PARSERUTILS_EOF) {
1383 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1385 }
else if (c ==
'"') {
1388 }
else if (c ==
'&') {
1391 }
else if (c ==
'\'') {
1394 }
else if (c ==
'>') {
1400 }
else if (c ==
'\0') {
1426 const uint8_t *cptr;
1427 parserutils_error error;
1430 error = parserutils_inputstream_peek(tokeniser->
input,
1433 if (error != PARSERUTILS_OK) {
1434 if (error == PARSERUTILS_EOF) {
1447 }
else if (c ==
'&') {
1452 }
else if (c ==
'\0') {
1456 }
else if (c ==
'\r') {
1457 error = parserutils_inputstream_peek(
1463 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1465 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
1488 const uint8_t *cptr;
1489 parserutils_error error;
1492 error = parserutils_inputstream_peek(tokeniser->
input,
1495 if (error != PARSERUTILS_OK) {
1496 if (error == PARSERUTILS_EOF) {
1509 }
else if (c ==
'&') {
1514 }
else if (c ==
'\0') {
1518 }
else if (c ==
'\r') {
1519 error = parserutils_inputstream_peek(
1525 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1527 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
1551 const uint8_t *cptr;
1552 parserutils_error error;
1554 error = parserutils_inputstream_peek(tokeniser->
input,
1557 if (error != PARSERUTILS_OK) {
1558 if (error == PARSERUTILS_EOF) {
1571 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1574 }
else if (c ==
'&') {
1578 }
else if (c ==
'>') {
1582 }
else if (c ==
'\0') {
1587 if (c ==
'"' || c ==
'\'' || c ==
'=') {
1611 uint8_t *utf8ptr = utf8;
1612 size_t len =
sizeof(utf8);
1615 parserutils_charset_utf8_from_ucs4(
1627 const uint8_t *cptr = NULL;
1628 parserutils_error error;
1630 error = parserutils_inputstream_peek(
1635 if (error != PARSERUTILS_OK) {
1660 const uint8_t *cptr;
1661 parserutils_error error;
1664 error = parserutils_inputstream_peek(tokeniser->
input,
1667 if (error != PARSERUTILS_OK) {
1668 if (error == PARSERUTILS_EOF) {
1678 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
1681 }
else if (c ==
'>') {
1686 }
else if (c ==
'/') {
1702 const uint8_t *cptr;
1703 parserutils_error error;
1706 error = parserutils_inputstream_peek(tokeniser->
input,
1709 if (error != PARSERUTILS_OK) {
1710 if (error == PARSERUTILS_EOF) {
1738 const uint8_t *cptr;
1739 parserutils_error error;
1742 error = parserutils_inputstream_peek(tokeniser->
input,
1745 if (error != PARSERUTILS_OK) {
1746 if (error == PARSERUTILS_EOF) {
1760 }
else if (c ==
'\0') {
1761 error = parserutils_buffer_append(tokeniser->
buffer,
1763 if (error != PARSERUTILS_OK)
1767 }
else if (c ==
'\r') {
1768 error = parserutils_inputstream_peek(
1774 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1776 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
1777 error = parserutils_buffer_append(tokeniser->
buffer,
1779 if (error != PARSERUTILS_OK) {
1786 error = parserutils_buffer_append(tokeniser->
buffer,
1787 (uint8_t *) cptr, len);
1788 if (error != PARSERUTILS_OK)
1802 const uint8_t *cptr;
1803 parserutils_error error;
1808 error = parserutils_inputstream_peek(tokeniser->
input, 0, &cptr, &len);
1810 if (error != PARSERUTILS_OK) {
1811 if (error == PARSERUTILS_EOF) {
1824 }
else if ((c & ~0x20) ==
'D') {
1843 const uint8_t *cptr;
1844 parserutils_error error;
1846 error = parserutils_inputstream_peek(tokeniser->
input,
1849 if (error != PARSERUTILS_OK) {
1850 if (error == PARSERUTILS_EOF) {
1863 parserutils_inputstream_advance(tokeniser->
input,
SLEN(
"--"));
1876 const uint8_t *cptr;
1877 parserutils_error error;
1880 error = parserutils_inputstream_peek(tokeniser->
input,
1883 if (error != PARSERUTILS_OK) {
1884 if (error == PARSERUTILS_EOF) {
1902 }
else if (c ==
'-') {
1912 error = parserutils_buffer_append(tokeniser->
buffer,
1913 (uint8_t *)
"-",
SLEN(
"-"));
1914 if (error != PARSERUTILS_OK) {
1924 error = parserutils_buffer_append(tokeniser->
buffer,
1925 (uint8_t *)
"-",
SLEN(
"-"));
1926 if (error != PARSERUTILS_OK) {
1931 error = parserutils_buffer_append(tokeniser->
buffer,
1932 (uint8_t *)
"--",
SLEN(
"--"));
1933 if (error != PARSERUTILS_OK) {
1940 error = parserutils_buffer_append(tokeniser->
buffer,
1942 if (error != PARSERUTILS_OK) {
1946 }
else if (c ==
'\r') {
1948 error = parserutils_inputstream_peek(
1953 if (error != PARSERUTILS_OK &&
1954 error != PARSERUTILS_EOF) {
1957 }
else if (error != PARSERUTILS_EOF && *cptr !=
'\n') {
1958 error = parserutils_buffer_append(
1961 if (error != PARSERUTILS_OK) {
1967 error = parserutils_buffer_append(tokeniser->
buffer,
1969 if (error != PARSERUTILS_OK) {
1985 #define DOCTYPE "DOCTYPE" 1986 #define DOCTYPE_LEN (SLEN(DOCTYPE) - 1) 1991 const uint8_t *cptr;
1992 parserutils_error error;
1995 error = parserutils_inputstream_peek(tokeniser->
input,
1998 if (error != PARSERUTILS_OK) {
1999 if (error == PARSERUTILS_EOF) {
2024 parserutils_inputstream_advance(tokeniser->
input,
2047 const uint8_t *cptr;
2048 parserutils_error error;
2051 error = parserutils_inputstream_peek(tokeniser->
input,
2054 if (error != PARSERUTILS_OK) {
2055 if (error == PARSERUTILS_EOF) {
2065 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2079 const uint8_t *cptr;
2080 parserutils_error error;
2083 error = parserutils_inputstream_peek(tokeniser->
input,
2086 if (error != PARSERUTILS_OK) {
2087 if (error == PARSERUTILS_EOF) {
2099 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2102 }
else if (c ==
'>') {
2110 }
else if (
'A' <= c && c <=
'Z') {
2111 uint8_t lc = c + 0x20;
2129 const uint8_t *cptr;
2130 parserutils_error error;
2133 error = parserutils_inputstream_peek(tokeniser->
input,
2136 if (error != PARSERUTILS_OK) {
2137 if (error == PARSERUTILS_EOF) {
2147 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2150 }
else if (c ==
'>') {
2154 }
else if (c ==
'\0') {
2157 }
else if (
'A' <= c && c <=
'Z') {
2158 uint8_t lc = c + 0x20;
2173 const uint8_t *cptr;
2174 parserutils_error error;
2177 error = parserutils_inputstream_peek(tokeniser->
input,
2180 if (error != PARSERUTILS_OK) {
2181 if (error == PARSERUTILS_EOF) {
2192 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2194 }
else if (c ==
'>') {
2197 }
else if ((c & ~0x20) ==
'P') {
2200 }
else if ((c & ~0x20) ==
'S') {
2211 #define PUBLIC "PUBLIC" 2212 #define PUBLIC_LEN (SLEN(PUBLIC) - 1) 2217 const uint8_t *cptr;
2218 parserutils_error error;
2221 error = parserutils_inputstream_peek(tokeniser->
input,
2224 if (error != PARSERUTILS_OK) {
2225 if (error == PARSERUTILS_EOF) {
2263 const uint8_t *cptr;
2264 parserutils_error error;
2267 error = parserutils_inputstream_peek(tokeniser->
input,
2270 if (error != PARSERUTILS_OK) {
2271 if (error == PARSERUTILS_EOF) {
2282 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2284 }
else if (c ==
'"') {
2288 }
else if (c ==
'\'') {
2292 }
else if (c ==
'>') {
2308 const uint8_t *cptr;
2309 parserutils_error error;
2312 error = parserutils_inputstream_peek(tokeniser->
input,
2315 if (error != PARSERUTILS_OK) {
2316 if (error == PARSERUTILS_EOF) {
2329 }
else if (c ==
'>') {
2333 }
else if (c ==
'\0') {
2336 }
else if (c ==
'\r') {
2337 error = parserutils_inputstream_peek(
2343 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2345 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
2365 const uint8_t *cptr;
2366 parserutils_error error;
2369 error = parserutils_inputstream_peek(tokeniser->
input,
2372 if (error != PARSERUTILS_OK) {
2373 if (error == PARSERUTILS_EOF) {
2386 }
else if (c ==
'>') {
2390 }
else if (c ==
'\0') {
2393 }
else if (c ==
'\r') {
2394 error = parserutils_inputstream_peek(
2400 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2402 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
2422 const uint8_t *cptr;
2423 parserutils_error error;
2426 error = parserutils_inputstream_peek(tokeniser->
input,
2429 if (error != PARSERUTILS_OK) {
2430 if (error == PARSERUTILS_EOF) {
2441 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2443 }
else if (c ==
'"') {
2448 }
else if (c ==
'\'') {
2453 }
else if (c ==
'>') {
2466 #define SYSTEM "SYSTEM" 2467 #define SYSTEM_LEN (SLEN(SYSTEM) - 1) 2472 const uint8_t *cptr;
2473 parserutils_error error;
2476 error = parserutils_inputstream_peek(tokeniser->
input,
2479 if (error != PARSERUTILS_OK){
2480 if (error == PARSERUTILS_EOF) {
2518 const uint8_t *cptr;
2519 parserutils_error error;
2522 error = parserutils_inputstream_peek(tokeniser->
input,
2525 if (error != PARSERUTILS_OK) {
2526 if (error == PARSERUTILS_EOF) {
2537 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2539 }
else if (c ==
'"') {
2544 }
else if (c ==
'\'') {
2549 }
else if (c ==
'>') {
2565 const uint8_t *cptr;
2566 parserutils_error error;
2569 error = parserutils_inputstream_peek(tokeniser->
input,
2572 if (error != PARSERUTILS_OK) {
2573 if (error == PARSERUTILS_EOF) {
2586 }
else if (c ==
'>') {
2590 }
else if (c ==
'\0') {
2593 }
else if (c ==
'\r') {
2594 error = parserutils_inputstream_peek(
2600 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2602 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
2621 const uint8_t *cptr;
2622 parserutils_error error;
2625 error = parserutils_inputstream_peek(tokeniser->
input,
2628 if (error != PARSERUTILS_OK) {
2629 if (error == PARSERUTILS_EOF) {
2642 }
else if (c ==
'>') {
2646 }
else if (c ==
'\0') {
2649 }
else if (c ==
'\r') {
2650 error = parserutils_inputstream_peek(
2656 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2658 }
else if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
2676 const uint8_t *cptr;
2677 parserutils_error error;
2680 error = parserutils_inputstream_peek(tokeniser->
input,
2683 if (error != PARSERUTILS_OK) {
2684 if (error == PARSERUTILS_EOF) {
2695 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' || c ==
'\r') {
2697 }
else if (c ==
'>') {
2711 const uint8_t *cptr;
2712 parserutils_error error;
2715 error = parserutils_inputstream_peek(tokeniser->
input,
2718 if (error != PARSERUTILS_OK) {
2719 if (error == PARSERUTILS_EOF) {
2740 #define CDATA "[CDATA[" 2741 #define CDATA_LEN (SLEN(CDATA) - 1) 2746 const uint8_t *cptr;
2747 parserutils_error error;
2750 error = parserutils_inputstream_peek(tokeniser->
input,
2753 if (error != PARSERUTILS_OK) {
2754 if (error == PARSERUTILS_EOF) {
2779 parserutils_inputstream_advance(tokeniser->
input,
2798 const uint8_t *cptr;
2799 parserutils_error error;
2802 error = parserutils_inputstream_peek(tokeniser->
input,
2805 if (error != PARSERUTILS_OK) {
2806 if (error == PARSERUTILS_EOF) {
2828 parserutils_inputstream_advance(tokeniser->
input,
SLEN(
"]]>"));
2831 }
else if (c ==
'\0') {
2840 parserutils_inputstream_advance(tokeniser->
input, len);
2842 }
else if (c ==
'\r') {
2843 error = parserutils_inputstream_peek(
2849 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2858 if (error == PARSERUTILS_EOF || *cptr !=
'\n') {
2864 parserutils_inputstream_advance(tokeniser->
input, 1);
2881 const uint8_t *cptr;
2882 parserutils_error error;
2886 error = parserutils_inputstream_peek(tokeniser->
input, pos,
2890 assert(error == PARSERUTILS_OK);
2891 assert(len == 1 && *cptr ==
'&');
2896 error = parserutils_inputstream_peek(tokeniser->
input, off,
2899 if (error != PARSERUTILS_OK) {
2900 if (error == PARSERUTILS_EOF) {
2927 if (c ==
'\t' || c ==
'\n' || c ==
'\f' || c ==
' ' ||
2928 c ==
'<' || c ==
'&' ||
2929 (allowed_char && c == allowed_char)) {
2932 }
else if (c ==
'#') {
2949 const uint8_t *cptr;
2950 parserutils_error error;
2952 error = parserutils_inputstream_peek(tokeniser->
input,
2956 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2962 if ((c & ~0x20) ==
'X') {
2970 while ((error = parserutils_inputstream_peek(tokeniser->
input,
2972 &cptr, &len)) == PARSERUTILS_OK) {
2976 (
'0' <= c && c <=
'9')) {
2983 ((
'0' <= c && c <=
'9') ||
2984 (
'A' <= (c & ~0x20) &&
2985 (c & ~0x20) <=
'F'))) {
2989 if (
'0' <= c && c <=
'9') {
2993 ((c & ~0x20) -
'A' + 10);
3006 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3011 if (error != PARSERUTILS_EOF && *cptr ==
';') {
3019 if (0x80 <= cp && cp <= 0x9F) {
3021 }
else if (cp == 0x0D) {
3024 cp <= 0x0008 || cp == 0x000B ||
3025 (0x000E <= cp && cp <= 0x001F) ||
3026 (0x007F <= cp && cp <= 0x009F) ||
3027 (0xD800 <= cp && cp <= 0xDFFF) ||
3028 (0xFDD0 <= cp && cp <= 0xFDEF) ||
3029 (cp & 0xFFFE) == 0xFFFE) {
3052 const uint8_t *cptr;
3053 parserutils_error error;
3055 while ((error = parserutils_inputstream_peek(tokeniser->
input,
3058 &cptr, &len)) == PARSERUTILS_OK) {
3088 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3094 error = parserutils_inputstream_peek(tokeniser->
input,
3101 assert(error == PARSERUTILS_OK);
3108 error = parserutils_inputstream_peek(tokeniser->
input,
3118 assert(error == PARSERUTILS_OK ||
3119 error == PARSERUTILS_EOF);
3121 if (error == PARSERUTILS_EOF) {
3126 if ((0x0030 <= c && c <= 0x0039) ||
3127 (0x0041 <= c && c <= 0x005A) ||
3128 (0x0061 <= c && c <= 0x007A)) {
3175 const uint8_t *cptr = NULL;
3176 parserutils_error error;
3181 error = parserutils_inputstream_peek(tokeniser->
input, 0, &cptr, &len);
3182 if (error != PARSERUTILS_OK)
3202 uint32_t n_attributes;
3217 ptr = tokeniser->
buffer->data;
3221 for (i = 0; i < n_attributes; i++) {
3230 for (i = 0; i < n_attributes; i++) {
3231 for (j = 0; j < n_attributes; j++) {
3235 attrs[i].
name.len !=
3237 strncmp((
char *) attrs[i].
name.ptr,
3247 move = (n_attributes - 1 - j) *
3251 memmove(&attrs[j],&attrs[j+1], move);
3323 if (force_quirks ==
true)
3354 assert(tokeniser != NULL);
3355 assert(token != NULL);
3360 switch (token->
type) {
3406 if (tokeniser->
buffer->length) {
3407 parserutils_buffer_discard(tokeniser->
buffer, 0,
3408 tokeniser->
buffer->length);
3413 parserutils_inputstream_advance(tokeniser->
input,
3419 parserutils_inputstream_insert(tokeniser->
input,
3422 parserutils_buffer_discard(tokeniser->
insert_buf, 0,
3428 tokeniser->
paused =
true;
hubbub_doctype current_doctype
Current doctype.
static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
hubbub_token_handler handler
hubbub_token_type type
The token type.
struct hubbub_tokeniser_optparams::@11 content_model
Current content model.
uint32_t line
Current line of input.
hubbub_ns ns
Tag namespace.
hubbub_tokeniser_state prev_state
Previous state.
hubbub_content_model model
static hubbub_error hubbub_tokeniser_handle_before_doctype_system(hubbub_tokeniser *tokeniser)
struct hubbub_tokeniser_context::@7 match_entity
Entity matching state.
static hubbub_error hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
parserutils_inputstream * input
Input stream.
uint32_t poss_length
Optimistic length when matching named character references.
hubbub_string name
Tag name.
const uint8_t * ptr
Pointer to data.
struct hubbub_tokeniser_optparams::@10 error_handler
Error handling callback.
hubbub_tokeniser_state return_state
State we were called from.
void * error_pw
Error handler data.
static hubbub_error hubbub_tokeniser_handle_after_attribute_name(hubbub_tokeniser *tokeniser)
hubbub_string value
Attribute value.
uint32_t allowed_char
Used for quote matching.
static hubbub_error hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser *tokeniser)
bool had_data
Whether we read anything after &#(x)?
hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, hubbub_tokeniser_opttype type, hubbub_tokeniser_optparams *params)
Configure a hubbub tokeniser.
static hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
static const uint8_t lf
String for when we want to emit newlines.
bool escape_flag
Escape flag.
hubbub_string public_id
Doctype public identifier.
size_t last_start_tag_len
Length of last start tag.
uint32_t count
Index into "DOCTYPE".
hubbub_string name
Attribute name.
Tokeniser data structure.
static hubbub_error hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser, size_t off)
bool system_missing
Whether the system id is missing.
static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
uint32_t length
Length of entity.
uint32_t col
Current character in line.
hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, int32_t *context)
Step-wise search for an entity in the dictionary.
static hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
Hubbub tokeniser option parameters.
static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser *tokeniser)
hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
Destroy a hubbub tokeniser.
hubbub_error(* hubbub_token_handler)(const hubbub_token *token, void *pw)
Type of token handling function.
static hubbub_error hubbub_tokeniser_handle_character_reference_data(hubbub_tokeniser *tokeniser)
hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser, const uint8_t *data, size_t len)
Insert a chunk of data into the input stream.
parserutils_buffer * insert_buf
Stream insertion buffer.
static hubbub_error hubbub_tokeniser_handle_after_doctype_public(hubbub_tokeniser *tokeniser)
struct hubbub_tokeniser_optparams::@9 token_handler
Token handling callback.
hubbub_token_handler token_handler
Token handling callback.
size_t len
Byte length of string.
static const hubbub_string lf_str
static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(hubbub_tokeniser *tokeniser)
static hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
Emit the current tag token being stored in the tokeniser context.
static hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_before_attribute_value(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
bool self_closing
Whether the tag can have children.
static hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
struct hubbub_tokeniser_context::@6 match_cdata
State for matching cdata.
parserutils_buffer * buffer
Input buffer.
static hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
bool pause_parse
Pause parsing.
static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(hubbub_tokeniser *tokeniser)
bool process_cdata
Whether to process CDATA sections.
hubbub_token_type
Type of an emitted token.
uint8_t base
Base for numeric entities.
static hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
int32_t context
Context for named entity search.
bool complete
True if match complete.
hubbub_attribute * attributes
Array of attribute data.
static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser *tokeniser)
#define START_BUF(str, cptr, length)
Various macros for manipulating buffers.
static hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, bool force_quirks)
Emit the current doctype token being stored in the tokeniser context.
hubbub_string system_id
Doctype system identifier.
static hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
Emit the current pending characters being stored in the tokeniser context.
size_t prev_len
Previous byte length of str.
static hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
hubbub_tokeniser_state state
Current tokeniser state.
hubbub_string name
Doctype name.
struct hubbub_tokeniser_context hubbub_tokeniser_context
Context for tokeniser.
void(* hubbub_error_handler)(uint32_t line, uint32_t col, const char *message, void *pw)
Type of parse error handling function.
static hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
bool overflow
Whether this entity has has overflowed the maximum numeric entity value.
hubbub_error_handler error_handler
Error handling callback.
uint8_t last_start_tag_name[10]
Name of the last start tag emitted.
hubbub_string current_comment
Current comment text.
hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input, hubbub_tokeniser **tokeniser)
Create a hubbub tokeniser.
static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, hubbub_token *token)
Emit a token, performing sanity checks if necessary.
static hubbub_error hubbub_tokeniser_handle_before_attribute_name(hubbub_tokeniser *tokeniser)
hubbub_content_model
Content model flag.
uint32_t codepoint
UCS4 codepoint.
bool paused
flag for if parsing is currently paused
#define COLLECT_MS(str, cptr, length)
hubbub_tokeniser_state
Tokeniser states.
struct hubbub_tokeniser_context::@5 match_doctype
State for matching doctype.
#define COLLECT(str, cptr, length)
void * token_pw
Token handler data.
static const uint32_t cp1252Table[32]
Table of mappings between Windows-1252 codepoints 128-159 and UCS4.
size_t offset
Offset in buffer.
union hubbub_token::@3 data
Type-specific data.
static const hubbub_string u_fffd_str
hubbub_ns ns
Attribute namespace.
size_t pending
Count of pending chars.
static hubbub_error hubbub_error_from_parserutils_error(parserutils_error error)
Convert a ParserUtils error into a Hubbub error.
static hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, const hubbub_string *chars)
Emit a character token.
static hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
Emit the current comment token being stored in the tokeniser context.
hubbub_tokeniser_opttype
Hubbub tokeniser option types.
static hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser *tokeniser)
bool process_cdata_section
Whether to process CDATA sections.
static hubbub_error hubbub_tokeniser_handle_before_doctype_name(hubbub_tokeniser *tokeniser)
hubbub_tag current_tag
Current tag.
static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(hubbub_tokeniser *tokeniser)
uint32_t end
Index into "]]>".
static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(hubbub_tokeniser *tokeniser)
struct hubbub_tokeniser_context::@8 position
Position in source data.
hubbub_content_model content_model
Current content model flag.
static hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
struct hubbub_tokeniser_context::@4 close_tag_match
State for matching close tags.
hubbub_tokeniser_context context
Tokeniser context.
static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser *tokeniser)
hubbub_token_type current_tag_type
Type of current_tag.
static hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
static const uint8_t u_fffd[3]
UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER.
hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
Process remaining data in the input stream.
bool force_quirks
Doctype force-quirks flag.
static hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
static hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
uint32_t n_attributes
Count of attributes.
bool public_missing
Whether the public id is missing.