summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Hintz <t@thintz.com>2014-10-17 06:51:13 -0700
committerThomas Hintz <t@thintz.com>2014-10-17 06:51:13 -0700
commitbaf570ab655dd0ae47b13f139817060cb9592302 (patch)
treef454c07f0c1eecdf4251732ff150765458df8195
parentdaef7b3ea4e2d4c598ced62fd0b0043fb28b2799 (diff)
downloadwebsockets-baf570ab655dd0ae47b13f139817060cb9592302.tar.gz
Add another utf8 decoder.
-rw-r--r--utf8decode.c56
-rw-r--r--utf8decode.h25
2 files changed, 81 insertions, 0 deletions
diff --git a/utf8decode.c b/utf8decode.c
new file mode 100644
index 0000000..4e1ff41
--- /dev/null
+++ b/utf8decode.c
@@ -0,0 +1,56 @@
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#include "utf8decode.h"
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static uint32_t inline
+decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state*16 + type];
+ return *state;
+}
+
+int
+countCodePoints(uint8_t* s, size_t len) {
+ uint32_t codepoint;
+ uint32_t state = 0;
+ size_t count = 0;
+
+// for (count = 0; *s; ++s)
+ for (int i = len; i != 0; --i) {
+ //if (!decode(&state, &codepoint, *s))
+ ++s;
+ if (!decode(&state, &codepoint, *s)) {
+ count += 1;
+ }
+ }
+
+ return state != UTF8_ACCEPT;
+// return state;
+// return count;
+}
diff --git a/utf8decode.h b/utf8decode.h
new file mode 100644
index 0000000..d4f7f40
--- /dev/null
+++ b/utf8decode.h
@@ -0,0 +1,25 @@
+#ifndef UTF8_DECODE_H
+#define UTF8_DECODE_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+//inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte);
+
+extern int countCodePoints(uint8_t* s, size_t count);
+
+/* typedef struct {
+ size_t current_index;
+ size_t total_index;
+ int state;
+ int is_valid;
+ int ends_on_codepoint;
+} utf8_validator_t;
+
+extern void utf8vld_reset (utf8_validator_t* validator);
+
+extern void utf8vld_validate (utf8_validator_t* validator, const uint8_t* data, size_t offset, size_t length);
+
+extern int utf8_valid(const uint8_t* data, size_t len); */
+
+#endif // UTF8_DECODE_H