summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Hintz <t@thintz.com>2014-10-17 06:52:56 -0700
committerThomas Hintz <t@thintz.com>2014-10-17 06:52:56 -0700
commitd194289740ddc54242af58db9370f77b73ba438a (patch)
treee6d9e35991c3f853f99a95f29cdcffd98467fa53
parentbaf570ab655dd0ae47b13f139817060cb9592302 (diff)
downloadwebsockets-d194289740ddc54242af58db9370f77b73ba438a.tar.gz
Switch to comparse for UTF8 validation.
-rw-r--r--utf8-grammar.scm76
-rw-r--r--websockets.scm54
2 files changed, 80 insertions, 50 deletions
diff --git a/utf8-grammar.scm b/utf8-grammar.scm
new file mode 100644
index 0000000..ec266ae
--- /dev/null
+++ b/utf8-grammar.scm
@@ -0,0 +1,76 @@
+(import chicken scheme)
+(use srfi-4 srfi-13 srfi-14 comparse)
+
+
+(define (ucs-range->char-set/inclusive lower upper)
+ (ucs-range->char-set lower (add1 upper)))
+
+(define utf8-tail
+ (in (ucs-range->char-set/inclusive #x80 #xBF)))
+
+(define utf8-1
+ (in (ucs-range->char-set/inclusive #x00 #x7F)))
+
+(define utf8-2
+ (sequence
+ (in (ucs-range->char-set/inclusive #xC2 #xDF))
+ utf8-tail))
+
+(define utf8-3
+ (any-of
+ (sequence
+ (is #\xE0)
+ (in (ucs-range->char-set/inclusive #xA0 #xBF))
+ utf8-tail)
+ (sequence
+ (in (ucs-range->char-set/inclusive #xE1 #xEC))
+ (repeated utf8-tail 2))
+ (sequence
+ (is #\xED)
+ (in (ucs-range->char-set/inclusive #x80 #x9F))
+ utf8-tail)
+ (sequence
+ (in (ucs-range->char-set/inclusive #xEE #xEF))
+ (repeated utf8-tail 2))))
+
+(define utf8-4
+ (any-of
+ (sequence
+ (is #\xF0)
+ (in (ucs-range->char-set/inclusive #x90 #xBF))
+ (repeated utf8-tail 2))
+ (sequence
+ (in (ucs-range->char-set/inclusive #xF1 #xF3))
+ (repeated utf8-tail 3))
+ (sequence
+ (is #\xF4)
+ (in (ucs-range->char-set/inclusive #x80 #x8F))
+ (repeated utf8-tail 2))))
+
+(define utf8-char
+ (any-of
+ utf8-1
+ utf8-2
+ utf8-3
+ utf8-4))
+
+(define utf8-string
+ (followed-by (zero-or-more utf8-char) end-of-input))
+
+;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
+;; (parse utf8-char (->parser-input #\a))
+
+;; (define (valid-utf8? s)
+;; (let ((len (string-length s)))
+;; (let loop ((i 0))
+;; (if (= i len)
+;; #t
+;; (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
+;; (if r
+;; (loop (+ i (length r)))
+;; (string-ref s i)))))))
+;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
+;; (valid-utf8? "Hello")
+;; (parse utf8-char (->parser-input (->string #\H)))
+
+;; #\xC0
diff --git a/websockets.scm b/websockets.scm
index 655fa48..ceae0b9 100644
--- a/websockets.scm
+++ b/websockets.scm
@@ -25,9 +25,7 @@
(import chicken scheme data-structures extras ports posix foreign)
(use srfi-1 srfi-4 spiffy intarweb uri-common base64 simple-sha1 srfi-18
- srfi-13 mailbox)
-
-(foreign-declare "#include \"utf8validator.c\"")
+ srfi-13 mailbox srfi-14 comparse)
(define-inline (neq? obj1 obj2) (not (eq? obj1 obj2)))
@@ -186,7 +184,7 @@
(dynamic-wind
(lambda () (mutex-lock! (websocket-send-mutex ws)))
(lambda () (send-frame ws optype data #t))
- (lambda () (mutex-unlock! (websocket-send-mutex ws))))
+ (lambda () (mutex-unlock! (websocket-send-mutex ws)))))
(define (websocket-unmask-frame-payload payload len frame-masking-key)
(define tmaskkey (make-u8vector 4 #f #t #t))
@@ -339,53 +337,9 @@
(make-property-condition 'unhandled-optype
'optype frame-optype)))))))))))
-(define (valid-utf8-2? s)
- (define-external str c-string s)
- (define-external len int (string-length s))
- (zero?
- ((foreign-lambda* int ()
-"
-static const uint8_t utf8d[] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
- 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
- 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
- 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
- 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
- 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
- 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
-};
-
- uint32_t si;
- uint32_t *state;
- si = 0;
- state = &si;
- uint32_t type;
-
- for (int i = 0; i < len; i++) {
- // type = utf8d[(uint8_t)str[i]];
- type = utf8d[*((uint8_t*)str)];
- *state = utf8d[256 + (*state) * 16 + type];
-
- if (*state != 0) // reject
- break;
- }
-
- C_return(*state);
-"
-))
- ))
-
+(include "utf8-grammar.scm")
(define (valid-utf8? s)
- (let ((len (string-length s)))
- ((foreign-lambda int "utf8_valid" scheme-pointer int)
- s len)))
+ (parse utf8-string (->parser-input s) memoize: #t))
(define (close-code->integer s)
(if (string-null? s)