summaryrefslogtreecommitdiffstats
path: root/utf8-grammar.scm
blob: 7d8b27730ef5709656d8ec6e41064a6b1abc9a14 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
(define (ucs-range->char-set/inclusive lower upper)
  (ucs-range->char-set lower (add1 upper)))

(define utf8-tail
  (in (ucs-range->char-set/inclusive #x80 #xBF)))

(define utf8-1
  (satisfies (lambda (c) (or (< (char->integer c) 128)
                        (and (> (char->integer c) 128)
                             (< (char->integer c) 191))))))

(define utf8-2
  (sequence
    (in (ucs-range->char-set/inclusive #xC2 #xDF))
    utf8-tail))

(define utf8-3
  (any-of
   (sequence
     (is #\xE0)
     (in (ucs-range->char-set/inclusive #xA0 #xBF))
     utf8-tail)
   (sequence
     (in (ucs-range->char-set/inclusive #xE1 #xEC))
     (repeated utf8-tail 2))
   (sequence
     (is #\xED)
     (in (ucs-range->char-set/inclusive #x80 #x9F))
     utf8-tail)
   (sequence
     (in (ucs-range->char-set/inclusive  #xEE #xEF))
     (repeated utf8-tail 2))))

(define utf8-4
  (any-of
   (sequence
     (is #\xF0)
     (in (ucs-range->char-set/inclusive #x90 #xBF))
     (repeated utf8-tail 2))
   (sequence
     (in (ucs-range->char-set/inclusive #xF1 #xF3))
     (repeated utf8-tail 3))
   (sequence
     (is #\xF4)
     (in (ucs-range->char-set/inclusive #x80 #x8F))
     (repeated utf8-tail 2))))

(define utf8-char
  (any-of
   utf8-1
   utf8-2
   utf8-3
   utf8-4))

(define utf8-string
  (followed-by (zero-or-more utf8-char) end-of-input))

;; (parse utf8-string (->parser-input "Hello-µ@ßöäüàá-UTF-8!!"))
;; (parse utf8-char (->parser-input #\a))

;; (define (valid-utf8? s)
;;   (let ((len (string-length s)))
;;     (let loop ((i 0))
;;       (if (= i len)
;;           #t
;;           (let ((r (parse utf8-char (->parser-input (->string (string-ref s i))))))
;;             (if r
;;                 (loop (+ i (length r)))
;;                 (string-ref s i)))))))
;; (valid-utf8? "Hello-µ@ßöäüàá-UTF-8!!")
;; (valid-utf8? "Hello")
;; (parse utf8-char (->parser-input (->string #\H)))

;; #\xC0