aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode.ml
diff options
context:
space:
mode:
authorPierre Courtieu2016-04-15 16:45:14 +0200
committerPierre Courtieu2016-04-15 16:45:14 +0200
commitcaa1f67de10614984fa7e1c68aa8adf0ff90196a (patch)
tree3c265ac5e16851c5dc1ca917ddb03725e09ea0ff /lib/unicode.ml
parentbe824224cc76f729872e9d803fc64831b95aee94 (diff)
parent3b3d98acd58e91c960a2e11cd47ac19b2b34f86b (diff)
Merge remote-tracking branch 'OFFICIAL/trunk' into morefresh
Diffstat (limited to 'lib/unicode.ml')
-rw-r--r--lib/unicode.ml14
1 files changed, 13 insertions, 1 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 1765e93dcd..938e8f1a99 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -18,7 +18,7 @@ exception Unsupported
to simplify the masking process. (This choice seems to be a good
trade-off between speed and space after some benchmarks.) *)
-(* A 256ko table, initially filled with zeros. *)
+(* A 256 KiB table, initially filled with zeros. *)
let table = Array.make (1 lsl 17) 0
(* Associate a 2-bit pattern to each status at position [i].
@@ -147,6 +147,11 @@ let utf8_of_unicode n =
s
end
+(* If [s] is some UTF-8 encoded string
+ and [i] is a position of some UTF-8 character within [s]
+ then [next_utf8 s i] returns [(j,n)] where:
+ - [j] indicates the position of the next UTF-8 character
+ - [n] represents the UTF-8 character at index [i] *)
let next_utf8 s i =
let err () = invalid_arg "utf8" in
let l = String.length s - i in
@@ -168,6 +173,13 @@ let next_utf8 s i =
(c land 0x3F) lsl 6 + (d land 0x3F)
else err ()
+let is_utf8 s =
+ let rec check i =
+ let (off, _) = next_utf8 s i in
+ check (i + off)
+ in
+ try check 0 with End_of_input -> true | Invalid_argument _ -> false
+
(* Check the well-formedness of an identifier *)
let initial_refutation j n s =