aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode.ml
diff options
context:
space:
mode:
Diffstat (limited to 'lib/unicode.ml')
-rw-r--r--lib/unicode.ml85
1 files changed, 79 insertions, 6 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 0dc4238ee4..ced5e258c2 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -8,9 +8,7 @@
(** Unicode utilities *)
-type status = Letter | IdentPart | Symbol
-
-exception Unsupported
+type status = Letter | IdentPart | Symbol | Unknown
(* The following table stores classes of Unicode characters that
are used by the lexer. There are 3 different classes so 2 bits are
@@ -18,7 +16,7 @@ exception Unsupported
to simplify the masking process. (This choice seems to be a good
trade-off between speed and space after some benchmarks.) *)
-(* A 256ko table, initially filled with zeros. *)
+(* A 256 KiB table, initially filled with zeros. *)
let table = Array.make (1 lsl 17) 0
(* Associate a 2-bit pattern to each status at position [i].
@@ -29,6 +27,7 @@ let mask i = function
| Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *)
| IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
| Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *)
+ | Unknown -> 0 lsl ((i land 7) lsl 1) (* 00 *)
(* Helper to reset 2 bits in a word. *)
let reset_mask i =
@@ -55,7 +54,7 @@ let lookup x =
if v = 1 then Letter
else if v = 2 then IdentPart
else if v = 3 then Symbol
- else raise Unsupported
+ else Unknown
(* [classify] discriminates between 3 different kinds of
symbols based on the standard unicode classification (extracted from
@@ -147,6 +146,11 @@ let utf8_of_unicode n =
s
end
+(* If [s] is some UTF-8 encoded string
+ and [i] is a position of some UTF-8 character within [s]
+ then [next_utf8 s i] returns [(j,n)] where:
+ - [j] indicates the position of the next UTF-8 character
+ - [n] represents the UTF-8 character at index [i] *)
let next_utf8 s i =
let err () = invalid_arg "utf8" in
let l = String.length s - i in
@@ -210,7 +214,6 @@ let ident_refutation s =
|x -> x
with
| End_of_input -> Some (true,"The empty string is not an identifier.")
- | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.")
| Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
let lowercase_unicode =
@@ -256,3 +259,73 @@ let ascii_of_ident s =
(Buffer.add_char out s.[!i]; incr i)
done;
Buffer.contents out
+
+(* Compute length of an UTF-8 encoded string
+ Rem 1 : utf8_length <= String.length (equal if pure ascii)
+ Rem 2 : if used for an iso8859_1 encoded string, the result is
+ wrong in very rare cases. Such a wrong case corresponds to any
+ sequence of a character in range 192..253 immediately followed by a
+ character in range 128..191 (typical case in french is "déçu" which
+ is counted 3 instead of 4); then no real harm to use always
+ utf8_length even if using an iso8859_1 encoding *)
+
+(** FIXME: duplicate code with Pp *)
+
+let utf8_length s =
+ let len = String.length s
+ and cnt = ref 0
+ and nc = ref 0
+ and p = ref 0 in
+ while !p < len do
+ begin
+ match s.[!p] with
+ | '\000'..'\127' -> nc := 0 (* ascii char *)
+ | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+ | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+ | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+ | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+ | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+ | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+ | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ end ;
+ incr p ;
+ while !p < len && !nc > 0 do
+ match s.[!p] with
+ | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+ | _ (* not a continuation byte *) -> nc := 0
+ done ;
+ incr cnt
+ done ;
+ !cnt
+
+(* Variant of String.sub for UTF8 character positions *)
+let utf8_sub s start_u len_u =
+ let len_b = String.length s
+ and end_u = start_u + len_u
+ and cnt = ref 0
+ and nc = ref 0
+ and p = ref 0 in
+ let start_b = ref len_b in
+ while !p < len_b && !cnt < end_u do
+ if !cnt <= start_u then start_b := !p ;
+ begin
+ match s.[!p] with
+ | '\000'..'\127' -> nc := 0 (* ascii char *)
+ | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+ | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+ | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+ | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+ | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+ | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+ | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ end ;
+ incr p ;
+ while !p < len_b && !nc > 0 do
+ match s.[!p] with
+ | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+ | _ (* not a continuation byte *) -> nc := 0
+ done ;
+ incr cnt
+ done ;
+ let end_b = !p in
+ String.sub s !start_b (end_b - !start_b)