aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/lib.mllib2
-rw-r--r--lib/segmenttree.ml131
-rw-r--r--lib/segmenttree.mli20
-rw-r--r--lib/unicodetable.ml416
-rw-r--r--lib/util.ml121
5 files changed, 576 insertions, 114 deletions
diff --git a/lib/lib.mllib b/lib/lib.mllib
index 73e3411dcf..1743ce26ca 100644
--- a/lib/lib.mllib
+++ b/lib/lib.mllib
@@ -2,6 +2,7 @@ Pp_control
Pp
Compat
Flags
+Segmenttree
Unicodetable
Util
Bigint
@@ -25,3 +26,4 @@ Rtree
Heap
Option
Dnet
+
diff --git a/lib/segmenttree.ml b/lib/segmenttree.ml
new file mode 100644
index 0000000000..2a7f9df0ed
--- /dev/null
+++ b/lib/segmenttree.ml
@@ -0,0 +1,131 @@
+(** This module is a very simple implementation of "segment trees".
+
+ A segment tree of type ['a t] represents a mapping from a union of
+ disjoint segments to some values of type 'a.
+*)
+
+(** Misc. functions. *)
+let list_iteri f l =
+ let rec loop i = function
+ | [] -> ()
+ | x :: xs -> f i x; loop (i + 1) xs
+ in
+ loop 0 l
+
+let log2 x = log x /. log 2.
+
+let log2n x = int_of_float (ceil (log2 (float_of_int x)))
+
+(** We focus on integers but this module can be generalized. *)
+type elt = int
+
+(** A value of type [domain] is interpreted differently given its position
+ in the tree. On internal nodes, a domain represents the set of
+ integers which are _not_ in the set of keys handled by the tree. On
+ leaves, a domain represents the st of integers which are in the set of
+ keys. *)
+type domain =
+ (** On internal nodes, a domain [Interval (a, b)] represents
+ the interval [a + 1; b - 1]. On leaves, it represents [a; b].
+ We always have [a] <= [b]. *)
+ | Interval of elt * elt
+ (** On internal node or root, a domain [Universe] represents all
+ the integers. When the tree is not a trivial root,
+ [Universe] has no interpretation on leaves. (The lookup
+ function should never reach the leaves.) *)
+ | Universe
+
+(** We use an array to store the almost complete tree. This array
+ contains at least one element. *)
+type 'a t = (domain * 'a option) array
+
+(** The root is the first item of the array. *)
+let is_root i = (i = 0)
+
+(** Standard layout for left child. *)
+let left_child i = 2 * i + 1
+
+(** Standard layout for right child. *)
+let right_child i = 2 * i + 2
+
+(** Extract the annotation of a node, be it internal or a leaf. *)
+let value_of i t = match t.(i) with (_, Some x) -> x | _ -> raise Not_found
+
+(** Initialize the array to store [n] leaves. *)
+let create n init =
+ Array.make (1 lsl (log2n n + 1) - 1) init
+
+(** Make a complete interval tree from a list of disjoint segments.
+ Precondition : the segments must be sorted. *)
+let make segments =
+ let nsegments = List.length segments in
+ let tree = create nsegments (Universe, None) in
+ let leaves_offset = (1 lsl (log2n nsegments)) - 1 in
+
+ (** The algorithm proceeds in two steps using an intermediate tree
+ to store minimum and maximum of each subtree as annotation of
+ the node. *)
+
+ (** We start from leaves: the last level of the tree is initialized
+ with the given segments... *)
+ list_iteri
+ (fun i ((start, stop), value) ->
+ let k = leaves_offset + i in
+ let i = Interval (start, stop) in
+ tree.(k) <- (i, Some i))
+ segments;
+ (** ... the remaining leaves are initialized with neutral information. *)
+ for k = leaves_offset + nsegments to Array.length tree -1 do
+ tree.(k) <- (Universe, Some Universe)
+ done;
+
+ (** We traverse the tree bottom-up and compute the interval and
+ annotation associated to each node from the annotations of its
+ children. *)
+ for k = leaves_offset - 1 downto 0 do
+ let node, annotation =
+ match value_of (left_child k) tree, value_of (right_child k) tree with
+ | Interval (left_min, left_max), Interval (right_min, right_max) ->
+ (Interval (left_max, right_min), Interval (left_min, right_max))
+ | Interval (min, max), Universe ->
+ (Interval (max, max), Interval (min, max))
+ | Universe, Universe -> Universe, Universe
+ | Universe, _ -> assert false
+ in
+ tree.(k) <- (node, Some annotation)
+ done;
+
+ (** Finally, annotation are replaced with the image related to each leaf. *)
+ let final_tree =
+ Array.mapi (fun i (segment, value) -> (segment, None)) tree
+ in
+ list_iteri
+ (fun i ((start, stop), value) ->
+ final_tree.(leaves_offset + i)
+ <- (Interval (start, stop), Some value))
+ segments;
+ final_tree
+
+(** [lookup k t] looks for an image for key [k] in the interval tree [t].
+ Raise [Not_found] if it fails. *)
+let lookup k t =
+ let i = ref 0 in
+ while (snd t.(!i) = None) do
+ match fst t.(!i) with
+ | Interval (start, stop) ->
+ if k <= start then i := left_child !i
+ else if k >= stop then i:= right_child !i
+ else raise Not_found
+ | Universe -> raise Not_found
+ done;
+ match fst t.(!i) with
+ | Interval (start, stop) ->
+ if k >= start && k <= stop then
+ match snd t.(!i) with
+ | Some v -> v
+ | None -> assert false
+ else
+ raise Not_found
+ | Universe -> assert false
+
+
diff --git a/lib/segmenttree.mli b/lib/segmenttree.mli
new file mode 100644
index 0000000000..4aea13e9ad
--- /dev/null
+++ b/lib/segmenttree.mli
@@ -0,0 +1,20 @@
+(** This module is a very simple implementation of "segment trees".
+
+ A segment tree of type ['a t] represents a mapping from a union of
+ disjoint segments to some values of type 'a.
+*)
+
+(** A mapping from a union of disjoint segments to some values of type ['a]. *)
+type 'a t
+
+(** [make [(i1, j1), v1; (i2, j2), v2; ...] creates a mapping that
+ associates to every integer [x] the value [v1] if [i1 <= x <= j1],
+ [v2] if [i2 <= x <= j2], and so one.
+ Precondition: the segments must be sorted. *)
+val make : ((int * int) * 'a) list -> 'a t
+
+(** [lookup k t] looks for an image for key [k] in the interval tree [t].
+ Raise [Not_found] if it fails. *)
+val lookup : int -> 'a t -> 'a
+
+
diff --git a/lib/unicodetable.ml b/lib/unicodetable.ml
index af8c351d57..f4e978d695 100644
--- a/lib/unicodetable.ml
+++ b/lib/unicodetable.ml
@@ -2201,3 +2201,419 @@ let so = [
(0x1D18C,0x1D1A9);
(0x1D1AE,0x1D1DD)
]
+
+(* Conversion to lower case. *)
+let to_lower = [
+ (0x00041,0x0005A), `Delta (32);
+ (0x000C0,0x000D6), `Delta (32);
+ (0x000D8,0x000DE), `Delta (32);
+ (0x00100,0x00100), `Abs (0x00101);
+ (0x00102,0x00102), `Abs (0x00103);
+ (0x00104,0x00104), `Abs (0x00105);
+ (0x00106,0x00106), `Abs (0x00107);
+ (0x00108,0x00108), `Abs (0x00109);
+ (0x0010A,0x0010A), `Abs (0x0010B);
+ (0x0010C,0x0010C), `Abs (0x0010D);
+ (0x0010E,0x0010E), `Abs (0x0010F);
+ (0x00110,0x00110), `Abs (0x00111);
+ (0x00112,0x00112), `Abs (0x00113);
+ (0x00114,0x00114), `Abs (0x00115);
+ (0x00116,0x00116), `Abs (0x00117);
+ (0x00118,0x00118), `Abs (0x00119);
+ (0x0011A,0x0011A), `Abs (0x0011B);
+ (0x0011C,0x0011C), `Abs (0x0011D);
+ (0x0011E,0x0011E), `Abs (0x0011F);
+ (0x00120,0x00120), `Abs (0x00121);
+ (0x00122,0x00122), `Abs (0x00123);
+ (0x00124,0x00124), `Abs (0x00125);
+ (0x00126,0x00126), `Abs (0x00127);
+ (0x00128,0x00128), `Abs (0x00129);
+ (0x0012A,0x0012A), `Abs (0x0012B);
+ (0x0012C,0x0012C), `Abs (0x0012D);
+ (0x0012E,0x0012E), `Abs (0x0012F);
+ (0x00130,0x00130), `Abs (0x00069);
+ (0x00132,0x00132), `Abs (0x00133);
+ (0x00134,0x00134), `Abs (0x00135);
+ (0x00136,0x00136), `Abs (0x00137);
+ (0x00139,0x00139), `Abs (0x0013A);
+ (0x0013B,0x0013B), `Abs (0x0013C);
+ (0x0013D,0x0013D), `Abs (0x0013E);
+ (0x0013F,0x0013F), `Abs (0x00140);
+ (0x00141,0x00141), `Abs (0x00142);
+ (0x00143,0x00143), `Abs (0x00144);
+ (0x00145,0x00145), `Abs (0x00146);
+ (0x00147,0x00147), `Abs (0x00148);
+ (0x0014A,0x0014A), `Abs (0x0014B);
+ (0x0014C,0x0014C), `Abs (0x0014D);
+ (0x0014E,0x0014E), `Abs (0x0014F);
+ (0x00150,0x00150), `Abs (0x00151);
+ (0x00152,0x00152), `Abs (0x00153);
+ (0x00154,0x00154), `Abs (0x00155);
+ (0x00156,0x00156), `Abs (0x00157);
+ (0x00158,0x00158), `Abs (0x00159);
+ (0x0015A,0x0015A), `Abs (0x0015B);
+ (0x0015C,0x0015C), `Abs (0x0015D);
+ (0x0015E,0x0015E), `Abs (0x0015F);
+ (0x00160,0x00160), `Abs (0x00161);
+ (0x00162,0x00162), `Abs (0x00163);
+ (0x00164,0x00164), `Abs (0x00165);
+ (0x00166,0x00166), `Abs (0x00167);
+ (0x00168,0x00168), `Abs (0x00169);
+ (0x0016A,0x0016A), `Abs (0x0016B);
+ (0x0016C,0x0016C), `Abs (0x0016D);
+ (0x0016E,0x0016E), `Abs (0x0016F);
+ (0x00170,0x00170), `Abs (0x00171);
+ (0x00172,0x00172), `Abs (0x00173);
+ (0x00174,0x00174), `Abs (0x00175);
+ (0x00176,0x00176), `Abs (0x00177);
+ (0x00178,0x00178), `Abs (0x000FF);
+ (0x00179,0x00179), `Abs (0x0017A);
+ (0x0017B,0x0017B), `Abs (0x0017C);
+ (0x0017D,0x0017D), `Abs (0x0017E);
+ (0x00181,0x00181), `Abs (0x00253);
+ (0x00182,0x00182), `Abs (0x00183);
+ (0x00184,0x00184), `Abs (0x00185);
+ (0x00186,0x00186), `Abs (0x00254);
+ (0x00187,0x00187), `Abs (0x00188);
+ (0x00189,0x0018A), `Delta (205);
+ (0x0018B,0x0018B), `Abs (0x0018C);
+ (0x0018E,0x0018E), `Abs (0x001DD);
+ (0x0018F,0x0018F), `Abs (0x00259);
+ (0x00190,0x00190), `Abs (0x0025B);
+ (0x00191,0x00191), `Abs (0x00192);
+ (0x00193,0x00193), `Abs (0x00260);
+ (0x00194,0x00194), `Abs (0x00263);
+ (0x00196,0x00196), `Abs (0x00269);
+ (0x00197,0x00197), `Abs (0x00268);
+ (0x00198,0x00198), `Abs (0x00199);
+ (0x0019C,0x0019C), `Abs (0x0026F);
+ (0x0019D,0x0019D), `Abs (0x00272);
+ (0x0019F,0x0019F), `Abs (0x00275);
+ (0x001A0,0x001A0), `Abs (0x001A1);
+ (0x001A2,0x001A2), `Abs (0x001A3);
+ (0x001A4,0x001A4), `Abs (0x001A5);
+ (0x001A6,0x001A6), `Abs (0x00280);
+ (0x001A7,0x001A7), `Abs (0x001A8);
+ (0x001A9,0x001A9), `Abs (0x00283);
+ (0x001AC,0x001AC), `Abs (0x001AD);
+ (0x001AE,0x001AE), `Abs (0x00288);
+ (0x001AF,0x001AF), `Abs (0x001B0);
+ (0x001B1,0x001B2), `Delta (217);
+ (0x001B3,0x001B3), `Abs (0x001B4);
+ (0x001B5,0x001B5), `Abs (0x001B6);
+ (0x001B7,0x001B7), `Abs (0x00292);
+ (0x001B8,0x001B8), `Abs (0x001B9);
+ (0x001BC,0x001BC), `Abs (0x001BD);
+ (0x001C4,0x001C4), `Abs (0x001C6);
+ (0x001C7,0x001C7), `Abs (0x001C9);
+ (0x001CA,0x001CA), `Abs (0x001CC);
+ (0x001CD,0x001CD), `Abs (0x001CE);
+ (0x001CF,0x001CF), `Abs (0x001D0);
+ (0x001D1,0x001D1), `Abs (0x001D2);
+ (0x001D3,0x001D3), `Abs (0x001D4);
+ (0x001D5,0x001D5), `Abs (0x001D6);
+ (0x001D7,0x001D7), `Abs (0x001D8);
+ (0x001D9,0x001D9), `Abs (0x001DA);
+ (0x001DB,0x001DB), `Abs (0x001DC);
+ (0x001DE,0x001DE), `Abs (0x001DF);
+ (0x001E0,0x001E0), `Abs (0x001E1);
+ (0x001E2,0x001E2), `Abs (0x001E3);
+ (0x001E4,0x001E4), `Abs (0x001E5);
+ (0x001E6,0x001E6), `Abs (0x001E7);
+ (0x001E8,0x001E8), `Abs (0x001E9);
+ (0x001EA,0x001EA), `Abs (0x001EB);
+ (0x001EC,0x001EC), `Abs (0x001ED);
+ (0x001EE,0x001EE), `Abs (0x001EF);
+ (0x001F1,0x001F1), `Abs (0x001F3);
+ (0x001F4,0x001F4), `Abs (0x001F5);
+ (0x001F6,0x001F6), `Abs (0x00195);
+ (0x001F7,0x001F7), `Abs (0x001BF);
+ (0x001F8,0x001F8), `Abs (0x001F9);
+ (0x001FA,0x001FA), `Abs (0x001FB);
+ (0x001FC,0x001FC), `Abs (0x001FD);
+ (0x001FE,0x001FE), `Abs (0x001FF);
+ (0x00200,0x00200), `Abs (0x00201);
+ (0x00202,0x00202), `Abs (0x00203);
+ (0x00204,0x00204), `Abs (0x00205);
+ (0x00206,0x00206), `Abs (0x00207);
+ (0x00208,0x00208), `Abs (0x00209);
+ (0x0020A,0x0020A), `Abs (0x0020B);
+ (0x0020C,0x0020C), `Abs (0x0020D);
+ (0x0020E,0x0020E), `Abs (0x0020F);
+ (0x00210,0x00210), `Abs (0x00211);
+ (0x00212,0x00212), `Abs (0x00213);
+ (0x00214,0x00214), `Abs (0x00215);
+ (0x00216,0x00216), `Abs (0x00217);
+ (0x00218,0x00218), `Abs (0x00219);
+ (0x0021A,0x0021A), `Abs (0x0021B);
+ (0x0021C,0x0021C), `Abs (0x0021D);
+ (0x0021E,0x0021E), `Abs (0x0021F);
+ (0x00220,0x00220), `Abs (0x0019E);
+ (0x00222,0x00222), `Abs (0x00223);
+ (0x00224,0x00224), `Abs (0x00225);
+ (0x00226,0x00226), `Abs (0x00227);
+ (0x00228,0x00228), `Abs (0x00229);
+ (0x0022A,0x0022A), `Abs (0x0022B);
+ (0x0022C,0x0022C), `Abs (0x0022D);
+ (0x0022E,0x0022E), `Abs (0x0022F);
+ (0x00230,0x00230), `Abs (0x00231);
+ (0x00232,0x00232), `Abs (0x00233);
+ (0x00386,0x00386), `Abs (0x003AC);
+ (0x00388,0x0038A), `Delta (37);
+ (0x0038C,0x0038C), `Abs (0x003CC);
+ (0x0038E,0x0038F), `Delta (63);
+ (0x00391,0x003A1), `Delta (32);
+ (0x003A3,0x003AB), `Delta (32);
+ (0x003D8,0x003D8), `Abs (0x003D9);
+ (0x003DA,0x003DA), `Abs (0x003DB);
+ (0x003DC,0x003DC), `Abs (0x003DD);
+ (0x003DE,0x003DE), `Abs (0x003DF);
+ (0x003E0,0x003E0), `Abs (0x003E1);
+ (0x003E2,0x003E2), `Abs (0x003E3);
+ (0x003E4,0x003E4), `Abs (0x003E5);
+ (0x003E6,0x003E6), `Abs (0x003E7);
+ (0x003E8,0x003E8), `Abs (0x003E9);
+ (0x003EA,0x003EA), `Abs (0x003EB);
+ (0x003EC,0x003EC), `Abs (0x003ED);
+ (0x003EE,0x003EE), `Abs (0x003EF);
+ (0x003F4,0x003F4), `Abs (0x003B8);
+ (0x00400,0x0040F), `Delta (80);
+ (0x00410,0x0042F), `Delta (32);
+ (0x00460,0x00460), `Abs (0x00461);
+ (0x00462,0x00462), `Abs (0x00463);
+ (0x00464,0x00464), `Abs (0x00465);
+ (0x00466,0x00466), `Abs (0x00467);
+ (0x00468,0x00468), `Abs (0x00469);
+ (0x0046A,0x0046A), `Abs (0x0046B);
+ (0x0046C,0x0046C), `Abs (0x0046D);
+ (0x0046E,0x0046E), `Abs (0x0046F);
+ (0x00470,0x00470), `Abs (0x00471);
+ (0x00472,0x00472), `Abs (0x00473);
+ (0x00474,0x00474), `Abs (0x00475);
+ (0x00476,0x00476), `Abs (0x00477);
+ (0x00478,0x00478), `Abs (0x00479);
+ (0x0047A,0x0047A), `Abs (0x0047B);
+ (0x0047C,0x0047C), `Abs (0x0047D);
+ (0x0047E,0x0047E), `Abs (0x0047F);
+ (0x00480,0x00480), `Abs (0x00481);
+ (0x0048A,0x0048A), `Abs (0x0048B);
+ (0x0048C,0x0048C), `Abs (0x0048D);
+ (0x0048E,0x0048E), `Abs (0x0048F);
+ (0x00490,0x00490), `Abs (0x00491);
+ (0x00492,0x00492), `Abs (0x00493);
+ (0x00494,0x00494), `Abs (0x00495);
+ (0x00496,0x00496), `Abs (0x00497);
+ (0x00498,0x00498), `Abs (0x00499);
+ (0x0049A,0x0049A), `Abs (0x0049B);
+ (0x0049C,0x0049C), `Abs (0x0049D);
+ (0x0049E,0x0049E), `Abs (0x0049F);
+ (0x004A0,0x004A0), `Abs (0x004A1);
+ (0x004A2,0x004A2), `Abs (0x004A3);
+ (0x004A4,0x004A4), `Abs (0x004A5);
+ (0x004A6,0x004A6), `Abs (0x004A7);
+ (0x004A8,0x004A8), `Abs (0x004A9);
+ (0x004AA,0x004AA), `Abs (0x004AB);
+ (0x004AC,0x004AC), `Abs (0x004AD);
+ (0x004AE,0x004AE), `Abs (0x004AF);
+ (0x004B0,0x004B0), `Abs (0x004B1);
+ (0x004B2,0x004B2), `Abs (0x004B3);
+ (0x004B4,0x004B4), `Abs (0x004B5);
+ (0x004B6,0x004B6), `Abs (0x004B7);
+ (0x004B8,0x004B8), `Abs (0x004B9);
+ (0x004BA,0x004BA), `Abs (0x004BB);
+ (0x004BC,0x004BC), `Abs (0x004BD);
+ (0x004BE,0x004BE), `Abs (0x004BF);
+ (0x004C1,0x004C1), `Abs (0x004C2);
+ (0x004C3,0x004C3), `Abs (0x004C4);
+ (0x004C5,0x004C5), `Abs (0x004C6);
+ (0x004C7,0x004C7), `Abs (0x004C8);
+ (0x004C9,0x004C9), `Abs (0x004CA);
+ (0x004CB,0x004CB), `Abs (0x004CC);
+ (0x004CD,0x004CD), `Abs (0x004CE);
+ (0x004D0,0x004D0), `Abs (0x004D1);
+ (0x004D2,0x004D2), `Abs (0x004D3);
+ (0x004D4,0x004D4), `Abs (0x004D5);
+ (0x004D6,0x004D6), `Abs (0x004D7);
+ (0x004D8,0x004D8), `Abs (0x004D9);
+ (0x004DA,0x004DA), `Abs (0x004DB);
+ (0x004DC,0x004DC), `Abs (0x004DD);
+ (0x004DE,0x004DE), `Abs (0x004DF);
+ (0x004E0,0x004E0), `Abs (0x004E1);
+ (0x004E2,0x004E2), `Abs (0x004E3);
+ (0x004E4,0x004E4), `Abs (0x004E5);
+ (0x004E6,0x004E6), `Abs (0x004E7);
+ (0x004E8,0x004E8), `Abs (0x004E9);
+ (0x004EA,0x004EA), `Abs (0x004EB);
+ (0x004EC,0x004EC), `Abs (0x004ED);
+ (0x004EE,0x004EE), `Abs (0x004EF);
+ (0x004F0,0x004F0), `Abs (0x004F1);
+ (0x004F2,0x004F2), `Abs (0x004F3);
+ (0x004F4,0x004F4), `Abs (0x004F5);
+ (0x004F8,0x004F8), `Abs (0x004F9);
+ (0x00500,0x00500), `Abs (0x00501);
+ (0x00502,0x00502), `Abs (0x00503);
+ (0x00504,0x00504), `Abs (0x00505);
+ (0x00506,0x00506), `Abs (0x00507);
+ (0x00508,0x00508), `Abs (0x00509);
+ (0x0050A,0x0050A), `Abs (0x0050B);
+ (0x0050C,0x0050C), `Abs (0x0050D);
+ (0x0050E,0x0050E), `Abs (0x0050F);
+ (0x00531,0x00556), `Delta (48);
+ (0x01E00,0x01E00), `Abs (0x01E01);
+ (0x01E02,0x01E02), `Abs (0x01E03);
+ (0x01E04,0x01E04), `Abs (0x01E05);
+ (0x01E06,0x01E06), `Abs (0x01E07);
+ (0x01E08,0x01E08), `Abs (0x01E09);
+ (0x01E0A,0x01E0A), `Abs (0x01E0B);
+ (0x01E0C,0x01E0C), `Abs (0x01E0D);
+ (0x01E0E,0x01E0E), `Abs (0x01E0F);
+ (0x01E10,0x01E10), `Abs (0x01E11);
+ (0x01E12,0x01E12), `Abs (0x01E13);
+ (0x01E14,0x01E14), `Abs (0x01E15);
+ (0x01E16,0x01E16), `Abs (0x01E17);
+ (0x01E18,0x01E18), `Abs (0x01E19);
+ (0x01E1A,0x01E1A), `Abs (0x01E1B);
+ (0x01E1C,0x01E1C), `Abs (0x01E1D);
+ (0x01E1E,0x01E1E), `Abs (0x01E1F);
+ (0x01E20,0x01E20), `Abs (0x01E21);
+ (0x01E22,0x01E22), `Abs (0x01E23);
+ (0x01E24,0x01E24), `Abs (0x01E25);
+ (0x01E26,0x01E26), `Abs (0x01E27);
+ (0x01E28,0x01E28), `Abs (0x01E29);
+ (0x01E2A,0x01E2A), `Abs (0x01E2B);
+ (0x01E2C,0x01E2C), `Abs (0x01E2D);
+ (0x01E2E,0x01E2E), `Abs (0x01E2F);
+ (0x01E30,0x01E30), `Abs (0x01E31);
+ (0x01E32,0x01E32), `Abs (0x01E33);
+ (0x01E34,0x01E34), `Abs (0x01E35);
+ (0x01E36,0x01E36), `Abs (0x01E37);
+ (0x01E38,0x01E38), `Abs (0x01E39);
+ (0x01E3A,0x01E3A), `Abs (0x01E3B);
+ (0x01E3C,0x01E3C), `Abs (0x01E3D);
+ (0x01E3E,0x01E3E), `Abs (0x01E3F);
+ (0x01E40,0x01E40), `Abs (0x01E41);
+ (0x01E42,0x01E42), `Abs (0x01E43);
+ (0x01E44,0x01E44), `Abs (0x01E45);
+ (0x01E46,0x01E46), `Abs (0x01E47);
+ (0x01E48,0x01E48), `Abs (0x01E49);
+ (0x01E4A,0x01E4A), `Abs (0x01E4B);
+ (0x01E4C,0x01E4C), `Abs (0x01E4D);
+ (0x01E4E,0x01E4E), `Abs (0x01E4F);
+ (0x01E50,0x01E50), `Abs (0x01E51);
+ (0x01E52,0x01E52), `Abs (0x01E53);
+ (0x01E54,0x01E54), `Abs (0x01E55);
+ (0x01E56,0x01E56), `Abs (0x01E57);
+ (0x01E58,0x01E58), `Abs (0x01E59);
+ (0x01E5A,0x01E5A), `Abs (0x01E5B);
+ (0x01E5C,0x01E5C), `Abs (0x01E5D);
+ (0x01E5E,0x01E5E), `Abs (0x01E5F);
+ (0x01E60,0x01E60), `Abs (0x01E61);
+ (0x01E62,0x01E62), `Abs (0x01E63);
+ (0x01E64,0x01E64), `Abs (0x01E65);
+ (0x01E66,0x01E66), `Abs (0x01E67);
+ (0x01E68,0x01E68), `Abs (0x01E69);
+ (0x01E6A,0x01E6A), `Abs (0x01E6B);
+ (0x01E6C,0x01E6C), `Abs (0x01E6D);
+ (0x01E6E,0x01E6E), `Abs (0x01E6F);
+ (0x01E70,0x01E70), `Abs (0x01E71);
+ (0x01E72,0x01E72), `Abs (0x01E73);
+ (0x01E74,0x01E74), `Abs (0x01E75);
+ (0x01E76,0x01E76), `Abs (0x01E77);
+ (0x01E78,0x01E78), `Abs (0x01E79);
+ (0x01E7A,0x01E7A), `Abs (0x01E7B);
+ (0x01E7C,0x01E7C), `Abs (0x01E7D);
+ (0x01E7E,0x01E7E), `Abs (0x01E7F);
+ (0x01E80,0x01E80), `Abs (0x01E81);
+ (0x01E82,0x01E82), `Abs (0x01E83);
+ (0x01E84,0x01E84), `Abs (0x01E85);
+ (0x01E86,0x01E86), `Abs (0x01E87);
+ (0x01E88,0x01E88), `Abs (0x01E89);
+ (0x01E8A,0x01E8A), `Abs (0x01E8B);
+ (0x01E8C,0x01E8C), `Abs (0x01E8D);
+ (0x01E8E,0x01E8E), `Abs (0x01E8F);
+ (0x01E90,0x01E90), `Abs (0x01E91);
+ (0x01E92,0x01E92), `Abs (0x01E93);
+ (0x01E94,0x01E94), `Abs (0x01E95);
+ (0x01EA0,0x01EA0), `Abs (0x01EA1);
+ (0x01EA2,0x01EA2), `Abs (0x01EA3);
+ (0x01EA4,0x01EA4), `Abs (0x01EA5);
+ (0x01EA6,0x01EA6), `Abs (0x01EA7);
+ (0x01EA8,0x01EA8), `Abs (0x01EA9);
+ (0x01EAA,0x01EAA), `Abs (0x01EAB);
+ (0x01EAC,0x01EAC), `Abs (0x01EAD);
+ (0x01EAE,0x01EAE), `Abs (0x01EAF);
+ (0x01EB0,0x01EB0), `Abs (0x01EB1);
+ (0x01EB2,0x01EB2), `Abs (0x01EB3);
+ (0x01EB4,0x01EB4), `Abs (0x01EB5);
+ (0x01EB6,0x01EB6), `Abs (0x01EB7);
+ (0x01EB8,0x01EB8), `Abs (0x01EB9);
+ (0x01EBA,0x01EBA), `Abs (0x01EBB);
+ (0x01EBC,0x01EBC), `Abs (0x01EBD);
+ (0x01EBE,0x01EBE), `Abs (0x01EBF);
+ (0x01EC0,0x01EC0), `Abs (0x01EC1);
+ (0x01EC2,0x01EC2), `Abs (0x01EC3);
+ (0x01EC4,0x01EC4), `Abs (0x01EC5);
+ (0x01EC6,0x01EC6), `Abs (0x01EC7);
+ (0x01EC8,0x01EC8), `Abs (0x01EC9);
+ (0x01ECA,0x01ECA), `Abs (0x01ECB);
+ (0x01ECC,0x01ECC), `Abs (0x01ECD);
+ (0x01ECE,0x01ECE), `Abs (0x01ECF);
+ (0x01ED0,0x01ED0), `Abs (0x01ED1);
+ (0x01ED2,0x01ED2), `Abs (0x01ED3);
+ (0x01ED4,0x01ED4), `Abs (0x01ED5);
+ (0x01ED6,0x01ED6), `Abs (0x01ED7);
+ (0x01ED8,0x01ED8), `Abs (0x01ED9);
+ (0x01EDA,0x01EDA), `Abs (0x01EDB);
+ (0x01EDC,0x01EDC), `Abs (0x01EDD);
+ (0x01EDE,0x01EDE), `Abs (0x01EDF);
+ (0x01EE0,0x01EE0), `Abs (0x01EE1);
+ (0x01EE2,0x01EE2), `Abs (0x01EE3);
+ (0x01EE4,0x01EE4), `Abs (0x01EE5);
+ (0x01EE6,0x01EE6), `Abs (0x01EE7);
+ (0x01EE8,0x01EE8), `Abs (0x01EE9);
+ (0x01EEA,0x01EEA), `Abs (0x01EEB);
+ (0x01EEC,0x01EEC), `Abs (0x01EED);
+ (0x01EEE,0x01EEE), `Abs (0x01EEF);
+ (0x01EF0,0x01EF0), `Abs (0x01EF1);
+ (0x01EF2,0x01EF2), `Abs (0x01EF3);
+ (0x01EF4,0x01EF4), `Abs (0x01EF5);
+ (0x01EF6,0x01EF6), `Abs (0x01EF7);
+ (0x01EF8,0x01EF8), `Abs (0x01EF9);
+ (0x01F08,0x01F0F), `Delta (-8);
+ (0x01F18,0x01F1D), `Delta (-8);
+ (0x01F28,0x01F2F), `Delta (-8);
+ (0x01F38,0x01F3F), `Delta (-8);
+ (0x01F48,0x01F4D), `Delta (-8);
+ (0x01F59,0x01F59), `Abs (0x01F51);
+ (0x01F5B,0x01F5B), `Abs (0x01F53);
+ (0x01F5D,0x01F5D), `Abs (0x01F55);
+ (0x01F5F,0x01F5F), `Abs (0x01F57);
+ (0x01F68,0x01F6F), `Delta (-8);
+ (0x01FB8,0x01FB9), `Delta (-8);
+ (0x01FBA,0x01FBB), `Delta (-74);
+ (0x01FC8,0x01FCB), `Delta (-86);
+ (0x01FD8,0x01FD9), `Delta (-8);
+ (0x01FDA,0x01FDB), `Delta (-100);
+ (0x01FE8,0x01FE9), `Delta (-8);
+ (0x01FEA,0x01FEB), `Delta (-112);
+ (0x01FEC,0x01FEC), `Abs (0x01FE5);
+ (0x01FF8,0x01FF9), `Delta (-128);
+ (0x01FFA,0x01FFB), `Delta (-126);
+ (0x02126,0x02126), `Abs (0x003C9);
+ (0x0212A,0x0212A), `Abs (0x0006B);
+ (0x0212B,0x0212B), `Abs (0x000E5);
+ (0x0FF21,0x0FF3A), `Delta (32);
+ (0x10400,0x10425), `Delta (40);
+ (0x001C5,0x001C5), `Abs (0x001C6);
+ (0x001C8,0x001C8), `Abs (0x001C9);
+ (0x001CB,0x001CB), `Abs (0x001CC);
+ (0x001F2,0x001F2), `Abs (0x001F3);
+ (0x01F88,0x01F8F), `Delta (-8);
+ (0x01F98,0x01F9F), `Delta (-8);
+ (0x01FA8,0x01FAF), `Delta (-8);
+ (0x01FBC,0x01FBC), `Abs (0x01FB3);
+ (0x01FCC,0x01FCC), `Abs (0x01FC3);
+ (0x01FFC,0x01FFC), `Abs (0x01FF3);
+ (0x02160,0x0216F), `Delta (16)
+]
+
diff --git a/lib/util.ml b/lib/util.ml
index 69a1e9f8bb..b8ceea3ea0 100644
--- a/lib/util.ml
+++ b/lib/util.ml
@@ -347,120 +347,13 @@ let check_ident_gen handle s =
let check_ident_soft = check_ident_gen warning
let check_ident = check_ident_gen error
-let lowercase_unicode s unicode =
- match unicode land 0x1F000 with
- | 0x0 ->
- begin match unicode with
- (* utf-8 Basic Latin underscore *)
- | x when x = 0x005F -> x
- (* utf-8 Basic Latin letters *)
- | x when 0x0041 <= x & x <= 0x005A -> x + 32
- | x when 0x0061 <= x & x <= 0x007A -> x
- (* utf-8 Latin-1 non breaking space U00A0 *)
- | 0x00A0 as x -> x
- (* utf-8 Latin-1 letters U00C0-00D6 *)
- | x when 0x00C0 <= x & x <= 0x00D6 -> x + 32
- (* utf-8 Latin-1 letters U00D8-00F6 *)
- | x when 0x00D8 <= x & x <= 0x00DE -> x + 32
- | x when 0x00E0 <= x & x <= 0x00F6 -> x
- (* utf-8 Latin-1 letters U00F8-00FF *)
- | x when 0x00F8 <= x & x <= 0x00FF -> x
- (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *)
- | x when 0x0100 <= x & x <= 0x017F ->
- if x mod 2 = 1 then x else x + 1
- | x when 0x0180 <= x & x <= 0x0241 ->
- warning ("Unable to decide which lowercase letter to map to "^s); x
- (* utf-8 Phonetic letters U0250-02AF *)
- | x when 0x0250 <= x & x <= 0x02AF -> x
- (* utf-8 what do to with diacritics U0300-U036F ? *)
- (* utf-8 Greek letters U0380-03FF *)
- | x when 0x0380 <= x & x <= 0x0385 -> x
- | 0x0386 -> 0x03AC
- | x when 0x0388 <= x & x <= 0x038A -> x + 37
- | 0x038C -> 0x03CC
- | x when 0x038E <= x & x <= 0x038F -> x + 63
- | x when 0x0390 <= x & x <= 0x03AB & x <> 0x03A2 -> x + 32
- (* utf-8 Greek lowercase letters U03B0-03CE *)
- | x when 0x03AC <= x & x <= 0x03CE -> x
- | x when 0x03CF <= x & x <= 0x03FF ->
- warning ("Unable to decide which lowercase letter to map to "^s); x
- (* utf-8 Cyrillic letters U0400-0481 *)
- | x when 0x0400 <= x & x <= 0x040F -> x + 80
- | x when 0x0410 <= x & x <= 0x042F -> x + 32
- | x when 0x0430 <= x & x <= 0x045F -> x
- | x when 0x0460 <= x & x <= 0x0481 ->
- if x mod 2 = 1 then x else x + 1
- (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *)
- | x when 0x048A <= x & x <= 0x04F9 & x <> 0x04CF ->
- if x mod 2 = 1 then x else x + 1
- (* utf-8 Cyrillic supplement letters U0500-U050F *)
- | x when 0x0500 <= x & x <= 0x050F ->
- if x mod 2 = 1 then x else x + 1
- (* utf-8 Hebrew letters U05D0-05EA *)
- | x when 0x05D0 <= x & x <= 0x05EA -> x
- (* utf-8 Arabic letters U0621-064A *)
- | x when 0x0621 <= x & x <= 0x064A -> x
- (* utf-8 Arabic supplement letters U0750-076D *)
- | x when 0x0750 <= x & x <= 0x076D -> x
- | _ -> raise UnsupportedUtf8
- end
- | 0x1000 ->
- begin match unicode with
- (* utf-8 Georgian U10A0-10FF (has holes) *)
- | x when 0x10A0 <= x & x <= 0x10FF -> x
- (* utf-8 Hangul Jamo U1100-11FF (has holes) *)
- | x when 0x1100 <= x & x <= 0x11FF -> x
- (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *)
- | x when 0x1E00 <= x & x <= 0x1E95 ->
- if x mod 2 = 1 then x else x + 1
- | x when 0x1E96 <= x & x <= 0x1E9B -> x
- | x when 0x1EA0 <= x & x <= 0x1EF9 ->
- if x mod 2 = 1 then x else x + 1
- | _ -> raise UnsupportedUtf8
- end
- | 0x2000 ->
- begin match unicode with
- (* utf-8 general punctuation U2080-2089 *)
- (* Hyphens *)
- | x when 0x2010 <= x & x <= 0x2011 -> x
- (* utf-8 letter-like U2100-214F *)
- | 0x2102 (* double-struck C *) -> Char.code 'x'
- | 0x2115 (* double-struck N *) -> Char.code 'n'
- | 0x2119 (* double-struck P *) -> Char.code 'x'
- | 0x211A (* double-struck Q *) -> Char.code 'x'
- | 0x211D (* double-struck R *) -> Char.code 'r'
- | 0x2124 (* double-struck Z *) -> Char.code 'x'
- | x when 0x2100 <= x & x <= 0x214F ->
- warning ("Unable to decide which lowercase letter to map to "^s); x
- | _ -> raise UnsupportedUtf8
- end
- | _ ->
- begin match unicode with
- (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *)
- | x when 0x3040 <= x & x <= 0x30FF -> x
- (* utf-8 Unified CJK Ideographs U4E00-9FA5 *)
- | x when 0x4E00 <= x & x <= 0x9FA5 -> x
- (* utf-8 Hangul syllables UAC00-D7AF *)
- | x when 0xAC00 <= x & x <= 0xD7AF -> x
- (* utf-8 Gothic U10330-1034A *)
- | x when 0x10330 <= x & x <= 0x1034A -> x
- (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *)
- | x when 0x1D6A8 <= x & x <= 0x1D7C9 ->
- let a = (x - 0x1D6A8) mod 58 in
- if a <= 16 or (18 <= a & a <= 24)
- then x + 26 (* all but nabla and theta symbol *)
- else x
- | x when 0x1D538 <= x & x <= 0x1D56B ->
- (* Use ordinary lowercase in both small and capital double-struck *)
- (x - 0x1D538) mod 26 + Char.code 'a'
- | x when 0x1D468 <= x & x <= 0x1D6A3 -> (* General case *)
- if (x - 0x1D400 / 26) mod 2 = 0 then x + 26 else x
- | x when 0x1D400 <= x & x <= 0x1D7CB -> (* fallback *)
- x
- (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *)
- | x when 0x1D7CE <= x & x <= 0x1D7FF -> x
- | _ -> raise UnsupportedUtf8
- end
+let lowercase_unicode s unicode =
+ let tree = Segmenttree.make Unicodetable.to_lower in
+ try
+ match Segmenttree.lookup unicode tree with
+ | `Abs c -> c
+ | `Delta d -> unicode + d
+ with Not_found -> unicode
let lowercase_first_char_utf8 s =
assert (s <> "");