diff options
| author | regisgia | 2010-01-08 17:03:54 +0000 |
|---|---|---|
| committer | regisgia | 2010-01-08 17:03:54 +0000 |
| commit | ff01cafe8104f7620aacbfdde5dba738dbadc326 (patch) | |
| tree | e6ea9e4d236e68ac3def5b029e5eb3aca70bedd3 | |
| parent | 5db31bb0333810ccdd0a79e9855ae9d2fcdbf2d3 (diff) | |
* Segmenttree: New. A very simple implementation of segment trees.
* Unicodetable: Update with the standard table for lower case conversion.
* Util: Rewrite "lowercase_unicode" to take the entire unicode character set
into account.
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@12645 85f007b7-540e-0410-9357-904b9bb8a0f7
| -rw-r--r-- | Makefile.common | 5 | ||||
| -rw-r--r-- | checker/check.mllib | 1 | ||||
| -rw-r--r-- | dev/printers.mllib | 1 | ||||
| -rw-r--r-- | lib/lib.mllib | 2 | ||||
| -rw-r--r-- | lib/segmenttree.ml | 131 | ||||
| -rw-r--r-- | lib/segmenttree.mli | 20 | ||||
| -rw-r--r-- | lib/unicodetable.ml | 416 | ||||
| -rw-r--r-- | lib/util.ml | 121 | ||||
| -rw-r--r-- | parsing/grammar.mllib | 1 |
9 files changed, 582 insertions, 116 deletions
diff --git a/Makefile.common b/Makefile.common index fcd6c81269..a1bfd08856 100644 --- a/Makefile.common +++ b/Makefile.common @@ -156,7 +156,7 @@ BYTERUN:=$(addprefix kernel/byterun/, \ CORECMA:=lib/lib.cma kernel/kernel.cma library/library.cma \ pretyping/pretyping.cma interp/interp.cma proofs/proofs.cma \ parsing/parsing.cma tactics/tactics.cma toplevel/toplevel.cma \ - parsing/highparsing.cma tactics/hightactics.cma + parsing/highparsing.cma tactics/hightactics.cma OMEGACMA:=plugins/omega/omega_plugin.cma ROMEGACMA:=plugins/romega/romega_plugin.cma @@ -223,7 +223,8 @@ IDEMOD:=$(shell cat ide/ide.mllib) COQENVCMO:=$(CONFIG) \ lib/pp_control.cmo lib/pp.cmo lib/compat.cmo lib/flags.cmo \ - lib/unicodetable.cmo lib/util.cmo lib/system.cmo lib/envars.cmo + lib/segmenttree.cmo lib/unicodetable.cmo lib/util.cmo lib/system.cmo \ + lib/envars.cmo COQMKTOPCMO:=$(COQENVCMO) scripts/tolink.cmo scripts/coqmktop.cmo COQMKTOPCMX:=$(COQMKTOPCMO:.cmo=.cmx) diff --git a/checker/check.mllib b/checker/check.mllib index 098e2f1ed2..08dd78bcb7 100644 --- a/checker/check.mllib +++ b/checker/check.mllib @@ -3,6 +3,7 @@ Pp_control Pp Compat Flags +Segmenttree Unicodetable Util Option diff --git a/dev/printers.mllib b/dev/printers.mllib index 00836c5bee..a6129fbf9b 100644 --- a/dev/printers.mllib +++ b/dev/printers.mllib @@ -4,6 +4,7 @@ Pp_control Pp Compat Flags +Segmenttree Unicodetable Util Bigint diff --git a/lib/lib.mllib b/lib/lib.mllib index 73e3411dcf..1743ce26ca 100644 --- a/lib/lib.mllib +++ b/lib/lib.mllib @@ -2,6 +2,7 @@ Pp_control Pp Compat Flags +Segmenttree Unicodetable Util Bigint @@ -25,3 +26,4 @@ Rtree Heap Option Dnet + diff --git a/lib/segmenttree.ml b/lib/segmenttree.ml new file mode 100644 index 0000000000..2a7f9df0ed --- /dev/null +++ b/lib/segmenttree.ml @@ -0,0 +1,131 @@ +(** This module is a very simple implementation of "segment trees". + + A segment tree of type ['a t] represents a mapping from a union of + disjoint segments to some values of type 'a. +*) + +(** Misc. functions. *) +let list_iteri f l = + let rec loop i = function + | [] -> () + | x :: xs -> f i x; loop (i + 1) xs + in + loop 0 l + +let log2 x = log x /. log 2. + +let log2n x = int_of_float (ceil (log2 (float_of_int x))) + +(** We focus on integers but this module can be generalized. *) +type elt = int + +(** A value of type [domain] is interpreted differently given its position + in the tree. On internal nodes, a domain represents the set of + integers which are _not_ in the set of keys handled by the tree. On + leaves, a domain represents the st of integers which are in the set of + keys. *) +type domain = + (** On internal nodes, a domain [Interval (a, b)] represents + the interval [a + 1; b - 1]. On leaves, it represents [a; b]. + We always have [a] <= [b]. *) + | Interval of elt * elt + (** On internal node or root, a domain [Universe] represents all + the integers. When the tree is not a trivial root, + [Universe] has no interpretation on leaves. (The lookup + function should never reach the leaves.) *) + | Universe + +(** We use an array to store the almost complete tree. This array + contains at least one element. *) +type 'a t = (domain * 'a option) array + +(** The root is the first item of the array. *) +let is_root i = (i = 0) + +(** Standard layout for left child. *) +let left_child i = 2 * i + 1 + +(** Standard layout for right child. *) +let right_child i = 2 * i + 2 + +(** Extract the annotation of a node, be it internal or a leaf. *) +let value_of i t = match t.(i) with (_, Some x) -> x | _ -> raise Not_found + +(** Initialize the array to store [n] leaves. *) +let create n init = + Array.make (1 lsl (log2n n + 1) - 1) init + +(** Make a complete interval tree from a list of disjoint segments. + Precondition : the segments must be sorted. *) +let make segments = + let nsegments = List.length segments in + let tree = create nsegments (Universe, None) in + let leaves_offset = (1 lsl (log2n nsegments)) - 1 in + + (** The algorithm proceeds in two steps using an intermediate tree + to store minimum and maximum of each subtree as annotation of + the node. *) + + (** We start from leaves: the last level of the tree is initialized + with the given segments... *) + list_iteri + (fun i ((start, stop), value) -> + let k = leaves_offset + i in + let i = Interval (start, stop) in + tree.(k) <- (i, Some i)) + segments; + (** ... the remaining leaves are initialized with neutral information. *) + for k = leaves_offset + nsegments to Array.length tree -1 do + tree.(k) <- (Universe, Some Universe) + done; + + (** We traverse the tree bottom-up and compute the interval and + annotation associated to each node from the annotations of its + children. *) + for k = leaves_offset - 1 downto 0 do + let node, annotation = + match value_of (left_child k) tree, value_of (right_child k) tree with + | Interval (left_min, left_max), Interval (right_min, right_max) -> + (Interval (left_max, right_min), Interval (left_min, right_max)) + | Interval (min, max), Universe -> + (Interval (max, max), Interval (min, max)) + | Universe, Universe -> Universe, Universe + | Universe, _ -> assert false + in + tree.(k) <- (node, Some annotation) + done; + + (** Finally, annotation are replaced with the image related to each leaf. *) + let final_tree = + Array.mapi (fun i (segment, value) -> (segment, None)) tree + in + list_iteri + (fun i ((start, stop), value) -> + final_tree.(leaves_offset + i) + <- (Interval (start, stop), Some value)) + segments; + final_tree + +(** [lookup k t] looks for an image for key [k] in the interval tree [t]. + Raise [Not_found] if it fails. *) +let lookup k t = + let i = ref 0 in + while (snd t.(!i) = None) do + match fst t.(!i) with + | Interval (start, stop) -> + if k <= start then i := left_child !i + else if k >= stop then i:= right_child !i + else raise Not_found + | Universe -> raise Not_found + done; + match fst t.(!i) with + | Interval (start, stop) -> + if k >= start && k <= stop then + match snd t.(!i) with + | Some v -> v + | None -> assert false + else + raise Not_found + | Universe -> assert false + + diff --git a/lib/segmenttree.mli b/lib/segmenttree.mli new file mode 100644 index 0000000000..4aea13e9ad --- /dev/null +++ b/lib/segmenttree.mli @@ -0,0 +1,20 @@ +(** This module is a very simple implementation of "segment trees". + + A segment tree of type ['a t] represents a mapping from a union of + disjoint segments to some values of type 'a. +*) + +(** A mapping from a union of disjoint segments to some values of type ['a]. *) +type 'a t + +(** [make [(i1, j1), v1; (i2, j2), v2; ...] creates a mapping that + associates to every integer [x] the value [v1] if [i1 <= x <= j1], + [v2] if [i2 <= x <= j2], and so one. + Precondition: the segments must be sorted. *) +val make : ((int * int) * 'a) list -> 'a t + +(** [lookup k t] looks for an image for key [k] in the interval tree [t]. + Raise [Not_found] if it fails. *) +val lookup : int -> 'a t -> 'a + + diff --git a/lib/unicodetable.ml b/lib/unicodetable.ml index af8c351d57..f4e978d695 100644 --- a/lib/unicodetable.ml +++ b/lib/unicodetable.ml @@ -2201,3 +2201,419 @@ let so = [ (0x1D18C,0x1D1A9); (0x1D1AE,0x1D1DD) ] + +(* Conversion to lower case. *) +let to_lower = [ + (0x00041,0x0005A), `Delta (32); + (0x000C0,0x000D6), `Delta (32); + (0x000D8,0x000DE), `Delta (32); + (0x00100,0x00100), `Abs (0x00101); + (0x00102,0x00102), `Abs (0x00103); + (0x00104,0x00104), `Abs (0x00105); + (0x00106,0x00106), `Abs (0x00107); + (0x00108,0x00108), `Abs (0x00109); + (0x0010A,0x0010A), `Abs (0x0010B); + (0x0010C,0x0010C), `Abs (0x0010D); + (0x0010E,0x0010E), `Abs (0x0010F); + (0x00110,0x00110), `Abs (0x00111); + (0x00112,0x00112), `Abs (0x00113); + (0x00114,0x00114), `Abs (0x00115); + (0x00116,0x00116), `Abs (0x00117); + (0x00118,0x00118), `Abs (0x00119); + (0x0011A,0x0011A), `Abs (0x0011B); + (0x0011C,0x0011C), `Abs (0x0011D); + (0x0011E,0x0011E), `Abs (0x0011F); + (0x00120,0x00120), `Abs (0x00121); + (0x00122,0x00122), `Abs (0x00123); + (0x00124,0x00124), `Abs (0x00125); + (0x00126,0x00126), `Abs (0x00127); + (0x00128,0x00128), `Abs (0x00129); + (0x0012A,0x0012A), `Abs (0x0012B); + (0x0012C,0x0012C), `Abs (0x0012D); + (0x0012E,0x0012E), `Abs (0x0012F); + (0x00130,0x00130), `Abs (0x00069); + (0x00132,0x00132), `Abs (0x00133); + (0x00134,0x00134), `Abs (0x00135); + (0x00136,0x00136), `Abs (0x00137); + (0x00139,0x00139), `Abs (0x0013A); + (0x0013B,0x0013B), `Abs (0x0013C); + (0x0013D,0x0013D), `Abs (0x0013E); + (0x0013F,0x0013F), `Abs (0x00140); + (0x00141,0x00141), `Abs (0x00142); + (0x00143,0x00143), `Abs (0x00144); + (0x00145,0x00145), `Abs (0x00146); + (0x00147,0x00147), `Abs (0x00148); + (0x0014A,0x0014A), `Abs (0x0014B); + (0x0014C,0x0014C), `Abs (0x0014D); + (0x0014E,0x0014E), `Abs (0x0014F); + (0x00150,0x00150), `Abs (0x00151); + (0x00152,0x00152), `Abs (0x00153); + (0x00154,0x00154), `Abs (0x00155); + (0x00156,0x00156), `Abs (0x00157); + (0x00158,0x00158), `Abs (0x00159); + (0x0015A,0x0015A), `Abs (0x0015B); + (0x0015C,0x0015C), `Abs (0x0015D); + (0x0015E,0x0015E), `Abs (0x0015F); + (0x00160,0x00160), `Abs (0x00161); + (0x00162,0x00162), `Abs (0x00163); + (0x00164,0x00164), `Abs (0x00165); + (0x00166,0x00166), `Abs (0x00167); + (0x00168,0x00168), `Abs (0x00169); + (0x0016A,0x0016A), `Abs (0x0016B); + (0x0016C,0x0016C), `Abs (0x0016D); + (0x0016E,0x0016E), `Abs (0x0016F); + (0x00170,0x00170), `Abs (0x00171); + (0x00172,0x00172), `Abs (0x00173); + (0x00174,0x00174), `Abs (0x00175); + (0x00176,0x00176), `Abs (0x00177); + (0x00178,0x00178), `Abs (0x000FF); + (0x00179,0x00179), `Abs (0x0017A); + (0x0017B,0x0017B), `Abs (0x0017C); + (0x0017D,0x0017D), `Abs (0x0017E); + (0x00181,0x00181), `Abs (0x00253); + (0x00182,0x00182), `Abs (0x00183); + (0x00184,0x00184), `Abs (0x00185); + (0x00186,0x00186), `Abs (0x00254); + (0x00187,0x00187), `Abs (0x00188); + (0x00189,0x0018A), `Delta (205); + (0x0018B,0x0018B), `Abs (0x0018C); + (0x0018E,0x0018E), `Abs (0x001DD); + (0x0018F,0x0018F), `Abs (0x00259); + (0x00190,0x00190), `Abs (0x0025B); + (0x00191,0x00191), `Abs (0x00192); + (0x00193,0x00193), `Abs (0x00260); + (0x00194,0x00194), `Abs (0x00263); + (0x00196,0x00196), `Abs (0x00269); + (0x00197,0x00197), `Abs (0x00268); + (0x00198,0x00198), `Abs (0x00199); + (0x0019C,0x0019C), `Abs (0x0026F); + (0x0019D,0x0019D), `Abs (0x00272); + (0x0019F,0x0019F), `Abs (0x00275); + (0x001A0,0x001A0), `Abs (0x001A1); + (0x001A2,0x001A2), `Abs (0x001A3); + (0x001A4,0x001A4), `Abs (0x001A5); + (0x001A6,0x001A6), `Abs (0x00280); + (0x001A7,0x001A7), `Abs (0x001A8); + (0x001A9,0x001A9), `Abs (0x00283); + (0x001AC,0x001AC), `Abs (0x001AD); + (0x001AE,0x001AE), `Abs (0x00288); + (0x001AF,0x001AF), `Abs (0x001B0); + (0x001B1,0x001B2), `Delta (217); + (0x001B3,0x001B3), `Abs (0x001B4); + (0x001B5,0x001B5), `Abs (0x001B6); + (0x001B7,0x001B7), `Abs (0x00292); + (0x001B8,0x001B8), `Abs (0x001B9); + (0x001BC,0x001BC), `Abs (0x001BD); + (0x001C4,0x001C4), `Abs (0x001C6); + (0x001C7,0x001C7), `Abs (0x001C9); + (0x001CA,0x001CA), `Abs (0x001CC); + (0x001CD,0x001CD), `Abs (0x001CE); + (0x001CF,0x001CF), `Abs (0x001D0); + (0x001D1,0x001D1), `Abs (0x001D2); + (0x001D3,0x001D3), `Abs (0x001D4); + (0x001D5,0x001D5), `Abs (0x001D6); + (0x001D7,0x001D7), `Abs (0x001D8); + (0x001D9,0x001D9), `Abs (0x001DA); + (0x001DB,0x001DB), `Abs (0x001DC); + (0x001DE,0x001DE), `Abs (0x001DF); + (0x001E0,0x001E0), `Abs (0x001E1); + (0x001E2,0x001E2), `Abs (0x001E3); + (0x001E4,0x001E4), `Abs (0x001E5); + (0x001E6,0x001E6), `Abs (0x001E7); + (0x001E8,0x001E8), `Abs (0x001E9); + (0x001EA,0x001EA), `Abs (0x001EB); + (0x001EC,0x001EC), `Abs (0x001ED); + (0x001EE,0x001EE), `Abs (0x001EF); + (0x001F1,0x001F1), `Abs (0x001F3); + (0x001F4,0x001F4), `Abs (0x001F5); + (0x001F6,0x001F6), `Abs (0x00195); + (0x001F7,0x001F7), `Abs (0x001BF); + (0x001F8,0x001F8), `Abs (0x001F9); + (0x001FA,0x001FA), `Abs (0x001FB); + (0x001FC,0x001FC), `Abs (0x001FD); + (0x001FE,0x001FE), `Abs (0x001FF); + (0x00200,0x00200), `Abs (0x00201); + (0x00202,0x00202), `Abs (0x00203); + (0x00204,0x00204), `Abs (0x00205); + (0x00206,0x00206), `Abs (0x00207); + (0x00208,0x00208), `Abs (0x00209); + (0x0020A,0x0020A), `Abs (0x0020B); + (0x0020C,0x0020C), `Abs (0x0020D); + (0x0020E,0x0020E), `Abs (0x0020F); + (0x00210,0x00210), `Abs (0x00211); + (0x00212,0x00212), `Abs (0x00213); + (0x00214,0x00214), `Abs (0x00215); + (0x00216,0x00216), `Abs (0x00217); + (0x00218,0x00218), `Abs (0x00219); + (0x0021A,0x0021A), `Abs (0x0021B); + (0x0021C,0x0021C), `Abs (0x0021D); + (0x0021E,0x0021E), `Abs (0x0021F); + (0x00220,0x00220), `Abs (0x0019E); + (0x00222,0x00222), `Abs (0x00223); + (0x00224,0x00224), `Abs (0x00225); + (0x00226,0x00226), `Abs (0x00227); + (0x00228,0x00228), `Abs (0x00229); + (0x0022A,0x0022A), `Abs (0x0022B); + (0x0022C,0x0022C), `Abs (0x0022D); + (0x0022E,0x0022E), `Abs (0x0022F); + (0x00230,0x00230), `Abs (0x00231); + (0x00232,0x00232), `Abs (0x00233); + (0x00386,0x00386), `Abs (0x003AC); + (0x00388,0x0038A), `Delta (37); + (0x0038C,0x0038C), `Abs (0x003CC); + (0x0038E,0x0038F), `Delta (63); + (0x00391,0x003A1), `Delta (32); + (0x003A3,0x003AB), `Delta (32); + (0x003D8,0x003D8), `Abs (0x003D9); + (0x003DA,0x003DA), `Abs (0x003DB); + (0x003DC,0x003DC), `Abs (0x003DD); + (0x003DE,0x003DE), `Abs (0x003DF); + (0x003E0,0x003E0), `Abs (0x003E1); + (0x003E2,0x003E2), `Abs (0x003E3); + (0x003E4,0x003E4), `Abs (0x003E5); + (0x003E6,0x003E6), `Abs (0x003E7); + (0x003E8,0x003E8), `Abs (0x003E9); + (0x003EA,0x003EA), `Abs (0x003EB); + (0x003EC,0x003EC), `Abs (0x003ED); + (0x003EE,0x003EE), `Abs (0x003EF); + (0x003F4,0x003F4), `Abs (0x003B8); + (0x00400,0x0040F), `Delta (80); + (0x00410,0x0042F), `Delta (32); + (0x00460,0x00460), `Abs (0x00461); + (0x00462,0x00462), `Abs (0x00463); + (0x00464,0x00464), `Abs (0x00465); + (0x00466,0x00466), `Abs (0x00467); + (0x00468,0x00468), `Abs (0x00469); + (0x0046A,0x0046A), `Abs (0x0046B); + (0x0046C,0x0046C), `Abs (0x0046D); + (0x0046E,0x0046E), `Abs (0x0046F); + (0x00470,0x00470), `Abs (0x00471); + (0x00472,0x00472), `Abs (0x00473); + (0x00474,0x00474), `Abs (0x00475); + (0x00476,0x00476), `Abs (0x00477); + (0x00478,0x00478), `Abs (0x00479); + (0x0047A,0x0047A), `Abs (0x0047B); + (0x0047C,0x0047C), `Abs (0x0047D); + (0x0047E,0x0047E), `Abs (0x0047F); + (0x00480,0x00480), `Abs (0x00481); + (0x0048A,0x0048A), `Abs (0x0048B); + (0x0048C,0x0048C), `Abs (0x0048D); + (0x0048E,0x0048E), `Abs (0x0048F); + (0x00490,0x00490), `Abs (0x00491); + (0x00492,0x00492), `Abs (0x00493); + (0x00494,0x00494), `Abs (0x00495); + (0x00496,0x00496), `Abs (0x00497); + (0x00498,0x00498), `Abs (0x00499); + (0x0049A,0x0049A), `Abs (0x0049B); + (0x0049C,0x0049C), `Abs (0x0049D); + (0x0049E,0x0049E), `Abs (0x0049F); + (0x004A0,0x004A0), `Abs (0x004A1); + (0x004A2,0x004A2), `Abs (0x004A3); + (0x004A4,0x004A4), `Abs (0x004A5); + (0x004A6,0x004A6), `Abs (0x004A7); + (0x004A8,0x004A8), `Abs (0x004A9); + (0x004AA,0x004AA), `Abs (0x004AB); + (0x004AC,0x004AC), `Abs (0x004AD); + (0x004AE,0x004AE), `Abs (0x004AF); + (0x004B0,0x004B0), `Abs (0x004B1); + (0x004B2,0x004B2), `Abs (0x004B3); + (0x004B4,0x004B4), `Abs (0x004B5); + (0x004B6,0x004B6), `Abs (0x004B7); + (0x004B8,0x004B8), `Abs (0x004B9); + (0x004BA,0x004BA), `Abs (0x004BB); + (0x004BC,0x004BC), `Abs (0x004BD); + (0x004BE,0x004BE), `Abs (0x004BF); + (0x004C1,0x004C1), `Abs (0x004C2); + (0x004C3,0x004C3), `Abs (0x004C4); + (0x004C5,0x004C5), `Abs (0x004C6); + (0x004C7,0x004C7), `Abs (0x004C8); + (0x004C9,0x004C9), `Abs (0x004CA); + (0x004CB,0x004CB), `Abs (0x004CC); + (0x004CD,0x004CD), `Abs (0x004CE); + (0x004D0,0x004D0), `Abs (0x004D1); + (0x004D2,0x004D2), `Abs (0x004D3); + (0x004D4,0x004D4), `Abs (0x004D5); + (0x004D6,0x004D6), `Abs (0x004D7); + (0x004D8,0x004D8), `Abs (0x004D9); + (0x004DA,0x004DA), `Abs (0x004DB); + (0x004DC,0x004DC), `Abs (0x004DD); + (0x004DE,0x004DE), `Abs (0x004DF); + (0x004E0,0x004E0), `Abs (0x004E1); + (0x004E2,0x004E2), `Abs (0x004E3); + (0x004E4,0x004E4), `Abs (0x004E5); + (0x004E6,0x004E6), `Abs (0x004E7); + (0x004E8,0x004E8), `Abs (0x004E9); + (0x004EA,0x004EA), `Abs (0x004EB); + (0x004EC,0x004EC), `Abs (0x004ED); + (0x004EE,0x004EE), `Abs (0x004EF); + (0x004F0,0x004F0), `Abs (0x004F1); + (0x004F2,0x004F2), `Abs (0x004F3); + (0x004F4,0x004F4), `Abs (0x004F5); + (0x004F8,0x004F8), `Abs (0x004F9); + (0x00500,0x00500), `Abs (0x00501); + (0x00502,0x00502), `Abs (0x00503); + (0x00504,0x00504), `Abs (0x00505); + (0x00506,0x00506), `Abs (0x00507); + (0x00508,0x00508), `Abs (0x00509); + (0x0050A,0x0050A), `Abs (0x0050B); + (0x0050C,0x0050C), `Abs (0x0050D); + (0x0050E,0x0050E), `Abs (0x0050F); + (0x00531,0x00556), `Delta (48); + (0x01E00,0x01E00), `Abs (0x01E01); + (0x01E02,0x01E02), `Abs (0x01E03); + (0x01E04,0x01E04), `Abs (0x01E05); + (0x01E06,0x01E06), `Abs (0x01E07); + (0x01E08,0x01E08), `Abs (0x01E09); + (0x01E0A,0x01E0A), `Abs (0x01E0B); + (0x01E0C,0x01E0C), `Abs (0x01E0D); + (0x01E0E,0x01E0E), `Abs (0x01E0F); + (0x01E10,0x01E10), `Abs (0x01E11); + (0x01E12,0x01E12), `Abs (0x01E13); + (0x01E14,0x01E14), `Abs (0x01E15); + (0x01E16,0x01E16), `Abs (0x01E17); + (0x01E18,0x01E18), `Abs (0x01E19); + (0x01E1A,0x01E1A), `Abs (0x01E1B); + (0x01E1C,0x01E1C), `Abs (0x01E1D); + (0x01E1E,0x01E1E), `Abs (0x01E1F); + (0x01E20,0x01E20), `Abs (0x01E21); + (0x01E22,0x01E22), `Abs (0x01E23); + (0x01E24,0x01E24), `Abs (0x01E25); + (0x01E26,0x01E26), `Abs (0x01E27); + (0x01E28,0x01E28), `Abs (0x01E29); + (0x01E2A,0x01E2A), `Abs (0x01E2B); + (0x01E2C,0x01E2C), `Abs (0x01E2D); + (0x01E2E,0x01E2E), `Abs (0x01E2F); + (0x01E30,0x01E30), `Abs (0x01E31); + (0x01E32,0x01E32), `Abs (0x01E33); + (0x01E34,0x01E34), `Abs (0x01E35); + (0x01E36,0x01E36), `Abs (0x01E37); + (0x01E38,0x01E38), `Abs (0x01E39); + (0x01E3A,0x01E3A), `Abs (0x01E3B); + (0x01E3C,0x01E3C), `Abs (0x01E3D); + (0x01E3E,0x01E3E), `Abs (0x01E3F); + (0x01E40,0x01E40), `Abs (0x01E41); + (0x01E42,0x01E42), `Abs (0x01E43); + (0x01E44,0x01E44), `Abs (0x01E45); + (0x01E46,0x01E46), `Abs (0x01E47); + (0x01E48,0x01E48), `Abs (0x01E49); + (0x01E4A,0x01E4A), `Abs (0x01E4B); + (0x01E4C,0x01E4C), `Abs (0x01E4D); + (0x01E4E,0x01E4E), `Abs (0x01E4F); + (0x01E50,0x01E50), `Abs (0x01E51); + (0x01E52,0x01E52), `Abs (0x01E53); + (0x01E54,0x01E54), `Abs (0x01E55); + (0x01E56,0x01E56), `Abs (0x01E57); + (0x01E58,0x01E58), `Abs (0x01E59); + (0x01E5A,0x01E5A), `Abs (0x01E5B); + (0x01E5C,0x01E5C), `Abs (0x01E5D); + (0x01E5E,0x01E5E), `Abs (0x01E5F); + (0x01E60,0x01E60), `Abs (0x01E61); + (0x01E62,0x01E62), `Abs (0x01E63); + (0x01E64,0x01E64), `Abs (0x01E65); + (0x01E66,0x01E66), `Abs (0x01E67); + (0x01E68,0x01E68), `Abs (0x01E69); + (0x01E6A,0x01E6A), `Abs (0x01E6B); + (0x01E6C,0x01E6C), `Abs (0x01E6D); + (0x01E6E,0x01E6E), `Abs (0x01E6F); + (0x01E70,0x01E70), `Abs (0x01E71); + (0x01E72,0x01E72), `Abs (0x01E73); + (0x01E74,0x01E74), `Abs (0x01E75); + (0x01E76,0x01E76), `Abs (0x01E77); + (0x01E78,0x01E78), `Abs (0x01E79); + (0x01E7A,0x01E7A), `Abs (0x01E7B); + (0x01E7C,0x01E7C), `Abs (0x01E7D); + (0x01E7E,0x01E7E), `Abs (0x01E7F); + (0x01E80,0x01E80), `Abs (0x01E81); + (0x01E82,0x01E82), `Abs (0x01E83); + (0x01E84,0x01E84), `Abs (0x01E85); + (0x01E86,0x01E86), `Abs (0x01E87); + (0x01E88,0x01E88), `Abs (0x01E89); + (0x01E8A,0x01E8A), `Abs (0x01E8B); + (0x01E8C,0x01E8C), `Abs (0x01E8D); + (0x01E8E,0x01E8E), `Abs (0x01E8F); + (0x01E90,0x01E90), `Abs (0x01E91); + (0x01E92,0x01E92), `Abs (0x01E93); + (0x01E94,0x01E94), `Abs (0x01E95); + (0x01EA0,0x01EA0), `Abs (0x01EA1); + (0x01EA2,0x01EA2), `Abs (0x01EA3); + (0x01EA4,0x01EA4), `Abs (0x01EA5); + (0x01EA6,0x01EA6), `Abs (0x01EA7); + (0x01EA8,0x01EA8), `Abs (0x01EA9); + (0x01EAA,0x01EAA), `Abs (0x01EAB); + (0x01EAC,0x01EAC), `Abs (0x01EAD); + (0x01EAE,0x01EAE), `Abs (0x01EAF); + (0x01EB0,0x01EB0), `Abs (0x01EB1); + (0x01EB2,0x01EB2), `Abs (0x01EB3); + (0x01EB4,0x01EB4), `Abs (0x01EB5); + (0x01EB6,0x01EB6), `Abs (0x01EB7); + (0x01EB8,0x01EB8), `Abs (0x01EB9); + (0x01EBA,0x01EBA), `Abs (0x01EBB); + (0x01EBC,0x01EBC), `Abs (0x01EBD); + (0x01EBE,0x01EBE), `Abs (0x01EBF); + (0x01EC0,0x01EC0), `Abs (0x01EC1); + (0x01EC2,0x01EC2), `Abs (0x01EC3); + (0x01EC4,0x01EC4), `Abs (0x01EC5); + (0x01EC6,0x01EC6), `Abs (0x01EC7); + (0x01EC8,0x01EC8), `Abs (0x01EC9); + (0x01ECA,0x01ECA), `Abs (0x01ECB); + (0x01ECC,0x01ECC), `Abs (0x01ECD); + (0x01ECE,0x01ECE), `Abs (0x01ECF); + (0x01ED0,0x01ED0), `Abs (0x01ED1); + (0x01ED2,0x01ED2), `Abs (0x01ED3); + (0x01ED4,0x01ED4), `Abs (0x01ED5); + (0x01ED6,0x01ED6), `Abs (0x01ED7); + (0x01ED8,0x01ED8), `Abs (0x01ED9); + (0x01EDA,0x01EDA), `Abs (0x01EDB); + (0x01EDC,0x01EDC), `Abs (0x01EDD); + (0x01EDE,0x01EDE), `Abs (0x01EDF); + (0x01EE0,0x01EE0), `Abs (0x01EE1); + (0x01EE2,0x01EE2), `Abs (0x01EE3); + (0x01EE4,0x01EE4), `Abs (0x01EE5); + (0x01EE6,0x01EE6), `Abs (0x01EE7); + (0x01EE8,0x01EE8), `Abs (0x01EE9); + (0x01EEA,0x01EEA), `Abs (0x01EEB); + (0x01EEC,0x01EEC), `Abs (0x01EED); + (0x01EEE,0x01EEE), `Abs (0x01EEF); + (0x01EF0,0x01EF0), `Abs (0x01EF1); + (0x01EF2,0x01EF2), `Abs (0x01EF3); + (0x01EF4,0x01EF4), `Abs (0x01EF5); + (0x01EF6,0x01EF6), `Abs (0x01EF7); + (0x01EF8,0x01EF8), `Abs (0x01EF9); + (0x01F08,0x01F0F), `Delta (-8); + (0x01F18,0x01F1D), `Delta (-8); + (0x01F28,0x01F2F), `Delta (-8); + (0x01F38,0x01F3F), `Delta (-8); + (0x01F48,0x01F4D), `Delta (-8); + (0x01F59,0x01F59), `Abs (0x01F51); + (0x01F5B,0x01F5B), `Abs (0x01F53); + (0x01F5D,0x01F5D), `Abs (0x01F55); + (0x01F5F,0x01F5F), `Abs (0x01F57); + (0x01F68,0x01F6F), `Delta (-8); + (0x01FB8,0x01FB9), `Delta (-8); + (0x01FBA,0x01FBB), `Delta (-74); + (0x01FC8,0x01FCB), `Delta (-86); + (0x01FD8,0x01FD9), `Delta (-8); + (0x01FDA,0x01FDB), `Delta (-100); + (0x01FE8,0x01FE9), `Delta (-8); + (0x01FEA,0x01FEB), `Delta (-112); + (0x01FEC,0x01FEC), `Abs (0x01FE5); + (0x01FF8,0x01FF9), `Delta (-128); + (0x01FFA,0x01FFB), `Delta (-126); + (0x02126,0x02126), `Abs (0x003C9); + (0x0212A,0x0212A), `Abs (0x0006B); + (0x0212B,0x0212B), `Abs (0x000E5); + (0x0FF21,0x0FF3A), `Delta (32); + (0x10400,0x10425), `Delta (40); + (0x001C5,0x001C5), `Abs (0x001C6); + (0x001C8,0x001C8), `Abs (0x001C9); + (0x001CB,0x001CB), `Abs (0x001CC); + (0x001F2,0x001F2), `Abs (0x001F3); + (0x01F88,0x01F8F), `Delta (-8); + (0x01F98,0x01F9F), `Delta (-8); + (0x01FA8,0x01FAF), `Delta (-8); + (0x01FBC,0x01FBC), `Abs (0x01FB3); + (0x01FCC,0x01FCC), `Abs (0x01FC3); + (0x01FFC,0x01FFC), `Abs (0x01FF3); + (0x02160,0x0216F), `Delta (16) +] + diff --git a/lib/util.ml b/lib/util.ml index 69a1e9f8bb..b8ceea3ea0 100644 --- a/lib/util.ml +++ b/lib/util.ml @@ -347,120 +347,13 @@ let check_ident_gen handle s = let check_ident_soft = check_ident_gen warning let check_ident = check_ident_gen error -let lowercase_unicode s unicode = - match unicode land 0x1F000 with - | 0x0 -> - begin match unicode with - (* utf-8 Basic Latin underscore *) - | x when x = 0x005F -> x - (* utf-8 Basic Latin letters *) - | x when 0x0041 <= x & x <= 0x005A -> x + 32 - | x when 0x0061 <= x & x <= 0x007A -> x - (* utf-8 Latin-1 non breaking space U00A0 *) - | 0x00A0 as x -> x - (* utf-8 Latin-1 letters U00C0-00D6 *) - | x when 0x00C0 <= x & x <= 0x00D6 -> x + 32 - (* utf-8 Latin-1 letters U00D8-00F6 *) - | x when 0x00D8 <= x & x <= 0x00DE -> x + 32 - | x when 0x00E0 <= x & x <= 0x00F6 -> x - (* utf-8 Latin-1 letters U00F8-00FF *) - | x when 0x00F8 <= x & x <= 0x00FF -> x - (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *) - | x when 0x0100 <= x & x <= 0x017F -> - if x mod 2 = 1 then x else x + 1 - | x when 0x0180 <= x & x <= 0x0241 -> - warning ("Unable to decide which lowercase letter to map to "^s); x - (* utf-8 Phonetic letters U0250-02AF *) - | x when 0x0250 <= x & x <= 0x02AF -> x - (* utf-8 what do to with diacritics U0300-U036F ? *) - (* utf-8 Greek letters U0380-03FF *) - | x when 0x0380 <= x & x <= 0x0385 -> x - | 0x0386 -> 0x03AC - | x when 0x0388 <= x & x <= 0x038A -> x + 37 - | 0x038C -> 0x03CC - | x when 0x038E <= x & x <= 0x038F -> x + 63 - | x when 0x0390 <= x & x <= 0x03AB & x <> 0x03A2 -> x + 32 - (* utf-8 Greek lowercase letters U03B0-03CE *) - | x when 0x03AC <= x & x <= 0x03CE -> x - | x when 0x03CF <= x & x <= 0x03FF -> - warning ("Unable to decide which lowercase letter to map to "^s); x - (* utf-8 Cyrillic letters U0400-0481 *) - | x when 0x0400 <= x & x <= 0x040F -> x + 80 - | x when 0x0410 <= x & x <= 0x042F -> x + 32 - | x when 0x0430 <= x & x <= 0x045F -> x - | x when 0x0460 <= x & x <= 0x0481 -> - if x mod 2 = 1 then x else x + 1 - (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *) - | x when 0x048A <= x & x <= 0x04F9 & x <> 0x04CF -> - if x mod 2 = 1 then x else x + 1 - (* utf-8 Cyrillic supplement letters U0500-U050F *) - | x when 0x0500 <= x & x <= 0x050F -> - if x mod 2 = 1 then x else x + 1 - (* utf-8 Hebrew letters U05D0-05EA *) - | x when 0x05D0 <= x & x <= 0x05EA -> x - (* utf-8 Arabic letters U0621-064A *) - | x when 0x0621 <= x & x <= 0x064A -> x - (* utf-8 Arabic supplement letters U0750-076D *) - | x when 0x0750 <= x & x <= 0x076D -> x - | _ -> raise UnsupportedUtf8 - end - | 0x1000 -> - begin match unicode with - (* utf-8 Georgian U10A0-10FF (has holes) *) - | x when 0x10A0 <= x & x <= 0x10FF -> x - (* utf-8 Hangul Jamo U1100-11FF (has holes) *) - | x when 0x1100 <= x & x <= 0x11FF -> x - (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *) - | x when 0x1E00 <= x & x <= 0x1E95 -> - if x mod 2 = 1 then x else x + 1 - | x when 0x1E96 <= x & x <= 0x1E9B -> x - | x when 0x1EA0 <= x & x <= 0x1EF9 -> - if x mod 2 = 1 then x else x + 1 - | _ -> raise UnsupportedUtf8 - end - | 0x2000 -> - begin match unicode with - (* utf-8 general punctuation U2080-2089 *) - (* Hyphens *) - | x when 0x2010 <= x & x <= 0x2011 -> x - (* utf-8 letter-like U2100-214F *) - | 0x2102 (* double-struck C *) -> Char.code 'x' - | 0x2115 (* double-struck N *) -> Char.code 'n' - | 0x2119 (* double-struck P *) -> Char.code 'x' - | 0x211A (* double-struck Q *) -> Char.code 'x' - | 0x211D (* double-struck R *) -> Char.code 'r' - | 0x2124 (* double-struck Z *) -> Char.code 'x' - | x when 0x2100 <= x & x <= 0x214F -> - warning ("Unable to decide which lowercase letter to map to "^s); x - | _ -> raise UnsupportedUtf8 - end - | _ -> - begin match unicode with - (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *) - | x when 0x3040 <= x & x <= 0x30FF -> x - (* utf-8 Unified CJK Ideographs U4E00-9FA5 *) - | x when 0x4E00 <= x & x <= 0x9FA5 -> x - (* utf-8 Hangul syllables UAC00-D7AF *) - | x when 0xAC00 <= x & x <= 0xD7AF -> x - (* utf-8 Gothic U10330-1034A *) - | x when 0x10330 <= x & x <= 0x1034A -> x - (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *) - | x when 0x1D6A8 <= x & x <= 0x1D7C9 -> - let a = (x - 0x1D6A8) mod 58 in - if a <= 16 or (18 <= a & a <= 24) - then x + 26 (* all but nabla and theta symbol *) - else x - | x when 0x1D538 <= x & x <= 0x1D56B -> - (* Use ordinary lowercase in both small and capital double-struck *) - (x - 0x1D538) mod 26 + Char.code 'a' - | x when 0x1D468 <= x & x <= 0x1D6A3 -> (* General case *) - if (x - 0x1D400 / 26) mod 2 = 0 then x + 26 else x - | x when 0x1D400 <= x & x <= 0x1D7CB -> (* fallback *) - x - (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *) - | x when 0x1D7CE <= x & x <= 0x1D7FF -> x - | _ -> raise UnsupportedUtf8 - end +let lowercase_unicode s unicode = + let tree = Segmenttree.make Unicodetable.to_lower in + try + match Segmenttree.lookup unicode tree with + | `Abs c -> c + | `Delta d -> unicode + d + with Not_found -> unicode let lowercase_first_char_utf8 s = assert (s <> ""); diff --git a/parsing/grammar.mllib b/parsing/grammar.mllib index 7c3ec11374..248a8ad9ad 100644 --- a/parsing/grammar.mllib +++ b/parsing/grammar.mllib @@ -5,6 +5,7 @@ Pp_control Pp Compat Flags +Segmenttree Unicodetable Util Bigint |
