diff options
| author | herbelin | 2008-06-06 09:29:20 +0000 |
|---|---|---|
| committer | herbelin | 2008-06-06 09:29:20 +0000 |
| commit | 64c9874e12872a08bfcce470b5e03813c5cff586 (patch) | |
| tree | 42a6e0060354c7f9327fd8349e0c1c8394ae7ccf /lib | |
| parent | 9953deaa45c642301a6cd7202b486c45923dece8 (diff) | |
Correction terminologie et ajout plage unicode 1D400-1D7FF (mathematical
alphanumerical symbols) suite à remarques de Arnaud.
Correction bugs d'affichage de l'option -translate.
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@11059 85f007b7-540e-0410-9357-904b9bb8a0f7
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/util.ml | 133 | ||||
| -rw-r--r-- | lib/util.mli | 4 |
2 files changed, 78 insertions, 59 deletions
diff --git a/lib/util.ml b/lib/util.ml index 26be66cb9e..5a59058596 100644 --- a/lib/util.ml +++ b/lib/util.ml @@ -119,129 +119,133 @@ module Stringset = Set.Make(struct type t = string let compare = compare end) module Stringmap = Map.Make(struct type t = string let compare = compare end) -type utf8_status = Utf8Letter | Utf8IdentPart | Utf8Symbol +type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol exception UnsupportedUtf8 -let classify_utf8 unicode = +let classify_unicode unicode = match unicode land 0x1F000 with | 0x0 -> begin match unicode with (* utf-8 Basic Latin underscore *) - | x when x = 0x005F -> Utf8Letter + | x when x = 0x005F -> UnicodeLetter (* utf-8 Basic Latin letters *) - | x when 0x0041 <= x & x <= 0x005A -> Utf8Letter - | x when 0x0061 <= x & x <= 0x007A -> Utf8Letter + | x when 0x0041 <= x & x <= 0x005A -> UnicodeLetter + | x when 0x0061 <= x & x <= 0x007A -> UnicodeLetter (* utf-8 Basic Latin digits and quote *) - | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> Utf8IdentPart + | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> UnicodeIdentPart (* utf-8 Basic Latin symbols *) - | x when x <= 0x007F -> Utf8Symbol + | x when x <= 0x007F -> UnicodeSymbol (* utf-8 Latin-1 non breaking space U00A0 *) - | 0x00A0 -> Utf8Letter + | 0x00A0 -> UnicodeLetter (* utf-8 Latin-1 symbols U00A1-00BF *) - | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol + | x when 0x00A0 <= x & x <= 0x00BF -> UnicodeSymbol (* utf-8 Latin-1 letters U00C0-00D6 *) - | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter + | x when 0x00C0 <= x & x <= 0x00D6 -> UnicodeLetter (* utf-8 Latin-1 symbol U00D7 *) - | 0x00D7 -> Utf8Symbol + | 0x00D7 -> UnicodeSymbol (* utf-8 Latin-1 letters U00D8-00F6 *) - | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter + | x when 0x00D8 <= x & x <= 0x00F6 -> UnicodeLetter (* utf-8 Latin-1 symbol U00F7 *) - | 0x00F7 -> Utf8Symbol + | 0x00F7 -> UnicodeSymbol (* utf-8 Latin-1 letters U00F8-00FF *) - | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter + | x when 0x00F8 <= x & x <= 0x00FF -> UnicodeLetter (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *) - | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter + | x when 0x0100 <= x & x <= 0x0241 -> UnicodeLetter (* utf-8 Phonetic letters U0250-02AF *) - | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter + | x when 0x0250 <= x & x <= 0x02AF -> UnicodeLetter (* utf-8 what do to with diacritics U0300-U036F ? *) (* utf-8 Greek letters U0380-03FF *) - | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter + | x when 0x0380 <= x & x <= 0x03FF -> UnicodeLetter (* utf-8 Cyrillic letters U0400-0481 *) - | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter + | x when 0x0400 <= x & x <= 0x0481 -> UnicodeLetter (* utf-8 Cyrillic symbol U0482 *) - | 0x0482 -> Utf8Symbol + | 0x0482 -> UnicodeSymbol (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *) (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *) - | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter + | x when 0x048A <= x & x <= 0x04F9 -> UnicodeLetter (* utf-8 Cyrillic supplement letters U0500-U050F *) - | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter + | x when 0x0500 <= x & x <= 0x050F -> UnicodeLetter (* utf-8 Hebrew letters U05D0-05EA *) - | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter + | x when 0x05D0 <= x & x <= 0x05EA -> UnicodeLetter (* utf-8 Arabic letters U0621-064A *) - | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter + | x when 0x0621 <= x & x <= 0x064A -> UnicodeLetter (* utf-8 Arabic supplement letters U0750-076D *) - | x when 0x0750 <= x & x <= 0x076D -> Utf8Letter + | x when 0x0750 <= x & x <= 0x076D -> UnicodeLetter | _ -> raise UnsupportedUtf8 end | 0x1000 -> begin match unicode with (* utf-8 Georgian U10A0-10FF (has holes) *) - | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter + | x when 0x10A0 <= x & x <= 0x10FF -> UnicodeLetter (* utf-8 Hangul Jamo U1100-11FF (has holes) *) - | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter + | x when 0x1100 <= x & x <= 0x11FF -> UnicodeLetter (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *) - | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter - | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter + | x when 0x1E00 <= x & x <= 0x1E9B -> UnicodeLetter + | x when 0x1EA0 <= x & x <= 0x1EF9 -> UnicodeLetter | _ -> raise UnsupportedUtf8 end | 0x2000 -> begin match unicode with (* utf-8 general punctuation U2080-2089 *) (* Hyphens *) - | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter + | x when 0x2010 <= x & x <= 0x2011 -> UnicodeLetter (* Dashes and other symbols *) - | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol + | x when 0x2012 <= x & x <= 0x2027 -> UnicodeSymbol (* Per mille and per ten thousand signs *) - | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol + | x when 0x2030 <= x & x <= 0x2031 -> UnicodeSymbol (* Prime letters *) - | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart + | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> UnicodeIdentPart (* Miscellaneous punctuation *) - | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol - | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol + | x when 0x2039 <= x & x <= 0x2056 -> UnicodeSymbol + | x when 0x2058 <= x & x <= 0x205E -> UnicodeSymbol (* Invisible mathematical operators *) - | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol + | x when 0x2061 <= x & x <= 0x2063 -> UnicodeSymbol (* utf-8 superscript U2070-207C *) - | x when 0x2070 <= x & x <= 0x207C -> Utf8Symbol + | x when 0x2070 <= x & x <= 0x207C -> UnicodeSymbol (* utf-8 subscript U2080-2089 *) - | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart + | x when 0x2080 <= x & x <= 0x2089 -> UnicodeIdentPart (* utf-8 letter-like U2100-214F *) - | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter + | x when 0x2100 <= x & x <= 0x214F -> UnicodeLetter (* utf-8 number-forms U2153-2183 *) - | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol + | x when 0x2153 <= x & x <= 0x2183 -> UnicodeSymbol (* utf-8 arrows A U2190-21FF *) (* utf-8 mathematical operators U2200-22FF *) (* utf-8 miscellaneous technical U2300-23FF *) - | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol + | x when 0x2190 <= x & x <= 0x23FF -> UnicodeSymbol (* utf-8 box drawing U2500-257F has ceiling, etc. *) (* utf-8 block elements U2580-259F *) (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *) (* utf-8 miscellaneous symbols U2600-26FF *) - | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol + | x when 0x2500 <= x & x <= 0x26FF -> UnicodeSymbol (* utf-8 arrows B U2900-297F *) - | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol + | x when 0x2900 <= x & x <= 0x297F -> UnicodeSymbol (* utf-8 mathematical operators U2A00-2AFF *) - | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol + | x when 0x2A00 <= x & x <= 0x2AFF -> UnicodeSymbol (* utf-8 bold symbols U2768-U2775 *) - | x when 0x2768 <= x & x <= 0x2775 -> Utf8Symbol + | x when 0x2768 <= x & x <= 0x2775 -> UnicodeSymbol (* utf-8 arrows and brackets U27E0-U27FF *) - | x when 0x27E0 <= x & x <= 0x27FF -> Utf8Symbol + | x when 0x27E0 <= x & x <= 0x27FF -> UnicodeSymbol (* utf-8 brackets, braces and parentheses *) - | x when 0x2980 <= x & x <= 0x299F -> Utf8Symbol + | x when 0x2980 <= x & x <= 0x299F -> UnicodeSymbol (* utf-8 miscellaneous including double-plus U29F0-U29FF *) - | x when 0x29F0 <= x & x <= 0x29FF -> Utf8Symbol + | x when 0x29F0 <= x & x <= 0x29FF -> UnicodeSymbol | _ -> raise UnsupportedUtf8 end | _ -> begin match unicode with (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *) - | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter + | x when 0x3040 <= x & x <= 0x30FF -> UnicodeLetter (* utf-8 Unified CJK Ideographs U4E00-9FA5 *) - | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter + | x when 0x4E00 <= x & x <= 0x9FA5 -> UnicodeLetter (* utf-8 Hangul syllables UAC00-D7AF *) - | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter + | x when 0xAC00 <= x & x <= 0xD7AF -> UnicodeLetter (* utf-8 Gothic U10330-1034A *) - | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter + | x when 0x10330 <= x & x <= 0x1034A -> UnicodeLetter + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *) + | x when 0x1D400 <= x & x <= 0x1D7CB -> UnicodeLetter + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *) + | x when 0x1D7CE <= x & x <= 0x1D7FF -> UnicodeIdentPart | _ -> raise UnsupportedUtf8 end @@ -299,14 +303,14 @@ let check_ident s = let i = ref 0 in if s <> ".." then try let j, n = next_utf8 s 0 in - match classify_utf8 n with - | Utf8Letter -> + match classify_unicode n with + | UnicodeLetter -> i := !i + j; begin try while true do let j, n = next_utf8 s !i in - match classify_utf8 n with - | Utf8Letter | Utf8IdentPart -> i := !i + j + match classify_unicode n with + | UnicodeLetter | UnicodeIdentPart -> i := !i + j | _ -> error ("invalid character "^(String.sub s !i j)^" in identifier "^s) done @@ -317,7 +321,7 @@ let check_ident s = | UnsupportedUtf8 -> error (s^": unsupported character in utf8 sequence") | Invalid_argument _ -> error (s^": invalid utf8 sequence") -let lowercase_utf8 s unicode = +let lowercase_unicode s unicode = match unicode land 0x1F000 with | 0x0 -> begin match unicode with @@ -414,13 +418,28 @@ let lowercase_utf8 s unicode = | x when 0xAC00 <= x & x <= 0xD7AF -> x (* utf-8 Gothic U10330-1034A *) | x when 0x10330 <= x & x <= 0x1034A -> x + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *) + | x when 0x1D6A8 <= x & x <= 0x1D7C9 -> + let a = (x - 0x1D6A8) mod 58 in + if a <= 16 or (18 <= a & a <= 24) + then x + 26 (* all but nabla and theta symbol *) + else x + | x when 0x1D538 <= x & x <= 0x1D56B -> + (* Use ordinary lowercase in both small and capital double-struck *) + (x - 0x1D538) mod 26 + Char.code 'a' + | x when 0x1D468 <= x & x <= 0x1D6A3 -> (* General case *) + if (x - 0x1D400 / 26) mod 2 = 0 then x + 26 else x + | x when 0x1D400 <= x & x <= 0x1D7CB -> (* fallback *) + x + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *) + | x when 0x1D7CE <= x & x <= 0x1D7FF -> x | _ -> raise UnsupportedUtf8 end let lowercase_first_char_utf8 s = assert (s <> ""); let j, n = next_utf8 s 0 in - utf8_of_unicode (lowercase_utf8 (String.sub s 0 j) n) + utf8_of_unicode (lowercase_unicode (String.sub s 0 j) n) (* Lists *) diff --git a/lib/util.mli b/lib/util.mli index 0cbd2fa0e6..e715feca3f 100644 --- a/lib/util.mli +++ b/lib/util.mli @@ -78,11 +78,11 @@ val parse_loadpath : string -> string list module Stringset : Set.S with type elt = string module Stringmap : Map.S with type key = string -type utf8_status = Utf8Letter | Utf8IdentPart | Utf8Symbol +type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol exception UnsupportedUtf8 -val classify_utf8 : int -> utf8_status +val classify_unicode : int -> utf8_status val check_ident : string -> unit val lowercase_first_char_utf8 : string -> string |
