Correction terminologie et ajout plage unicode 1D400-1D7FF (mathematical

alphanumerical symbols) suite à remarques de Arnaud. Correction bugs d'affichage de l'option -translate. git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@11059 85f007b7-540e-0410-9357-904b9bb8a0f7
author: herbelin 2008-06-06 09:29:20 +0000
committer: herbelin 2008-06-06 09:29:20 +0000
commit: 64c9874e12872a08bfcce470b5e03813c5cff586 (patch)
tree: 42a6e0060354c7f9327fd8349e0c1c8394ae7ccf /lib
parent: 9953deaa45c642301a6cd7202b486c45923dece8 (diff)
2 files changed, 78 insertions, 59 deletions
diff --git a/lib/util.ml b/lib/util.ml
index 26be66cb9e..5a59058596 100644
--- a/lib/util.ml
+++ b/lib/util.ml
@@ -119,129 +119,133 @@ module Stringset = Set.Make(struct type t = string let compare = compare end)
 
 module Stringmap = Map.Make(struct type t = string let compare = compare end)
 
-type utf8_status = Utf8Letter | Utf8IdentPart | Utf8Symbol
+type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
 
 exception UnsupportedUtf8
 
-let classify_utf8 unicode =
+let classify_unicode unicode =
     match unicode land 0x1F000 with
     | 0x0 ->
     begin match unicode with
     (* utf-8 Basic Latin underscore *)
-    | x when x = 0x005F -> Utf8Letter
+    | x when x = 0x005F -> UnicodeLetter
     (* utf-8 Basic Latin letters *)
-    | x when 0x0041 <= x & x <= 0x005A -> Utf8Letter
-    | x when 0x0061 <= x & x <= 0x007A -> Utf8Letter
+    | x when 0x0041 <= x & x <= 0x005A -> UnicodeLetter
+    | x when 0x0061 <= x & x <= 0x007A -> UnicodeLetter
     (* utf-8 Basic Latin digits and quote *)
-    | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> Utf8IdentPart
+    | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> UnicodeIdentPart
     (* utf-8 Basic Latin symbols *)
-    | x when x <= 0x007F -> Utf8Symbol
+    | x when x <= 0x007F -> UnicodeSymbol
     (* utf-8 Latin-1 non breaking space U00A0 *)
-    | 0x00A0 -> Utf8Letter
+    | 0x00A0 -> UnicodeLetter
     (* utf-8 Latin-1 symbols U00A1-00BF *)
-    | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol
+    | x when 0x00A0 <= x & x <= 0x00BF -> UnicodeSymbol
     (* utf-8 Latin-1 letters U00C0-00D6 *)
-    | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter
+    | x when 0x00C0 <= x & x <= 0x00D6 -> UnicodeLetter
     (* utf-8 Latin-1 symbol U00D7 *)
-    | 0x00D7 -> Utf8Symbol
+    | 0x00D7 -> UnicodeSymbol
     (* utf-8 Latin-1 letters U00D8-00F6 *)
-    | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter
+    | x when 0x00D8 <= x & x <= 0x00F6 -> UnicodeLetter
     (* utf-8 Latin-1 symbol U00F7 *)
-    | 0x00F7 -> Utf8Symbol
+    | 0x00F7 -> UnicodeSymbol
     (* utf-8 Latin-1 letters U00F8-00FF *)
-    | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter
+    | x when 0x00F8 <= x & x <= 0x00FF -> UnicodeLetter
     (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *)
-    | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter
+    | x when 0x0100 <= x & x <= 0x0241 -> UnicodeLetter
     (* utf-8 Phonetic letters U0250-02AF *)
-    | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter
+    | x when 0x0250 <= x & x <= 0x02AF -> UnicodeLetter
     (* utf-8 what do to with diacritics U0300-U036F ? *)
     (* utf-8 Greek letters U0380-03FF *)
-    | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter
+    | x when 0x0380 <= x & x <= 0x03FF -> UnicodeLetter
     (* utf-8 Cyrillic letters U0400-0481 *)
-    | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter
+    | x when 0x0400 <= x & x <= 0x0481 -> UnicodeLetter
     (* utf-8 Cyrillic symbol U0482 *)
-    | 0x0482 -> Utf8Symbol
+    | 0x0482 -> UnicodeSymbol
     (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *)
     (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *)
-    | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter
+    | x when 0x048A <= x & x <= 0x04F9 -> UnicodeLetter
     (* utf-8 Cyrillic supplement letters U0500-U050F *)
-    | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter
+    | x when 0x0500 <= x & x <= 0x050F -> UnicodeLetter
     (* utf-8 Hebrew letters U05D0-05EA *)
-    | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter
+    | x when 0x05D0 <= x & x <= 0x05EA -> UnicodeLetter
     (* utf-8 Arabic letters U0621-064A *)
-    | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter
+    | x when 0x0621 <= x & x <= 0x064A -> UnicodeLetter
     (* utf-8 Arabic supplement letters U0750-076D *)
-    | x when 0x0750 <= x & x <= 0x076D -> Utf8Letter
+    | x when 0x0750 <= x & x <= 0x076D -> UnicodeLetter
     | _ -> raise UnsupportedUtf8
     end
     | 0x1000 ->
     begin match unicode with
     (* utf-8 Georgian U10A0-10FF (has holes) *)
-    | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter
+    | x when 0x10A0 <= x & x <= 0x10FF -> UnicodeLetter
     (* utf-8 Hangul Jamo U1100-11FF (has holes) *)
-    | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter
+    | x when 0x1100 <= x & x <= 0x11FF -> UnicodeLetter
     (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *)
-    | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter
-    | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter
+    | x when 0x1E00 <= x & x <= 0x1E9B -> UnicodeLetter
+    | x when 0x1EA0 <= x & x <= 0x1EF9 -> UnicodeLetter
     | _ -> raise UnsupportedUtf8
     end
     | 0x2000 ->
     begin match unicode with
     (* utf-8 general punctuation U2080-2089 *)
     (* Hyphens *)
-    | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter 
+    | x when 0x2010 <= x & x <= 0x2011 -> UnicodeLetter 
     (* Dashes and other symbols *)
-    | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol
+    | x when 0x2012 <= x & x <= 0x2027 -> UnicodeSymbol
     (* Per mille and per ten thousand signs *)
-    | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol
+    | x when 0x2030 <= x & x <= 0x2031 -> UnicodeSymbol
     (* Prime letters *)
-    | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart
+    | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> UnicodeIdentPart
     (* Miscellaneous punctuation *)
-    | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol
-    | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol
+    | x when 0x2039 <= x & x <= 0x2056 -> UnicodeSymbol
+    | x when 0x2058 <= x & x <= 0x205E -> UnicodeSymbol
     (* Invisible mathematical operators *)
-    | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol
+    | x when 0x2061 <= x & x <= 0x2063 -> UnicodeSymbol
     (* utf-8 superscript U2070-207C *) 
-    | x when 0x2070 <= x & x <= 0x207C -> Utf8Symbol
+    | x when 0x2070 <= x & x <= 0x207C -> UnicodeSymbol
     (* utf-8 subscript U2080-2089 *) 
-    | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart
+    | x when 0x2080 <= x & x <= 0x2089 -> UnicodeIdentPart
     (* utf-8 letter-like U2100-214F *)
-    | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter
+    | x when 0x2100 <= x & x <= 0x214F -> UnicodeLetter
     (* utf-8 number-forms U2153-2183 *)
-    | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol
+    | x when 0x2153 <= x & x <= 0x2183 -> UnicodeSymbol
     (* utf-8 arrows A U2190-21FF *)
     (* utf-8 mathematical operators U2200-22FF *)
     (* utf-8 miscellaneous technical U2300-23FF *)
-    | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol
+    | x when 0x2190 <= x & x <= 0x23FF -> UnicodeSymbol
     (* utf-8 box drawing U2500-257F has ceiling, etc. *)
     (* utf-8 block elements U2580-259F *)
     (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
     (* utf-8 miscellaneous symbols U2600-26FF *)
-    | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol
+    | x when 0x2500 <= x & x <= 0x26FF -> UnicodeSymbol
     (* utf-8 arrows B U2900-297F *)
-    | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol
+    | x when 0x2900 <= x & x <= 0x297F -> UnicodeSymbol
     (* utf-8 mathematical operators U2A00-2AFF *)
-    | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol
+    | x when 0x2A00 <= x & x <= 0x2AFF -> UnicodeSymbol
     (* utf-8 bold symbols U2768-U2775 *)
-    | x when 0x2768 <= x & x <= 0x2775 -> Utf8Symbol
+    | x when 0x2768 <= x & x <= 0x2775 -> UnicodeSymbol
     (* utf-8 arrows and brackets U27E0-U27FF *)
-    | x when 0x27E0 <= x & x <= 0x27FF -> Utf8Symbol
+    | x when 0x27E0 <= x & x <= 0x27FF -> UnicodeSymbol
     (* utf-8 brackets, braces and parentheses *)
-    | x when 0x2980 <= x & x <= 0x299F -> Utf8Symbol
+    | x when 0x2980 <= x & x <= 0x299F -> UnicodeSymbol
     (* utf-8 miscellaneous including double-plus U29F0-U29FF *)
-    | x when 0x29F0 <= x & x <= 0x29FF -> Utf8Symbol
+    | x when 0x29F0 <= x & x <= 0x29FF -> UnicodeSymbol
     | _ -> raise UnsupportedUtf8
     end
     | _ ->
     begin match unicode with
     (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *)
-    | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter
+    | x when 0x3040 <= x & x <= 0x30FF -> UnicodeLetter
     (* utf-8 Unified CJK Ideographs U4E00-9FA5 *)
-    | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter
+    | x when 0x4E00 <= x & x <= 0x9FA5 -> UnicodeLetter
     (* utf-8 Hangul syllables UAC00-D7AF *)
-    | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter
+    | x when 0xAC00 <= x & x <= 0xD7AF -> UnicodeLetter
     (* utf-8 Gothic U10330-1034A *)
-    | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter
+    | x when 0x10330 <= x & x <= 0x1034A -> UnicodeLetter
+    (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *)
+    | x when 0x1D400 <= x & x <= 0x1D7CB -> UnicodeLetter
+    (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *)
+    | x when 0x1D7CE <= x & x <= 0x1D7FF -> UnicodeIdentPart
     | _ -> raise UnsupportedUtf8
     end
 
@@ -299,14 +303,14 @@ let check_ident s =
   let i = ref 0 in
   if s <> ".." then try
     let j, n = next_utf8 s 0 in
-    match classify_utf8 n with
-    | Utf8Letter ->
+    match classify_unicode n with
+    | UnicodeLetter ->
 	i := !i + j;
 	begin try
 	  while true do
 	    let j, n = next_utf8 s !i in
-	    match classify_utf8 n with
-	    | Utf8Letter | Utf8IdentPart -> i := !i + j
+	    match classify_unicode n with
+	    | UnicodeLetter | UnicodeIdentPart -> i := !i + j
 	    | _ -> error
 		("invalid character "^(String.sub s !i j)^" in identifier "^s)
 	  done
@@ -317,7 +321,7 @@ let check_ident s =
   | UnsupportedUtf8 -> error (s^": unsupported character in utf8 sequence")
   | Invalid_argument _ -> error (s^": invalid utf8 sequence")
 
-let lowercase_utf8 s unicode =
+let lowercase_unicode s unicode =
     match unicode land 0x1F000 with
     | 0x0 ->
     begin match unicode with
@@ -414,13 +418,28 @@ let lowercase_utf8 s unicode =
     | x when 0xAC00 <= x & x <= 0xD7AF -> x
     (* utf-8 Gothic U10330-1034A *)
     | x when 0x10330 <= x & x <= 0x1034A -> x
+    (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *)
+    | x when 0x1D6A8 <= x & x <= 0x1D7C9 ->
+        let a = (x - 0x1D6A8) mod 58 in
+        if a <= 16 or (18 <= a & a <= 24)
+        then x + 26 (* all but nabla and theta symbol *)
+        else x
+    | x when 0x1D538 <= x & x <= 0x1D56B ->
+        (* Use ordinary lowercase in both small and capital double-struck *)
+        (x - 0x1D538) mod 26 + Char.code 'a'
+    | x when 0x1D468 <= x & x <= 0x1D6A3 -> (* General case *)
+        if (x - 0x1D400 / 26) mod 2 = 0 then x + 26 else x
+    | x when 0x1D400 <= x & x <= 0x1D7CB -> (* fallback *)
+        x
+    (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *)
+    | x when 0x1D7CE <= x & x <= 0x1D7FF -> x
     | _ -> raise UnsupportedUtf8
     end
 
 let lowercase_first_char_utf8 s =
   assert (s <> "");
   let j, n = next_utf8 s 0 in
-  utf8_of_unicode (lowercase_utf8 (String.sub s 0 j) n)
+  utf8_of_unicode (lowercase_unicode (String.sub s 0 j) n)
 
 (* Lists *)
 
diff --git a/lib/util.mli b/lib/util.mli
index 0cbd2fa0e6..e715feca3f 100644
--- a/lib/util.mli
+++ b/lib/util.mli
@@ -78,11 +78,11 @@ val parse_loadpath : string -> string list
 module Stringset : Set.S with type elt = string
 module Stringmap : Map.S with type key = string
 
-type utf8_status = Utf8Letter | Utf8IdentPart | Utf8Symbol
+type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
 
 exception UnsupportedUtf8
 
-val classify_utf8 : int -> utf8_status
+val classify_unicode : int -> utf8_status
 val check_ident : string -> unit
 val lowercase_first_char_utf8 : string -> string
author	herbelin	2008-06-06 09:29:20 +0000
committer	herbelin	2008-06-06 09:29:20 +0000
commit	64c9874e12872a08bfcce470b5e03813c5cff586 (patch)
tree	42a6e0060354c7f9327fd8349e0c1c8394ae7ccf /lib
parent	9953deaa45c642301a6cd7202b486c45923dece8 (diff)