diff options
| author | herbelin | 2008-06-06 09:29:20 +0000 |
|---|---|---|
| committer | herbelin | 2008-06-06 09:29:20 +0000 |
| commit | 64c9874e12872a08bfcce470b5e03813c5cff586 (patch) | |
| tree | 42a6e0060354c7f9327fd8349e0c1c8394ae7ccf | |
| parent | 9953deaa45c642301a6cd7202b486c45923dece8 (diff) | |
Correction terminologie et ajout plage unicode 1D400-1D7FF (mathematical
alphanumerical symbols) suite à remarques de Arnaud.
Correction bugs d'affichage de l'option -translate.
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@11059 85f007b7-540e-0410-9357-904b9bb8a0f7
| -rw-r--r-- | lib/util.ml | 133 | ||||
| -rw-r--r-- | lib/util.mli | 4 | ||||
| -rw-r--r-- | parsing/lexer.ml4 | 14 | ||||
| -rw-r--r-- | parsing/ppvernac.ml | 14 |
4 files changed, 93 insertions, 72 deletions
diff --git a/lib/util.ml b/lib/util.ml index 26be66cb9e..5a59058596 100644 --- a/lib/util.ml +++ b/lib/util.ml @@ -119,129 +119,133 @@ module Stringset = Set.Make(struct type t = string let compare = compare end) module Stringmap = Map.Make(struct type t = string let compare = compare end) -type utf8_status = Utf8Letter | Utf8IdentPart | Utf8Symbol +type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol exception UnsupportedUtf8 -let classify_utf8 unicode = +let classify_unicode unicode = match unicode land 0x1F000 with | 0x0 -> begin match unicode with (* utf-8 Basic Latin underscore *) - | x when x = 0x005F -> Utf8Letter + | x when x = 0x005F -> UnicodeLetter (* utf-8 Basic Latin letters *) - | x when 0x0041 <= x & x <= 0x005A -> Utf8Letter - | x when 0x0061 <= x & x <= 0x007A -> Utf8Letter + | x when 0x0041 <= x & x <= 0x005A -> UnicodeLetter + | x when 0x0061 <= x & x <= 0x007A -> UnicodeLetter (* utf-8 Basic Latin digits and quote *) - | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> Utf8IdentPart + | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> UnicodeIdentPart (* utf-8 Basic Latin symbols *) - | x when x <= 0x007F -> Utf8Symbol + | x when x <= 0x007F -> UnicodeSymbol (* utf-8 Latin-1 non breaking space U00A0 *) - | 0x00A0 -> Utf8Letter + | 0x00A0 -> UnicodeLetter (* utf-8 Latin-1 symbols U00A1-00BF *) - | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol + | x when 0x00A0 <= x & x <= 0x00BF -> UnicodeSymbol (* utf-8 Latin-1 letters U00C0-00D6 *) - | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter + | x when 0x00C0 <= x & x <= 0x00D6 -> UnicodeLetter (* utf-8 Latin-1 symbol U00D7 *) - | 0x00D7 -> Utf8Symbol + | 0x00D7 -> UnicodeSymbol (* utf-8 Latin-1 letters U00D8-00F6 *) - | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter + | x when 0x00D8 <= x & x <= 0x00F6 -> UnicodeLetter (* utf-8 Latin-1 symbol U00F7 *) - | 0x00F7 -> Utf8Symbol + | 0x00F7 -> UnicodeSymbol (* utf-8 Latin-1 letters U00F8-00FF *) - | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter + | x when 0x00F8 <= x & x <= 0x00FF -> UnicodeLetter (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *) - | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter + | x when 0x0100 <= x & x <= 0x0241 -> UnicodeLetter (* utf-8 Phonetic letters U0250-02AF *) - | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter + | x when 0x0250 <= x & x <= 0x02AF -> UnicodeLetter (* utf-8 what do to with diacritics U0300-U036F ? *) (* utf-8 Greek letters U0380-03FF *) - | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter + | x when 0x0380 <= x & x <= 0x03FF -> UnicodeLetter (* utf-8 Cyrillic letters U0400-0481 *) - | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter + | x when 0x0400 <= x & x <= 0x0481 -> UnicodeLetter (* utf-8 Cyrillic symbol U0482 *) - | 0x0482 -> Utf8Symbol + | 0x0482 -> UnicodeSymbol (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *) (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *) - | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter + | x when 0x048A <= x & x <= 0x04F9 -> UnicodeLetter (* utf-8 Cyrillic supplement letters U0500-U050F *) - | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter + | x when 0x0500 <= x & x <= 0x050F -> UnicodeLetter (* utf-8 Hebrew letters U05D0-05EA *) - | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter + | x when 0x05D0 <= x & x <= 0x05EA -> UnicodeLetter (* utf-8 Arabic letters U0621-064A *) - | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter + | x when 0x0621 <= x & x <= 0x064A -> UnicodeLetter (* utf-8 Arabic supplement letters U0750-076D *) - | x when 0x0750 <= x & x <= 0x076D -> Utf8Letter + | x when 0x0750 <= x & x <= 0x076D -> UnicodeLetter | _ -> raise UnsupportedUtf8 end | 0x1000 -> begin match unicode with (* utf-8 Georgian U10A0-10FF (has holes) *) - | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter + | x when 0x10A0 <= x & x <= 0x10FF -> UnicodeLetter (* utf-8 Hangul Jamo U1100-11FF (has holes) *) - | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter + | x when 0x1100 <= x & x <= 0x11FF -> UnicodeLetter (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *) - | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter - | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter + | x when 0x1E00 <= x & x <= 0x1E9B -> UnicodeLetter + | x when 0x1EA0 <= x & x <= 0x1EF9 -> UnicodeLetter | _ -> raise UnsupportedUtf8 end | 0x2000 -> begin match unicode with (* utf-8 general punctuation U2080-2089 *) (* Hyphens *) - | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter + | x when 0x2010 <= x & x <= 0x2011 -> UnicodeLetter (* Dashes and other symbols *) - | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol + | x when 0x2012 <= x & x <= 0x2027 -> UnicodeSymbol (* Per mille and per ten thousand signs *) - | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol + | x when 0x2030 <= x & x <= 0x2031 -> UnicodeSymbol (* Prime letters *) - | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart + | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> UnicodeIdentPart (* Miscellaneous punctuation *) - | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol - | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol + | x when 0x2039 <= x & x <= 0x2056 -> UnicodeSymbol + | x when 0x2058 <= x & x <= 0x205E -> UnicodeSymbol (* Invisible mathematical operators *) - | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol + | x when 0x2061 <= x & x <= 0x2063 -> UnicodeSymbol (* utf-8 superscript U2070-207C *) - | x when 0x2070 <= x & x <= 0x207C -> Utf8Symbol + | x when 0x2070 <= x & x <= 0x207C -> UnicodeSymbol (* utf-8 subscript U2080-2089 *) - | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart + | x when 0x2080 <= x & x <= 0x2089 -> UnicodeIdentPart (* utf-8 letter-like U2100-214F *) - | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter + | x when 0x2100 <= x & x <= 0x214F -> UnicodeLetter (* utf-8 number-forms U2153-2183 *) - | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol + | x when 0x2153 <= x & x <= 0x2183 -> UnicodeSymbol (* utf-8 arrows A U2190-21FF *) (* utf-8 mathematical operators U2200-22FF *) (* utf-8 miscellaneous technical U2300-23FF *) - | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol + | x when 0x2190 <= x & x <= 0x23FF -> UnicodeSymbol (* utf-8 box drawing U2500-257F has ceiling, etc. *) (* utf-8 block elements U2580-259F *) (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *) (* utf-8 miscellaneous symbols U2600-26FF *) - | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol + | x when 0x2500 <= x & x <= 0x26FF -> UnicodeSymbol (* utf-8 arrows B U2900-297F *) - | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol + | x when 0x2900 <= x & x <= 0x297F -> UnicodeSymbol (* utf-8 mathematical operators U2A00-2AFF *) - | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol + | x when 0x2A00 <= x & x <= 0x2AFF -> UnicodeSymbol (* utf-8 bold symbols U2768-U2775 *) - | x when 0x2768 <= x & x <= 0x2775 -> Utf8Symbol + | x when 0x2768 <= x & x <= 0x2775 -> UnicodeSymbol (* utf-8 arrows and brackets U27E0-U27FF *) - | x when 0x27E0 <= x & x <= 0x27FF -> Utf8Symbol + | x when 0x27E0 <= x & x <= 0x27FF -> UnicodeSymbol (* utf-8 brackets, braces and parentheses *) - | x when 0x2980 <= x & x <= 0x299F -> Utf8Symbol + | x when 0x2980 <= x & x <= 0x299F -> UnicodeSymbol (* utf-8 miscellaneous including double-plus U29F0-U29FF *) - | x when 0x29F0 <= x & x <= 0x29FF -> Utf8Symbol + | x when 0x29F0 <= x & x <= 0x29FF -> UnicodeSymbol | _ -> raise UnsupportedUtf8 end | _ -> begin match unicode with (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *) - | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter + | x when 0x3040 <= x & x <= 0x30FF -> UnicodeLetter (* utf-8 Unified CJK Ideographs U4E00-9FA5 *) - | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter + | x when 0x4E00 <= x & x <= 0x9FA5 -> UnicodeLetter (* utf-8 Hangul syllables UAC00-D7AF *) - | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter + | x when 0xAC00 <= x & x <= 0xD7AF -> UnicodeLetter (* utf-8 Gothic U10330-1034A *) - | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter + | x when 0x10330 <= x & x <= 0x1034A -> UnicodeLetter + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *) + | x when 0x1D400 <= x & x <= 0x1D7CB -> UnicodeLetter + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *) + | x when 0x1D7CE <= x & x <= 0x1D7FF -> UnicodeIdentPart | _ -> raise UnsupportedUtf8 end @@ -299,14 +303,14 @@ let check_ident s = let i = ref 0 in if s <> ".." then try let j, n = next_utf8 s 0 in - match classify_utf8 n with - | Utf8Letter -> + match classify_unicode n with + | UnicodeLetter -> i := !i + j; begin try while true do let j, n = next_utf8 s !i in - match classify_utf8 n with - | Utf8Letter | Utf8IdentPart -> i := !i + j + match classify_unicode n with + | UnicodeLetter | UnicodeIdentPart -> i := !i + j | _ -> error ("invalid character "^(String.sub s !i j)^" in identifier "^s) done @@ -317,7 +321,7 @@ let check_ident s = | UnsupportedUtf8 -> error (s^": unsupported character in utf8 sequence") | Invalid_argument _ -> error (s^": invalid utf8 sequence") -let lowercase_utf8 s unicode = +let lowercase_unicode s unicode = match unicode land 0x1F000 with | 0x0 -> begin match unicode with @@ -414,13 +418,28 @@ let lowercase_utf8 s unicode = | x when 0xAC00 <= x & x <= 0xD7AF -> x (* utf-8 Gothic U10330-1034A *) | x when 0x10330 <= x & x <= 0x1034A -> x + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *) + | x when 0x1D6A8 <= x & x <= 0x1D7C9 -> + let a = (x - 0x1D6A8) mod 58 in + if a <= 16 or (18 <= a & a <= 24) + then x + 26 (* all but nabla and theta symbol *) + else x + | x when 0x1D538 <= x & x <= 0x1D56B -> + (* Use ordinary lowercase in both small and capital double-struck *) + (x - 0x1D538) mod 26 + Char.code 'a' + | x when 0x1D468 <= x & x <= 0x1D6A3 -> (* General case *) + if (x - 0x1D400 / 26) mod 2 = 0 then x + 26 else x + | x when 0x1D400 <= x & x <= 0x1D7CB -> (* fallback *) + x + (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *) + | x when 0x1D7CE <= x & x <= 0x1D7FF -> x | _ -> raise UnsupportedUtf8 end let lowercase_first_char_utf8 s = assert (s <> ""); let j, n = next_utf8 s 0 in - utf8_of_unicode (lowercase_utf8 (String.sub s 0 j) n) + utf8_of_unicode (lowercase_unicode (String.sub s 0 j) n) (* Lists *) diff --git a/lib/util.mli b/lib/util.mli index 0cbd2fa0e6..e715feca3f 100644 --- a/lib/util.mli +++ b/lib/util.mli @@ -78,11 +78,11 @@ val parse_loadpath : string -> string list module Stringset : Set.S with type elt = string module Stringmap : Map.S with type key = string -type utf8_status = Utf8Letter | Utf8IdentPart | Utf8Symbol +type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol exception UnsupportedUtf8 -val classify_utf8 : int -> utf8_status +val classify_unicode : int -> utf8_status val check_ident : string -> unit val lowercase_first_char_utf8 : string -> string diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index b26f8c77ed..b04ab971e4 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -124,7 +124,7 @@ let lookup_utf8_tail c cs = (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) | _ -> error_utf8 cs in - try classify_utf8 unicode, n + try classify_unicode unicode, n with UnsupportedUtf8 -> error_unsupported_unicode_character n cs let lookup_utf8 cs = @@ -149,8 +149,8 @@ let check_ident str = loop_id true s | [< s >] -> match lookup_utf8 s with - | Utf8Token (Utf8Letter, n) -> njunk n s; loop_id true s - | Utf8Token (Utf8IdentPart, n) when intail -> njunk n s; loop_id true s + | Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s + | Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s | EmptyStream -> () | Utf8Token _ | AsciiChar -> bad_token str in @@ -217,7 +217,7 @@ let rec ident_tail len = parser ident_tail (store len c) s | [< s >] -> match lookup_utf8 s with - | Utf8Token ((Utf8IdentPart | Utf8Letter), n) -> + | Utf8Token ((UnicodeIdentPart | UnicodeLetter), n) -> ident_tail (nstore n len s) s | _ -> len @@ -374,7 +374,7 @@ let parse_after_dot bp c = (constructor, get_buff len) | [< s >] -> match lookup_utf8 s with - | Utf8Token (Utf8Letter, n) -> + | Utf8Token (UnicodeLetter, n) -> (constructor, get_buff (ident_tail (nstore n 0 s) s)) | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s) @@ -413,13 +413,13 @@ let rec next_token = parser bp t | [< s >] -> match lookup_utf8 s with - | Utf8Token (Utf8Letter, n) -> + | Utf8Token (UnicodeLetter, n) -> let len = ident_tail (nstore n 0 s) s in let id = get_buff len in let ep = Stream.count s in comment_stop bp; (try ("",find_keyword id) with Not_found -> ("IDENT",id)), (bp, ep) - | AsciiChar | Utf8Token ((Utf8Symbol | Utf8IdentPart), _) -> + | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) -> let t = process_chars bp (Stream.next s) s in comment_stop bp; t | EmptyStream -> diff --git a/parsing/ppvernac.ml b/parsing/ppvernac.ml index c4b5c2d3e8..3ec225ddce 100644 --- a/parsing/ppvernac.ml +++ b/parsing/ppvernac.ml @@ -534,11 +534,13 @@ let rec pr_vernac = function | Some cc -> str" :=" ++ spc() ++ cc)) | VernacStartTheoremProof (ki,l,_,_) -> - hov 1 (pr_thm_token ki ++ spc() ++ - prlist_with_sep (fun () -> str "with ") (fun (id,(bl,c)) -> hov 0 - (pr_opt pr_lident id ++ spc() ++ - (match bl with [] -> mt() | _ -> pr_binders bl ++ spc()) ++ - str":" ++ pr_spc_lconstr c)) l) + let pr_statement head (id,(bl,c)) = + hov 0 + (head ++ spc () ++ pr_opt pr_lident id ++ spc() ++ + (match bl with [] -> mt() | _ -> pr_binders bl ++ spc()) ++ + str":" ++ pr_spc_lconstr c) in + hov 1 (pr_statement (pr_thm_token ki) (List.hd l) ++ + prlist (pr_statement (str "with ")) (List.tl l)) | VernacEndProof Admitted -> str"Admitted" | VernacEndProof (Proved (opac,o)) -> (match o with @@ -821,7 +823,7 @@ let rec pr_vernac = function | VernacDeclareImplicits (local,q,None) -> hov 2 (str"Implicit Arguments" ++ spc() ++ pr_reference q) | VernacDeclareImplicits (local,q,Some imps) -> - hov 1 (str"Implicit Arguments" ++ pr_non_globality local ++ + hov 1 (str"Implicit Arguments " ++ pr_non_globality local ++ spc() ++ pr_reference q ++ spc() ++ str"[" ++ prlist_with_sep sep pr_explanation imps ++ str"]") | VernacReserve (idl,c) -> |
