diff options
| author | herbelin | 2003-02-27 16:00:16 +0000 |
|---|---|---|
| committer | herbelin | 2003-02-27 16:00:16 +0000 |
| commit | 538aff1e33de17e3aa840402731dd6ba483f1cdb (patch) | |
| tree | 7c1cfc15ff0a1def6b0d249de5b72d0dcf9e7747 | |
| parent | c7b40c6ce35eb54f708d5d91ef264f6be92949c0 (diff) | |
Le lexeur et Notation savent reconnaître si un unicode des blocs
0380-03FF (lettres grecques) 2100-214F (symboles assimilés à des
lettres), 2200-22FF (opérateurs mathématiques), 2300-23FF (symboles
techniques divers) et 2600-26FF (symboles divers) est un caractère
spécial ou non lorsque encodé en utf-8.
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@3712 85f007b7-540e-0410-9357-904b9bb8a0f7
| -rw-r--r-- | parsing/lexer.ml4 | 147 | ||||
| -rw-r--r-- | parsing/lexer.mli | 5 | ||||
| -rw-r--r-- | toplevel/metasyntax.ml | 7 |
3 files changed, 123 insertions, 36 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index 2d84593bc2..362a066f58 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -77,8 +77,11 @@ let check_special_token str = let check_ident str = let rec loop = parser - | [< ''$' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246' - | '\248'..'\255' | '0'..'9' | ''' | '_'; s >] -> loop s + | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> loop s + (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) + | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop s + (* iso 8859-1 accentuated letters *) + | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> loop s | [< _ = Stream.empty >] -> () | [< >] -> bad_token str in @@ -103,20 +106,34 @@ let add_keyword str = let is_keyword s = try let _ = ttree_find !keywords s in true with Not_found -> false +let is_normal_token str = + if String.length str > 0 then + match str.[0] with + | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str + | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true + (* utf-8 symbols of the form "E2 xx xx" [E2=226] *) + | '\226' when String.length str > 2 -> + (match str.[1], str.[2] with + | ('\132', '\128'..'\191') | ('\133', '\128'..'\143') -> + (* utf8 letter-like unicode 2100-214F *) true + | (('\136'..'\143' | '\152'..'\155'),_) -> + (* utf8 mathematical operators unicode 2200-22FF *) + (* utf8 miscellaneous technical unicode 2300-23FF *) + (* utf8 miscellaneous symbols unicode 2600-26FF *) + false + | _ -> + (* default to iso 8859-1 "â" *) + true) + (* iso 8859-1 accentuated letters *) + | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> true + | _ -> false + else + bad_token str (* Adding a new token (keyword or special token). *) let add_token (con, str) = match con with | "" -> - let normal_token = - if String.length str > 0 then - match str.[0] with - | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str - | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true - | _ -> false - else - true - in - if normal_token then add_keyword str else add_special_token str + if is_normal_token str then add_keyword str else add_special_token str | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI" -> () | _ -> @@ -178,8 +195,13 @@ let get_buff len = String.sub !buff 0 len (* The classical lexer: idents, numbers, quoted strings, comments *) let rec ident len = parser - | [< ' ('a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246' - |'\248'..'\255' | '0'..'9' | ''' | '_' | '@' as c); s >] -> + | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' | '@' as c); s >] -> + ident (store len c) s + (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) + | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] -> + ident (store (store len c1) c2) s + (* iso 8859-1 accentuated letters *) + | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); s >] -> ident (store len c) s | [< >] -> len @@ -247,19 +269,25 @@ let rec comment bp = parser (* Parse a special token, using the [token_tree] *) +let progress_special c = function + | None -> None + | Some tt -> try Some (CharMap.find c tt.branch) with Not_found -> None + +let rec special tt cs = match tt with + | None -> None + | Some tt -> + match + match Stream.peek cs with + | Some c -> + (try Some (CharMap.find c tt.branch) with Not_found -> None) + | None -> None + with + | Some _ as tt' -> Stream.junk cs; special tt' cs + | None -> tt.node + let process_chars bp c cs = - let rec proc_rec tt = - match - match Stream.peek cs with - | Some c -> - (try Some (CharMap.find c tt.branch) with Not_found -> None) - | None -> None - with - | Some tt' -> Stream.junk cs; proc_rec tt' - | None -> tt.node - in let t = - try proc_rec (CharMap.find c !token_tree.branch) + try special (Some (CharMap.find c !token_tree.branch)) cs with Not_found -> !token_tree.node in let ep = Stream.count cs in @@ -267,6 +295,29 @@ let process_chars bp c cs = | Some t -> (("", t), (bp, ep)) | None -> err (bp, ep) Undefined_token +type token_226_tail = + | TokSymbol of string option + | TokIdent of string + +let parse_226_tail tk = parser + | [< ''\132' as c2; ' ('\128'..'\191' as c3); + (* utf8 letter-like unicode 2100-214F *) + len = ident (store (store (store 0 '\226') c2) c3) >] -> + TokIdent (get_buff len) + | [< ''\133' as c2; ' ('\128'..'\143' as c3); + (* utf8 letter-like unicode 2100-214F *) + len = ident (store (store (store 0 '\226') c2) c3) >] -> + TokIdent (get_buff len) + | [< ' ('\136'..'\143' | '\152'..'\155' as c2); 'c3; + (* utf8 mathematical operators unicode 2200-22FF *) + (* utf8 miscellaneous technical unicode 2300-23FF *) + (* utf8 miscellaneous symbols unicode 2600-26FF *) + t = special (progress_special c3 (progress_special c2 + (progress_special '\226' tk))) >] -> + TokSymbol t + | [< len = ident (store 0 '\226') >] -> + TokIdent (get_buff len) + (* Parse a token in a char stream *) let rec next_token = parser bp @@ -281,16 +332,48 @@ let rec next_token = parser bp | [< ''$'; len = ident (store 0 '$') >] ep -> (("METAIDENT", get_buff len), (bp,ep)) | [< ''.' as c; t = parser - | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' - | '\216'..'\246' | '\248'..'\255' as c); - len = ident (store 0 c) >] -> current:=BeVernac ""; ("FIELD", get_buff len) + | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] -> + ("FIELD", get_buff len) + (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) + | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); + len = ident (store (store 0 c1) c2) >] -> + ("FIELD", get_buff len) + (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) + | [< ''\226' as c1; t = parse_226_tail + (progress_special '.' (Some !token_tree)) >] ep -> + (match t with + | TokSymbol (Some t) -> ("", t) + | TokSymbol None -> err (bp, ep) Undefined_token + | TokIdent t -> ("FIELD", t)) + (* iso 8859-1 accentuated letters *) + | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); + len = ident (store 0 c) >] -> + ("FIELD", get_buff len) (* | [< >] -> ("", ".") >] ep -> (t, (bp,ep)) *) - | [< (t,_) = process_chars bp c >] -> t >] ep -> current:=BeVernac ""; (t, (bp,ep)) - | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' - | '\216'..'\246' | '\248'..'\255' as c); - len = ident (store 0 c) >] ep -> + | [< (t,_) = process_chars bp c >] -> t >] ep -> + current:=BeVernac ""; (t, (bp,ep)) + | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] ep -> + let id = get_buff len in current:=InVernac; + (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) + (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) + | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); + len = ident (store (store 0 c1) c2) >] ep -> + let id = get_buff len in current:=InVernac; + (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) + (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) + | [< ''\226' as c1; t = parse_226_tail (Some !token_tree) >] ep -> + (match t with + | TokSymbol (Some t) -> ("", t), (bp, ep) + | TokSymbol None -> err (bp, ep) Undefined_token + | TokIdent id -> + current:=InVernac; + (try ("", find_keyword id) with Not_found -> ("IDENT", id)), + (bp, ep)) + (* iso 8859-1 accentuated letters *) + | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); + len = ident (store 0 c) >] ep -> let id = get_buff len in current:=InVernac; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> diff --git a/parsing/lexer.mli b/parsing/lexer.mli index b722e8b0c9..b36f1ec6e1 100644 --- a/parsing/lexer.mli +++ b/parsing/lexer.mli @@ -24,6 +24,11 @@ val is_keyword : string -> bool val func : char Stream.t -> (string * string) Stream.t * (int -> int * int) +val check_ident : string -> unit +val check_special_token : string -> unit + +val is_normal_token : string -> bool + val add_token : Token.pattern -> unit val tparse : string * string -> ((string * string) Stream.t -> string) option diff --git a/toplevel/metasyntax.ml b/toplevel/metasyntax.ml index d41ce28036..fbbae0a1b6 100644 --- a/toplevel/metasyntax.ml +++ b/toplevel/metasyntax.ml @@ -391,15 +391,13 @@ let quote x = else x -let is_symbol = function String s -> not (is_letter s.[0]) | _ -> false - let rec find_symbols c_current c_next c_last vars = function | [] -> (vars, []) - | String x :: sl when is_letter x.[0] -> + | String x :: sl when Lexer.is_normal_token x -> + Lexer.check_ident x; let id = Names.id_of_string x in if List.mem_assoc id vars then error ("Variable "^x^" occurs more than once"); -(* let prec = if List.exists is_symbol sl then c_current else c_last in*) let prec = if sl <> [] then c_current else c_last in let (vars,l) = find_symbols c_next c_next c_last vars sl in ((id,prec)::vars, NonTerminal id :: l) @@ -413,6 +411,7 @@ let rec find_symbols c_current c_next c_last vars = function (vars, NonTerminal (prec, meta) :: l) *) | String s :: sl -> + Lexer.check_special_token s; let (vars,l) = find_symbols c_next c_next c_last vars sl in (vars, Terminal (strip s) :: l) | WhiteSpace n :: sl -> |
