aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorherbelin2003-02-27 16:00:16 +0000
committerherbelin2003-02-27 16:00:16 +0000
commit538aff1e33de17e3aa840402731dd6ba483f1cdb (patch)
tree7c1cfc15ff0a1def6b0d249de5b72d0dcf9e7747
parentc7b40c6ce35eb54f708d5d91ef264f6be92949c0 (diff)
Le lexeur et Notation savent reconnaître si un unicode des blocs
0380-03FF (lettres grecques) 2100-214F (symboles assimilés à des lettres), 2200-22FF (opérateurs mathématiques), 2300-23FF (symboles techniques divers) et 2600-26FF (symboles divers) est un caractère spécial ou non lorsque encodé en utf-8. git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@3712 85f007b7-540e-0410-9357-904b9bb8a0f7
-rw-r--r--parsing/lexer.ml4147
-rw-r--r--parsing/lexer.mli5
-rw-r--r--toplevel/metasyntax.ml7
3 files changed, 123 insertions, 36 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 2d84593bc2..362a066f58 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -77,8 +77,11 @@ let check_special_token str =
let check_ident str =
let rec loop = parser
- | [< ''$' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246'
- | '\248'..'\255' | '0'..'9' | ''' | '_'; s >] -> loop s
+ | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> loop s
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop s
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> loop s
| [< _ = Stream.empty >] -> ()
| [< >] -> bad_token str
in
@@ -103,20 +106,34 @@ let add_keyword str =
let is_keyword s =
try let _ = ttree_find !keywords s in true with Not_found -> false
+let is_normal_token str =
+ if String.length str > 0 then
+ match str.[0] with
+ | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str
+ | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true
+ (* utf-8 symbols of the form "E2 xx xx" [E2=226] *)
+ | '\226' when String.length str > 2 ->
+ (match str.[1], str.[2] with
+ | ('\132', '\128'..'\191') | ('\133', '\128'..'\143') ->
+ (* utf8 letter-like unicode 2100-214F *) true
+ | (('\136'..'\143' | '\152'..'\155'),_) ->
+ (* utf8 mathematical operators unicode 2200-22FF *)
+ (* utf8 miscellaneous technical unicode 2300-23FF *)
+ (* utf8 miscellaneous symbols unicode 2600-26FF *)
+ false
+ | _ ->
+ (* default to iso 8859-1 "â" *)
+ true)
+ (* iso 8859-1 accentuated letters *)
+ | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> true
+ | _ -> false
+ else
+ bad_token str
(* Adding a new token (keyword or special token). *)
let add_token (con, str) = match con with
| "" ->
- let normal_token =
- if String.length str > 0 then
- match str.[0] with
- | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str
- | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true
- | _ -> false
- else
- true
- in
- if normal_token then add_keyword str else add_special_token str
+ if is_normal_token str then add_keyword str else add_special_token str
| "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
-> ()
| _ ->
@@ -178,8 +195,13 @@ let get_buff len = String.sub !buff 0 len
(* The classical lexer: idents, numbers, quoted strings, comments *)
let rec ident len = parser
- | [< ' ('a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246'
- |'\248'..'\255' | '0'..'9' | ''' | '_' | '@' as c); s >] ->
+ | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' | '@' as c); s >] ->
+ ident (store len c) s
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] ->
+ ident (store (store len c1) c2) s
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); s >] ->
ident (store len c) s
| [< >] -> len
@@ -247,19 +269,25 @@ let rec comment bp = parser
(* Parse a special token, using the [token_tree] *)
+let progress_special c = function
+ | None -> None
+ | Some tt -> try Some (CharMap.find c tt.branch) with Not_found -> None
+
+let rec special tt cs = match tt with
+ | None -> None
+ | Some tt ->
+ match
+ match Stream.peek cs with
+ | Some c ->
+ (try Some (CharMap.find c tt.branch) with Not_found -> None)
+ | None -> None
+ with
+ | Some _ as tt' -> Stream.junk cs; special tt' cs
+ | None -> tt.node
+
let process_chars bp c cs =
- let rec proc_rec tt =
- match
- match Stream.peek cs with
- | Some c ->
- (try Some (CharMap.find c tt.branch) with Not_found -> None)
- | None -> None
- with
- | Some tt' -> Stream.junk cs; proc_rec tt'
- | None -> tt.node
- in
let t =
- try proc_rec (CharMap.find c !token_tree.branch)
+ try special (Some (CharMap.find c !token_tree.branch)) cs
with Not_found -> !token_tree.node
in
let ep = Stream.count cs in
@@ -267,6 +295,29 @@ let process_chars bp c cs =
| Some t -> (("", t), (bp, ep))
| None -> err (bp, ep) Undefined_token
+type token_226_tail =
+ | TokSymbol of string option
+ | TokIdent of string
+
+let parse_226_tail tk = parser
+ | [< ''\132' as c2; ' ('\128'..'\191' as c3);
+ (* utf8 letter-like unicode 2100-214F *)
+ len = ident (store (store (store 0 '\226') c2) c3) >] ->
+ TokIdent (get_buff len)
+ | [< ''\133' as c2; ' ('\128'..'\143' as c3);
+ (* utf8 letter-like unicode 2100-214F *)
+ len = ident (store (store (store 0 '\226') c2) c3) >] ->
+ TokIdent (get_buff len)
+ | [< ' ('\136'..'\143' | '\152'..'\155' as c2); 'c3;
+ (* utf8 mathematical operators unicode 2200-22FF *)
+ (* utf8 miscellaneous technical unicode 2300-23FF *)
+ (* utf8 miscellaneous symbols unicode 2600-26FF *)
+ t = special (progress_special c3 (progress_special c2
+ (progress_special '\226' tk))) >] ->
+ TokSymbol t
+ | [< len = ident (store 0 '\226') >] ->
+ TokIdent (get_buff len)
+
(* Parse a token in a char stream *)
let rec next_token = parser bp
@@ -281,16 +332,48 @@ let rec next_token = parser bp
| [< ''$'; len = ident (store 0 '$') >] ep ->
(("METAIDENT", get_buff len), (bp,ep))
| [< ''.' as c; t = parser
- | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214'
- | '\216'..'\246' | '\248'..'\255' as c);
- len = ident (store 0 c) >] -> current:=BeVernac ""; ("FIELD", get_buff len)
+ | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] ->
+ ("FIELD", get_buff len)
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2);
+ len = ident (store (store 0 c1) c2) >] ->
+ ("FIELD", get_buff len)
+ (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
+ | [< ''\226' as c1; t = parse_226_tail
+ (progress_special '.' (Some !token_tree)) >] ep ->
+ (match t with
+ | TokSymbol (Some t) -> ("", t)
+ | TokSymbol None -> err (bp, ep) Undefined_token
+ | TokIdent t -> ("FIELD", t))
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c);
+ len = ident (store 0 c) >] ->
+ ("FIELD", get_buff len)
(*
| [< >] -> ("", ".") >] ep -> (t, (bp,ep))
*)
- | [< (t,_) = process_chars bp c >] -> t >] ep -> current:=BeVernac ""; (t, (bp,ep))
- | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214'
- | '\216'..'\246' | '\248'..'\255' as c);
- len = ident (store 0 c) >] ep ->
+ | [< (t,_) = process_chars bp c >] -> t >] ep ->
+ current:=BeVernac ""; (t, (bp,ep))
+ | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] ep ->
+ let id = get_buff len in current:=InVernac;
+ (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2);
+ len = ident (store (store 0 c1) c2) >] ep ->
+ let id = get_buff len in current:=InVernac;
+ (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
+ (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
+ | [< ''\226' as c1; t = parse_226_tail (Some !token_tree) >] ep ->
+ (match t with
+ | TokSymbol (Some t) -> ("", t), (bp, ep)
+ | TokSymbol None -> err (bp, ep) Undefined_token
+ | TokIdent id ->
+ current:=InVernac;
+ (try ("", find_keyword id) with Not_found -> ("IDENT", id)),
+ (bp, ep))
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c);
+ len = ident (store 0 c) >] ep ->
let id = get_buff len in current:=InVernac;
(try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
| [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->
diff --git a/parsing/lexer.mli b/parsing/lexer.mli
index b722e8b0c9..b36f1ec6e1 100644
--- a/parsing/lexer.mli
+++ b/parsing/lexer.mli
@@ -24,6 +24,11 @@ val is_keyword : string -> bool
val func : char Stream.t -> (string * string) Stream.t * (int -> int * int)
+val check_ident : string -> unit
+val check_special_token : string -> unit
+
+val is_normal_token : string -> bool
+
val add_token : Token.pattern -> unit
val tparse : string * string -> ((string * string) Stream.t -> string) option
diff --git a/toplevel/metasyntax.ml b/toplevel/metasyntax.ml
index d41ce28036..fbbae0a1b6 100644
--- a/toplevel/metasyntax.ml
+++ b/toplevel/metasyntax.ml
@@ -391,15 +391,13 @@ let quote x =
else
x
-let is_symbol = function String s -> not (is_letter s.[0]) | _ -> false
-
let rec find_symbols c_current c_next c_last vars = function
| [] -> (vars, [])
- | String x :: sl when is_letter x.[0] ->
+ | String x :: sl when Lexer.is_normal_token x ->
+ Lexer.check_ident x;
let id = Names.id_of_string x in
if List.mem_assoc id vars then
error ("Variable "^x^" occurs more than once");
-(* let prec = if List.exists is_symbol sl then c_current else c_last in*)
let prec = if sl <> [] then c_current else c_last in
let (vars,l) = find_symbols c_next c_next c_last vars sl in
((id,prec)::vars, NonTerminal id :: l)
@@ -413,6 +411,7 @@ let rec find_symbols c_current c_next c_last vars = function
(vars, NonTerminal (prec, meta) :: l)
*)
| String s :: sl ->
+ Lexer.check_special_token s;
let (vars,l) = find_symbols c_next c_next c_last vars sl in
(vars, Terminal (strip s) :: l)
| WhiteSpace n :: sl ->