aboutsummaryrefslogtreecommitdiff
path: root/parsing/lexer.ml4
diff options
context:
space:
mode:
Diffstat (limited to 'parsing/lexer.ml4')
-rw-r--r--parsing/lexer.ml4147
1 files changed, 115 insertions, 32 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 2d84593bc2..362a066f58 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -77,8 +77,11 @@ let check_special_token str =
let check_ident str =
let rec loop = parser
- | [< ''$' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246'
- | '\248'..'\255' | '0'..'9' | ''' | '_'; s >] -> loop s
+ | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> loop s
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop s
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> loop s
| [< _ = Stream.empty >] -> ()
| [< >] -> bad_token str
in
@@ -103,20 +106,34 @@ let add_keyword str =
let is_keyword s =
try let _ = ttree_find !keywords s in true with Not_found -> false
+let is_normal_token str =
+ if String.length str > 0 then
+ match str.[0] with
+ | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str
+ | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true
+ (* utf-8 symbols of the form "E2 xx xx" [E2=226] *)
+ | '\226' when String.length str > 2 ->
+ (match str.[1], str.[2] with
+ | ('\132', '\128'..'\191') | ('\133', '\128'..'\143') ->
+ (* utf8 letter-like unicode 2100-214F *) true
+ | (('\136'..'\143' | '\152'..'\155'),_) ->
+ (* utf8 mathematical operators unicode 2200-22FF *)
+ (* utf8 miscellaneous technical unicode 2300-23FF *)
+ (* utf8 miscellaneous symbols unicode 2600-26FF *)
+ false
+ | _ ->
+ (* default to iso 8859-1 "â" *)
+ true)
+ (* iso 8859-1 accentuated letters *)
+ | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> true
+ | _ -> false
+ else
+ bad_token str
(* Adding a new token (keyword or special token). *)
let add_token (con, str) = match con with
| "" ->
- let normal_token =
- if String.length str > 0 then
- match str.[0] with
- | ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' -> bad_token str
- | '_' | '$' | 'a'..'z' | 'A'..'Z' -> true
- | _ -> false
- else
- true
- in
- if normal_token then add_keyword str else add_special_token str
+ if is_normal_token str then add_keyword str else add_special_token str
| "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
-> ()
| _ ->
@@ -178,8 +195,13 @@ let get_buff len = String.sub !buff 0 len
(* The classical lexer: idents, numbers, quoted strings, comments *)
let rec ident len = parser
- | [< ' ('a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246'
- |'\248'..'\255' | '0'..'9' | ''' | '_' | '@' as c); s >] ->
+ | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' | '@' as c); s >] ->
+ ident (store len c) s
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] ->
+ ident (store (store len c1) c2) s
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); s >] ->
ident (store len c) s
| [< >] -> len
@@ -247,19 +269,25 @@ let rec comment bp = parser
(* Parse a special token, using the [token_tree] *)
+let progress_special c = function
+ | None -> None
+ | Some tt -> try Some (CharMap.find c tt.branch) with Not_found -> None
+
+let rec special tt cs = match tt with
+ | None -> None
+ | Some tt ->
+ match
+ match Stream.peek cs with
+ | Some c ->
+ (try Some (CharMap.find c tt.branch) with Not_found -> None)
+ | None -> None
+ with
+ | Some _ as tt' -> Stream.junk cs; special tt' cs
+ | None -> tt.node
+
let process_chars bp c cs =
- let rec proc_rec tt =
- match
- match Stream.peek cs with
- | Some c ->
- (try Some (CharMap.find c tt.branch) with Not_found -> None)
- | None -> None
- with
- | Some tt' -> Stream.junk cs; proc_rec tt'
- | None -> tt.node
- in
let t =
- try proc_rec (CharMap.find c !token_tree.branch)
+ try special (Some (CharMap.find c !token_tree.branch)) cs
with Not_found -> !token_tree.node
in
let ep = Stream.count cs in
@@ -267,6 +295,29 @@ let process_chars bp c cs =
| Some t -> (("", t), (bp, ep))
| None -> err (bp, ep) Undefined_token
+type token_226_tail =
+ | TokSymbol of string option
+ | TokIdent of string
+
+let parse_226_tail tk = parser
+ | [< ''\132' as c2; ' ('\128'..'\191' as c3);
+ (* utf8 letter-like unicode 2100-214F *)
+ len = ident (store (store (store 0 '\226') c2) c3) >] ->
+ TokIdent (get_buff len)
+ | [< ''\133' as c2; ' ('\128'..'\143' as c3);
+ (* utf8 letter-like unicode 2100-214F *)
+ len = ident (store (store (store 0 '\226') c2) c3) >] ->
+ TokIdent (get_buff len)
+ | [< ' ('\136'..'\143' | '\152'..'\155' as c2); 'c3;
+ (* utf8 mathematical operators unicode 2200-22FF *)
+ (* utf8 miscellaneous technical unicode 2300-23FF *)
+ (* utf8 miscellaneous symbols unicode 2600-26FF *)
+ t = special (progress_special c3 (progress_special c2
+ (progress_special '\226' tk))) >] ->
+ TokSymbol t
+ | [< len = ident (store 0 '\226') >] ->
+ TokIdent (get_buff len)
+
(* Parse a token in a char stream *)
let rec next_token = parser bp
@@ -281,16 +332,48 @@ let rec next_token = parser bp
| [< ''$'; len = ident (store 0 '$') >] ep ->
(("METAIDENT", get_buff len), (bp,ep))
| [< ''.' as c; t = parser
- | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214'
- | '\216'..'\246' | '\248'..'\255' as c);
- len = ident (store 0 c) >] -> current:=BeVernac ""; ("FIELD", get_buff len)
+ | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] ->
+ ("FIELD", get_buff len)
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2);
+ len = ident (store (store 0 c1) c2) >] ->
+ ("FIELD", get_buff len)
+ (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
+ | [< ''\226' as c1; t = parse_226_tail
+ (progress_special '.' (Some !token_tree)) >] ep ->
+ (match t with
+ | TokSymbol (Some t) -> ("", t)
+ | TokSymbol None -> err (bp, ep) Undefined_token
+ | TokIdent t -> ("FIELD", t))
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c);
+ len = ident (store 0 c) >] ->
+ ("FIELD", get_buff len)
(*
| [< >] -> ("", ".") >] ep -> (t, (bp,ep))
*)
- | [< (t,_) = process_chars bp c >] -> t >] ep -> current:=BeVernac ""; (t, (bp,ep))
- | [< ' ('_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214'
- | '\216'..'\246' | '\248'..'\255' as c);
- len = ident (store 0 c) >] ep ->
+ | [< (t,_) = process_chars bp c >] -> t >] ep ->
+ current:=BeVernac ""; (t, (bp,ep))
+ | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); len = ident (store 0 c) >] ep ->
+ let id = get_buff len in current:=InVernac;
+ (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
+ (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
+ | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2);
+ len = ident (store (store 0 c1) c2) >] ep ->
+ let id = get_buff len in current:=InVernac;
+ (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
+ (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
+ | [< ''\226' as c1; t = parse_226_tail (Some !token_tree) >] ep ->
+ (match t with
+ | TokSymbol (Some t) -> ("", t), (bp, ep)
+ | TokSymbol None -> err (bp, ep) Undefined_token
+ | TokIdent id ->
+ current:=InVernac;
+ (try ("", find_keyword id) with Not_found -> ("IDENT", id)),
+ (bp, ep))
+ (* iso 8859-1 accentuated letters *)
+ | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c);
+ len = ident (store 0 c) >] ep ->
let id = get_buff len in current:=InVernac;
(try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
| [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->