From a66b57ba4bba866bb626bde2b6fe3b762347eb3e Mon Sep 17 00:00:00 2001 From: Pierre-Marie Pédrot Date: Mon, 9 May 2016 16:33:21 +0200 Subject: Rename Lexer -> CLexer. --- .gitignore | 2 +- dev/printers.mllib | 2 +- grammar/argextend.ml4 | 2 +- grammar/grammar.mllib | 2 +- parsing/cLexer.ml4 | 695 +++++++++++++++++++++++++++++++++++++++++++++++++ parsing/cLexer.mli | 38 +++ parsing/compat.ml4 | 2 +- parsing/egramcoq.ml | 6 +- parsing/g_constr.ml4 | 2 +- parsing/g_prim.ml4 | 2 +- parsing/g_tactic.ml4 | 2 +- parsing/g_vernac.ml4 | 2 +- parsing/lexer.ml4 | 695 ------------------------------------------------- parsing/lexer.mli | 38 --- parsing/parsing.mllib | 2 +- parsing/pcoq.ml4 | 4 +- tactics/tacinterp.ml | 2 +- toplevel/cerrors.ml | 2 +- toplevel/coqloop.ml | 2 +- toplevel/coqtop.ml | 4 +- toplevel/metasyntax.ml | 14 +- toplevel/vernac.ml | 8 +- 22 files changed, 764 insertions(+), 764 deletions(-) create mode 100644 parsing/cLexer.ml4 create mode 100644 parsing/cLexer.mli delete mode 100644 parsing/lexer.ml4 delete mode 100644 parsing/lexer.mli diff --git a/.gitignore b/.gitignore index 3adb9c67c1..6d843661ff 100644 --- a/.gitignore +++ b/.gitignore @@ -127,7 +127,7 @@ grammar/tacextend.ml grammar/vernacextend.ml grammar/argextend.ml parsing/pcoq.ml -parsing/lexer.ml +parsing/cLexer.ml plugins/setoid_ring/newring.ml plugins/field/field.ml plugins/nsatz/nsatz.ml diff --git a/dev/printers.mllib b/dev/printers.mllib index ad9a5d75e6..e054c1bc70 100644 --- a/dev/printers.mllib +++ b/dev/printers.mllib @@ -151,7 +151,7 @@ States Genprint Tok -Lexer +CLexer Ppextend Pputils Ppannotation diff --git a/grammar/argextend.ml4 b/grammar/argextend.ml4 index cb0f7d2d31..b3c71a703c 100644 --- a/grammar/argextend.ml4 +++ b/grammar/argextend.ml4 @@ -287,7 +287,7 @@ EXTEND GramNonTerminal (!@loc, t, g, Some (Names.Id.of_string s)) | s = STRING -> if String.length s > 0 && Util.is_letter s.[0] then - Lexer.add_keyword s; + CLexer.add_keyword s; GramTerminal s ] ] ; diff --git a/grammar/grammar.mllib b/grammar/grammar.mllib index 71e5b8ae2c..1864fae5fc 100644 --- a/grammar/grammar.mllib +++ b/grammar/grammar.mllib @@ -50,7 +50,7 @@ Constrexpr_ops Compat Tok -Lexer +CLexer Pcoq G_prim G_tactic diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4 new file mode 100644 index 0000000000..5d96873f31 --- /dev/null +++ b/parsing/cLexer.ml4 @@ -0,0 +1,695 @@ +(************************************************************************) +(* v * The Coq Proof Assistant / The Coq Development Team *) +(* char -> int = compare end +module CharMap = Map.Make (CharOrd) + +type ttree = { + node : string option; + branch : ttree CharMap.t } + +let empty_ttree = { node = None; branch = CharMap.empty } + +let ttree_add ttree str = + let rec insert tt i = + if i == String.length str then + {node = Some str; branch = tt.branch} + else + let c = str.[i] in + let br = + match try Some (CharMap.find c tt.branch) with Not_found -> None with + | Some tt' -> + CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch) + | None -> + let tt' = {node = None; branch = CharMap.empty} in + CharMap.add c (insert tt' (i + 1)) tt.branch + in + { node = tt.node; branch = br } + in + insert ttree 0 + +(* Search a string in a dictionary: raises [Not_found] + if the word is not present. *) + +let ttree_find ttree str = + let rec proc_rec tt i = + if i == String.length str then tt + else proc_rec (CharMap.find str.[i] tt.branch) (i+1) + in + proc_rec ttree 0 + +(* Removes a string from a dictionary: returns an equal dictionary + if the word not present. *) +let ttree_remove ttree str = + let rec remove tt i = + if i == String.length str then + {node = None; branch = tt.branch} + else + let c = str.[i] in + let br = + match try Some (CharMap.find c tt.branch) with Not_found -> None with + | Some tt' -> + CharMap.add c (remove tt' (i + 1)) (CharMap.remove c tt.branch) + | None -> tt.branch + in + { node = tt.node; branch = br } + in + remove ttree 0 + + +(* Errors occurring while lexing (explained as "Lexer error: ...") *) + +module Error = struct + + type t = + | Illegal_character + | Unterminated_comment + | Unterminated_string + | Undefined_token + | Bad_token of string + | UnsupportedUnicode of int + + exception E of t + + let to_string x = + "Syntax Error: Lexer: " ^ + (match x with + | Illegal_character -> "Illegal character" + | Unterminated_comment -> "Unterminated comment" + | Unterminated_string -> "Unterminated string" + | Undefined_token -> "Undefined token" + | Bad_token tok -> Format.sprintf "Bad token %S" tok + | UnsupportedUnicode x -> + Printf.sprintf "Unsupported Unicode character (0x%x)" x) + + (* Require to fix the Camlp4 signature *) + let print ppf x = Pp.pp_with ppf (Pp.str (to_string x)) + +end +open Error + +let err loc str = Loc.raise (Loc.make_loc loc) (Error.E str) + +let bad_token str = raise (Error.E (Bad_token str)) + +(* Lexer conventions on tokens *) + +type token_kind = + | Utf8Token of (Unicode.status * int) + | AsciiChar + | EmptyStream + +let error_unsupported_unicode_character n unicode cs = + let bp = Stream.count cs in + err (bp,bp+n) (UnsupportedUnicode unicode) + +let error_utf8 cs = + let bp = Stream.count cs in + Stream.junk cs; (* consume the char to avoid read it and fail again *) + err (bp, bp+1) Illegal_character + +let utf8_char_size cs = function + (* Utf8 leading byte *) + | '\x00'..'\x7F' -> 1 + | '\xC0'..'\xDF' -> 2 + | '\xE0'..'\xEF' -> 3 + | '\xF0'..'\xF7' -> 4 + | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> error_utf8 cs + +let njunk n = Util.repeat n Stream.junk + +let check_utf8_trailing_byte cs c = + if not (Int.equal (Char.code c land 0xC0) 0x80) then error_utf8 cs + +(* Recognize utf8 blocks (of length less than 4 bytes) *) +(* but don't certify full utf8 compliance (e.g. no emptyness check) *) +let lookup_utf8_tail c cs = + let c1 = Char.code c in + if Int.equal (c1 land 0x40) 0 || Int.equal (c1 land 0x38) 0x38 then error_utf8 cs + else + let n, unicode = + if Int.equal (c1 land 0x20) 0 then + match Stream.npeek 2 cs with + | [_;c2] -> + check_utf8_trailing_byte cs c2; + 2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F) + | _ -> error_utf8 cs + else if Int.equal (c1 land 0x10) 0 then + match Stream.npeek 3 cs with + | [_;c2;c3] -> + check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3; + 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + + (Char.code c3 land 0x3F) + | _ -> error_utf8 cs + else match Stream.npeek 4 cs with + | [_;c2;c3;c4] -> + check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3; + check_utf8_trailing_byte cs c4; + 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 + + (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) + | _ -> error_utf8 cs + in + try Unicode.classify unicode, n + with Unicode.Unsupported -> + njunk n cs; error_unsupported_unicode_character n unicode cs + +let lookup_utf8 cs = + match Stream.peek cs with + | Some ('\x00'..'\x7F') -> AsciiChar + | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs) + | None -> EmptyStream + +let unlocated f x = f x + (** FIXME: should we still unloc the exception? *) +(* try f x with Loc.Exc_located (_, exc) -> raise exc *) + +let check_keyword str = + let rec loop_symb = parser + | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str + | [< s >] -> + match unlocated lookup_utf8 s with + | Utf8Token (_,n) -> njunk n s; loop_symb s + | AsciiChar -> Stream.junk s; loop_symb s + | EmptyStream -> () + in + loop_symb (Stream.of_string str) + +let check_keyword_to_add s = + try check_keyword s + with Error.E (UnsupportedUnicode unicode) -> + Flags.if_verbose msg_warning + (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \ + which will not be parsable." s unicode)) + +let check_ident str = + let rec loop_id intail = parser + | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] -> + loop_id true s + | [< ' ('0'..'9' | ''') when intail; s >] -> + loop_id true s + | [< s >] -> + match unlocated lookup_utf8 s with + | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s + | Utf8Token (Unicode.IdentPart, n) when intail -> + njunk n s; + loop_id true s + | EmptyStream -> () + | Utf8Token _ | AsciiChar -> bad_token str + in + loop_id false (Stream.of_string str) + +let is_ident str = + try let _ = check_ident str in true with Error.E _ -> false + +(* Keyword and symbol dictionary *) +let token_tree = ref empty_ttree + +let is_keyword s = + try match (ttree_find !token_tree s).node with None -> false | Some _ -> true + with Not_found -> false + +let add_keyword str = + if not (is_keyword str) then + begin + check_keyword_to_add str; + token_tree := ttree_add !token_tree str + end + +let remove_keyword str = + token_tree := ttree_remove !token_tree str + +(* Freeze and unfreeze the state of the lexer *) +type frozen_t = ttree + +let freeze () = !token_tree +let unfreeze tt = (token_tree := tt) + +(* The string buffering machinery *) + +let buff = ref (String.create 80) + +let store len x = + if len >= String.length !buff then + buff := !buff ^ String.create (String.length !buff); + !buff.[len] <- x; + succ len + +let rec nstore n len cs = + if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len + +let get_buff len = String.sub !buff 0 len + +(* The classical lexer: idents, numbers, quoted strings, comments *) + +let rec ident_tail len = parser + | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] -> + ident_tail (store len c) s + | [< s >] -> + match lookup_utf8 s with + | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) -> + ident_tail (nstore n len s) s + | _ -> len + +let rec number len = parser + | [< ' ('0'..'9' as c); s >] -> number (store len c) s + | [< >] -> len + +let rec string in_comments bp len = parser + | [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] -> + if esc then string in_comments bp (store len '"') s else len + | [< ''('; s >] -> + (parser + | [< ''*'; s >] -> + string + (Option.map succ in_comments) + bp (store (store len '(') '*') + s + | [< >] -> + string in_comments bp (store len '(') s) s + | [< ''*'; s >] -> + (parser + | [< '')'; s >] -> + let () = match in_comments with + | Some 0 -> + msg_warning + (strbrk + "Not interpreting \"*)\" as the end of current \ + non-terminated comment because it occurs in a \ + non-terminated string of the comment.") + | _ -> () + in + let in_comments = Option.map pred in_comments in + string in_comments bp (store (store len '*') ')') s + | [< >] -> + string in_comments bp (store len '*') s) s + | [< 'c; s >] -> string in_comments bp (store len c) s + | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string + +(* Hook for exporting comment into xml theory files *) +let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore () + +(* Utilities for comments in beautify *) +let comment_begin = ref None +let comm_loc bp = match !comment_begin with +| None -> comment_begin := Some bp +| _ -> () + +let current = Buffer.create 8192 +let between_com = ref true + +type com_state = int option * string * bool +let restore_com_state (o,s,b) = + comment_begin := o; + Buffer.clear current; Buffer.add_string current s; + between_com := b +let dflt_com = (None,"",true) +let com_state () = + let s = (!comment_begin, Buffer.contents current, !between_com) in + restore_com_state dflt_com; s + +let real_push_char c = Buffer.add_char current c + +(* Add a char if it is between two commands, if it is a newline or + if the last char is not a space itself. *) +let push_char c = + if + !between_com || List.mem c ['\n';'\r'] || + (List.mem c [' ';'\t']&& + (Int.equal (Buffer.length current) 0 || + not (let s = Buffer.contents current in + List.mem s.[String.length s - 1] [' ';'\t';'\n';'\r']))) + then + real_push_char c + +let push_string s = Buffer.add_string current s + +let null_comment s = + let rec null i = + i<0 || (List.mem s.[i] [' ';'\t';'\n';'\r'] && null (i-1)) in + null (String.length s - 1) + +let comment_stop ep = + let current_s = Buffer.contents current in + if !Flags.xml_export && Buffer.length current > 0 && + (!between_com || not(null_comment current_s)) then + Hook.get f_xml_output_comment current_s; + (if Flags.do_beautify() && Buffer.length current > 0 && + (!between_com || not(null_comment current_s)) then + let bp = match !comment_begin with + Some bp -> bp + | None -> + msgerrnl(str "No begin location for comment '" + ++ str current_s ++str"' ending at " + ++ int ep); + ep-1 in + Pp.comments := ((bp,ep),current_s) :: !Pp.comments); + Buffer.clear current; + comment_begin := None; + between_com := false + +(* Does not unescape!!! *) +let rec comm_string bp = parser + | [< ''"' >] -> push_string "\"" + | [< ''\\'; _ = + (parser [< ' ('"' | '\\' as c) >] -> + let () = match c with + | '"' -> real_push_char c + | _ -> () + in + real_push_char c + | [< >] -> real_push_char '\\'); s >] + -> comm_string bp s + | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string + | [< 'c; s >] -> real_push_char c; comm_string bp s + +let rec comment bp = parser bp2 + | [< ''('; + _ = (parser + | [< ''*'; s >] -> push_string "(*"; comment bp s + | [< >] -> push_string "(" ); + s >] -> comment bp s + | [< ''*'; + _ = parser + | [< '')' >] -> push_string "*)"; + | [< s >] -> real_push_char '*'; comment bp s >] -> () + | [< ''"'; s >] -> + if Flags.do_beautify() then (push_string"\"";comm_string bp2 s) + else ignore (string (Some 0) bp2 0 s); + comment bp s + | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment + | [< 'z; s >] -> real_push_char z; comment bp s + +(* Parse a special token, using the [token_tree] *) + +(* Peek as much utf-8 lexemes as possible *) +(* and retain the longest valid special token obtained *) +let rec progress_further last nj tt cs = + try progress_from_byte last nj tt cs (List.nth (Stream.npeek (nj+1) cs) nj) + with Failure _ -> last + +and update_longest_valid_token last nj tt cs = + match tt.node with + | Some _ as last' -> + stream_njunk nj cs; + progress_further last' 0 tt cs + | None -> + progress_further last nj tt cs + +(* nj is the number of char peeked since last valid token *) +(* n the number of char in utf8 block *) +and progress_utf8 last nj n c tt cs = + try + let tt = CharMap.find c tt.branch in + if Int.equal n 1 then + update_longest_valid_token last (nj+n) tt cs + else + match Util.List.skipn (nj+1) (Stream.npeek (nj+n) cs) with + | l when Int.equal (List.length l) (n - 1) -> + List.iter (check_utf8_trailing_byte cs) l; + let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in + update_longest_valid_token last (nj+n) tt cs + | _ -> + error_utf8 cs + with Not_found -> + last + +and progress_from_byte last nj tt cs c = + progress_utf8 last nj (utf8_char_size cs c) c tt cs + +let find_keyword id s = + let tt = ttree_find !token_tree id in + match progress_further tt.node 0 tt s with + | None -> raise Not_found + | Some c -> KEYWORD c + +let process_sequence bp c cs = + let rec aux n cs = + match Stream.peek cs with + | Some c' when c == c' -> Stream.junk cs; aux (n+1) cs + | _ -> BULLET (String.make n c), (bp, Stream.count cs) + in + aux 1 cs + +(* Must be a special token *) +let process_chars bp c cs = + let t = progress_from_byte None (-1) !token_tree cs c in + let ep = Stream.count cs in + match t with + | Some t -> (KEYWORD t, (bp, ep)) + | None -> + let ep' = bp + utf8_char_size cs c in + njunk (ep' - ep) cs; + err (bp, ep') Undefined_token + +let token_of_special c s = match c with + | '$' -> METAIDENT s + | '.' -> FIELD s + | _ -> assert false + +(* Parse what follows a dot / a dollar *) + +let parse_after_special c bp = + parser + | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail (store 0 d) >] -> + token_of_special c (get_buff len) + | [< s >] -> + match lookup_utf8 s with + | Utf8Token (Unicode.Letter, n) -> + token_of_special c (get_buff (ident_tail (nstore n 0 s) s)) + | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s) + +(* Parse what follows a question mark *) + +let parse_after_qmark bp s = + match Stream.peek s with + | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK + | None -> KEYWORD "?" + | _ -> + match lookup_utf8 s with + | Utf8Token (Unicode.Letter, _) -> LEFTQMARK + | AsciiChar | Utf8Token _ | EmptyStream -> + fst (process_chars bp '?' s) + +let blank_or_eof cs = + match Stream.peek cs with + | None -> true + | Some (' ' | '\t' | '\n' |'\r') -> true + | _ -> false + +(* Parse a token in a char stream *) + +let rec next_token = parser bp + | [< '' ' | '\t' | '\n' |'\r' as c; s >] -> + comm_loc bp; push_char c; next_token s + | [< ''$' as c; t = parse_after_special c bp >] ep -> + comment_stop bp; (t, (ep, bp)) + | [< ''.' as c; t = parse_after_special c bp; s >] ep -> + comment_stop bp; + (* We enforce that "." should either be part of a larger keyword, + for instance ".(", or followed by a blank or eof. *) + let () = match t with + | KEYWORD ("." | "...") -> + if not (blank_or_eof s) then err (bp,ep+1) Undefined_token; + between_com := true; + | _ -> () + in + (t, (bp,ep)) + | [< ' ('-'|'+'|'*' as c); s >] -> + let t,new_between_com = + if !between_com then process_sequence bp c s,true + else process_chars bp c s,false + in + comment_stop bp; between_com := new_between_com; t + | [< ''?'; s >] ep -> + let t = parse_after_qmark bp s in comment_stop bp; (t, (ep, bp)) + | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); + len = ident_tail (store 0 c); s >] ep -> + let id = get_buff len in + comment_stop bp; + (try find_keyword id s with Not_found -> IDENT id), (bp, ep) + | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> + comment_stop bp; + (INT (get_buff len), (bp, ep)) + | [< ''\"'; len = string None bp 0 >] ep -> + comment_stop bp; + (STRING (get_buff len), (bp, ep)) + | [< ' ('(' as c); + t = parser + | [< ''*'; s >] -> + comm_loc bp; + push_string "(*"; + comment bp s; + next_token s + | [< t = process_chars bp c >] -> comment_stop bp; t >] -> + t + | [< s >] -> + match lookup_utf8 s with + | Utf8Token (Unicode.Letter, n) -> + let len = ident_tail (nstore n 0 s) s in + let id = get_buff len in + let ep = Stream.count s in + comment_stop bp; + (try find_keyword id s with Not_found -> IDENT id), (bp, ep) + | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) -> + let t = process_chars bp (Stream.next s) s in + let new_between_com = match t with + (KEYWORD ("{"|"}"),_) -> !between_com | _ -> false in + comment_stop bp; between_com := new_between_com; t + | EmptyStream -> + comment_stop bp; (EOI, (bp, bp + 1)) + +(* (* Debug: uncomment this for tracing tokens seen by coq...*) +let next_token s = + let (t,(bp,ep)) = next_token s in Printf.eprintf "[%s]\n%!" (Tok.to_string t); + (t,(bp,ep)) +*) + +(* Location table system for creating tables associating a token count + to its location in a char stream (the source) *) + +let locerr () = invalid_arg "Lexer: location function" + +let loct_create () = Hashtbl.create 207 + +let loct_func loct i = + try Hashtbl.find loct i with Not_found -> locerr () + +let loct_add loct i loc = Hashtbl.add loct i loc + +let current_location_table = ref (loct_create ()) + +type location_table = (int, CompatLoc.t) Hashtbl.t +let location_table () = !current_location_table +let restore_location_table t = current_location_table := t + +(** {6 The lexer of Coq} *) + +(** Note: removing a token. + We do nothing because [remove_token] is called only when removing a grammar + rule with [Grammar.delete_rule]. The latter command is called only when + unfreezing the state of the grammar entries (see GRAMMAR summary, file + env/metasyntax.ml). Therefore, instead of removing tokens one by one, + we unfreeze the state of the lexer. This restores the behaviour of the + lexer. B.B. *) + +IFDEF CAMLP5 THEN + +type te = Tok.t + +(** Names of tokens, for this lexer, used in Grammar error messages *) + +let token_text = function + | ("", t) -> "'" ^ t ^ "'" + | ("IDENT", "") -> "identifier" + | ("IDENT", t) -> "'" ^ t ^ "'" + | ("INT", "") -> "integer" + | ("INT", s) -> "'" ^ s ^ "'" + | ("STRING", "") -> "string" + | ("EOI", "") -> "end of input" + | (con, "") -> con + | (con, prm) -> con ^ " \"" ^ prm ^ "\"" + +let func cs = + let loct = loct_create () in + let ts = + Stream.from + (fun i -> + let (tok, loc) = next_token cs in + loct_add loct i (make_loc loc); Some tok) + in + current_location_table := loct; + (ts, loct_func loct) + +let lexer = { + Token.tok_func = func; + Token.tok_using = + (fun pat -> match Tok.of_pattern pat with + | KEYWORD s -> add_keyword s + | _ -> ()); + Token.tok_removing = (fun _ -> ()); + Token.tok_match = Tok.match_pattern; + Token.tok_comm = None; + Token.tok_text = token_text } + +ELSE (* official camlp4 for ocaml >= 3.10 *) + +module M_ = Camlp4.ErrorHandler.Register (Error) + +module Loc = CompatLoc +module Token = struct + include Tok (* Cf. tok.ml *) + module Loc = CompatLoc + module Error = Camlp4.Struct.EmptyError + module Filter = struct + type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t + type t = unit + let mk _is_kwd = () + let keyword_added () kwd _ = add_keyword kwd + let keyword_removed () _ = () + let filter () x = x + let define_filter () _ = () + end +end + +let mk () _init_loc(*FIXME*) cs = + let loct = loct_create () in + let rec self = + parser i + [< (tok, loc) = next_token; s >] -> + let loc = make_loc loc in + loct_add loct i loc; + [< '(tok, loc); self s >] + | [< >] -> [< >] + in current_location_table := loct; self cs + +END + +(** Terminal symbols interpretation *) + +let is_ident_not_keyword s = + is_ident s && not (is_keyword s) + +let is_number s = + let rec aux i = + Int.equal (String.length s) i || + match s.[i] with '0'..'9' -> aux (i+1) | _ -> false + in aux 0 + +let strip s = + let len = + let rec loop i len = + if Int.equal i (String.length s) then len + else if s.[i] == ' ' then loop (i + 1) len + else loop (i + 1) (len + 1) + in + loop 0 0 + in + if len == String.length s then s + else + let s' = String.create len in + let rec loop i i' = + if i == String.length s then s' + else if s.[i] == ' ' then loop (i + 1) i' + else begin s'.[i'] <- s.[i]; loop (i + 1) (i' + 1) end + in + loop 0 0 + +let terminal s = + let s = strip s in + let () = match s with "" -> Errors.error "empty token." | _ -> () in + if is_ident_not_keyword s then IDENT s + else if is_number s then INT s + else KEYWORD s diff --git a/parsing/cLexer.mli b/parsing/cLexer.mli new file mode 100644 index 0000000000..24b0ec8471 --- /dev/null +++ b/parsing/cLexer.mli @@ -0,0 +1,38 @@ +(************************************************************************) +(* v * The Coq Proof Assistant / The Coq Development Team *) +(* unit +val remove_keyword : string -> unit +val is_keyword : string -> bool + +(* val location_function : int -> Loc.t *) + +(** for coqdoc *) +type location_table +val location_table : unit -> location_table +val restore_location_table : location_table -> unit + +val check_ident : string -> unit +val is_ident : string -> bool +val check_keyword : string -> unit + +type frozen_t +val freeze : unit -> frozen_t +val unfreeze : frozen_t -> unit + +type com_state +val com_state: unit -> com_state +val restore_com_state: com_state -> unit + +val xml_output_comment : (string -> unit) Hook.t + +val terminal : string -> Tok.t + +(** The lexer of Coq: *) + +include Compat.LexerSig diff --git a/parsing/compat.ml4 b/parsing/compat.ml4 index 17038ab5fc..1481c31f45 100644 --- a/parsing/compat.ml4 +++ b/parsing/compat.ml4 @@ -135,7 +135,7 @@ let to_coq_position = function END -(** Signature of Lexer *) +(** Signature of CLexer *) IFDEF CAMLP5 THEN diff --git a/parsing/egramcoq.ml b/parsing/egramcoq.ml index b0bbdd8136..4c77433038 100644 --- a/parsing/egramcoq.ml +++ b/parsing/egramcoq.ml @@ -317,9 +317,9 @@ let recover_constr_grammar ntn prec = (* Summary functions: the state of the lexer is included in that of the parser. Because the grammar affects the set of keywords when adding or removing grammar rules. *) -type frozen_t = (int * all_grammar_command) list * Lexer.frozen_t +type frozen_t = (int * all_grammar_command) list * CLexer.frozen_t -let freeze _ : frozen_t = (!grammar_state, Lexer.freeze ()) +let freeze _ : frozen_t = (!grammar_state, CLexer.freeze ()) (* We compare the current state of the grammar and the state to unfreeze, by computing the longest common suffixes *) @@ -335,7 +335,7 @@ let unfreeze (grams, lex) = remove_grammars n; remove_levels n; grammar_state := common; - Lexer.unfreeze lex; + CLexer.unfreeze lex; List.iter extend_grammar (List.rev_map snd redo) (** No need to provide an init function : the grammar state is diff --git a/parsing/g_constr.ml4 b/parsing/g_constr.ml4 index 5edb7b8082..73b088b467 100644 --- a/parsing/g_constr.ml4 +++ b/parsing/g_constr.ml4 @@ -29,7 +29,7 @@ let constr_kw = "Prop"; "Set"; "Type"; ".("; "_"; ".."; "`{"; "`("; "{|"; "|}" ] -let _ = List.iter Lexer.add_keyword constr_kw +let _ = List.iter CLexer.add_keyword constr_kw let mk_cast = function (c,(_,None)) -> c diff --git a/parsing/g_prim.ml4 b/parsing/g_prim.ml4 index 5297c163bc..20fbccb2db 100644 --- a/parsing/g_prim.ml4 +++ b/parsing/g_prim.ml4 @@ -15,7 +15,7 @@ open Pcoq open Pcoq.Prim let prim_kw = ["{"; "}"; "["; "]"; "("; ")"; "'"] -let _ = List.iter Lexer.add_keyword prim_kw +let _ = List.iter CLexer.add_keyword prim_kw let local_make_qualid l id = make_qualid (DirPath.make l) id diff --git a/parsing/g_tactic.ml4 b/parsing/g_tactic.ml4 index 2a00a17640..8f3d04eeb6 100644 --- a/parsing/g_tactic.ml4 +++ b/parsing/g_tactic.ml4 @@ -25,7 +25,7 @@ open Pcoq let all_with delta = Redops.make_red_flag [FBeta;FIota;FZeta;delta] let tactic_kw = [ "->"; "<-" ; "by" ] -let _ = List.iter Lexer.add_keyword tactic_kw +let _ = List.iter CLexer.add_keyword tactic_kw let err () = raise Stream.Failure diff --git a/parsing/g_vernac.ml4 b/parsing/g_vernac.ml4 index f3766a7d79..b19ad8807e 100644 --- a/parsing/g_vernac.ml4 +++ b/parsing/g_vernac.ml4 @@ -27,7 +27,7 @@ open Pcoq.Vernac_ open Pcoq.Module let vernac_kw = [ ";"; ","; ">->"; ":<"; "<:"; "where"; "at" ] -let _ = List.iter Lexer.add_keyword vernac_kw +let _ = List.iter CLexer.add_keyword vernac_kw (* Rem: do not join the different GEXTEND into one, it breaks native *) (* compilation on PowerPC and Sun architectures *) diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 deleted file mode 100644 index 5d96873f31..0000000000 --- a/parsing/lexer.ml4 +++ /dev/null @@ -1,695 +0,0 @@ -(************************************************************************) -(* v * The Coq Proof Assistant / The Coq Development Team *) -(* char -> int = compare end -module CharMap = Map.Make (CharOrd) - -type ttree = { - node : string option; - branch : ttree CharMap.t } - -let empty_ttree = { node = None; branch = CharMap.empty } - -let ttree_add ttree str = - let rec insert tt i = - if i == String.length str then - {node = Some str; branch = tt.branch} - else - let c = str.[i] in - let br = - match try Some (CharMap.find c tt.branch) with Not_found -> None with - | Some tt' -> - CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch) - | None -> - let tt' = {node = None; branch = CharMap.empty} in - CharMap.add c (insert tt' (i + 1)) tt.branch - in - { node = tt.node; branch = br } - in - insert ttree 0 - -(* Search a string in a dictionary: raises [Not_found] - if the word is not present. *) - -let ttree_find ttree str = - let rec proc_rec tt i = - if i == String.length str then tt - else proc_rec (CharMap.find str.[i] tt.branch) (i+1) - in - proc_rec ttree 0 - -(* Removes a string from a dictionary: returns an equal dictionary - if the word not present. *) -let ttree_remove ttree str = - let rec remove tt i = - if i == String.length str then - {node = None; branch = tt.branch} - else - let c = str.[i] in - let br = - match try Some (CharMap.find c tt.branch) with Not_found -> None with - | Some tt' -> - CharMap.add c (remove tt' (i + 1)) (CharMap.remove c tt.branch) - | None -> tt.branch - in - { node = tt.node; branch = br } - in - remove ttree 0 - - -(* Errors occurring while lexing (explained as "Lexer error: ...") *) - -module Error = struct - - type t = - | Illegal_character - | Unterminated_comment - | Unterminated_string - | Undefined_token - | Bad_token of string - | UnsupportedUnicode of int - - exception E of t - - let to_string x = - "Syntax Error: Lexer: " ^ - (match x with - | Illegal_character -> "Illegal character" - | Unterminated_comment -> "Unterminated comment" - | Unterminated_string -> "Unterminated string" - | Undefined_token -> "Undefined token" - | Bad_token tok -> Format.sprintf "Bad token %S" tok - | UnsupportedUnicode x -> - Printf.sprintf "Unsupported Unicode character (0x%x)" x) - - (* Require to fix the Camlp4 signature *) - let print ppf x = Pp.pp_with ppf (Pp.str (to_string x)) - -end -open Error - -let err loc str = Loc.raise (Loc.make_loc loc) (Error.E str) - -let bad_token str = raise (Error.E (Bad_token str)) - -(* Lexer conventions on tokens *) - -type token_kind = - | Utf8Token of (Unicode.status * int) - | AsciiChar - | EmptyStream - -let error_unsupported_unicode_character n unicode cs = - let bp = Stream.count cs in - err (bp,bp+n) (UnsupportedUnicode unicode) - -let error_utf8 cs = - let bp = Stream.count cs in - Stream.junk cs; (* consume the char to avoid read it and fail again *) - err (bp, bp+1) Illegal_character - -let utf8_char_size cs = function - (* Utf8 leading byte *) - | '\x00'..'\x7F' -> 1 - | '\xC0'..'\xDF' -> 2 - | '\xE0'..'\xEF' -> 3 - | '\xF0'..'\xF7' -> 4 - | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> error_utf8 cs - -let njunk n = Util.repeat n Stream.junk - -let check_utf8_trailing_byte cs c = - if not (Int.equal (Char.code c land 0xC0) 0x80) then error_utf8 cs - -(* Recognize utf8 blocks (of length less than 4 bytes) *) -(* but don't certify full utf8 compliance (e.g. no emptyness check) *) -let lookup_utf8_tail c cs = - let c1 = Char.code c in - if Int.equal (c1 land 0x40) 0 || Int.equal (c1 land 0x38) 0x38 then error_utf8 cs - else - let n, unicode = - if Int.equal (c1 land 0x20) 0 then - match Stream.npeek 2 cs with - | [_;c2] -> - check_utf8_trailing_byte cs c2; - 2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F) - | _ -> error_utf8 cs - else if Int.equal (c1 land 0x10) 0 then - match Stream.npeek 3 cs with - | [_;c2;c3] -> - check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3; - 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + - (Char.code c3 land 0x3F) - | _ -> error_utf8 cs - else match Stream.npeek 4 cs with - | [_;c2;c3;c4] -> - check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3; - check_utf8_trailing_byte cs c4; - 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 + - (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) - | _ -> error_utf8 cs - in - try Unicode.classify unicode, n - with Unicode.Unsupported -> - njunk n cs; error_unsupported_unicode_character n unicode cs - -let lookup_utf8 cs = - match Stream.peek cs with - | Some ('\x00'..'\x7F') -> AsciiChar - | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs) - | None -> EmptyStream - -let unlocated f x = f x - (** FIXME: should we still unloc the exception? *) -(* try f x with Loc.Exc_located (_, exc) -> raise exc *) - -let check_keyword str = - let rec loop_symb = parser - | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str - | [< s >] -> - match unlocated lookup_utf8 s with - | Utf8Token (_,n) -> njunk n s; loop_symb s - | AsciiChar -> Stream.junk s; loop_symb s - | EmptyStream -> () - in - loop_symb (Stream.of_string str) - -let check_keyword_to_add s = - try check_keyword s - with Error.E (UnsupportedUnicode unicode) -> - Flags.if_verbose msg_warning - (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \ - which will not be parsable." s unicode)) - -let check_ident str = - let rec loop_id intail = parser - | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] -> - loop_id true s - | [< ' ('0'..'9' | ''') when intail; s >] -> - loop_id true s - | [< s >] -> - match unlocated lookup_utf8 s with - | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s - | Utf8Token (Unicode.IdentPart, n) when intail -> - njunk n s; - loop_id true s - | EmptyStream -> () - | Utf8Token _ | AsciiChar -> bad_token str - in - loop_id false (Stream.of_string str) - -let is_ident str = - try let _ = check_ident str in true with Error.E _ -> false - -(* Keyword and symbol dictionary *) -let token_tree = ref empty_ttree - -let is_keyword s = - try match (ttree_find !token_tree s).node with None -> false | Some _ -> true - with Not_found -> false - -let add_keyword str = - if not (is_keyword str) then - begin - check_keyword_to_add str; - token_tree := ttree_add !token_tree str - end - -let remove_keyword str = - token_tree := ttree_remove !token_tree str - -(* Freeze and unfreeze the state of the lexer *) -type frozen_t = ttree - -let freeze () = !token_tree -let unfreeze tt = (token_tree := tt) - -(* The string buffering machinery *) - -let buff = ref (String.create 80) - -let store len x = - if len >= String.length !buff then - buff := !buff ^ String.create (String.length !buff); - !buff.[len] <- x; - succ len - -let rec nstore n len cs = - if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len - -let get_buff len = String.sub !buff 0 len - -(* The classical lexer: idents, numbers, quoted strings, comments *) - -let rec ident_tail len = parser - | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] -> - ident_tail (store len c) s - | [< s >] -> - match lookup_utf8 s with - | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) -> - ident_tail (nstore n len s) s - | _ -> len - -let rec number len = parser - | [< ' ('0'..'9' as c); s >] -> number (store len c) s - | [< >] -> len - -let rec string in_comments bp len = parser - | [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] -> - if esc then string in_comments bp (store len '"') s else len - | [< ''('; s >] -> - (parser - | [< ''*'; s >] -> - string - (Option.map succ in_comments) - bp (store (store len '(') '*') - s - | [< >] -> - string in_comments bp (store len '(') s) s - | [< ''*'; s >] -> - (parser - | [< '')'; s >] -> - let () = match in_comments with - | Some 0 -> - msg_warning - (strbrk - "Not interpreting \"*)\" as the end of current \ - non-terminated comment because it occurs in a \ - non-terminated string of the comment.") - | _ -> () - in - let in_comments = Option.map pred in_comments in - string in_comments bp (store (store len '*') ')') s - | [< >] -> - string in_comments bp (store len '*') s) s - | [< 'c; s >] -> string in_comments bp (store len c) s - | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string - -(* Hook for exporting comment into xml theory files *) -let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore () - -(* Utilities for comments in beautify *) -let comment_begin = ref None -let comm_loc bp = match !comment_begin with -| None -> comment_begin := Some bp -| _ -> () - -let current = Buffer.create 8192 -let between_com = ref true - -type com_state = int option * string * bool -let restore_com_state (o,s,b) = - comment_begin := o; - Buffer.clear current; Buffer.add_string current s; - between_com := b -let dflt_com = (None,"",true) -let com_state () = - let s = (!comment_begin, Buffer.contents current, !between_com) in - restore_com_state dflt_com; s - -let real_push_char c = Buffer.add_char current c - -(* Add a char if it is between two commands, if it is a newline or - if the last char is not a space itself. *) -let push_char c = - if - !between_com || List.mem c ['\n';'\r'] || - (List.mem c [' ';'\t']&& - (Int.equal (Buffer.length current) 0 || - not (let s = Buffer.contents current in - List.mem s.[String.length s - 1] [' ';'\t';'\n';'\r']))) - then - real_push_char c - -let push_string s = Buffer.add_string current s - -let null_comment s = - let rec null i = - i<0 || (List.mem s.[i] [' ';'\t';'\n';'\r'] && null (i-1)) in - null (String.length s - 1) - -let comment_stop ep = - let current_s = Buffer.contents current in - if !Flags.xml_export && Buffer.length current > 0 && - (!between_com || not(null_comment current_s)) then - Hook.get f_xml_output_comment current_s; - (if Flags.do_beautify() && Buffer.length current > 0 && - (!between_com || not(null_comment current_s)) then - let bp = match !comment_begin with - Some bp -> bp - | None -> - msgerrnl(str "No begin location for comment '" - ++ str current_s ++str"' ending at " - ++ int ep); - ep-1 in - Pp.comments := ((bp,ep),current_s) :: !Pp.comments); - Buffer.clear current; - comment_begin := None; - between_com := false - -(* Does not unescape!!! *) -let rec comm_string bp = parser - | [< ''"' >] -> push_string "\"" - | [< ''\\'; _ = - (parser [< ' ('"' | '\\' as c) >] -> - let () = match c with - | '"' -> real_push_char c - | _ -> () - in - real_push_char c - | [< >] -> real_push_char '\\'); s >] - -> comm_string bp s - | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string - | [< 'c; s >] -> real_push_char c; comm_string bp s - -let rec comment bp = parser bp2 - | [< ''('; - _ = (parser - | [< ''*'; s >] -> push_string "(*"; comment bp s - | [< >] -> push_string "(" ); - s >] -> comment bp s - | [< ''*'; - _ = parser - | [< '')' >] -> push_string "*)"; - | [< s >] -> real_push_char '*'; comment bp s >] -> () - | [< ''"'; s >] -> - if Flags.do_beautify() then (push_string"\"";comm_string bp2 s) - else ignore (string (Some 0) bp2 0 s); - comment bp s - | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment - | [< 'z; s >] -> real_push_char z; comment bp s - -(* Parse a special token, using the [token_tree] *) - -(* Peek as much utf-8 lexemes as possible *) -(* and retain the longest valid special token obtained *) -let rec progress_further last nj tt cs = - try progress_from_byte last nj tt cs (List.nth (Stream.npeek (nj+1) cs) nj) - with Failure _ -> last - -and update_longest_valid_token last nj tt cs = - match tt.node with - | Some _ as last' -> - stream_njunk nj cs; - progress_further last' 0 tt cs - | None -> - progress_further last nj tt cs - -(* nj is the number of char peeked since last valid token *) -(* n the number of char in utf8 block *) -and progress_utf8 last nj n c tt cs = - try - let tt = CharMap.find c tt.branch in - if Int.equal n 1 then - update_longest_valid_token last (nj+n) tt cs - else - match Util.List.skipn (nj+1) (Stream.npeek (nj+n) cs) with - | l when Int.equal (List.length l) (n - 1) -> - List.iter (check_utf8_trailing_byte cs) l; - let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in - update_longest_valid_token last (nj+n) tt cs - | _ -> - error_utf8 cs - with Not_found -> - last - -and progress_from_byte last nj tt cs c = - progress_utf8 last nj (utf8_char_size cs c) c tt cs - -let find_keyword id s = - let tt = ttree_find !token_tree id in - match progress_further tt.node 0 tt s with - | None -> raise Not_found - | Some c -> KEYWORD c - -let process_sequence bp c cs = - let rec aux n cs = - match Stream.peek cs with - | Some c' when c == c' -> Stream.junk cs; aux (n+1) cs - | _ -> BULLET (String.make n c), (bp, Stream.count cs) - in - aux 1 cs - -(* Must be a special token *) -let process_chars bp c cs = - let t = progress_from_byte None (-1) !token_tree cs c in - let ep = Stream.count cs in - match t with - | Some t -> (KEYWORD t, (bp, ep)) - | None -> - let ep' = bp + utf8_char_size cs c in - njunk (ep' - ep) cs; - err (bp, ep') Undefined_token - -let token_of_special c s = match c with - | '$' -> METAIDENT s - | '.' -> FIELD s - | _ -> assert false - -(* Parse what follows a dot / a dollar *) - -let parse_after_special c bp = - parser - | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail (store 0 d) >] -> - token_of_special c (get_buff len) - | [< s >] -> - match lookup_utf8 s with - | Utf8Token (Unicode.Letter, n) -> - token_of_special c (get_buff (ident_tail (nstore n 0 s) s)) - | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s) - -(* Parse what follows a question mark *) - -let parse_after_qmark bp s = - match Stream.peek s with - | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK - | None -> KEYWORD "?" - | _ -> - match lookup_utf8 s with - | Utf8Token (Unicode.Letter, _) -> LEFTQMARK - | AsciiChar | Utf8Token _ | EmptyStream -> - fst (process_chars bp '?' s) - -let blank_or_eof cs = - match Stream.peek cs with - | None -> true - | Some (' ' | '\t' | '\n' |'\r') -> true - | _ -> false - -(* Parse a token in a char stream *) - -let rec next_token = parser bp - | [< '' ' | '\t' | '\n' |'\r' as c; s >] -> - comm_loc bp; push_char c; next_token s - | [< ''$' as c; t = parse_after_special c bp >] ep -> - comment_stop bp; (t, (ep, bp)) - | [< ''.' as c; t = parse_after_special c bp; s >] ep -> - comment_stop bp; - (* We enforce that "." should either be part of a larger keyword, - for instance ".(", or followed by a blank or eof. *) - let () = match t with - | KEYWORD ("." | "...") -> - if not (blank_or_eof s) then err (bp,ep+1) Undefined_token; - between_com := true; - | _ -> () - in - (t, (bp,ep)) - | [< ' ('-'|'+'|'*' as c); s >] -> - let t,new_between_com = - if !between_com then process_sequence bp c s,true - else process_chars bp c s,false - in - comment_stop bp; between_com := new_between_com; t - | [< ''?'; s >] ep -> - let t = parse_after_qmark bp s in comment_stop bp; (t, (ep, bp)) - | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); - len = ident_tail (store 0 c); s >] ep -> - let id = get_buff len in - comment_stop bp; - (try find_keyword id s with Not_found -> IDENT id), (bp, ep) - | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> - comment_stop bp; - (INT (get_buff len), (bp, ep)) - | [< ''\"'; len = string None bp 0 >] ep -> - comment_stop bp; - (STRING (get_buff len), (bp, ep)) - | [< ' ('(' as c); - t = parser - | [< ''*'; s >] -> - comm_loc bp; - push_string "(*"; - comment bp s; - next_token s - | [< t = process_chars bp c >] -> comment_stop bp; t >] -> - t - | [< s >] -> - match lookup_utf8 s with - | Utf8Token (Unicode.Letter, n) -> - let len = ident_tail (nstore n 0 s) s in - let id = get_buff len in - let ep = Stream.count s in - comment_stop bp; - (try find_keyword id s with Not_found -> IDENT id), (bp, ep) - | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) -> - let t = process_chars bp (Stream.next s) s in - let new_between_com = match t with - (KEYWORD ("{"|"}"),_) -> !between_com | _ -> false in - comment_stop bp; between_com := new_between_com; t - | EmptyStream -> - comment_stop bp; (EOI, (bp, bp + 1)) - -(* (* Debug: uncomment this for tracing tokens seen by coq...*) -let next_token s = - let (t,(bp,ep)) = next_token s in Printf.eprintf "[%s]\n%!" (Tok.to_string t); - (t,(bp,ep)) -*) - -(* Location table system for creating tables associating a token count - to its location in a char stream (the source) *) - -let locerr () = invalid_arg "Lexer: location function" - -let loct_create () = Hashtbl.create 207 - -let loct_func loct i = - try Hashtbl.find loct i with Not_found -> locerr () - -let loct_add loct i loc = Hashtbl.add loct i loc - -let current_location_table = ref (loct_create ()) - -type location_table = (int, CompatLoc.t) Hashtbl.t -let location_table () = !current_location_table -let restore_location_table t = current_location_table := t - -(** {6 The lexer of Coq} *) - -(** Note: removing a token. - We do nothing because [remove_token] is called only when removing a grammar - rule with [Grammar.delete_rule]. The latter command is called only when - unfreezing the state of the grammar entries (see GRAMMAR summary, file - env/metasyntax.ml). Therefore, instead of removing tokens one by one, - we unfreeze the state of the lexer. This restores the behaviour of the - lexer. B.B. *) - -IFDEF CAMLP5 THEN - -type te = Tok.t - -(** Names of tokens, for this lexer, used in Grammar error messages *) - -let token_text = function - | ("", t) -> "'" ^ t ^ "'" - | ("IDENT", "") -> "identifier" - | ("IDENT", t) -> "'" ^ t ^ "'" - | ("INT", "") -> "integer" - | ("INT", s) -> "'" ^ s ^ "'" - | ("STRING", "") -> "string" - | ("EOI", "") -> "end of input" - | (con, "") -> con - | (con, prm) -> con ^ " \"" ^ prm ^ "\"" - -let func cs = - let loct = loct_create () in - let ts = - Stream.from - (fun i -> - let (tok, loc) = next_token cs in - loct_add loct i (make_loc loc); Some tok) - in - current_location_table := loct; - (ts, loct_func loct) - -let lexer = { - Token.tok_func = func; - Token.tok_using = - (fun pat -> match Tok.of_pattern pat with - | KEYWORD s -> add_keyword s - | _ -> ()); - Token.tok_removing = (fun _ -> ()); - Token.tok_match = Tok.match_pattern; - Token.tok_comm = None; - Token.tok_text = token_text } - -ELSE (* official camlp4 for ocaml >= 3.10 *) - -module M_ = Camlp4.ErrorHandler.Register (Error) - -module Loc = CompatLoc -module Token = struct - include Tok (* Cf. tok.ml *) - module Loc = CompatLoc - module Error = Camlp4.Struct.EmptyError - module Filter = struct - type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t - type t = unit - let mk _is_kwd = () - let keyword_added () kwd _ = add_keyword kwd - let keyword_removed () _ = () - let filter () x = x - let define_filter () _ = () - end -end - -let mk () _init_loc(*FIXME*) cs = - let loct = loct_create () in - let rec self = - parser i - [< (tok, loc) = next_token; s >] -> - let loc = make_loc loc in - loct_add loct i loc; - [< '(tok, loc); self s >] - | [< >] -> [< >] - in current_location_table := loct; self cs - -END - -(** Terminal symbols interpretation *) - -let is_ident_not_keyword s = - is_ident s && not (is_keyword s) - -let is_number s = - let rec aux i = - Int.equal (String.length s) i || - match s.[i] with '0'..'9' -> aux (i+1) | _ -> false - in aux 0 - -let strip s = - let len = - let rec loop i len = - if Int.equal i (String.length s) then len - else if s.[i] == ' ' then loop (i + 1) len - else loop (i + 1) (len + 1) - in - loop 0 0 - in - if len == String.length s then s - else - let s' = String.create len in - let rec loop i i' = - if i == String.length s then s' - else if s.[i] == ' ' then loop (i + 1) i' - else begin s'.[i'] <- s.[i]; loop (i + 1) (i' + 1) end - in - loop 0 0 - -let terminal s = - let s = strip s in - let () = match s with "" -> Errors.error "empty token." | _ -> () in - if is_ident_not_keyword s then IDENT s - else if is_number s then INT s - else KEYWORD s diff --git a/parsing/lexer.mli b/parsing/lexer.mli deleted file mode 100644 index 24b0ec8471..0000000000 --- a/parsing/lexer.mli +++ /dev/null @@ -1,38 +0,0 @@ -(************************************************************************) -(* v * The Coq Proof Assistant / The Coq Development Team *) -(* unit -val remove_keyword : string -> unit -val is_keyword : string -> bool - -(* val location_function : int -> Loc.t *) - -(** for coqdoc *) -type location_table -val location_table : unit -> location_table -val restore_location_table : location_table -> unit - -val check_ident : string -> unit -val is_ident : string -> bool -val check_keyword : string -> unit - -type frozen_t -val freeze : unit -> frozen_t -val unfreeze : frozen_t -> unit - -type com_state -val com_state: unit -> com_state -val restore_com_state: com_state -> unit - -val xml_output_comment : (string -> unit) Hook.t - -val terminal : string -> Tok.t - -(** The lexer of Coq: *) - -include Compat.LexerSig diff --git a/parsing/parsing.mllib b/parsing/parsing.mllib index a0cb831931..0e1c79c91b 100644 --- a/parsing/parsing.mllib +++ b/parsing/parsing.mllib @@ -1,6 +1,6 @@ Tok Compat -Lexer +CLexer Pcoq Egramml Egramcoq diff --git a/parsing/pcoq.ml4 b/parsing/pcoq.ml4 index 28dc74e81b..f01e25c616 100644 --- a/parsing/pcoq.ml4 +++ b/parsing/pcoq.ml4 @@ -18,7 +18,7 @@ open Tok (* necessary for camlp4 *) (** The parser of Coq *) -module G = GrammarMake (Lexer) +module G = GrammarMake (CLexer) (* TODO: this is a workaround, since there isn't such [warning_verbose] in new camlp4. In camlp5, this ref @@ -51,7 +51,7 @@ ELSE | tok -> Stoken ((=) tok, to_string tok) END -let gram_token_of_string s = gram_token_of_token (Lexer.terminal s) +let gram_token_of_string s = gram_token_of_token (CLexer.terminal s) let camlp4_verbosity silent f x = let a = !warning_verbose in diff --git a/tactics/tacinterp.ml b/tactics/tacinterp.ml index 5ecc46d670..a7acdf7594 100644 --- a/tactics/tacinterp.ml +++ b/tactics/tacinterp.ml @@ -491,7 +491,7 @@ let interp_fresh_id ist env sigma l = String.concat "" (List.map (function | ArgArg s -> s | ArgVar (_,id) -> Id.to_string (interp_ident ist env sigma id)) l) in - let s = if Lexer.is_keyword s then s^"0" else s in + let s = if CLexer.is_keyword s then s^"0" else s in Id.of_string s in Tactics.fresh_id_in_env avoid id env diff --git a/toplevel/cerrors.ml b/toplevel/cerrors.ml index 600683d359..d8ee5d90da 100644 --- a/toplevel/cerrors.ml +++ b/toplevel/cerrors.ml @@ -35,7 +35,7 @@ let explain_exn_default = function (* Basic interaction exceptions *) | Stream.Error txt -> hov 0 (str "Syntax error: " ++ str txt ++ str ".") | Compat.Token.Error txt -> hov 0 (str "Syntax error: " ++ str txt ++ str ".") - | Lexer.Error.E err -> hov 0 (str (Lexer.Error.to_string err)) + | CLexer.Error.E err -> hov 0 (str (CLexer.Error.to_string err)) | Sys_error msg -> hov 0 (str "System error: " ++ guill msg) | Out_of_memory -> hov 0 (str "Out of memory.") | Stack_overflow -> hov 0 (str "Stack overflow.") diff --git a/toplevel/coqloop.ml b/toplevel/coqloop.ml index 063ed89643..040c339249 100644 --- a/toplevel/coqloop.ml +++ b/toplevel/coqloop.ml @@ -288,7 +288,7 @@ let rec discard_to_dot () = try Gram.entry_parse parse_to_dot top_buffer.tokens with - | Compat.Token.Error _ | Lexer.Error.E _ -> discard_to_dot () + | Compat.Token.Error _ | CLexer.Error.E _ -> discard_to_dot () | End_of_input -> raise End_of_input | e when Errors.noncritical e -> () diff --git a/toplevel/coqtop.ml b/toplevel/coqtop.ml index 9e1a76bbd6..08b38a7037 100644 --- a/toplevel/coqtop.ml +++ b/toplevel/coqtop.ml @@ -230,11 +230,11 @@ let compile_files () = | [vf] -> compile_file vf (* One compilation : no need to save init state *) | l -> let init_state = States.freeze ~marshallable:`No in - let coqdoc_init_state = Lexer.location_table () in + let coqdoc_init_state = CLexer.location_table () in List.iter (fun vf -> States.unfreeze init_state; - Lexer.restore_location_table coqdoc_init_state; + CLexer.restore_location_table coqdoc_init_state; compile_file vf) (List.rev l) diff --git a/toplevel/metasyntax.ml b/toplevel/metasyntax.ml index 2ccd27acb9..c33726550d 100644 --- a/toplevel/metasyntax.ml +++ b/toplevel/metasyntax.ml @@ -31,7 +31,7 @@ open Nameops (**********************************************************************) (* Tokens *) -let cache_token (_,s) = Lexer.add_keyword s +let cache_token (_,s) = CLexer.add_keyword s let inToken : string -> obj = declare_object {(default_object "TOKEN") with @@ -428,7 +428,7 @@ let rec interp_list_parser hd = function (* To protect alphabetic tokens and quotes from being seen as variables *) let quote_notation_token x = let n = String.length x in - let norm = Lexer.is_ident x in + let norm = CLexer.is_ident x in if (n > 0 && norm) || (n > 2 && x.[0] == '\'') then "'"^x^"'" else x @@ -436,7 +436,7 @@ let rec raw_analyze_notation_tokens = function | [] -> [] | String ".." :: sl -> NonTerminal ldots_var :: raw_analyze_notation_tokens sl | String "_" :: _ -> error "_ must be quoted." - | String x :: sl when Lexer.is_ident x -> + | String x :: sl when CLexer.is_ident x -> NonTerminal (Names.Id.of_string x) :: raw_analyze_notation_tokens sl | String s :: sl -> Terminal (String.drop_simple_quotes s) :: raw_analyze_notation_tokens sl @@ -725,7 +725,7 @@ let rec define_keywords_aux = function | GramConstrNonTerminal(e,Some _) as n1 :: GramConstrTerminal(IDENT k) :: l when is_not_small_constr e -> Flags.if_verbose msg_info (str "Identifier '" ++ str k ++ str "' now a keyword"); - Lexer.add_keyword k; + CLexer.add_keyword k; n1 :: GramConstrTerminal(KEYWORD k) :: define_keywords_aux l | n :: l -> n :: define_keywords_aux l | [] -> [] @@ -734,7 +734,7 @@ let rec define_keywords_aux = function let define_keywords = function | GramConstrTerminal(IDENT k)::l -> Flags.if_verbose msg_info (str "Identifier '" ++ str k ++ str "' now a keyword"); - Lexer.add_keyword k; + CLexer.add_keyword k; GramConstrTerminal(KEYWORD k) :: define_keywords_aux l | l -> define_keywords_aux l @@ -763,12 +763,12 @@ let make_production etyps symbols = let typ = List.assoc m etyps in distribute [GramConstrNonTerminal (typ, Some m)] ll | Terminal s -> - distribute [GramConstrTerminal (Lexer.terminal s)] ll + distribute [GramConstrTerminal (CLexer.terminal s)] ll | Break _ -> ll | SProdList (x,sl) -> let tkl = List.flatten - (List.map (function Terminal s -> [Lexer.terminal s] + (List.map (function Terminal s -> [CLexer.terminal s] | Break _ -> [] | _ -> anomaly (Pp.str "Found a non terminal token in recursive notation separator")) sl) in match List.assoc x etyps with diff --git a/toplevel/vernac.ml b/toplevel/vernac.ml index 7c4920dfb8..a5f502503b 100644 --- a/toplevel/vernac.ml +++ b/toplevel/vernac.ml @@ -164,17 +164,17 @@ let save_translator_coqdoc () = (* translator state *) let ch = !chan_beautify in let cl = !Pp.comments in - let cs = Lexer.com_state() in + let cs = CLexer.com_state() in (* end translator state *) - let coqdocstate = Lexer.location_table () in + let coqdocstate = CLexer.location_table () in ch,cl,cs,coqdocstate let restore_translator_coqdoc (ch,cl,cs,coqdocstate) = if !Flags.beautify_file then close_out !chan_beautify; chan_beautify := ch; Pp.comments := cl; - Lexer.restore_com_state cs; - Lexer.restore_location_table coqdocstate + CLexer.restore_com_state cs; + CLexer.restore_location_table coqdocstate (* For coqtop -time, we display the position in the file, and a glimpse of the executed command *) -- cgit v1.2.3