From 3e99b3807b4380cbb6b875fa6c67a8ee921b2494 Mon Sep 17 00:00:00 2001 From: Pierre-Marie Pédrot Date: Mon, 15 Oct 2018 23:11:03 +0200 Subject: Removing the Camlp5 macros from CLexer. We partially hand-translated so as to result in the minimal diff possible. --- .gitignore | 1 - parsing/cLexer.ml | 815 +++++++++++++++++++++++++++++++++++++++++++++++++++++ parsing/cLexer.ml4 | 728 ----------------------------------------------- 3 files changed, 815 insertions(+), 729 deletions(-) create mode 100644 parsing/cLexer.ml delete mode 100644 parsing/cLexer.ml4 diff --git a/.gitignore b/.gitignore index 63da6b4d0e..f741135211 100644 --- a/.gitignore +++ b/.gitignore @@ -135,7 +135,6 @@ coqpp/coqpp_parse.mli g_*.ml lib/coqProject_file.ml -parsing/cLexer.ml plugins/ltac/coretactics.ml plugins/ltac/extratactics.ml plugins/ltac/extraargs.ml diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml new file mode 100644 index 0000000000..2230dfc47c --- /dev/null +++ b/parsing/cLexer.ml @@ -0,0 +1,815 @@ +(************************************************************************) +(* * The Coq Proof Assistant / The Coq Development Team *) +(* v * INRIA, CNRS and contributors - Copyright 1999-2018 *) +(* "" +| Loc.InFile f -> f + +let coq_file_of_ploc_file s = + if s = "" then Loc.ToplevelInput else Loc.InFile s + +let from_coqloc fname line_nb bol_pos bp ep = + Ploc.make_loc (ploc_file_of_coq_file fname) line_nb bol_pos (bp, ep) "" + +let to_coqloc loc = + { Loc.fname = coq_file_of_ploc_file (Ploc.file_name loc); + Loc.line_nb = Ploc.line_nb loc; + Loc.bol_pos = Ploc.bol_pos loc; + Loc.bp = Ploc.first_pos loc; + Loc.ep = Ploc.last_pos loc; + Loc.line_nb_last = Ploc.line_nb_last loc; + Loc.bol_pos_last = Ploc.bol_pos_last loc; } + +let (!@) = to_coqloc + +(* Dictionaries: trees annotated with string options, each node being a map + from chars to dictionaries (the subtrees). A trie, in other words. *) + +module CharOrd = struct type t = char let compare : char -> char -> int = compare end +module CharMap = Map.Make (CharOrd) + +type ttree = { + node : string option; + branch : ttree CharMap.t } + +let empty_ttree = { node = None; branch = CharMap.empty } + +let ttree_add ttree str = + let rec insert tt i = + if i == String.length str then + {node = Some str; branch = tt.branch} + else + let c = str.[i] in + let br = + match try Some (CharMap.find c tt.branch) with Not_found -> None with + | Some tt' -> + CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch) + | None -> + let tt' = {node = None; branch = CharMap.empty} in + CharMap.add c (insert tt' (i + 1)) tt.branch + in + { node = tt.node; branch = br } + in + insert ttree 0 + +(* Search a string in a dictionary: raises [Not_found] + if the word is not present. *) + +let ttree_find ttree str = + let rec proc_rec tt i = + if i == String.length str then tt + else proc_rec (CharMap.find str.[i] tt.branch) (i+1) + in + proc_rec ttree 0 + +(* Removes a string from a dictionary: returns an equal dictionary + if the word not present. *) +let ttree_remove ttree str = + let rec remove tt i = + if i == String.length str then + {node = None; branch = tt.branch} + else + let c = str.[i] in + let br = + match try Some (CharMap.find c tt.branch) with Not_found -> None with + | Some tt' -> + CharMap.add c (remove tt' (i + 1)) (CharMap.remove c tt.branch) + | None -> tt.branch + in + { node = tt.node; branch = br } + in + remove ttree 0 + +let ttree_elements ttree = + let rec elts tt accu = + let accu = match tt.node with + | None -> accu + | Some s -> CString.Set.add s accu + in + CharMap.fold (fun _ tt accu -> elts tt accu) tt.branch accu + in + elts ttree CString.Set.empty + +(* Errors occurring while lexing (explained as "Lexer error: ...") *) + +module Error = struct + + type t = + | Illegal_character + | Unterminated_comment + | Unterminated_string + | Undefined_token + | Bad_token of string + + exception E of t + + let to_string x = + "Syntax Error: Lexer: " ^ + (match x with + | Illegal_character -> "Illegal character" + | Unterminated_comment -> "Unterminated comment" + | Unterminated_string -> "Unterminated string" + | Undefined_token -> "Undefined token" + | Bad_token tok -> Format.sprintf "Bad token %S" tok) + +end +open Error + +let err loc str = Loc.raise ~loc:(to_coqloc loc) (Error.E str) + +let bad_token str = raise (Error.E (Bad_token str)) + +(* Update a loc without allocating an intermediate pair *) +let set_loc_pos loc bp ep = + Ploc.sub loc (bp - Ploc.first_pos loc) (ep - bp) + +(* Increase line number by 1 and update position of beginning of line *) +let bump_loc_line loc bol_pos = + Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb loc + 1) bol_pos + (Ploc.first_pos loc, Ploc.last_pos loc) (Ploc.comment loc) + +(* Same as [bump_loc_line], but for the last line in location *) +(* For an obscure reason, camlp5 does not give an easy way to set line_nb_stop, + so we have to resort to a hack merging two locations. *) +(* Warning: [bump_loc_line_last] changes the end position. You may need to call + [set_loc_pos] to fix it. *) +let bump_loc_line_last loc bol_pos = + let loc' = + Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb_last loc + 1) bol_pos + (Ploc.first_pos loc + 1, Ploc.last_pos loc + 1) (Ploc.comment loc) + in + Ploc.encl loc loc' + +(* For some reason, the [Ploc.after] function of Camlp5 does not update line + numbers, so we define our own function that does it. *) +let after loc = + let line_nb = Ploc.line_nb_last loc in + let bol_pos = Ploc.bol_pos_last loc in + Ploc.make_loc (Ploc.file_name loc) line_nb bol_pos + (Ploc.last_pos loc, Ploc.last_pos loc) (Ploc.comment loc) + +(** Lexer conventions on tokens *) + +type token_kind = + | Utf8Token of (Unicode.status * int) + | AsciiChar + | EmptyStream + +let error_utf8 loc cs = + let bp = Stream.count cs in + Stream.junk cs; (* consume the char to avoid read it and fail again *) + let loc = set_loc_pos loc bp (bp+1) in + err loc Illegal_character + +let utf8_char_size loc cs = function + (* Utf8 leading byte *) + | '\x00'..'\x7F' -> 1 + | '\xC0'..'\xDF' -> 2 + | '\xE0'..'\xEF' -> 3 + | '\xF0'..'\xF7' -> 4 + | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> error_utf8 loc cs + +let njunk n = Util.repeat n Stream.junk + +let check_utf8_trailing_byte loc cs c = + if not (Int.equal (Char.code c land 0xC0) 0x80) then error_utf8 loc cs + +(* Recognize utf8 blocks (of length less than 4 bytes) *) +(* but don't certify full utf8 compliance (e.g. no emptyness check) *) +let lookup_utf8_tail loc c cs = + let c1 = Char.code c in + if Int.equal (c1 land 0x40) 0 || Int.equal (c1 land 0x38) 0x38 then error_utf8 loc cs + else + let n, unicode = + if Int.equal (c1 land 0x20) 0 then + match Stream.npeek 2 cs with + | [_;c2] -> + check_utf8_trailing_byte loc cs c2; + 2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F) + | _ -> error_utf8 loc cs + else if Int.equal (c1 land 0x10) 0 then + match Stream.npeek 3 cs with + | [_;c2;c3] -> + check_utf8_trailing_byte loc cs c2; + check_utf8_trailing_byte loc cs c3; + 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + + (Char.code c3 land 0x3F) + | _ -> error_utf8 loc cs + else match Stream.npeek 4 cs with + | [_;c2;c3;c4] -> + check_utf8_trailing_byte loc cs c2; + check_utf8_trailing_byte loc cs c3; + check_utf8_trailing_byte loc cs c4; + 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 + + (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) + | _ -> error_utf8 loc cs + in + Utf8Token (Unicode.classify unicode, n) + +let lookup_utf8 loc cs = + match Stream.peek cs with + | Some ('\x00'..'\x7F') -> AsciiChar + | Some ('\x80'..'\xFF' as c) -> lookup_utf8_tail loc c cs + | None -> EmptyStream + +let unlocated f x = f x + (** FIXME: should we still unloc the exception? *) +(* try f x with Loc.Exc_located (_, exc) -> raise exc *) + +let check_keyword str = + let rec loop_symb s = match Stream.peek s with + | Some (' ' | '\n' | '\r' | '\t' | '"') -> + Stream.junk s; + bad_token str + | _ -> + match unlocated lookup_utf8 Ploc.dummy s with + | Utf8Token (_,n) -> njunk n s; loop_symb s + | AsciiChar -> Stream.junk s; loop_symb s + | EmptyStream -> () + in + loop_symb (Stream.of_string str) + +let check_ident str = + let rec loop_id intail s = match Stream.peek s with + | Some ('a'..'z' | 'A'..'Z' | '_') -> + Stream.junk s; + loop_id true s + | Some ('0'..'9' | '\'') when intail -> + Stream.junk s; + loop_id true s + | _ -> + match unlocated lookup_utf8 Ploc.dummy s with + | Utf8Token (st, n) when not intail && Unicode.is_valid_ident_initial st -> njunk n s; loop_id true s + | Utf8Token (st, n) when intail && Unicode.is_valid_ident_trailing st -> + njunk n s; + loop_id true s + | EmptyStream -> () + | Utf8Token _ | AsciiChar -> bad_token str + in + loop_id false (Stream.of_string str) + +let is_ident str = + try let _ = check_ident str in true with Error.E _ -> false + +(* Keyword and symbol dictionary *) +let token_tree = ref empty_ttree + +let is_keyword s = + try match (ttree_find !token_tree s).node with None -> false | Some _ -> true + with Not_found -> false + +let add_keyword str = + if not (is_keyword str) then + begin + check_keyword str; + token_tree := ttree_add !token_tree str + end + +let remove_keyword str = + token_tree := ttree_remove !token_tree str + +let keywords () = ttree_elements !token_tree + +(* Freeze and unfreeze the state of the lexer *) +type keyword_state = ttree + +let get_keyword_state () = !token_tree +let set_keyword_state tt = (token_tree := tt) + +(* The string buffering machinery *) + +let buff = ref (Bytes.create 80) + +let store len x = + let open Bytes in + if len >= length !buff then + buff := cat !buff (create (length !buff)); + set !buff len x; + succ len + +let rec nstore n len cs = + if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len + +let get_buff len = Bytes.sub_string !buff 0 len + +(* The classical lexer: idents, numbers, quoted strings, comments *) + +let warn_unrecognized_unicode = + CWarnings.create ~name:"unrecognized-unicode" ~category:"parsing" + (fun (u,id) -> + strbrk (Printf.sprintf "Not considering unicode character \"%s\" of unknown \ + lexical status as part of identifier \"%s\"." u id)) + +let rec ident_tail loc len s = match Stream.peek s with + | Some ('a'..'z' | 'A'..'Z' | '0'..'9' | '\'' | '_' as c) -> + Stream.junk s; + ident_tail loc (store len c) s + | _ -> + match lookup_utf8 loc s with + | Utf8Token (st, n) when Unicode.is_valid_ident_trailing st -> + ident_tail loc (nstore n len s) s + | Utf8Token (st, n) when Unicode.is_unknown st -> + let id = get_buff len in + let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in + warn_unrecognized_unicode ~loc:!@loc (u,id); len + | _ -> len + +let rec number len s = match Stream.peek s with + | Some ('0'..'9' as c) -> Stream.junk s; number (store len c) s + | _ -> len + +let warn_comment_terminator_in_string = + CWarnings.create ~name:"comment-terminator-in-string" ~category:"parsing" + (fun () -> + (strbrk + "Not interpreting \"*)\" as the end of current \ + non-terminated comment because it occurs in a \ + non-terminated string of the comment.")) + +(* If the string being lexed is in a comment, [comm_level] is Some i with i the + current level of comments nesting. Otherwise, [comm_level] is None. *) +let rec string loc ~comm_level bp len s = match Stream.peek s with + | Some '"' -> + Stream.junk s; + let esc = + match Stream.peek s with + Some '"' -> Stream.junk s; true + | _ -> false + in + if esc then string loc ~comm_level bp (store len '"') s else (loc, len) + | Some '(' -> + Stream.junk s; + (fun s -> match Stream.peek s with + | Some '*' -> + Stream.junk s; + let comm_level = Option.map succ comm_level in + string loc ~comm_level + bp (store (store len '(') '*') + s + | _ -> + string loc ~comm_level bp (store len '(') s) s + | Some '*' -> + Stream.junk s; + (fun s -> match Stream.peek s with + | Some ')' -> + Stream.junk s; + let () = match comm_level with + | Some 0 -> + warn_comment_terminator_in_string ~loc:!@loc () + | _ -> () + in + let comm_level = Option.map pred comm_level in + string loc ~comm_level bp (store (store len '*') ')') s + | _ -> + string loc ~comm_level bp (store len '*') s) s + | Some ('\n' as c) -> + Stream.junk s; + let ep = Stream.count s in + (* If we are parsing a comment, the string if not part of a token so we + update the first line of the location. Otherwise, we update the last + line. *) + let loc = + if Option.has_some comm_level then bump_loc_line loc ep + else bump_loc_line_last loc ep + in + string loc ~comm_level bp (store len c) s + | Some c -> + Stream.junk s; + string loc ~comm_level bp (store len c) s + | _ -> + let _ = Stream.empty s in + let ep = Stream.count s in + let loc = set_loc_pos loc bp ep in + err loc Unterminated_string + +(* To associate locations to a file name *) +let current_file = ref Loc.ToplevelInput + +(* Utilities for comments in beautify *) +let comment_begin = ref None +let comm_loc bp = match !comment_begin with +| None -> comment_begin := Some bp +| _ -> () + +let comments = ref [] +let current_comment = Buffer.create 8192 +let between_commands = ref true + +(* The state of the lexer visible from outside *) +type lexer_state = int option * string * bool * ((int * int) * string) list * Loc.source + +let init_lexer_state f = (None,"",true,[],f) +let set_lexer_state (o,s,b,c,f) = + comment_begin := o; + Buffer.clear current_comment; Buffer.add_string current_comment s; + between_commands := b; + comments := c; + current_file := f +let get_lexer_state () = + (!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file) +let drop_lexer_state () = + set_lexer_state (init_lexer_state Loc.ToplevelInput) + +let get_comment_state (_,_,_,c,_) = c + +let real_push_char c = Buffer.add_char current_comment c + +(* Add a char if it is between two commands, if it is a newline or + if the last char is not a space itself. *) +let push_char c = + if + !between_commands || List.mem c ['\n';'\r'] || + (List.mem c [' ';'\t']&& + (Int.equal (Buffer.length current_comment) 0 || + not (let s = Buffer.contents current_comment in + List.mem s.[String.length s - 1] [' ';'\t';'\n';'\r']))) + then + real_push_char c + +let push_string s = Buffer.add_string current_comment s + +let null_comment s = + let rec null i = + i<0 || (List.mem s.[i] [' ';'\t';'\n';'\r'] && null (i-1)) in + null (String.length s - 1) + +let comment_stop ep = + let current_s = Buffer.contents current_comment in + (if !Flags.beautify && Buffer.length current_comment > 0 && + (!between_commands || not(null_comment current_s)) then + let bp = match !comment_begin with + Some bp -> bp + | None -> + Feedback.msg_notice + (str "No begin location for comment '" + ++ str current_s ++str"' ending at " + ++ int ep); + ep-1 in + comments := ((bp,ep),current_s) :: !comments); + Buffer.clear current_comment; + comment_begin := None; + between_commands := false + +let rec comment loc bp s = + let bp2 = Stream.count s in + match Stream.peek s with + Some '(' -> + Stream.junk s; + let loc = + try + match Stream.peek s with + Some '*' -> + Stream.junk s; + push_string "(*"; comment loc bp s + | _ -> push_string "("; loc + with Stream.Failure -> raise (Stream.Error "") + in + comment loc bp s + | Some '*' -> + Stream.junk s; + begin try + match Stream.peek s with + Some ')' -> Stream.junk s; push_string "*)"; loc + | _ -> real_push_char '*'; comment loc bp s + with Stream.Failure -> raise (Stream.Error "") + end + | Some '"' -> + Stream.junk s; + let loc, len = string loc ~comm_level:(Some 0) bp2 0 s in + push_string "\""; push_string (get_buff len); push_string "\""; + comment loc bp s + | _ -> + match try Some (Stream.empty s) with Stream.Failure -> None with + | Some _ -> + let ep = Stream.count s in + let loc = set_loc_pos loc bp ep in + err loc Unterminated_comment + | _ -> + match Stream.peek s with + Some ('\n' as z) -> + Stream.junk s; + let ep = Stream.count s in + real_push_char z; comment (bump_loc_line loc ep) bp s + | Some z -> + Stream.junk s; + real_push_char z; comment loc bp s + | _ -> raise Stream.Failure + +(* Parse a special token, using the [token_tree] *) + +(* Peek as much utf-8 lexemes as possible *) +(* and retain the longest valid special token obtained *) +let rec progress_further loc last nj tt cs = + try progress_from_byte loc last nj tt cs (List.nth (Stream.npeek (nj+1) cs) nj) + with Failure _ -> last + +and update_longest_valid_token loc last nj tt cs = + match tt.node with + | Some _ as last' -> + stream_njunk nj cs; + progress_further loc last' 0 tt cs + | None -> + progress_further loc last nj tt cs + +(* nj is the number of char peeked since last valid token *) +(* n the number of char in utf8 block *) +and progress_utf8 loc last nj n c tt cs = + try + let tt = CharMap.find c tt.branch in + if Int.equal n 1 then + update_longest_valid_token loc last (nj+n) tt cs + else + match Util.List.skipn (nj+1) (Stream.npeek (nj+n) cs) with + | l when Int.equal (List.length l) (n - 1) -> + List.iter (check_utf8_trailing_byte loc cs) l; + let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in + update_longest_valid_token loc last (nj+n) tt cs + | _ -> + error_utf8 loc cs + with Not_found -> + last + +and progress_from_byte loc last nj tt cs c = + progress_utf8 loc last nj (utf8_char_size loc cs c) c tt cs + +let find_keyword loc id s = + let tt = ttree_find !token_tree id in + match progress_further loc tt.node 0 tt s with + | None -> raise Not_found + | Some c -> KEYWORD c + +let process_sequence loc bp c cs = + let rec aux n cs = + match Stream.peek cs with + | Some c' when c == c' -> Stream.junk cs; aux (n+1) cs + | _ -> BULLET (String.make n c), set_loc_pos loc bp (Stream.count cs) + in + aux 1 cs + +(* Must be a special token *) +let process_chars loc bp c cs = + let t = progress_from_byte loc None (-1) !token_tree cs c in + let ep = Stream.count cs in + match t with + | Some t -> (KEYWORD t, set_loc_pos loc bp ep) + | None -> + let ep' = bp + utf8_char_size loc cs c in + njunk (ep' - ep) cs; + let loc = set_loc_pos loc bp ep' in + err loc Undefined_token + +(* Parse what follows a dot *) + +let parse_after_dot loc c bp s = match Stream.peek s with + | Some ('a'..'z' | 'A'..'Z' | '_' as d) -> + Stream.junk s; + let len = + try ident_tail loc (store 0 d) s with + Stream.Failure -> raise (Stream.Error "") + in + let field = get_buff len in + (try find_keyword loc ("."^field) s with Not_found -> FIELD field) + | _ -> + match lookup_utf8 loc s with + | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> + let len = ident_tail loc (nstore n 0 s) s in + let field = get_buff len in + (try find_keyword loc ("."^field) s with Not_found -> FIELD field) + | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s) + +(* Parse what follows a question mark *) + +let parse_after_qmark loc bp s = + match Stream.peek s with + | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK + | None -> KEYWORD "?" + | _ -> + match lookup_utf8 loc s with + | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK + | AsciiChar | Utf8Token _ | EmptyStream -> + fst (process_chars loc bp '?' s) + +let blank_or_eof cs = + match Stream.peek cs with + | None -> true + | Some (' ' | '\t' | '\n' |'\r') -> true + | _ -> false + +(* Parse a token in a char stream *) + +let rec next_token loc s = + let bp = Stream.count s in + match Stream.peek s with + | Some ('\n' as c) -> + Stream.junk s; + let ep = Stream.count s in + comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s + | Some (' ' | '\t' | '\r' as c) -> + Stream.junk s; + comm_loc bp; push_char c; next_token loc s + | Some ('.' as c) -> + Stream.junk s; + let t = + try parse_after_dot loc c bp s with + Stream.Failure -> raise (Stream.Error "") + in + let ep = Stream.count s in + comment_stop bp; + (* We enforce that "." should either be part of a larger keyword, + for instance ".(", or followed by a blank or eof. *) + let () = match t with + | KEYWORD ("." | "...") -> + if not (blank_or_eof s) then + err (set_loc_pos loc bp (ep+1)) Undefined_token; + between_commands := true; + | _ -> () + in + t, set_loc_pos loc bp ep + | Some ('-' | '+' | '*' as c) -> + Stream.junk s; + let t,new_between_commands = + if !between_commands then process_sequence loc bp c s, true + else process_chars loc bp c s,false + in + comment_stop bp; between_commands := new_between_commands; t + | Some '?' -> + Stream.junk s; + let ep = Stream.count s in + let t = parse_after_qmark loc bp s in + comment_stop bp; (t, set_loc_pos loc bp ep) + | Some ('a'..'z' | 'A'..'Z' | '_' as c) -> + Stream.junk s; + let len = + try ident_tail loc (store 0 c) s with + Stream.Failure -> raise (Stream.Error "") + in + let ep = Stream.count s in + let id = get_buff len in + comment_stop bp; + (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep + | Some ('0'..'9' as c) -> + Stream.junk s; + let len = + try number (store 0 c) s with + Stream.Failure -> raise (Stream.Error "") + in + let ep = Stream.count s in + comment_stop bp; + (INT (get_buff len), set_loc_pos loc bp ep) + | Some '\"' -> + Stream.junk s; + let (loc, len) = + try string loc ~comm_level:None bp 0 s with + Stream.Failure -> raise (Stream.Error "") + in + let ep = Stream.count s in + comment_stop bp; + (STRING (get_buff len), set_loc_pos loc bp ep) + | Some ('(' as c) -> + Stream.junk s; + begin try + match Stream.peek s with + | Some '*' -> + Stream.junk s; + comm_loc bp; + push_string "(*"; + let loc = comment loc bp s in next_token loc s + | _ -> let t = process_chars loc bp c s in comment_stop bp; t + with Stream.Failure -> raise (Stream.Error "") + end + | Some ('{' | '}' as c) -> + Stream.junk s; + let ep = Stream.count s in + let t,new_between_commands = + if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true + else process_chars loc bp c s, false + in + comment_stop bp; between_commands := new_between_commands; t + | _ -> + match lookup_utf8 loc s with + | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> + let len = ident_tail loc (nstore n 0 s) s in + let id = get_buff len in + let ep = Stream.count s in + comment_stop bp; + (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep + | AsciiChar | Utf8Token _ -> + let t = process_chars loc bp (Stream.next s) s in + comment_stop bp; t + | EmptyStream -> + comment_stop bp; (EOI, set_loc_pos loc bp (bp+1)) + +(* (* Debug: uncomment this for tracing tokens seen by coq...*) +let next_token loc s = + let (t,loc as r) = next_token loc s in + Printf.eprintf "(line %i, %i-%i)[%s]\n%!" (Ploc.line_nb loc) (Ploc.first_pos loc) (Ploc.last_pos loc) (Tok.to_string t); + r *) + +(* Location table system for creating tables associating a token count + to its location in a char stream (the source) *) + +let locerr () = invalid_arg "Lexer: location function" + +let loct_create () = Hashtbl.create 207 + +let loct_func loct i = + try Hashtbl.find loct i with Not_found -> locerr () + +let loct_add loct i loc = Hashtbl.add loct i loc + +(** {6 The lexer of Coq} *) + +(** Note: removing a token. + We do nothing because [remove_token] is called only when removing a grammar + rule with [Grammar.delete_rule]. The latter command is called only when + unfreezing the state of the grammar entries (see GRAMMAR summary, file + env/metasyntax.ml). Therefore, instead of removing tokens one by one, + we unfreeze the state of the lexer. This restores the behaviour of the + lexer. B.B. *) + +type te = Tok.t + +(** Names of tokens, for this lexer, used in Grammar error messages *) + +let token_text = function + | ("", t) -> "'" ^ t ^ "'" + | ("IDENT", "") -> "identifier" + | ("IDENT", t) -> "'" ^ t ^ "'" + | ("INT", "") -> "integer" + | ("INT", s) -> "'" ^ s ^ "'" + | ("STRING", "") -> "string" + | ("EOI", "") -> "end of input" + | (con, "") -> con + | (con, prm) -> con ^ " \"" ^ prm ^ "\"" + +let func cs = + let loct = loct_create () in + let cur_loc = ref (from_coqloc !current_file 1 0 0 0) in + let ts = + Stream.from + (fun i -> + let (tok, loc) = next_token !cur_loc cs in + cur_loc := after loc; + loct_add loct i loc; Some tok) + in + (ts, loct_func loct) + +let lexer = { + Token.tok_func = func; + Token.tok_using = + (fun pat -> match Tok.of_pattern pat with + | KEYWORD s -> add_keyword s + | _ -> ()); + Token.tok_removing = (fun _ -> ()); + Token.tok_match = Tok.match_pattern; + Token.tok_comm = None; + Token.tok_text = token_text } + +(** Terminal symbols interpretation *) + +let is_ident_not_keyword s = + is_ident s && not (is_keyword s) + +let is_number s = + let rec aux i = + Int.equal (String.length s) i || + match s.[i] with '0'..'9' -> aux (i+1) | _ -> false + in aux 0 + +let strip s = + let len = + let rec loop i len = + if Int.equal i (String.length s) then len + else if s.[i] == ' ' then loop (i + 1) len + else loop (i + 1) (len + 1) + in + loop 0 0 + in + if len == String.length s then s + else + let s' = Bytes.create len in + let rec loop i i' = + if i == String.length s then s' + else if s.[i] == ' ' then loop (i + 1) i' + else begin Bytes.set s' i' s.[i]; loop (i + 1) (i' + 1) end + in + Bytes.to_string (loop 0 0) + +let terminal s = + let s = strip s in + let () = match s with "" -> failwith "empty token." | _ -> () in + if is_ident_not_keyword s then IDENT s + else if is_number s then INT s + else KEYWORD s diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4 deleted file mode 100644 index 9c421f5b76..0000000000 --- a/parsing/cLexer.ml4 +++ /dev/null @@ -1,728 +0,0 @@ -(************************************************************************) -(* * The Coq Proof Assistant / The Coq Development Team *) -(* v * INRIA, CNRS and contributors - Copyright 1999-2018 *) -(* "" -| Loc.InFile f -> f - -let coq_file_of_ploc_file s = - if s = "" then Loc.ToplevelInput else Loc.InFile s - -let from_coqloc fname line_nb bol_pos bp ep = - Ploc.make_loc (ploc_file_of_coq_file fname) line_nb bol_pos (bp, ep) "" - -let to_coqloc loc = - { Loc.fname = coq_file_of_ploc_file (Ploc.file_name loc); - Loc.line_nb = Ploc.line_nb loc; - Loc.bol_pos = Ploc.bol_pos loc; - Loc.bp = Ploc.first_pos loc; - Loc.ep = Ploc.last_pos loc; - Loc.line_nb_last = Ploc.line_nb_last loc; - Loc.bol_pos_last = Ploc.bol_pos_last loc; } - -let (!@) = to_coqloc - -(* Dictionaries: trees annotated with string options, each node being a map - from chars to dictionaries (the subtrees). A trie, in other words. *) - -module CharOrd = struct type t = char let compare : char -> char -> int = compare end -module CharMap = Map.Make (CharOrd) - -type ttree = { - node : string option; - branch : ttree CharMap.t } - -let empty_ttree = { node = None; branch = CharMap.empty } - -let ttree_add ttree str = - let rec insert tt i = - if i == String.length str then - {node = Some str; branch = tt.branch} - else - let c = str.[i] in - let br = - match try Some (CharMap.find c tt.branch) with Not_found -> None with - | Some tt' -> - CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch) - | None -> - let tt' = {node = None; branch = CharMap.empty} in - CharMap.add c (insert tt' (i + 1)) tt.branch - in - { node = tt.node; branch = br } - in - insert ttree 0 - -(* Search a string in a dictionary: raises [Not_found] - if the word is not present. *) - -let ttree_find ttree str = - let rec proc_rec tt i = - if i == String.length str then tt - else proc_rec (CharMap.find str.[i] tt.branch) (i+1) - in - proc_rec ttree 0 - -(* Removes a string from a dictionary: returns an equal dictionary - if the word not present. *) -let ttree_remove ttree str = - let rec remove tt i = - if i == String.length str then - {node = None; branch = tt.branch} - else - let c = str.[i] in - let br = - match try Some (CharMap.find c tt.branch) with Not_found -> None with - | Some tt' -> - CharMap.add c (remove tt' (i + 1)) (CharMap.remove c tt.branch) - | None -> tt.branch - in - { node = tt.node; branch = br } - in - remove ttree 0 - -let ttree_elements ttree = - let rec elts tt accu = - let accu = match tt.node with - | None -> accu - | Some s -> CString.Set.add s accu - in - CharMap.fold (fun _ tt accu -> elts tt accu) tt.branch accu - in - elts ttree CString.Set.empty - -(* Errors occurring while lexing (explained as "Lexer error: ...") *) - -module Error = struct - - type t = - | Illegal_character - | Unterminated_comment - | Unterminated_string - | Undefined_token - | Bad_token of string - - exception E of t - - let to_string x = - "Syntax Error: Lexer: " ^ - (match x with - | Illegal_character -> "Illegal character" - | Unterminated_comment -> "Unterminated comment" - | Unterminated_string -> "Unterminated string" - | Undefined_token -> "Undefined token" - | Bad_token tok -> Format.sprintf "Bad token %S" tok) - -end -open Error - -let err loc str = Loc.raise ~loc:(to_coqloc loc) (Error.E str) - -let bad_token str = raise (Error.E (Bad_token str)) - -(* Update a loc without allocating an intermediate pair *) -let set_loc_pos loc bp ep = - Ploc.sub loc (bp - Ploc.first_pos loc) (ep - bp) - -(* Increase line number by 1 and update position of beginning of line *) -let bump_loc_line loc bol_pos = - Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb loc + 1) bol_pos - (Ploc.first_pos loc, Ploc.last_pos loc) (Ploc.comment loc) - -(* Same as [bump_loc_line], but for the last line in location *) -(* For an obscure reason, camlp5 does not give an easy way to set line_nb_stop, - so we have to resort to a hack merging two locations. *) -(* Warning: [bump_loc_line_last] changes the end position. You may need to call - [set_loc_pos] to fix it. *) -let bump_loc_line_last loc bol_pos = - let loc' = - Ploc.make_loc (Ploc.file_name loc) (Ploc.line_nb_last loc + 1) bol_pos - (Ploc.first_pos loc + 1, Ploc.last_pos loc + 1) (Ploc.comment loc) - in - Ploc.encl loc loc' - -(* For some reason, the [Ploc.after] function of Camlp5 does not update line - numbers, so we define our own function that does it. *) -let after loc = - let line_nb = Ploc.line_nb_last loc in - let bol_pos = Ploc.bol_pos_last loc in - Ploc.make_loc (Ploc.file_name loc) line_nb bol_pos - (Ploc.last_pos loc, Ploc.last_pos loc) (Ploc.comment loc) - -(** Lexer conventions on tokens *) - -type token_kind = - | Utf8Token of (Unicode.status * int) - | AsciiChar - | EmptyStream - -let error_utf8 loc cs = - let bp = Stream.count cs in - Stream.junk cs; (* consume the char to avoid read it and fail again *) - let loc = set_loc_pos loc bp (bp+1) in - err loc Illegal_character - -let utf8_char_size loc cs = function - (* Utf8 leading byte *) - | '\x00'..'\x7F' -> 1 - | '\xC0'..'\xDF' -> 2 - | '\xE0'..'\xEF' -> 3 - | '\xF0'..'\xF7' -> 4 - | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> error_utf8 loc cs - -let njunk n = Util.repeat n Stream.junk - -let check_utf8_trailing_byte loc cs c = - if not (Int.equal (Char.code c land 0xC0) 0x80) then error_utf8 loc cs - -(* Recognize utf8 blocks (of length less than 4 bytes) *) -(* but don't certify full utf8 compliance (e.g. no emptyness check) *) -let lookup_utf8_tail loc c cs = - let c1 = Char.code c in - if Int.equal (c1 land 0x40) 0 || Int.equal (c1 land 0x38) 0x38 then error_utf8 loc cs - else - let n, unicode = - if Int.equal (c1 land 0x20) 0 then - match Stream.npeek 2 cs with - | [_;c2] -> - check_utf8_trailing_byte loc cs c2; - 2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F) - | _ -> error_utf8 loc cs - else if Int.equal (c1 land 0x10) 0 then - match Stream.npeek 3 cs with - | [_;c2;c3] -> - check_utf8_trailing_byte loc cs c2; - check_utf8_trailing_byte loc cs c3; - 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + - (Char.code c3 land 0x3F) - | _ -> error_utf8 loc cs - else match Stream.npeek 4 cs with - | [_;c2;c3;c4] -> - check_utf8_trailing_byte loc cs c2; - check_utf8_trailing_byte loc cs c3; - check_utf8_trailing_byte loc cs c4; - 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 + - (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) - | _ -> error_utf8 loc cs - in - Utf8Token (Unicode.classify unicode, n) - -let lookup_utf8 loc cs = - match Stream.peek cs with - | Some ('\x00'..'\x7F') -> AsciiChar - | Some ('\x80'..'\xFF' as c) -> lookup_utf8_tail loc c cs - | None -> EmptyStream - -let unlocated f x = f x - (** FIXME: should we still unloc the exception? *) -(* try f x with Loc.Exc_located (_, exc) -> raise exc *) - -let check_keyword str = - let rec loop_symb = parser - | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str - | [< s >] -> - match unlocated lookup_utf8 Ploc.dummy s with - | Utf8Token (_,n) -> njunk n s; loop_symb s - | AsciiChar -> Stream.junk s; loop_symb s - | EmptyStream -> () - in - loop_symb (Stream.of_string str) - -let check_ident str = - let rec loop_id intail = parser - | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] -> - loop_id true s - | [< ' ('0'..'9' | ''') when intail; s >] -> - loop_id true s - | [< s >] -> - match unlocated lookup_utf8 Ploc.dummy s with - | Utf8Token (st, n) when not intail && Unicode.is_valid_ident_initial st -> njunk n s; loop_id true s - | Utf8Token (st, n) when intail && Unicode.is_valid_ident_trailing st -> - njunk n s; - loop_id true s - | EmptyStream -> () - | Utf8Token _ | AsciiChar -> bad_token str - in - loop_id false (Stream.of_string str) - -let is_ident str = - try let _ = check_ident str in true with Error.E _ -> false - -(* Keyword and symbol dictionary *) -let token_tree = ref empty_ttree - -let is_keyword s = - try match (ttree_find !token_tree s).node with None -> false | Some _ -> true - with Not_found -> false - -let add_keyword str = - if not (is_keyword str) then - begin - check_keyword str; - token_tree := ttree_add !token_tree str - end - -let remove_keyword str = - token_tree := ttree_remove !token_tree str - -let keywords () = ttree_elements !token_tree - -(* Freeze and unfreeze the state of the lexer *) -type keyword_state = ttree - -let get_keyword_state () = !token_tree -let set_keyword_state tt = (token_tree := tt) - -(* The string buffering machinery *) - -let buff = ref (Bytes.create 80) - -let store len x = - let open Bytes in - if len >= length !buff then - buff := cat !buff (create (length !buff)); - set !buff len x; - succ len - -let rec nstore n len cs = - if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len - -let get_buff len = Bytes.sub_string !buff 0 len - -(* The classical lexer: idents, numbers, quoted strings, comments *) - -let warn_unrecognized_unicode = - CWarnings.create ~name:"unrecognized-unicode" ~category:"parsing" - (fun (u,id) -> - strbrk (Printf.sprintf "Not considering unicode character \"%s\" of unknown \ - lexical status as part of identifier \"%s\"." u id)) - -let rec ident_tail loc len = parser - | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] -> - ident_tail loc (store len c) s - | [< s >] -> - match lookup_utf8 loc s with - | Utf8Token (st, n) when Unicode.is_valid_ident_trailing st -> - ident_tail loc (nstore n len s) s - | Utf8Token (st, n) when Unicode.is_unknown st -> - let id = get_buff len in - let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in - warn_unrecognized_unicode ~loc:!@loc (u,id); len - | _ -> len - -let rec number len = parser - | [< ' ('0'..'9' as c); s >] -> number (store len c) s - | [< >] -> len - -let warn_comment_terminator_in_string = - CWarnings.create ~name:"comment-terminator-in-string" ~category:"parsing" - (fun () -> - (strbrk - "Not interpreting \"*)\" as the end of current \ - non-terminated comment because it occurs in a \ - non-terminated string of the comment.")) - -(* If the string being lexed is in a comment, [comm_level] is Some i with i the - current level of comments nesting. Otherwise, [comm_level] is None. *) -let rec string loc ~comm_level bp len = parser - | [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] -> - if esc then string loc ~comm_level bp (store len '"') s else (loc, len) - | [< ''('; s >] -> - (parser - | [< ''*'; s >] -> - let comm_level = Option.map succ comm_level in - string loc ~comm_level - bp (store (store len '(') '*') - s - | [< >] -> - string loc ~comm_level bp (store len '(') s) s - | [< ''*'; s >] -> - (parser - | [< '')'; s >] -> - let () = match comm_level with - | Some 0 -> - warn_comment_terminator_in_string ~loc:!@loc () - | _ -> () - in - let comm_level = Option.map pred comm_level in - string loc ~comm_level bp (store (store len '*') ')') s - | [< >] -> - string loc ~comm_level bp (store len '*') s) s - | [< ''\n' as c; s >] ep -> - (* If we are parsing a comment, the string if not part of a token so we - update the first line of the location. Otherwise, we update the last - line. *) - let loc = - if Option.has_some comm_level then bump_loc_line loc ep - else bump_loc_line_last loc ep - in - string loc ~comm_level bp (store len c) s - | [< 'c; s >] -> string loc ~comm_level bp (store len c) s - | [< _ = Stream.empty >] ep -> - let loc = set_loc_pos loc bp ep in - err loc Unterminated_string - -(* To associate locations to a file name *) -let current_file = ref Loc.ToplevelInput - -(* Utilities for comments in beautify *) -let comment_begin = ref None -let comm_loc bp = match !comment_begin with -| None -> comment_begin := Some bp -| _ -> () - -let comments = ref [] -let current_comment = Buffer.create 8192 -let between_commands = ref true - -(* The state of the lexer visible from outside *) -type lexer_state = int option * string * bool * ((int * int) * string) list * Loc.source - -let init_lexer_state f = (None,"",true,[],f) -let set_lexer_state (o,s,b,c,f) = - comment_begin := o; - Buffer.clear current_comment; Buffer.add_string current_comment s; - between_commands := b; - comments := c; - current_file := f -let get_lexer_state () = - (!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file) -let drop_lexer_state () = - set_lexer_state (init_lexer_state Loc.ToplevelInput) - -let get_comment_state (_,_,_,c,_) = c - -let real_push_char c = Buffer.add_char current_comment c - -(* Add a char if it is between two commands, if it is a newline or - if the last char is not a space itself. *) -let push_char c = - if - !between_commands || List.mem c ['\n';'\r'] || - (List.mem c [' ';'\t']&& - (Int.equal (Buffer.length current_comment) 0 || - not (let s = Buffer.contents current_comment in - List.mem s.[String.length s - 1] [' ';'\t';'\n';'\r']))) - then - real_push_char c - -let push_string s = Buffer.add_string current_comment s - -let null_comment s = - let rec null i = - i<0 || (List.mem s.[i] [' ';'\t';'\n';'\r'] && null (i-1)) in - null (String.length s - 1) - -let comment_stop ep = - let current_s = Buffer.contents current_comment in - (if !Flags.beautify && Buffer.length current_comment > 0 && - (!between_commands || not(null_comment current_s)) then - let bp = match !comment_begin with - Some bp -> bp - | None -> - Feedback.msg_notice - (str "No begin location for comment '" - ++ str current_s ++str"' ending at " - ++ int ep); - ep-1 in - comments := ((bp,ep),current_s) :: !comments); - Buffer.clear current_comment; - comment_begin := None; - between_commands := false - -let rec comment loc bp = parser bp2 - | [< ''('; - loc = (parser - | [< ''*'; s >] -> push_string "(*"; comment loc bp s - | [< >] -> push_string "("; loc ); - s >] -> comment loc bp s - | [< ''*'; - loc = parser - | [< '')' >] -> push_string "*)"; loc - | [< s >] -> real_push_char '*'; comment loc bp s >] -> loc - | [< ''"'; s >] -> - let loc, len = string loc ~comm_level:(Some 0) bp2 0 s in - push_string "\""; push_string (get_buff len); push_string "\""; - comment loc bp s - | [< _ = Stream.empty >] ep -> - let loc = set_loc_pos loc bp ep in - err loc Unterminated_comment - | [< ''\n' as z; s >] ep -> real_push_char z; comment (bump_loc_line loc ep) bp s - | [< 'z; s >] -> real_push_char z; comment loc bp s - -(* Parse a special token, using the [token_tree] *) - -(* Peek as much utf-8 lexemes as possible *) -(* and retain the longest valid special token obtained *) -let rec progress_further loc last nj tt cs = - try progress_from_byte loc last nj tt cs (List.nth (Stream.npeek (nj+1) cs) nj) - with Failure _ -> last - -and update_longest_valid_token loc last nj tt cs = - match tt.node with - | Some _ as last' -> - stream_njunk nj cs; - progress_further loc last' 0 tt cs - | None -> - progress_further loc last nj tt cs - -(* nj is the number of char peeked since last valid token *) -(* n the number of char in utf8 block *) -and progress_utf8 loc last nj n c tt cs = - try - let tt = CharMap.find c tt.branch in - if Int.equal n 1 then - update_longest_valid_token loc last (nj+n) tt cs - else - match Util.List.skipn (nj+1) (Stream.npeek (nj+n) cs) with - | l when Int.equal (List.length l) (n - 1) -> - List.iter (check_utf8_trailing_byte loc cs) l; - let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in - update_longest_valid_token loc last (nj+n) tt cs - | _ -> - error_utf8 loc cs - with Not_found -> - last - -and progress_from_byte loc last nj tt cs c = - progress_utf8 loc last nj (utf8_char_size loc cs c) c tt cs - -let find_keyword loc id s = - let tt = ttree_find !token_tree id in - match progress_further loc tt.node 0 tt s with - | None -> raise Not_found - | Some c -> KEYWORD c - -let process_sequence loc bp c cs = - let rec aux n cs = - match Stream.peek cs with - | Some c' when c == c' -> Stream.junk cs; aux (n+1) cs - | _ -> BULLET (String.make n c), set_loc_pos loc bp (Stream.count cs) - in - aux 1 cs - -(* Must be a special token *) -let process_chars loc bp c cs = - let t = progress_from_byte loc None (-1) !token_tree cs c in - let ep = Stream.count cs in - match t with - | Some t -> (KEYWORD t, set_loc_pos loc bp ep) - | None -> - let ep' = bp + utf8_char_size loc cs c in - njunk (ep' - ep) cs; - let loc = set_loc_pos loc bp ep' in - err loc Undefined_token - -(* Parse what follows a dot *) - -let parse_after_dot loc c bp = - parser - | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail loc (store 0 d); s >] -> - let field = get_buff len in - (try find_keyword loc ("."^field) s with Not_found -> FIELD field) - | [< s >] -> - match lookup_utf8 loc s with - | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> - let len = ident_tail loc (nstore n 0 s) s in - let field = get_buff len in - (try find_keyword loc ("."^field) s with Not_found -> FIELD field) - | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s) - -(* Parse what follows a question mark *) - -let parse_after_qmark loc bp s = - match Stream.peek s with - | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK - | None -> KEYWORD "?" - | _ -> - match lookup_utf8 loc s with - | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK - | AsciiChar | Utf8Token _ | EmptyStream -> - fst (process_chars loc bp '?' s) - -let blank_or_eof cs = - match Stream.peek cs with - | None -> true - | Some (' ' | '\t' | '\n' |'\r') -> true - | _ -> false - -(* Parse a token in a char stream *) - -let rec next_token loc = parser bp - | [< ''\n' as c; s >] ep -> - comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s - | [< '' ' | '\t' | '\r' as c; s >] -> - comm_loc bp; push_char c; next_token loc s - | [< ''.' as c; t = parse_after_dot loc c bp; s >] ep -> - comment_stop bp; - (* We enforce that "." should either be part of a larger keyword, - for instance ".(", or followed by a blank or eof. *) - let () = match t with - | KEYWORD ("." | "...") -> - if not (blank_or_eof s) then - err (set_loc_pos loc bp (ep+1)) Undefined_token; - between_commands := true; - | _ -> () - in - (t, set_loc_pos loc bp ep) - | [< ' ('-'|'+'|'*' as c); s >] -> - let t,new_between_commands = - if !between_commands then process_sequence loc bp c s, true - else process_chars loc bp c s,false - in - comment_stop bp; between_commands := new_between_commands; t - | [< ''?'; s >] ep -> - let t = parse_after_qmark loc bp s in - comment_stop bp; (t, set_loc_pos loc bp ep) - | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); - len = ident_tail loc (store 0 c); s >] ep -> - let id = get_buff len in - comment_stop bp; - (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep - | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> - comment_stop bp; - (INT (get_buff len), set_loc_pos loc bp ep) - | [< ''\"'; (loc,len) = string loc ~comm_level:None bp 0 >] ep -> - comment_stop bp; - (STRING (get_buff len), set_loc_pos loc bp ep) - | [< ' ('(' as c); - t = parser - | [< ''*'; s >] -> - comm_loc bp; - push_string "(*"; - let loc = comment loc bp s in next_token loc s - | [< t = process_chars loc bp c >] -> comment_stop bp; t >] -> - t - | [< ' ('{' | '}' as c); s >] ep -> - let t,new_between_commands = - if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true - else process_chars loc bp c s, false - in - comment_stop bp; between_commands := new_between_commands; t - | [< s >] -> - match lookup_utf8 loc s with - | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> - let len = ident_tail loc (nstore n 0 s) s in - let id = get_buff len in - let ep = Stream.count s in - comment_stop bp; - (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep - | AsciiChar | Utf8Token _ -> - let t = process_chars loc bp (Stream.next s) s in - comment_stop bp; t - | EmptyStream -> - comment_stop bp; (EOI, set_loc_pos loc bp (bp+1)) - -(* (* Debug: uncomment this for tracing tokens seen by coq...*) -let next_token loc s = - let (t,loc as r) = next_token loc s in - Printf.eprintf "(line %i, %i-%i)[%s]\n%!" (Ploc.line_nb loc) (Ploc.first_pos loc) (Ploc.last_pos loc) (Tok.to_string t); - r *) - -(* Location table system for creating tables associating a token count - to its location in a char stream (the source) *) - -let locerr () = invalid_arg "Lexer: location function" - -let loct_create () = Hashtbl.create 207 - -let loct_func loct i = - try Hashtbl.find loct i with Not_found -> locerr () - -let loct_add loct i loc = Hashtbl.add loct i loc - -(** {6 The lexer of Coq} *) - -(** Note: removing a token. - We do nothing because [remove_token] is called only when removing a grammar - rule with [Grammar.delete_rule]. The latter command is called only when - unfreezing the state of the grammar entries (see GRAMMAR summary, file - env/metasyntax.ml). Therefore, instead of removing tokens one by one, - we unfreeze the state of the lexer. This restores the behaviour of the - lexer. B.B. *) - -type te = Tok.t - -(** Names of tokens, for this lexer, used in Grammar error messages *) - -let token_text = function - | ("", t) -> "'" ^ t ^ "'" - | ("IDENT", "") -> "identifier" - | ("IDENT", t) -> "'" ^ t ^ "'" - | ("INT", "") -> "integer" - | ("INT", s) -> "'" ^ s ^ "'" - | ("STRING", "") -> "string" - | ("EOI", "") -> "end of input" - | (con, "") -> con - | (con, prm) -> con ^ " \"" ^ prm ^ "\"" - -let func cs = - let loct = loct_create () in - let cur_loc = ref (from_coqloc !current_file 1 0 0 0) in - let ts = - Stream.from - (fun i -> - let (tok, loc) = next_token !cur_loc cs in - cur_loc := after loc; - loct_add loct i loc; Some tok) - in - (ts, loct_func loct) - -let lexer = { - Token.tok_func = func; - Token.tok_using = - (fun pat -> match Tok.of_pattern pat with - | KEYWORD s -> add_keyword s - | _ -> ()); - Token.tok_removing = (fun _ -> ()); - Token.tok_match = Tok.match_pattern; - Token.tok_comm = None; - Token.tok_text = token_text } - -(** Terminal symbols interpretation *) - -let is_ident_not_keyword s = - is_ident s && not (is_keyword s) - -let is_number s = - let rec aux i = - Int.equal (String.length s) i || - match s.[i] with '0'..'9' -> aux (i+1) | _ -> false - in aux 0 - -let strip s = - let len = - let rec loop i len = - if Int.equal i (String.length s) then len - else if s.[i] == ' ' then loop (i + 1) len - else loop (i + 1) (len + 1) - in - loop 0 0 - in - if len == String.length s then s - else - let s' = Bytes.create len in - let rec loop i i' = - if i == String.length s then s' - else if s.[i] == ' ' then loop (i + 1) i' - else begin Bytes.set s' i' s.[i]; loop (i + 1) (i' + 1) end - in - Bytes.to_string (loop 0 0) - -let terminal s = - let s = strip s in - let () = match s with "" -> failwith "empty token." | _ -> () in - if is_ident_not_keyword s then IDENT s - else if is_number s then INT s - else KEYWORD s -- cgit v1.2.3 From 43afa49511033d7bbe5dec72e2e861930f8f9126 Mon Sep 17 00:00:00 2001 From: Pierre-Marie Pédrot Date: Thu, 18 Oct 2018 11:29:13 +0200 Subject: Adding a rule to generate grammar.cma. Since the removal of the dependency in camlp5 for CLexer, it looks like this file was never generated, leading to installation failure. We add it as a dependency of the install step. --- Makefile.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.build b/Makefile.build index 4d19f9a2e1..77fcfc0abf 100644 --- a/Makefile.build +++ b/Makefile.build @@ -406,7 +406,7 @@ grammar/%.cmi: grammar/%.mli .PHONY: coqbinaries coqbyte -coqbinaries: $(TOPBINOPT) $(COQTOPEXE) $(CHICKEN) $(CSDPCERT) $(FAKEIDE) +coqbinaries: $(TOPBINOPT) $(COQTOPEXE) $(CHICKEN) $(CSDPCERT) $(FAKEIDE) $(GRAMMARCMA) coqbyte: $(TOPBYTE) $(CHICKENBYTE) # Special rule for coqtop, we imitate `ocamlopt` can delete the target -- cgit v1.2.3