diff options
| author | Pierre-Marie Pédrot | 2019-03-31 23:07:48 +0200 |
|---|---|---|
| committer | Pierre-Marie Pédrot | 2019-03-31 23:07:48 +0200 |
| commit | 5dd3c18f4e50eef53ae4413b7b80951f17edad5f (patch) | |
| tree | 0a104d6afc109b7b89c8997d1afb3bc9f1e89dc3 /parsing | |
| parent | cb502e44aac8328ffd6c37429e050a01f72b2c53 (diff) | |
| parent | f832476404a29d7791c1a6d7970988d3b2e3ad9e (diff) | |
Merge PR #9733: [lexer] keyword protected quotation token for arbitrary text
Ack-by: Zimmi48
Ack-by: ejgallego
Ack-by: gares
Reviewed-by: ppedrot
Diffstat (limited to 'parsing')
| -rw-r--r-- | parsing/cLexer.ml | 104 | ||||
| -rw-r--r-- | parsing/cLexer.mli | 28 | ||||
| -rw-r--r-- | parsing/extend.ml | 6 | ||||
| -rw-r--r-- | parsing/notation_gram.ml | 2 | ||||
| -rw-r--r-- | parsing/pcoq.ml | 19 | ||||
| -rw-r--r-- | parsing/tok.ml | 85 | ||||
| -rw-r--r-- | parsing/tok.mli | 21 |
7 files changed, 179 insertions, 86 deletions
diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml index 503cfcdb4f..33890545da 100644 --- a/parsing/cLexer.ml +++ b/parsing/cLexer.ml @@ -19,16 +19,19 @@ open Gramlib module CharOrd = struct type t = char let compare : char -> char -> int = compare end module CharMap = Map.Make (CharOrd) +type starts_quotation = NoQuotation | Quotation + type ttree = { - node : string option; - branch : ttree CharMap.t } + node : (string * starts_quotation) option; + branch : ttree CharMap.t; +} let empty_ttree = { node = None; branch = CharMap.empty } -let ttree_add ttree str = +let ttree_add ttree (str,quot) = let rec insert tt i = if i == String.length str then - {node = Some str; branch = tt.branch} + {node = Some (str,quot); branch = tt.branch} else let c = str.[i] in let br = @@ -75,7 +78,7 @@ let ttree_elements ttree = let rec elts tt accu = let accu = match tt.node with | None -> accu - | Some s -> CString.Set.add s accu + | Some (s,_) -> CString.Set.add s accu in CharMap.fold (fun _ tt accu -> elts tt accu) tt.branch accu in @@ -259,11 +262,11 @@ let is_keyword s = try match (ttree_find !token_tree s).node with None -> false | Some _ -> true with Not_found -> false -let add_keyword str = +let add_keyword ?(quotation=NoQuotation) str = if not (is_keyword str) then begin check_keyword str; - token_tree := ttree_add !token_tree str + token_tree := ttree_add !token_tree (str,quotation) end let remove_keyword str = @@ -529,11 +532,62 @@ and progress_utf8 loc last nj n c tt cs = and progress_from_byte loc last nj tt cs c = progress_utf8 loc last nj (utf8_char_size loc cs c) c tt cs +type marker = Delimited of int * char list * char list | ImmediateAsciiIdent + +let peek_marker_len b e s = + let rec peek n = + match stream_nth n s with + | c -> if c = b then peek (n+1) else n, List.make n b, List.make n e + | exception Stream.Failure -> n, List.make n b, List.make n e + in + let len, start, stop = peek 0 in + if len = 0 then raise Stream.Failure + else Delimited (len, start, stop) + +let peek_marker s = + match stream_nth 0 s with + | '(' -> peek_marker_len '(' ')' s + | '[' -> peek_marker_len '[' ']' s + | '{' -> peek_marker_len '{' '}' s + | ('a'..'z' | 'A'..'Z' | '_') -> ImmediateAsciiIdent + | _ -> raise Stream.Failure + +let parse_quotation loc s = + match peek_marker s with + | ImmediateAsciiIdent -> + let c = Stream.next s in + let len = + try ident_tail loc (store 0 c) s with + Stream.Failure -> raise (Stream.Error "") + in + get_buff len + | Delimited (lenmarker, bmarker, emarker) -> + let b = Buffer.create 80 in + let commit1 c = Buffer.add_char b c; Stream.junk s in + let commit l = List.iter commit1 l in + let rec quotation depth = + match Stream.npeek lenmarker s with + | l when l = bmarker -> + commit l; + quotation (depth + 1) + | l when l = emarker -> + commit l; + if depth > 1 then quotation (depth - 1) + | c :: cs -> + commit1 c; + quotation depth + | [] -> raise Stream.Failure + in + quotation 0; + Buffer.contents b + + let find_keyword loc id s = let tt = ttree_find !token_tree id in match progress_further loc tt.node 0 tt s with | None -> raise Not_found - | Some c -> KEYWORD c + | Some (c,NoQuotation) -> KEYWORD c + | Some (c,Quotation) -> QUOTATION(c, parse_quotation loc s) let process_sequence loc bp c cs = let rec aux n cs = @@ -548,7 +602,8 @@ let process_chars ~diff_mode loc bp c cs = let t = progress_from_byte loc None (-1) !token_tree cs c in let ep = Stream.count cs in match t with - | Some t -> (KEYWORD t, set_loc_pos loc bp ep) + | Some (t,NoQuotation) -> (KEYWORD t, set_loc_pos loc bp ep) + | Some (c,Quotation) -> (QUOTATION(c, parse_quotation loc cs), set_loc_pos loc bp ep) | None -> let ep' = bp + utf8_char_size loc cs c in if diff_mode then begin @@ -740,15 +795,15 @@ type te = Tok.t (** Names of tokens, for this lexer, used in Grammar error messages *) let token_text = function - | ("", t) -> "'" ^ t ^ "'" - | ("IDENT", "") -> "identifier" - | ("IDENT", t) -> "'" ^ t ^ "'" - | ("INT", "") -> "integer" - | ("INT", s) -> "'" ^ s ^ "'" - | ("STRING", "") -> "string" - | ("EOI", "") -> "end of input" - | (con, "") -> con - | (con, prm) -> con ^ " \"" ^ prm ^ "\"" + | ("", Some t) -> "'" ^ t ^ "'" + | ("IDENT", None) -> "identifier" + | ("IDENT", Some t) -> "'" ^ t ^ "'" + | ("INT", None) -> "integer" + | ("INT", Some s) -> "'" ^ s ^ "'" + | ("STRING", None) -> "string" + | ("EOI", None) -> "end of input" + | (con, None) -> con + | (con, Some prm) -> con ^ " \"" ^ prm ^ "\"" let func next_token ?loc cs = let loct = loct_create () in @@ -765,9 +820,10 @@ let func next_token ?loc cs = let make_lexer ~diff_mode = { Plexing.tok_func = func (next_token ~diff_mode); Plexing.tok_using = - (fun pat -> match Tok.of_pattern pat with - | KEYWORD s -> add_keyword s - | _ -> ()); + (fun pat -> match Tok.is_keyword pat with + | Some (false,s) -> add_keyword ~quotation:NoQuotation s + | Some (true,s) -> add_keyword ~quotation:Quotation s + | None -> ()); Plexing.tok_removing = (fun _ -> ()); Plexing.tok_match = Tok.match_pattern; Plexing.tok_text = token_text } @@ -807,6 +863,6 @@ let strip s = let terminal s = let s = strip s in let () = match s with "" -> failwith "empty token." | _ -> () in - if is_ident_not_keyword s then IDENT s - else if is_number s then INT s - else KEYWORD s + if is_ident_not_keyword s then "IDENT", Some s + else if is_number s then "INT", Some s + else "", Some s diff --git a/parsing/cLexer.mli b/parsing/cLexer.mli index 807f37a1a4..6d5a621877 100644 --- a/parsing/cLexer.mli +++ b/parsing/cLexer.mli @@ -8,8 +8,32 @@ (* * (see LICENSE file for the text of the license) *) (************************************************************************) +(** When one registers a keyword she can declare it starts a quotation. + In particular using QUOTATION("name:") in a grammar rule + declares "name:" as a keyword and the token QUOTATION is + matched whenever the keyword is followed by an identifier or a + parenthesized text. Eg + + constr:x + string:[....] + ltac:(....) + ltac:{....} + + The delimiter is made of 1 or more occurrences of the same parenthesis, + eg ((.....)) or [[[[....]]]]. The idea being that if the text happens to + contain the closing delimiter, one can make the delimiter longer and avoid + confusion (no escaping). Eg + + string:[[ .. ']' .. ]] + + + Nesting the delimiter is allowed, eg ((..((...))..)) is OK. + + Keywords don't need to end in ':' *) +type starts_quotation = NoQuotation | Quotation + (** This should be functional but it is not due to the interface *) -val add_keyword : string -> unit +val add_keyword : ?quotation:starts_quotation -> string -> unit val remove_keyword : string -> unit val is_keyword : string -> bool val keywords : unit -> CString.Set.t @@ -21,7 +45,7 @@ val get_keyword_state : unit -> keyword_state val check_ident : string -> unit val is_ident : string -> bool val check_keyword : string -> unit -val terminal : string -> Tok.t +val terminal : string -> Tok.pattern (** The lexer of Coq: *) diff --git a/parsing/extend.ml b/parsing/extend.ml index 9b5537d7f6..ff879dd1c2 100644 --- a/parsing/extend.ml +++ b/parsing/extend.ml @@ -44,7 +44,7 @@ type simple_constr_prod_entry_key = (** Entries used in productions (in right-hand-side of grammar rules), to parse non-terminals *) -type binder_entry_kind = ETBinderOpen | ETBinderClosed of Tok.t list +type binder_entry_kind = ETBinderOpen | ETBinderClosed of Tok.pattern list type binder_target = ForBinder | ForTerm @@ -54,7 +54,7 @@ type constr_prod_entry_key = | ETProdBigint (* Parsed as an (unbounded) integer *) | ETProdConstr of Constrexpr.notation_entry * (production_level * production_position) (* Parsed as constr or pattern, or a subentry of those *) | ETProdPattern of int (* Parsed as pattern as a binder (as subpart of a constr) *) - | ETProdConstrList of (production_level * production_position) * Tok.t list (* Parsed as non-empty list of constr *) + | ETProdConstrList of (production_level * production_position) * Tok.pattern list (* Parsed as non-empty list of constr *) | ETProdBinderList of binder_entry_kind (* Parsed as non-empty list of local binders *) (** {5 AST for user-provided entries} *) @@ -80,7 +80,7 @@ type ('a,'b,'c) ty_user_symbol = (** {5 Type-safe grammar extension} *) type ('self, 'a) symbol = -| Atoken : Tok.t -> ('self, string) symbol +| Atoken : Tok.pattern -> ('self, string) symbol | Alist1 : ('self, 'a) symbol -> ('self, 'a list) symbol | Alist1sep : ('self, 'a) symbol * ('self, _) symbol -> ('self, 'a list) symbol | Alist0 : ('self, 'a) symbol -> ('self, 'a list) symbol diff --git a/parsing/notation_gram.ml b/parsing/notation_gram.ml index fc5feba58b..a50f8d69e3 100644 --- a/parsing/notation_gram.ml +++ b/parsing/notation_gram.ml @@ -21,7 +21,7 @@ type level = Constrexpr.notation_entry * precedence * tolerability list * constr (* first argument is InCustomEntry s for custom entries *) type grammar_constr_prod_item = - | GramConstrTerminal of Tok.t + | GramConstrTerminal of Tok.pattern | GramConstrNonTerminal of Extend.constr_prod_entry_key * Id.t option | GramConstrListMark of int * bool * int (* tells action rule to make a list of the n previous parsed items; diff --git a/parsing/pcoq.ml b/parsing/pcoq.ml index 9241205755..f2f5d17da3 100644 --- a/parsing/pcoq.ml +++ b/parsing/pcoq.ml @@ -119,25 +119,10 @@ struct end module Symbols : sig - val stoken : Tok.t -> ('s, string) G.ty_symbol val slist0sep : ('s, 'a) G.ty_symbol -> ('s, 'b) G.ty_symbol -> ('s, 'a list) G.ty_symbol val slist1sep : ('s, 'a) G.ty_symbol -> ('s, 'b) G.ty_symbol -> ('s, 'a list) G.ty_symbol end = struct - let stoken tok = - let pattern = match tok with - | Tok.KEYWORD s -> "", s - | Tok.IDENT s -> "IDENT", s - | Tok.PATTERNIDENT s -> "PATTERNIDENT", s - | Tok.FIELD s -> "FIELD", s - | Tok.INT s -> "INT", s - | Tok.STRING s -> "STRING", s - | Tok.LEFTQMARK -> "LEFTQMARK", "" - | Tok.BULLET s -> "BULLET", s - | Tok.EOI -> "EOI", "" - in - G.s_token pattern - let slist0sep x y = G.s_list0sep x y false let slist1sep x y = G.s_list1sep x y false end @@ -158,7 +143,7 @@ end type ('s, 'a, 'r) casted_rule = Casted : ('s, 'b, 'r) G.ty_rule * ('a -> 'b) -> ('s, 'a, 'r) casted_rule let rec symbol_of_prod_entry_key : type s a. (s, a) symbol -> (s, a) G.ty_symbol = function -| Atoken t -> Symbols.stoken t +| Atoken t -> G.s_token t | Alist1 s -> G.s_list1 (symbol_of_prod_entry_key s) | Alist1sep (s,sep) -> Symbols.slist1sep (symbol_of_prod_entry_key s) (symbol_of_prod_entry_key sep) @@ -303,7 +288,7 @@ let make_rule r = [None, None, r] let eoi_entry en = let e = Entry.create ((Gram.Entry.name en) ^ "_eoi") in - let symbs = G.r_next (G.r_next G.r_stop (G.s_nterm en)) (Symbols.stoken Tok.EOI) in + let symbs = G.r_next (G.r_next G.r_stop (G.s_nterm en)) (G.s_token Tok.pattern_for_EOI) in let act = fun _ x loc -> x in let warning msg = Feedback.msg_warning Pp.(str msg) in Gram.safe_extend ~warning:(Some warning) e None (make_rule [G.production (symbs, act)]); diff --git a/parsing/tok.ml b/parsing/tok.ml index 03825e350f..f0846dcf90 100644 --- a/parsing/tok.ml +++ b/parsing/tok.ml @@ -21,6 +21,7 @@ type t = | STRING of string | LEFTQMARK | BULLET of string + | QUOTATION of string * string | EOI let equal t1 t2 = match t1, t2 with @@ -34,6 +35,7 @@ let equal t1 t2 = match t1, t2 with | LEFTQMARK, LEFTQMARK -> true | BULLET s1, BULLET s2 -> string_equal s1 s2 | EOI, EOI -> true +| QUOTATION(s1,t1), QUOTATION(s2,t2) -> string_equal s1 s2 && string_equal t1 t2 | _ -> false let extract_string diff_mode = function @@ -58,6 +60,7 @@ let extract_string diff_mode = function | INT s -> s | LEFTQMARK -> "?" | BULLET s -> s + | QUOTATION(_,s) -> s | EOI -> "" let to_string = function @@ -69,12 +72,27 @@ let to_string = function | STRING s -> Format.sprintf "STRING %S" s | LEFTQMARK -> "LEFTQMARK" | BULLET s -> Format.sprintf "BULLET %S" s + | QUOTATION(lbl,s) -> lbl ^ s | EOI -> "EOI" let match_keyword kwd = function | KEYWORD kwd' when kwd = kwd' -> true | _ -> false +(* Invariant, txt is "ident" or a well parenthesized "{{....}}" *) +let trim_quotation txt = + let len = String.length txt in + if len = 0 then None, txt + else + let c = txt.[0] in + if c = '(' || c = '[' || c = '{' then + let rec aux n = + if n < len && txt.[n] = c then aux (n+1) + else Some c, String.sub txt n (len - (2*n)) + in + aux 0 + else None, txt + (* Needed to fix Camlp5 signature. Cannot use Pp because of silly Tox -> Compat -> Pp dependency *) let print ppf tok = Format.pp_print_string ppf (to_string tok) @@ -82,41 +100,38 @@ let print ppf tok = Format.pp_print_string ppf (to_string tok) (** For camlp5, conversion from/to [Plexing.pattern], and a match function analoguous to [Plexing.default_match] *) -let of_pattern = function - | "", s -> KEYWORD s - | "IDENT", s -> IDENT s - | "PATTERNIDENT", s -> PATTERNIDENT s - | "FIELD", s -> FIELD s - | "INT", s -> INT s - | "STRING", s -> STRING s - | "LEFTQMARK", _ -> LEFTQMARK - | "BULLET", s -> BULLET s - | "EOI", _ -> EOI - | _ -> failwith "Tok.of_pattern: not a constructor" +type pattern = string * string option + +let is_keyword = function + | "", Some s -> Some (false,s) + | "QUOTATION", Some s -> Some (true,s) + | _ -> None -let to_pattern = function - | KEYWORD s -> "", s - | IDENT s -> "IDENT", s - | PATTERNIDENT s -> "PATTERNIDENT", s - | FIELD s -> "FIELD", s - | INT s -> "INT", s - | STRING s -> "STRING", s - | LEFTQMARK -> "LEFTQMARK", "" - | BULLET s -> "BULLET", s - | EOI -> "EOI", "" +let pattern_for_EOI = ("EOI",None) +let pattern_for_KEYWORD s = ("",Some s) +let pattern_for_IDENT s = ("IDENT",Some s) -let match_pattern = +let match_pattern (key, value) = let err () = raise Stream.Failure in - function - | "", "" -> (function KEYWORD s -> s | _ -> err ()) - | "IDENT", "" -> (function IDENT s -> s | _ -> err ()) - | "PATTERNIDENT", "" -> (function PATTERNIDENT s -> s | _ -> err ()) - | "FIELD", "" -> (function FIELD s -> s | _ -> err ()) - | "INT", "" -> (function INT s -> s | _ -> err ()) - | "STRING", "" -> (function STRING s -> s | _ -> err ()) - | "LEFTQMARK", "" -> (function LEFTQMARK -> "" | _ -> err ()) - | "BULLET", "" -> (function BULLET s -> s | _ -> err ()) - | "EOI", "" -> (function EOI -> "" | _ -> err ()) - | pat -> - let tok = of_pattern pat in - function tok' -> if equal tok tok' then snd pat else err () + let cond x = + match value with + | None -> x + | Some value -> if string_equal value x then x else err () in + match key with + | "" -> (function KEYWORD s -> cond s | _ -> err ()) + | "IDENT" when value <> None -> (function (IDENT s | KEYWORD s) -> cond s | _ -> err ()) + | "IDENT" -> (function IDENT s -> cond s | _ -> err ()) + | "PATTERNIDENT" -> (function PATTERNIDENT s -> cond s | _ -> err ()) + | "FIELD" -> (function FIELD s -> cond s | _ -> err ()) + | "INT" -> (function INT s -> cond s | _ -> err ()) + | "STRING" -> (function STRING s -> cond s | _ -> err ()) + | "LEFTQMARK" -> (function LEFTQMARK -> cond "" | _ -> err ()) + | "BULLET" -> (function BULLET s -> cond s | _ -> err ()) + | "QUOTATION" -> (function + | QUOTATION(lbl,s) -> + begin match value with + | None -> assert false + | Some lbl1 -> if string_equal lbl1 lbl then s else err () end + | _ -> err ()) + | "EOI" -> (function EOI -> cond "" | _ -> err ()) + | p -> CErrors.anomaly Pp.(str "Tok: unknown pattern " ++ str p) diff --git a/parsing/tok.mli b/parsing/tok.mli index 5750096a28..8a255be06f 100644 --- a/parsing/tok.mli +++ b/parsing/tok.mli @@ -19,6 +19,7 @@ type t = | STRING of string | LEFTQMARK | BULLET of string + | QUOTATION of string * string | EOI val equal : t -> t -> bool @@ -29,7 +30,19 @@ val to_string : t -> string val print : Format.formatter -> t -> unit val match_keyword : string -> t -> bool -(** for camlp5 *) -val of_pattern : string*string -> t -val to_pattern : t -> string*string -val match_pattern : string*string -> t -> string +(** Utility function for the test returned by a QUOTATION token: + It returns the delimiter parenthesis, if any, and the text + without delimiters. Eg `{{{ text }}}` -> Some '{', ` text ` *) +val trim_quotation : string -> char option * string + +(** for camlp5, + eg GRAMMAR EXTEND ..... [ IDENT "x" -> .... END + is a pattern ("IDENT", Some "x") +*) +type pattern = string * string option (* = Plexing.pattern *) + +val is_keyword : pattern -> (bool * string) option +val pattern_for_EOI : pattern +val pattern_for_KEYWORD : string -> pattern +val pattern_for_IDENT : string -> pattern +val match_pattern : pattern -> t -> string |
