aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--parsing/cLexer.ml59
-rw-r--r--parsing/cLexer.mli11
-rw-r--r--parsing/tok.ml25
-rw-r--r--printing/proof_diffs.ml3
-rw-r--r--test-suite/unit-tests/printing/proof_diffs_test.ml35
5 files changed, 82 insertions, 51 deletions
diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml
index c2b7fa117d..49d6cf01d9 100644
--- a/parsing/cLexer.ml
+++ b/parsing/cLexer.ml
@@ -548,20 +548,27 @@ let process_sequence loc bp c cs =
aux 1 cs
(* Must be a special token *)
-let process_chars loc bp c cs =
+let process_chars ~diff_mode loc bp c cs =
let t = progress_from_byte loc None (-1) !token_tree cs c in
let ep = Stream.count cs in
match t with
| Some t -> (KEYWORD t, set_loc_pos loc bp ep)
| None ->
let ep' = bp + utf8_char_size loc cs c in
- njunk (ep' - ep) cs;
- let loc = set_loc_pos loc bp ep' in
- err loc Undefined_token
+ if diff_mode then begin
+ let len = ep' - bp in
+ ignore (store 0 c);
+ ignore (nstore (len - 1) 1 cs);
+ IDENT (get_buff len), set_loc_pos loc bp ep
+ end else begin
+ njunk (ep' - ep) cs;
+ let loc = set_loc_pos loc bp ep' in
+ err loc Undefined_token
+ end
(* Parse what follows a dot *)
-let parse_after_dot loc c bp s = match Stream.peek s with
+let parse_after_dot ~diff_mode loc c bp s = match Stream.peek s with
| Some ('a'..'z' | 'A'..'Z' | '_' as d) ->
Stream.junk s;
let len =
@@ -576,11 +583,11 @@ let parse_after_dot loc c bp s = match Stream.peek s with
let len = ident_tail loc (nstore n 0 s) s in
let field = get_buff len in
(try find_keyword loc ("."^field) s with Not_found -> FIELD field)
- | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s)
+ | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars ~diff_mode loc bp c s)
(* Parse what follows a question mark *)
-let parse_after_qmark loc bp s =
+let parse_after_qmark ~diff_mode loc bp s =
match Stream.peek s with
| Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK
| None -> KEYWORD "?"
@@ -588,7 +595,7 @@ let parse_after_qmark loc bp s =
match lookup_utf8 loc s with
| Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK
| AsciiChar | Utf8Token _ | EmptyStream ->
- fst (process_chars loc bp '?' s)
+ fst (process_chars ~diff_mode loc bp '?' s)
let blank_or_eof cs =
match Stream.peek cs with
@@ -598,20 +605,20 @@ let blank_or_eof cs =
(* Parse a token in a char stream *)
-let rec next_token loc s =
+let rec next_token ~diff_mode loc s =
let bp = Stream.count s in
match Stream.peek s with
| Some ('\n' as c) ->
Stream.junk s;
let ep = Stream.count s in
- comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s
+ comm_loc bp; push_char c; next_token ~diff_mode (bump_loc_line loc ep) s
| Some (' ' | '\t' | '\r' as c) ->
Stream.junk s;
- comm_loc bp; push_char c; next_token loc s
+ comm_loc bp; push_char c; next_token ~diff_mode loc s
| Some ('.' as c) ->
Stream.junk s;
let t =
- try parse_after_dot loc c bp s with
+ try parse_after_dot ~diff_mode loc c bp s with
Stream.Failure -> raise (Stream.Error "")
in
let ep = Stream.count s in
@@ -630,13 +637,13 @@ let rec next_token loc s =
Stream.junk s;
let t,new_between_commands =
if !between_commands then process_sequence loc bp c s, true
- else process_chars loc bp c s,false
+ else process_chars ~diff_mode loc bp c s,false
in
comment_stop bp; between_commands := new_between_commands; t
| Some '?' ->
Stream.junk s;
let ep = Stream.count s in
- let t = parse_after_qmark loc bp s in
+ let t = parse_after_qmark ~diff_mode loc bp s in
comment_stop bp; (t, set_loc_pos loc bp ep)
| Some ('a'..'z' | 'A'..'Z' | '_' as c) ->
Stream.junk s;
@@ -670,12 +677,16 @@ let rec next_token loc s =
Stream.junk s;
begin try
match Stream.peek s with
+ | Some '*' when diff_mode ->
+ Stream.junk s;
+ let ep = Stream.count s in
+ (IDENT "(*", set_loc_pos loc bp ep)
| Some '*' ->
Stream.junk s;
comm_loc bp;
push_string "(*";
- let loc = comment loc bp s in next_token loc s
- | _ -> let t = process_chars loc bp c s in comment_stop bp; t
+ let loc = comment loc bp s in next_token ~diff_mode loc s
+ | _ -> let t = process_chars ~diff_mode loc bp c s in comment_stop bp; t
with Stream.Failure -> raise (Stream.Error "")
end
| Some ('{' | '}' as c) ->
@@ -683,7 +694,7 @@ let rec next_token loc s =
let ep = Stream.count s in
let t,new_between_commands =
if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true
- else process_chars loc bp c s, false
+ else process_chars ~diff_mode loc bp c s, false
in
comment_stop bp; between_commands := new_between_commands; t
| _ ->
@@ -695,14 +706,14 @@ let rec next_token loc s =
comment_stop bp;
(try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
| AsciiChar | Utf8Token _ ->
- let t = process_chars loc bp (Stream.next s) s in
+ let t = process_chars ~diff_mode loc bp (Stream.next s) s in
comment_stop bp; t
| EmptyStream ->
comment_stop bp; (EOI, set_loc_pos loc bp (bp+1))
(* (* Debug: uncomment this for tracing tokens seen by coq...*)
-let next_token loc s =
- let (t,loc as r) = next_token loc s in
+let next_token ~diff_mode loc s =
+ let (t,loc as r) = next_token ~diff_mode loc s in
Printf.eprintf "(line %i, %i-%i)[%s]\n%!" (Ploc.line_nb loc) (Ploc.first_pos loc) (Ploc.last_pos loc) (Tok.to_string t);
r *)
@@ -743,7 +754,7 @@ let token_text = function
| (con, "") -> con
| (con, prm) -> con ^ " \"" ^ prm ^ "\""
-let func cs =
+let func next_token cs =
let loct = loct_create () in
let cur_loc = ref (Loc.create !current_file 1 0 0 0) in
let ts =
@@ -755,8 +766,8 @@ let func cs =
in
(ts, loct_func loct)
-let lexer = {
- Plexing.tok_func = func;
+let make_lexer ~diff_mode = {
+ Plexing.tok_func = func (next_token ~diff_mode);
Plexing.tok_using =
(fun pat -> match Tok.of_pattern pat with
| KEYWORD s -> add_keyword s
@@ -765,6 +776,8 @@ let lexer = {
Plexing.tok_match = Tok.match_pattern;
Plexing.tok_text = token_text }
+let lexer = make_lexer ~diff_mode:false
+
(** Terminal symbols interpretation *)
let is_ident_not_keyword s =
diff --git a/parsing/cLexer.mli b/parsing/cLexer.mli
index c0ebdd45ef..af3fd7f318 100644
--- a/parsing/cLexer.mli
+++ b/parsing/cLexer.mli
@@ -56,3 +56,14 @@ val set_lexer_state : lexer_state -> unit
val get_lexer_state : unit -> lexer_state
val drop_lexer_state : unit -> unit
val get_comment_state : lexer_state -> ((int * int) * string) list
+
+(** Create a lexer. true enables alternate handling for computing diffs.
+It ensures that, ignoring white space, the concatenated tokens equal the input
+string. Specifically:
+- for strings, return the enclosing quotes as tokens and treat the quoted value
+as if it was unquoted, possibly becoming multiple tokens
+- for comments, return the "(*" as a token and treat the contents of the comment as if
+it was not in a comment, possibly becoming multiple tokens
+- return any unrecognized Ascii or UTF-8 character as a string
+*)
+val make_lexer : diff_mode:bool -> Tok.t Gramlib.Plexing.lexer
diff --git a/parsing/tok.ml b/parsing/tok.ml
index c0d5b6742d..03825e350f 100644
--- a/parsing/tok.ml
+++ b/parsing/tok.ml
@@ -40,18 +40,19 @@ let extract_string diff_mode = function
| KEYWORD s -> s
| IDENT s -> s
| STRING s ->
- if diff_mode then
- let escape_quotes s =
- let len = String.length s in
- let buf = Buffer.create len in
- for i = 0 to len-1 do
- let ch = String.get s i in
- Buffer.add_char buf ch;
- if ch = '"' then Buffer.add_char buf '"' else ()
- done;
- Buffer.contents buf
- in
- "\"" ^ (escape_quotes s) ^ "\"" else s
+ if diff_mode then
+ let escape_quotes s =
+ let len = String.length s in
+ let buf = Buffer.create len in
+ for i = 0 to len-1 do
+ let ch = String.get s i in
+ Buffer.add_char buf ch;
+ if ch = '"' then Buffer.add_char buf '"' else ()
+ done;
+ Buffer.contents buf
+ in
+ "\"" ^ (escape_quotes s) ^ "\""
+ else s
| PATTERNIDENT s -> s
| FIELD s -> if diff_mode then "." ^ s else s
| INT s -> s
diff --git a/printing/proof_diffs.ml b/printing/proof_diffs.ml
index b280ce909b..c1ea067567 100644
--- a/printing/proof_diffs.ml
+++ b/printing/proof_diffs.ml
@@ -88,7 +88,8 @@ let tokenize_string s =
let st = CLexer.get_lexer_state () in
try
let istr = Stream.of_string s in
- let lex = CLexer.lexer.Gramlib.Plexing.tok_func istr in
+ let lexer = CLexer.make_lexer ~diff_mode:true in
+ let lex = lexer.Gramlib.Plexing.tok_func istr in
let toks = stream_tok [] (fst lex) in
CLexer.set_lexer_state st;
toks
diff --git a/test-suite/unit-tests/printing/proof_diffs_test.ml b/test-suite/unit-tests/printing/proof_diffs_test.ml
index 7f9e6cc6e0..d0b8d21b69 100644
--- a/test-suite/unit-tests/printing/proof_diffs_test.ml
+++ b/test-suite/unit-tests/printing/proof_diffs_test.ml
@@ -51,23 +51,28 @@ let t () =
assert_equal ~msg:"has `Removed" ~printer:string_of_bool true has_removed
let _ = add_test "diff_str add/remove" t
-(* example of a limitation, not really a test *)
-let t () =
- try
- let _ = diff_str "a" ">" in
- assert_failure "unlexable string gives an exception"
- with _ -> ()
-let _ = add_test "diff_str unlexable" t
-
-(* problematic examples for tokenize_string:
- comments omitted
- quoted string loses quote marks (are escapes supported/handled?)
- char constant split into 2
+(* lexer tweaks:
+ comments are lexed as multiple tokens
+ strings tokens include begin/end quotes and embedded ""
+ single multibyte characters returned even if they're not keywords
+
+ inputs that give a lexer failure (but no use case needs them yet):
+ ".12"
+ unterminated string
+ invalid UTF-8 sequences
*)
let t () =
- List.iter (fun x -> cprintf "'%s' " x) (tokenize_string "(* comment *) \"string\" 'c' xx");
- cprintf "\n"
-let _ = add_test "tokenize_string examples" t
+ let str = "(* comment.field *) ?id () \"str\"\"ing\" \\ := Ж > ∃ 'c' xx" in
+ let toks = tokenize_string str in
+ (*List.iter (fun x -> cprintf "'%s' " x) toks;*)
+ (*cprintf "\n";*)
+ let str_no_white = String.concat "" (String.split_on_char ' ' str) in
+ assert_equal ~printer:(fun x -> x) str_no_white (String.concat "" toks);
+ List.iter (fun s ->
+ assert_equal ~msg:("'" ^ s ^ "' is a single token") ~printer:string_of_bool true (List.mem s toks))
+ [ "(*"; "()"; ":="]
+
+let _ = add_test "tokenize_string/diff_mode in lexer" t
open Pp