From 70a139733dfaa42d665a85736ee4324d926723ed Mon Sep 17 00:00:00 2001
From: Jim Fehrle
Date: Sun, 7 Oct 2018 15:14:56 -0700
Subject: Make diffs work for more input strings

Diff code uses the lexer to recognize tokens in the inputs, which can be
Pp.t's or strings.  To add the highlights in the Pp.t, the diff code
matches characters in the input to characters in the tokens.  Current
code fails for inputs containing quote marks or "(*" because the quote
marks and comments don't appear in the tokens.  This commit adds a "diff
mode" to the lexer to return those characters, making the diff routine
more robust.
---
 parsing/cLexer.ml                                  | 59 +++++++++++++---------
 parsing/cLexer.mli                                 | 11 ++++
 parsing/tok.ml                                     | 25 ++++-----
 printing/proof_diffs.ml                            |  3 +-
 test-suite/unit-tests/printing/proof_diffs_test.ml | 35 +++++++------
 5 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml
index c2b7fa117d..49d6cf01d9 100644
--- a/parsing/cLexer.ml
+++ b/parsing/cLexer.ml
@@ -548,20 +548,27 @@ let process_sequence loc bp c cs =
   aux 1 cs
 
 (* Must be a special token *)
-let process_chars loc bp c cs =
+let process_chars ~diff_mode loc bp c cs =
   let t = progress_from_byte loc None (-1) !token_tree cs c in
   let ep = Stream.count cs in
   match t with
     | Some t -> (KEYWORD t, set_loc_pos loc bp ep)
     | None ->
         let ep' = bp + utf8_char_size loc cs c in
-        njunk (ep' - ep) cs;
-	let loc = set_loc_pos loc bp ep' in
-        err loc Undefined_token
+        if diff_mode then begin
+          let len = ep' - bp in
+          ignore (store 0 c);
+          ignore (nstore (len - 1) 1 cs);
+          IDENT (get_buff len), set_loc_pos loc bp ep
+        end else begin
+          njunk (ep' - ep) cs;
+          let loc = set_loc_pos loc bp ep' in
+          err loc Undefined_token
+        end
 
 (* Parse what follows a dot *)
 
-let parse_after_dot loc c bp s = match Stream.peek s with
+let parse_after_dot ~diff_mode loc c bp s = match Stream.peek s with
   | Some ('a'..'z' | 'A'..'Z' | '_' as d) ->
       Stream.junk s;
       let len =
@@ -576,11 +583,11 @@ let parse_after_dot loc c bp s = match Stream.peek s with
           let len = ident_tail loc (nstore n 0 s) s in
           let field = get_buff len in
           (try find_keyword loc ("."^field) s with Not_found -> FIELD field)
-      | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s)
+      | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars ~diff_mode loc bp c s)
 
 (* Parse what follows a question mark *)
 
-let parse_after_qmark loc bp s =
+let parse_after_qmark ~diff_mode loc bp s =
   match Stream.peek s with
     | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK
     | None -> KEYWORD "?"
@@ -588,7 +595,7 @@ let parse_after_qmark loc bp s =
         match lookup_utf8 loc s with
           | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK
           | AsciiChar | Utf8Token _ | EmptyStream ->
-            fst (process_chars loc bp '?' s)
+            fst (process_chars ~diff_mode loc bp '?' s)
 
 let blank_or_eof cs =
   match Stream.peek cs with
@@ -598,20 +605,20 @@ let blank_or_eof cs =
 
 (* Parse a token in a char stream *)
 
-let rec next_token loc s =
+let rec next_token ~diff_mode loc s =
   let bp = Stream.count s in
   match Stream.peek s with
   | Some ('\n' as c) ->
       Stream.junk s;
       let ep = Stream.count s in
-      comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s
+      comm_loc bp; push_char c; next_token ~diff_mode (bump_loc_line loc ep) s
   | Some (' ' | '\t' | '\r' as c) ->
       Stream.junk s;
-      comm_loc bp; push_char c; next_token loc s
+      comm_loc bp; push_char c; next_token ~diff_mode loc s
   | Some ('.' as c) ->
       Stream.junk s;
       let t =
-        try parse_after_dot loc c bp s with
+        try parse_after_dot ~diff_mode loc c bp s with
           Stream.Failure -> raise (Stream.Error "")
       in
       let ep = Stream.count s in
@@ -630,13 +637,13 @@ let rec next_token loc s =
       Stream.junk s;
       let t,new_between_commands =
         if !between_commands then process_sequence loc bp c s, true
-        else process_chars loc bp c s,false
+        else process_chars ~diff_mode loc bp c s,false
       in
       comment_stop bp; between_commands := new_between_commands; t
   | Some '?' ->
       Stream.junk s;
       let ep = Stream.count s in
-      let t = parse_after_qmark loc bp s in
+      let t = parse_after_qmark ~diff_mode loc bp s in
       comment_stop bp; (t, set_loc_pos loc bp ep)
   | Some ('a'..'z' | 'A'..'Z' | '_' as c) ->
       Stream.junk s;
@@ -670,12 +677,16 @@ let rec next_token loc s =
       Stream.junk s;
       begin try
         match Stream.peek s with
+        | Some '*' when diff_mode ->
+            Stream.junk s;
+            let ep = Stream.count s in
+            (IDENT "(*", set_loc_pos loc bp ep)
         | Some '*' ->
             Stream.junk s;
             comm_loc bp;
             push_string "(*";
-            let loc = comment loc bp s in next_token loc s
-        | _ -> let t = process_chars loc bp c s in comment_stop bp; t
+            let loc = comment loc bp s in next_token ~diff_mode loc s
+        | _ -> let t = process_chars ~diff_mode loc bp c s in comment_stop bp; t
       with Stream.Failure -> raise (Stream.Error "")
       end
   | Some ('{' | '}' as c) ->
@@ -683,7 +694,7 @@ let rec next_token loc s =
       let ep = Stream.count s in
       let t,new_between_commands =
         if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true
-        else process_chars loc bp c s, false
+        else process_chars ~diff_mode loc bp c s, false
       in
       comment_stop bp; between_commands := new_between_commands; t
   | _ ->
@@ -695,14 +706,14 @@ let rec next_token loc s =
             comment_stop bp;
             (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
         | AsciiChar | Utf8Token _ ->
-            let t = process_chars loc bp (Stream.next s) s in
+            let t = process_chars ~diff_mode loc bp (Stream.next s) s in
             comment_stop bp; t
         | EmptyStream ->
             comment_stop bp; (EOI, set_loc_pos loc bp (bp+1))
 
 (* (* Debug: uncomment this for tracing tokens seen by coq...*)
-let next_token loc s =
-  let (t,loc as r) = next_token loc s in
+let next_token ~diff_mode loc s =
+  let (t,loc as r) = next_token ~diff_mode loc s in
   Printf.eprintf "(line %i, %i-%i)[%s]\n%!" (Ploc.line_nb loc) (Ploc.first_pos loc) (Ploc.last_pos loc) (Tok.to_string t);
   r *)
 
@@ -743,7 +754,7 @@ let token_text = function
   | (con, "") -> con
   | (con, prm) -> con ^ " \"" ^ prm ^ "\""
 
-let func cs =
+let func next_token cs =
   let loct = loct_create () in
   let cur_loc = ref (Loc.create !current_file 1 0 0 0) in
   let ts =
@@ -755,8 +766,8 @@ let func cs =
   in
   (ts, loct_func loct)
 
-let lexer = {
-  Plexing.tok_func = func;
+let make_lexer ~diff_mode = {
+  Plexing.tok_func = func (next_token ~diff_mode);
   Plexing.tok_using =
     (fun pat -> match Tok.of_pattern pat with
        | KEYWORD s -> add_keyword s
@@ -765,6 +776,8 @@ let lexer = {
   Plexing.tok_match = Tok.match_pattern;
   Plexing.tok_text = token_text }
 
+let lexer = make_lexer ~diff_mode:false
+
 (** Terminal symbols interpretation *)
 
 let is_ident_not_keyword s =
diff --git a/parsing/cLexer.mli b/parsing/cLexer.mli
index c0ebdd45ef..af3fd7f318 100644
--- a/parsing/cLexer.mli
+++ b/parsing/cLexer.mli
@@ -56,3 +56,14 @@ val set_lexer_state : lexer_state -> unit
 val get_lexer_state : unit -> lexer_state
 val drop_lexer_state : unit -> unit
 val get_comment_state : lexer_state -> ((int * int) * string) list
+
+(** Create a lexer.  true enables alternate handling for computing diffs.
+It ensures that, ignoring white space, the concatenated tokens equal the input
+string.  Specifically:
+- for strings, return the enclosing quotes as tokens and treat the quoted value
+as if it was unquoted, possibly becoming multiple tokens
+- for comments, return the "(*" as a token and treat the contents of the comment as if
+it was not in a comment, possibly becoming multiple tokens
+- return any unrecognized Ascii or UTF-8 character as a string
+*)
+val make_lexer : diff_mode:bool -> Tok.t Gramlib.Plexing.lexer
diff --git a/parsing/tok.ml b/parsing/tok.ml
index c0d5b6742d..03825e350f 100644
--- a/parsing/tok.ml
+++ b/parsing/tok.ml
@@ -40,18 +40,19 @@ let extract_string diff_mode = function
   | KEYWORD s -> s
   | IDENT s -> s
   | STRING s ->
-   if diff_mode then
-    let escape_quotes s =
-      let len = String.length s in
-      let buf = Buffer.create len in
-      for i = 0 to len-1 do
-        let ch = String.get s i in
-        Buffer.add_char buf ch;
-        if ch = '"' then Buffer.add_char buf '"' else ()
-      done;
-      Buffer.contents buf
-    in
-    "\"" ^ (escape_quotes s) ^ "\"" else s
+    if diff_mode then
+      let escape_quotes s =
+        let len = String.length s in
+        let buf = Buffer.create len in
+        for i = 0 to len-1 do
+          let ch = String.get s i in
+          Buffer.add_char buf ch;
+          if ch = '"' then Buffer.add_char buf '"' else ()
+        done;
+        Buffer.contents buf
+      in
+      "\"" ^ (escape_quotes s) ^ "\""
+    else s
   | PATTERNIDENT s -> s
   | FIELD s -> if diff_mode then "." ^ s else s
   | INT s -> s
diff --git a/printing/proof_diffs.ml b/printing/proof_diffs.ml
index b280ce909b..c1ea067567 100644
--- a/printing/proof_diffs.ml
+++ b/printing/proof_diffs.ml
@@ -88,7 +88,8 @@ let tokenize_string s =
   let st = CLexer.get_lexer_state () in
   try
     let istr = Stream.of_string s in
-    let lex = CLexer.lexer.Gramlib.Plexing.tok_func istr in
+    let lexer = CLexer.make_lexer ~diff_mode:true in
+    let lex = lexer.Gramlib.Plexing.tok_func istr in
     let toks = stream_tok [] (fst lex) in
     CLexer.set_lexer_state st;
     toks
diff --git a/test-suite/unit-tests/printing/proof_diffs_test.ml b/test-suite/unit-tests/printing/proof_diffs_test.ml
index 7f9e6cc6e0..d0b8d21b69 100644
--- a/test-suite/unit-tests/printing/proof_diffs_test.ml
+++ b/test-suite/unit-tests/printing/proof_diffs_test.ml
@@ -51,23 +51,28 @@ let t () =
   assert_equal ~msg:"has `Removed" ~printer:string_of_bool true has_removed
 let _ = add_test "diff_str add/remove" t
 
-(* example of a limitation, not really a test *)
-let t () =
-  try
-    let _ = diff_str "a" "&gt;" in
-    assert_failure "unlexable string gives an exception"
-  with _ -> ()
-let _ = add_test "diff_str unlexable" t
-
-(* problematic examples for tokenize_string:
-   comments omitted
-   quoted string loses quote marks (are escapes supported/handled?)
-   char constant split into 2
+(* lexer tweaks:
+   comments are lexed as multiple tokens
+   strings tokens include begin/end quotes and embedded ""
+   single multibyte characters returned even if they're not keywords
+
+   inputs that give a lexer failure (but no use case needs them yet):
+     ".12"
+     unterminated string
+     invalid UTF-8 sequences
    *)
 let t () =
-  List.iter (fun x -> cprintf "'%s' " x) (tokenize_string "(* comment *) \"string\" 'c' xx");
-  cprintf "\n"
-let _ = add_test "tokenize_string examples" t
+  let str = "(* comment.field *) ?id () \"str\"\"ing\" \\ := Ж &gt; ∃ 'c' xx" in
+  let toks = tokenize_string str in
+  (*List.iter (fun x -> cprintf "'%s' " x) toks;*)
+  (*cprintf "\n";*)
+  let str_no_white = String.concat "" (String.split_on_char ' ' str) in
+  assert_equal ~printer:(fun x -> x) str_no_white (String.concat "" toks);
+  List.iter (fun s ->
+    assert_equal ~msg:("'" ^ s ^ "' is a single token") ~printer:string_of_bool true (List.mem s toks))
+    [ "(*"; "()"; ":="]
+
+let _ = add_test "tokenize_string/diff_mode in lexer" t
 
 open Pp
 
-- 
cgit v1.2.3