5 files changed, 82 insertions, 51 deletions
diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml
index c2b7fa117d..49d6cf01d9 100644
--- a/parsing/cLexer.ml
+++ b/parsing/cLexer.ml
@@ -548,20 +548,27 @@ let process_sequence loc bp c cs =
   aux 1 cs
 
 (* Must be a special token *)
-let process_chars loc bp c cs =
+let process_chars ~diff_mode loc bp c cs =
   let t = progress_from_byte loc None (-1) !token_tree cs c in
   let ep = Stream.count cs in
   match t with
     | Some t -> (KEYWORD t, set_loc_pos loc bp ep)
     | None ->
         let ep' = bp + utf8_char_size loc cs c in
-        njunk (ep' - ep) cs;
-	let loc = set_loc_pos loc bp ep' in
-        err loc Undefined_token
+        if diff_mode then begin
+          let len = ep' - bp in
+          ignore (store 0 c);
+          ignore (nstore (len - 1) 1 cs);
+          IDENT (get_buff len), set_loc_pos loc bp ep
+        end else begin
+          njunk (ep' - ep) cs;
+          let loc = set_loc_pos loc bp ep' in
+          err loc Undefined_token
+        end
 
 (* Parse what follows a dot *)
 
-let parse_after_dot loc c bp s = match Stream.peek s with
+let parse_after_dot ~diff_mode loc c bp s = match Stream.peek s with
   | Some ('a'..'z' | 'A'..'Z' | '_' as d) ->
       Stream.junk s;
       let len =
@@ -576,11 +583,11 @@ let parse_after_dot loc c bp s = match Stream.peek s with
           let len = ident_tail loc (nstore n 0 s) s in
           let field = get_buff len in
           (try find_keyword loc ("."^field) s with Not_found -> FIELD field)
-      | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s)
+      | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars ~diff_mode loc bp c s)
 
 (* Parse what follows a question mark *)
 
-let parse_after_qmark loc bp s =
+let parse_after_qmark ~diff_mode loc bp s =
   match Stream.peek s with
     | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK
     | None -> KEYWORD "?"
@@ -588,7 +595,7 @@ let parse_after_qmark loc bp s =
         match lookup_utf8 loc s with
           | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK
           | AsciiChar | Utf8Token _ | EmptyStream ->
-            fst (process_chars loc bp '?' s)
+            fst (process_chars ~diff_mode loc bp '?' s)
 
 let blank_or_eof cs =
   match Stream.peek cs with
@@ -598,20 +605,20 @@ let blank_or_eof cs =
 
 (* Parse a token in a char stream *)
 
-let rec next_token loc s =
+let rec next_token ~diff_mode loc s =
   let bp = Stream.count s in
   match Stream.peek s with
   | Some ('\n' as c) ->
       Stream.junk s;
       let ep = Stream.count s in
-      comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s
+      comm_loc bp; push_char c; next_token ~diff_mode (bump_loc_line loc ep) s
   | Some (' ' | '\t' | '\r' as c) ->
       Stream.junk s;
-      comm_loc bp; push_char c; next_token loc s
+      comm_loc bp; push_char c; next_token ~diff_mode loc s
   | Some ('.' as c) ->
       Stream.junk s;
       let t =
-        try parse_after_dot loc c bp s with
+        try parse_after_dot ~diff_mode loc c bp s with
           Stream.Failure -> raise (Stream.Error "")
       in
       let ep = Stream.count s in
@@ -630,13 +637,13 @@ let rec next_token loc s =
       Stream.junk s;
       let t,new_between_commands =
         if !between_commands then process_sequence loc bp c s, true
-        else process_chars loc bp c s,false
+        else process_chars ~diff_mode loc bp c s,false
       in
       comment_stop bp; between_commands := new_between_commands; t
   | Some '?' ->
       Stream.junk s;
       let ep = Stream.count s in
-      let t = parse_after_qmark loc bp s in
+      let t = parse_after_qmark ~diff_mode loc bp s in
       comment_stop bp; (t, set_loc_pos loc bp ep)
   | Some ('a'..'z' | 'A'..'Z' | '_' as c) ->
       Stream.junk s;
@@ -670,12 +677,16 @@ let rec next_token loc s =
       Stream.junk s;
       begin try
         match Stream.peek s with
+        | Some '*' when diff_mode ->
+            Stream.junk s;
+            let ep = Stream.count s in
+            (IDENT "(*", set_loc_pos loc bp ep)
         | Some '*' ->
             Stream.junk s;
             comm_loc bp;
             push_string "(*";
-            let loc = comment loc bp s in next_token loc s
-        | _ -> let t = process_chars loc bp c s in comment_stop bp; t
+            let loc = comment loc bp s in next_token ~diff_mode loc s
+        | _ -> let t = process_chars ~diff_mode loc bp c s in comment_stop bp; t
       with Stream.Failure -> raise (Stream.Error "")
       end
   | Some ('{' | '}' as c) ->
@@ -683,7 +694,7 @@ let rec next_token loc s =
       let ep = Stream.count s in
       let t,new_between_commands =
         if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true
-        else process_chars loc bp c s, false
+        else process_chars ~diff_mode loc bp c s, false
       in
       comment_stop bp; between_commands := new_between_commands; t
   | _ ->
@@ -695,14 +706,14 @@ let rec next_token loc s =
             comment_stop bp;
             (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
         | AsciiChar | Utf8Token _ ->
-            let t = process_chars loc bp (Stream.next s) s in
+            let t = process_chars ~diff_mode loc bp (Stream.next s) s in
             comment_stop bp; t
         | EmptyStream ->
             comment_stop bp; (EOI, set_loc_pos loc bp (bp+1))
 
 (* (* Debug: uncomment this for tracing tokens seen by coq...*)
-let next_token loc s =
-  let (t,loc as r) = next_token loc s in
+let next_token ~diff_mode loc s =
+  let (t,loc as r) = next_token ~diff_mode loc s in
   Printf.eprintf "(line %i, %i-%i)[%s]\n%!" (Ploc.line_nb loc) (Ploc.first_pos loc) (Ploc.last_pos loc) (Tok.to_string t);
   r *)
 
@@ -743,7 +754,7 @@ let token_text = function
   | (con, "") -> con
   | (con, prm) -> con ^ " \"" ^ prm ^ "\""
 
-let func cs =
+let func next_token cs =
   let loct = loct_create () in
   let cur_loc = ref (Loc.create !current_file 1 0 0 0) in
   let ts =
@@ -755,8 +766,8 @@ let func cs =
   in
   (ts, loct_func loct)
 
-let lexer = {
-  Plexing.tok_func = func;
+let make_lexer ~diff_mode = {
+  Plexing.tok_func = func (next_token ~diff_mode);
   Plexing.tok_using =
     (fun pat -> match Tok.of_pattern pat with
        | KEYWORD s -> add_keyword s
@@ -765,6 +776,8 @@ let lexer = {
   Plexing.tok_match = Tok.match_pattern;
   Plexing.tok_text = token_text }
 
+let lexer = make_lexer ~diff_mode:false
+
 (** Terminal symbols interpretation *)
 
 let is_ident_not_keyword s =
diff --git a/parsing/cLexer.mli b/parsing/cLexer.mli
index c0ebdd45ef..af3fd7f318 100644
--- a/parsing/cLexer.mli
+++ b/parsing/cLexer.mli
@@ -56,3 +56,14 @@ val set_lexer_state : lexer_state -> unit
 val get_lexer_state : unit -> lexer_state
 val drop_lexer_state : unit -> unit
 val get_comment_state : lexer_state -> ((int * int) * string) list
+
+(** Create a lexer.  true enables alternate handling for computing diffs.
+It ensures that, ignoring white space, the concatenated tokens equal the input
+string.  Specifically:
+- for strings, return the enclosing quotes as tokens and treat the quoted value
+as if it was unquoted, possibly becoming multiple tokens
+- for comments, return the "(*" as a token and treat the contents of the comment as if
+it was not in a comment, possibly becoming multiple tokens
+- return any unrecognized Ascii or UTF-8 character as a string
+*)
+val make_lexer : diff_mode:bool -> Tok.t Gramlib.Plexing.lexer
diff --git a/parsing/tok.ml b/parsing/tok.ml
index c0d5b6742d..03825e350f 100644
--- a/parsing/tok.ml
+++ b/parsing/tok.ml
@@ -40,18 +40,19 @@ let extract_string diff_mode = function
   | KEYWORD s -> s
   | IDENT s -> s
   | STRING s ->
-   if diff_mode then
-    let escape_quotes s =
-      let len = String.length s in
-      let buf = Buffer.create len in
-      for i = 0 to len-1 do
-        let ch = String.get s i in
-        Buffer.add_char buf ch;
-        if ch = '"' then Buffer.add_char buf '"' else ()
-      done;
-      Buffer.contents buf
-    in
-    "\"" ^ (escape_quotes s) ^ "\"" else s
+    if diff_mode then
+      let escape_quotes s =
+        let len = String.length s in
+        let buf = Buffer.create len in
+        for i = 0 to len-1 do
+          let ch = String.get s i in
+          Buffer.add_char buf ch;
+          if ch = '"' then Buffer.add_char buf '"' else ()
+        done;
+        Buffer.contents buf
+      in
+      "\"" ^ (escape_quotes s) ^ "\""
+    else s
   | PATTERNIDENT s -> s
   | FIELD s -> if diff_mode then "." ^ s else s
   | INT s -> s
diff --git a/printing/proof_diffs.ml b/printing/proof_diffs.ml
index b280ce909b..c1ea067567 100644
--- a/printing/proof_diffs.ml
+++ b/printing/proof_diffs.ml
@@ -88,7 +88,8 @@ let tokenize_string s =
   let st = CLexer.get_lexer_state () in
   try
     let istr = Stream.of_string s in
-    let lex = CLexer.lexer.Gramlib.Plexing.tok_func istr in
+    let lexer = CLexer.make_lexer ~diff_mode:true in
+    let lex = lexer.Gramlib.Plexing.tok_func istr in
     let toks = stream_tok [] (fst lex) in
     CLexer.set_lexer_state st;
     toks
diff --git a/test-suite/unit-tests/printing/proof_diffs_test.ml b/test-suite/unit-tests/printing/proof_diffs_test.ml
index 7f9e6cc6e0..d0b8d21b69 100644
--- a/test-suite/unit-tests/printing/proof_diffs_test.ml
+++ b/test-suite/unit-tests/printing/proof_diffs_test.ml
@@ -51,23 +51,28 @@ let t () =
   assert_equal ~msg:"has `Removed" ~printer:string_of_bool true has_removed
 let _ = add_test "diff_str add/remove" t
 
-(* example of a limitation, not really a test *)
-let t () =
-  try
-    let _ = diff_str "a" "&gt;" in
-    assert_failure "unlexable string gives an exception"
-  with _ -> ()
-let _ = add_test "diff_str unlexable" t
-
-(* problematic examples for tokenize_string:
-   comments omitted
-   quoted string loses quote marks (are escapes supported/handled?)
-   char constant split into 2
+(* lexer tweaks:
+   comments are lexed as multiple tokens
+   strings tokens include begin/end quotes and embedded ""
+   single multibyte characters returned even if they're not keywords
+
+   inputs that give a lexer failure (but no use case needs them yet):
+     ".12"
+     unterminated string
+     invalid UTF-8 sequences
    *)
 let t () =
-  List.iter (fun x -> cprintf "'%s' " x) (tokenize_string "(* comment *) \"string\" 'c' xx");
-  cprintf "\n"
-let _ = add_test "tokenize_string examples" t
+  let str = "(* comment.field *) ?id () \"str\"\"ing\" \\ := Ж &gt; ∃ 'c' xx" in
+  let toks = tokenize_string str in
+  (*List.iter (fun x -> cprintf "'%s' " x) toks;*)
+  (*cprintf "\n";*)
+  let str_no_white = String.concat "" (String.split_on_char ' ' str) in
+  assert_equal ~printer:(fun x -> x) str_no_white (String.concat "" toks);
+  List.iter (fun s ->
+    assert_equal ~msg:("'" ^ s ^ "' is a single token") ~printer:string_of_bool true (List.mem s toks))
+    [ "(*"; "()"; ":="]
+
+let _ = add_test "tokenize_string/diff_mode in lexer" t
 
 open Pp