diff options
| -rw-r--r-- | parsing/cLexer.ml | 59 | ||||
| -rw-r--r-- | parsing/cLexer.mli | 11 | ||||
| -rw-r--r-- | parsing/tok.ml | 25 | ||||
| -rw-r--r-- | printing/proof_diffs.ml | 3 | ||||
| -rw-r--r-- | test-suite/unit-tests/printing/proof_diffs_test.ml | 35 |
5 files changed, 82 insertions, 51 deletions
diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml index c2b7fa117d..49d6cf01d9 100644 --- a/parsing/cLexer.ml +++ b/parsing/cLexer.ml @@ -548,20 +548,27 @@ let process_sequence loc bp c cs = aux 1 cs (* Must be a special token *) -let process_chars loc bp c cs = +let process_chars ~diff_mode loc bp c cs = let t = progress_from_byte loc None (-1) !token_tree cs c in let ep = Stream.count cs in match t with | Some t -> (KEYWORD t, set_loc_pos loc bp ep) | None -> let ep' = bp + utf8_char_size loc cs c in - njunk (ep' - ep) cs; - let loc = set_loc_pos loc bp ep' in - err loc Undefined_token + if diff_mode then begin + let len = ep' - bp in + ignore (store 0 c); + ignore (nstore (len - 1) 1 cs); + IDENT (get_buff len), set_loc_pos loc bp ep + end else begin + njunk (ep' - ep) cs; + let loc = set_loc_pos loc bp ep' in + err loc Undefined_token + end (* Parse what follows a dot *) -let parse_after_dot loc c bp s = match Stream.peek s with +let parse_after_dot ~diff_mode loc c bp s = match Stream.peek s with | Some ('a'..'z' | 'A'..'Z' | '_' as d) -> Stream.junk s; let len = @@ -576,11 +583,11 @@ let parse_after_dot loc c bp s = match Stream.peek s with let len = ident_tail loc (nstore n 0 s) s in let field = get_buff len in (try find_keyword loc ("."^field) s with Not_found -> FIELD field) - | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s) + | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars ~diff_mode loc bp c s) (* Parse what follows a question mark *) -let parse_after_qmark loc bp s = +let parse_after_qmark ~diff_mode loc bp s = match Stream.peek s with | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK | None -> KEYWORD "?" @@ -588,7 +595,7 @@ let parse_after_qmark loc bp s = match lookup_utf8 loc s with | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK | AsciiChar | Utf8Token _ | EmptyStream -> - fst (process_chars loc bp '?' s) + fst (process_chars ~diff_mode loc bp '?' s) let blank_or_eof cs = match Stream.peek cs with @@ -598,20 +605,20 @@ let blank_or_eof cs = (* Parse a token in a char stream *) -let rec next_token loc s = +let rec next_token ~diff_mode loc s = let bp = Stream.count s in match Stream.peek s with | Some ('\n' as c) -> Stream.junk s; let ep = Stream.count s in - comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s + comm_loc bp; push_char c; next_token ~diff_mode (bump_loc_line loc ep) s | Some (' ' | '\t' | '\r' as c) -> Stream.junk s; - comm_loc bp; push_char c; next_token loc s + comm_loc bp; push_char c; next_token ~diff_mode loc s | Some ('.' as c) -> Stream.junk s; let t = - try parse_after_dot loc c bp s with + try parse_after_dot ~diff_mode loc c bp s with Stream.Failure -> raise (Stream.Error "") in let ep = Stream.count s in @@ -630,13 +637,13 @@ let rec next_token loc s = Stream.junk s; let t,new_between_commands = if !between_commands then process_sequence loc bp c s, true - else process_chars loc bp c s,false + else process_chars ~diff_mode loc bp c s,false in comment_stop bp; between_commands := new_between_commands; t | Some '?' -> Stream.junk s; let ep = Stream.count s in - let t = parse_after_qmark loc bp s in + let t = parse_after_qmark ~diff_mode loc bp s in comment_stop bp; (t, set_loc_pos loc bp ep) | Some ('a'..'z' | 'A'..'Z' | '_' as c) -> Stream.junk s; @@ -670,12 +677,16 @@ let rec next_token loc s = Stream.junk s; begin try match Stream.peek s with + | Some '*' when diff_mode -> + Stream.junk s; + let ep = Stream.count s in + (IDENT "(*", set_loc_pos loc bp ep) | Some '*' -> Stream.junk s; comm_loc bp; push_string "(*"; - let loc = comment loc bp s in next_token loc s - | _ -> let t = process_chars loc bp c s in comment_stop bp; t + let loc = comment loc bp s in next_token ~diff_mode loc s + | _ -> let t = process_chars ~diff_mode loc bp c s in comment_stop bp; t with Stream.Failure -> raise (Stream.Error "") end | Some ('{' | '}' as c) -> @@ -683,7 +694,7 @@ let rec next_token loc s = let ep = Stream.count s in let t,new_between_commands = if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true - else process_chars loc bp c s, false + else process_chars ~diff_mode loc bp c s, false in comment_stop bp; between_commands := new_between_commands; t | _ -> @@ -695,14 +706,14 @@ let rec next_token loc s = comment_stop bp; (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep | AsciiChar | Utf8Token _ -> - let t = process_chars loc bp (Stream.next s) s in + let t = process_chars ~diff_mode loc bp (Stream.next s) s in comment_stop bp; t | EmptyStream -> comment_stop bp; (EOI, set_loc_pos loc bp (bp+1)) (* (* Debug: uncomment this for tracing tokens seen by coq...*) -let next_token loc s = - let (t,loc as r) = next_token loc s in +let next_token ~diff_mode loc s = + let (t,loc as r) = next_token ~diff_mode loc s in Printf.eprintf "(line %i, %i-%i)[%s]\n%!" (Ploc.line_nb loc) (Ploc.first_pos loc) (Ploc.last_pos loc) (Tok.to_string t); r *) @@ -743,7 +754,7 @@ let token_text = function | (con, "") -> con | (con, prm) -> con ^ " \"" ^ prm ^ "\"" -let func cs = +let func next_token cs = let loct = loct_create () in let cur_loc = ref (Loc.create !current_file 1 0 0 0) in let ts = @@ -755,8 +766,8 @@ let func cs = in (ts, loct_func loct) -let lexer = { - Plexing.tok_func = func; +let make_lexer ~diff_mode = { + Plexing.tok_func = func (next_token ~diff_mode); Plexing.tok_using = (fun pat -> match Tok.of_pattern pat with | KEYWORD s -> add_keyword s @@ -765,6 +776,8 @@ let lexer = { Plexing.tok_match = Tok.match_pattern; Plexing.tok_text = token_text } +let lexer = make_lexer ~diff_mode:false + (** Terminal symbols interpretation *) let is_ident_not_keyword s = diff --git a/parsing/cLexer.mli b/parsing/cLexer.mli index c0ebdd45ef..af3fd7f318 100644 --- a/parsing/cLexer.mli +++ b/parsing/cLexer.mli @@ -56,3 +56,14 @@ val set_lexer_state : lexer_state -> unit val get_lexer_state : unit -> lexer_state val drop_lexer_state : unit -> unit val get_comment_state : lexer_state -> ((int * int) * string) list + +(** Create a lexer. true enables alternate handling for computing diffs. +It ensures that, ignoring white space, the concatenated tokens equal the input +string. Specifically: +- for strings, return the enclosing quotes as tokens and treat the quoted value +as if it was unquoted, possibly becoming multiple tokens +- for comments, return the "(*" as a token and treat the contents of the comment as if +it was not in a comment, possibly becoming multiple tokens +- return any unrecognized Ascii or UTF-8 character as a string +*) +val make_lexer : diff_mode:bool -> Tok.t Gramlib.Plexing.lexer diff --git a/parsing/tok.ml b/parsing/tok.ml index c0d5b6742d..03825e350f 100644 --- a/parsing/tok.ml +++ b/parsing/tok.ml @@ -40,18 +40,19 @@ let extract_string diff_mode = function | KEYWORD s -> s | IDENT s -> s | STRING s -> - if diff_mode then - let escape_quotes s = - let len = String.length s in - let buf = Buffer.create len in - for i = 0 to len-1 do - let ch = String.get s i in - Buffer.add_char buf ch; - if ch = '"' then Buffer.add_char buf '"' else () - done; - Buffer.contents buf - in - "\"" ^ (escape_quotes s) ^ "\"" else s + if diff_mode then + let escape_quotes s = + let len = String.length s in + let buf = Buffer.create len in + for i = 0 to len-1 do + let ch = String.get s i in + Buffer.add_char buf ch; + if ch = '"' then Buffer.add_char buf '"' else () + done; + Buffer.contents buf + in + "\"" ^ (escape_quotes s) ^ "\"" + else s | PATTERNIDENT s -> s | FIELD s -> if diff_mode then "." ^ s else s | INT s -> s diff --git a/printing/proof_diffs.ml b/printing/proof_diffs.ml index b280ce909b..c1ea067567 100644 --- a/printing/proof_diffs.ml +++ b/printing/proof_diffs.ml @@ -88,7 +88,8 @@ let tokenize_string s = let st = CLexer.get_lexer_state () in try let istr = Stream.of_string s in - let lex = CLexer.lexer.Gramlib.Plexing.tok_func istr in + let lexer = CLexer.make_lexer ~diff_mode:true in + let lex = lexer.Gramlib.Plexing.tok_func istr in let toks = stream_tok [] (fst lex) in CLexer.set_lexer_state st; toks diff --git a/test-suite/unit-tests/printing/proof_diffs_test.ml b/test-suite/unit-tests/printing/proof_diffs_test.ml index 7f9e6cc6e0..d0b8d21b69 100644 --- a/test-suite/unit-tests/printing/proof_diffs_test.ml +++ b/test-suite/unit-tests/printing/proof_diffs_test.ml @@ -51,23 +51,28 @@ let t () = assert_equal ~msg:"has `Removed" ~printer:string_of_bool true has_removed let _ = add_test "diff_str add/remove" t -(* example of a limitation, not really a test *) -let t () = - try - let _ = diff_str "a" ">" in - assert_failure "unlexable string gives an exception" - with _ -> () -let _ = add_test "diff_str unlexable" t - -(* problematic examples for tokenize_string: - comments omitted - quoted string loses quote marks (are escapes supported/handled?) - char constant split into 2 +(* lexer tweaks: + comments are lexed as multiple tokens + strings tokens include begin/end quotes and embedded "" + single multibyte characters returned even if they're not keywords + + inputs that give a lexer failure (but no use case needs them yet): + ".12" + unterminated string + invalid UTF-8 sequences *) let t () = - List.iter (fun x -> cprintf "'%s' " x) (tokenize_string "(* comment *) \"string\" 'c' xx"); - cprintf "\n" -let _ = add_test "tokenize_string examples" t + let str = "(* comment.field *) ?id () \"str\"\"ing\" \\ := Ж > ∃ 'c' xx" in + let toks = tokenize_string str in + (*List.iter (fun x -> cprintf "'%s' " x) toks;*) + (*cprintf "\n";*) + let str_no_white = String.concat "" (String.split_on_char ' ' str) in + assert_equal ~printer:(fun x -> x) str_no_white (String.concat "" toks); + List.iter (fun s -> + assert_equal ~msg:("'" ^ s ^ "' is a single token") ~printer:string_of_bool true (List.mem s toks)) + [ "(*"; "()"; ":="] + +let _ = add_test "tokenize_string/diff_mode in lexer" t open Pp |
