diff options
| author | Hugo Herbelin | 2016-10-13 15:57:14 +0200 |
|---|---|---|
| committer | Hugo Herbelin | 2016-10-17 20:22:17 +0200 |
| commit | 57c6ffd23836364168ffd1c66dbddbecf830c7c6 (patch) | |
| tree | 051f302085ef8a33f90bd0533053d158e1b29b6d | |
| parent | 4204581ccb8bdf0f6c4298029c010c6deb643594 (diff) | |
Stopping warning on unrecognized unicode character in notation (fixing #5136).
The warning was pointless since the notation was accepted and parsed
anyway.
We now treat unrecognized unicode characters like ordinary
undefined tokens (e.g. "#" in a bare Coq).
For instance, "aₚ", or ".ₚ", or "?ₚ" now fail with "Undefined token"
rather than "Unsupported Unicode character".
| -rw-r--r-- | lib/unicode.ml | 8 | ||||
| -rw-r--r-- | lib/unicode.mli | 12 | ||||
| -rw-r--r-- | parsing/cLexer.ml4 | 26 |
3 files changed, 10 insertions, 36 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml index dc852d9819..ced5e258c2 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -8,9 +8,7 @@ (** Unicode utilities *) -type status = Letter | IdentPart | Symbol - -exception Unsupported +type status = Letter | IdentPart | Symbol | Unknown (* The following table stores classes of Unicode characters that are used by the lexer. There are 3 different classes so 2 bits are @@ -29,6 +27,7 @@ let mask i = function | Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *) | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *) | Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *) + | Unknown -> 0 lsl ((i land 7) lsl 1) (* 00 *) (* Helper to reset 2 bits in a word. *) let reset_mask i = @@ -55,7 +54,7 @@ let lookup x = if v = 1 then Letter else if v = 2 then IdentPart else if v = 3 then Symbol - else raise Unsupported + else Unknown (* [classify] discriminates between 3 different kinds of symbols based on the standard unicode classification (extracted from @@ -215,7 +214,6 @@ let ident_refutation s = |x -> x with | End_of_input -> Some (true,"The empty string is not an identifier.") - | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.") | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") let lowercase_unicode = diff --git a/lib/unicode.mli b/lib/unicode.mli index 1f8bd44eee..2609e1968f 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -8,22 +8,16 @@ (** Unicode utilities *) -type status = Letter | IdentPart | Symbol +type status = Letter | IdentPart | Symbol | Unknown -(** This exception is raised when UTF-8 the input string contains unsupported UTF-8 characters. *) -exception Unsupported - -(** Classify a unicode char into 3 classes. - @raise Unsupported if the input string contains unsupported UTF-8 characters. *) +(** Classify a unicode char into 3 classes or unknown. *) val classify : int -> status (** Return [None] if a given string can be used as a (Coq) identifier. - Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity. - @raise Unsupported if the input string contains unsupported UTF-8 characters. *) + Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity. *) val ident_refutation : string -> (bool * string) option (** First char of a string, converted to lowercase - @raise Unsupported if the input string contains unsupported UTF-8 characters. @raise Assert_failure if the input string is empty. *) val lowercase_first_char : string -> string diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4 index e59b9630fb..6a343f50ee 100644 --- a/parsing/cLexer.ml4 +++ b/parsing/cLexer.ml4 @@ -112,11 +112,6 @@ type token_kind = | AsciiChar | EmptyStream -let error_unsupported_unicode_character loc n unicode cs = - let bp = Stream.count cs in - let loc = set_loc_pos loc bp (bp+n) in - err loc (UnsupportedUnicode unicode) - let error_utf8 loc cs = let bp = Stream.count cs in Stream.junk cs; (* consume the char to avoid read it and fail again *) @@ -166,14 +161,12 @@ let lookup_utf8_tail loc c cs = (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) | _ -> error_utf8 loc cs in - try Unicode.classify unicode, n - with Unicode.Unsupported -> - njunk n cs; error_unsupported_unicode_character loc n unicode cs + Utf8Token (Unicode.classify unicode, n) let lookup_utf8 loc cs = match Stream.peek cs with | Some ('\x00'..'\x7F') -> AsciiChar - | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail loc c cs) + | Some ('\x80'..'\xFF' as c) -> lookup_utf8_tail loc c cs | None -> EmptyStream let unlocated f x = f x @@ -191,17 +184,6 @@ let check_keyword str = in loop_symb (Stream.of_string str) -let warn_unparsable_keyword = - CWarnings.create ~name:"unparsable-keyword" ~category:"parsing" - (fun (s,unicode) -> - strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \ - which will not be parsable." s unicode)) - -let check_keyword_to_add s = - try check_keyword s - with Error.E (UnsupportedUnicode unicode) -> - warn_unparsable_keyword (s,unicode) - let check_ident str = let rec loop_id intail = parser | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] -> @@ -232,7 +214,7 @@ let is_keyword s = let add_keyword str = if not (is_keyword str) then begin - check_keyword_to_add str; + check_keyword str; token_tree := ttree_add !token_tree str end @@ -599,7 +581,7 @@ let rec next_token loc = parser bp let ep = Stream.count s in comment_stop bp; (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep - | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) -> + | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) -> let t = process_chars loc bp (Stream.next s) s in let new_between_commands = match t with (KEYWORD ("{"|"}"),_) -> !between_commands | _ -> false in |
