Stopping warning on unrecognized unicode character in notation (fixing #5136).

The warning was pointless since the notation was accepted and parsed anyway. We now treat unrecognized unicode characters like ordinary undefined tokens (e.g. "#" in a bare Coq). For instance, "aₚ", or ".ₚ", or "?ₚ" now fail with "Undefined token" rather than "Unsupported Unicode character".
author: Hugo Herbelin 2016-10-13 15:57:14 +0200
committer: Hugo Herbelin 2016-10-17 20:22:17 +0200
commit: 57c6ffd23836364168ffd1c66dbddbecf830c7c6 (patch)
tree: 051f302085ef8a33f90bd0533053d158e1b29b6d
parent: 4204581ccb8bdf0f6c4298029c010c6deb643594 (diff)
3 files changed, 10 insertions, 36 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index dc852d9819..ced5e258c2 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -8,9 +8,7 @@
 
 (** Unicode utilities *)
 
-type status = Letter | IdentPart | Symbol
-
-exception Unsupported
+type status = Letter | IdentPart | Symbol | Unknown
 
 (* The following table stores classes of Unicode characters that
    are used by the lexer. There are 3 different classes so 2 bits are
@@ -29,6 +27,7 @@ let mask i = function
   | Letter    -> 1 lsl ((i land 7) lsl 1) (* 01 *)
   | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
   | Symbol    -> 3 lsl ((i land 7) lsl 1) (* 11 *)
+  | Unknown   -> 0 lsl ((i land 7) lsl 1) (* 00 *)
 
 (* Helper to reset 2 bits in a word. *)
 let reset_mask i =
@@ -55,7 +54,7 @@ let lookup x =
     if      v = 1 then Letter
     else if v = 2 then IdentPart
     else if v = 3 then Symbol
-    else raise Unsupported
+    else Unknown
 
 (* [classify] discriminates between 3 different kinds of
    symbols based on the standard unicode classification (extracted from
@@ -215,7 +214,6 @@ let ident_refutation s =
         |x -> x
   with
   | End_of_input -> Some (true,"The empty string is not an identifier.")
-  | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.")
   | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
 
 let lowercase_unicode =
diff --git a/lib/unicode.mli b/lib/unicode.mli
index 1f8bd44eee..2609e1968f 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -8,22 +8,16 @@
 
 (** Unicode utilities *)
 
-type status = Letter | IdentPart | Symbol
+type status = Letter | IdentPart | Symbol | Unknown
 
-(** This exception is raised when UTF-8 the input string contains unsupported UTF-8 characters. *)
-exception Unsupported
-
-(** Classify a unicode char into 3 classes.
-    @raise Unsupported if the input string contains unsupported UTF-8 characters. *)
+(** Classify a unicode char into 3 classes or unknown. *)
 val classify : int -> status
 
 (** Return [None] if a given string can be used as a (Coq) identifier.
-    Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity.
-    @raise Unsupported if the input string contains unsupported UTF-8 characters. *)
+    Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity. *)
 val ident_refutation : string -> (bool * string) option
 
 (** First char of a string, converted to lowercase
-    @raise Unsupported if the input string contains unsupported UTF-8 characters.
     @raise Assert_failure if the input string is empty. *)
 val lowercase_first_char : string -> string
 
diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4
index e59b9630fb..6a343f50ee 100644
--- a/parsing/cLexer.ml4
+++ b/parsing/cLexer.ml4
@@ -112,11 +112,6 @@ type token_kind =
   | AsciiChar
   | EmptyStream
 
-let error_unsupported_unicode_character loc n unicode cs =
-  let bp = Stream.count cs in
-  let loc = set_loc_pos loc bp (bp+n) in
-  err loc (UnsupportedUnicode unicode)
-
 let error_utf8 loc cs =
   let bp = Stream.count cs in
   Stream.junk cs; (* consume the char to avoid read it and fail again *)
@@ -166,14 +161,12 @@ let lookup_utf8_tail loc c cs =
           (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
       | _ -> error_utf8 loc cs
     in
-    try Unicode.classify unicode, n
-    with Unicode.Unsupported ->
-      njunk n cs; error_unsupported_unicode_character loc n unicode cs
+    Utf8Token (Unicode.classify unicode, n)
 
 let lookup_utf8 loc cs =
   match Stream.peek cs with
     | Some ('\x00'..'\x7F') -> AsciiChar
-    | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail loc c cs)
+    | Some ('\x80'..'\xFF' as c) -> lookup_utf8_tail loc c cs
     | None -> EmptyStream
 
 let unlocated f x = f x
@@ -191,17 +184,6 @@ let check_keyword str =
   in
   loop_symb (Stream.of_string str)
 
-let warn_unparsable_keyword =
-  CWarnings.create ~name:"unparsable-keyword" ~category:"parsing"
-         (fun (s,unicode) ->
-          strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \
-                                  which will not be parsable." s unicode))
-            
-let check_keyword_to_add s =
-  try check_keyword s
-  with Error.E (UnsupportedUnicode unicode) ->
-    warn_unparsable_keyword (s,unicode)
-
 let check_ident str =
   let rec loop_id intail = parser
     | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
@@ -232,7 +214,7 @@ let is_keyword s =
 let add_keyword str =
   if not (is_keyword str) then
     begin
-      check_keyword_to_add str;
+      check_keyword str;
       token_tree := ttree_add !token_tree str
     end
 
@@ -599,7 +581,7 @@ let rec next_token loc = parser bp
             let ep = Stream.count s in
             comment_stop bp;
             (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
-        | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) ->
+        | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) ->
             let t = process_chars loc bp (Stream.next s) s in
             let new_between_commands = match t with
               (KEYWORD ("{"|"}"),_) -> !between_commands | _ -> false in
author	Hugo Herbelin	2016-10-13 15:57:14 +0200
committer	Hugo Herbelin	2016-10-17 20:22:17 +0200
commit	57c6ffd23836364168ffd1c66dbddbecf830c7c6 (patch)
tree	051f302085ef8a33f90bd0533053d158e1b29b6d
parent	4204581ccb8bdf0f6c4298029c010c6deb643594 (diff)