Improving error messages in the presence of utf-8 characters

- A more clever strategy than the use of null spaces (in revision 12058) for collapsing multi-byte utf-8 characters into one character (toplevel.ml, 8.3 and trunk only) - Fixing discard_to_dot in the presence of iterated lexing failures - Made the lexer state aligned with the start of utf-8 chars instead of staying in the middle of multi-byte chars when a token is not recognized (lexer.ml4) git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@12891 85f007b7-540e-0410-9357-904b9bb8a0f7
author: herbelin 2010-03-30 11:40:02 +0000
committer: herbelin 2010-03-30 11:40:02 +0000
commit: 6cbbf1f3cb363256a27d6d8d20684a4510768464 (patch)
tree: 459ed5abd87f83a5c8bdc6a665d5778eec5790b0 /parsing
parent: a4a492ca1c1fe2caa3f5de785fe08662d9520725 (diff)
1 files changed, 14 insertions, 9 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 8ec6b886ee..4b15e200ca 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -103,6 +103,14 @@ let error_utf8 cs =
   Stream.junk cs; (* consume the char to avoid read it and fail again *)
   err (bp, bp+1) Illegal_character
 
+let utf8_char_size cs = function
+  (* Utf8 leading byte *)
+  | '\x00'..'\x7F' -> 1
+  | '\xC0'..'\xDF' -> 2
+  | '\xE0'..'\xEF' -> 3
+  | '\xF0'..'\xF7' -> 4
+  | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> error_utf8 cs
+
 let njunk n = Util.repeat n Stream.junk
 
 let check_utf8_trailing_byte cs c =
@@ -377,14 +385,8 @@ and progress_utf8 last nj n c tt cs =
   with Not_found ->
     last
 
-and progress_from_byte last nj tt cs = function
-  (* Utf8 leading byte *)
-  | '\x00'..'\x7F' as c -> progress_utf8 last nj 1 c tt cs
-  | '\xC0'..'\xDF' as c -> progress_utf8 last nj 2 c tt cs
-  | '\xE0'..'\xEF' as c -> progress_utf8 last nj 3 c tt cs
-  | '\xF0'..'\xF7' as c -> progress_utf8 last nj 4 c tt cs
-  | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) ->
-      error_utf8 cs
+and progress_from_byte last nj tt cs c =
+  progress_utf8 last nj (utf8_char_size cs c) c tt cs
 
 let find_keyword id s =
   let tt = ttree_find !token_tree id in
@@ -398,7 +400,10 @@ let process_chars bp c cs =
   let ep = Stream.count cs in
   match t with
     | Some t -> (("", t), (bp, ep))
-    | None -> err (bp, ep) Undefined_token
+    | None ->
+	let ep' = bp + utf8_char_size cs c in
+	njunk (ep' - ep) cs;
+	err (bp, ep') Undefined_token
 
 let parse_after_dollar bp =
   parser
author	herbelin	2010-03-30 11:40:02 +0000
committer	herbelin	2010-03-30 11:40:02 +0000
commit	6cbbf1f3cb363256a27d6d8d20684a4510768464 (patch)
tree	459ed5abd87f83a5c8bdc6a665d5778eec5790b0 /parsing
parent	a4a492ca1c1fe2caa3f5de785fe08662d9520725 (diff)