Unicode.ascii_of_ident is now truly injective

A non-ASCII char is now converted to _UUxxxx_ with xxxx being its unicode index in hexa. And any preexisting _UU substring in the ident is converted to _UUU. The switch from __Uxxxx_ to _UUxxxx_ is cosmetic, it just helps the extraction (less __ in names). But the other part of the patch (detection of preexisting _UU substrings) is critical to make ascii_of_ident truly injective and avoid the following kind of proof of False via native_compute : Definition α := 1. Definition __U03b1_ := 2. Lemma oups : False. Proof. assert (α = __U03b1_). { native_compute. reflexivity. } discriminate. Qed.
author: Pierre Letouzey 2016-05-19 15:18:26 +0200
committer: Pierre Letouzey 2016-05-19 15:18:26 +0200
commit: 244d7a9aafe7ad613dd2095ca3126560cb3ea1d0 (patch)
tree: 26102e433f0072ab32f724fa231693510119c37b /lib
parent: c14e6eebc6c3696623a440cd7eaa4a8d8fe4f492 (diff)
2 files changed, 27 insertions, 16 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 938e8f1a99..7aa8d9d513 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -240,14 +240,24 @@ let is_basic_ascii s =
   !ok
 
 let ascii_of_ident s =
-  if is_basic_ascii s then s else
-    let i = ref 0 and out = ref "" in
-    begin try while true do
+  let len = String.length s in
+  let has_UU i =
+    i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U'
+  in
+  let i = ref 0 in
+  while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do
+    incr i
+  done;
+  if !i = len then s else
+    let out = Buffer.create (2*len) in
+    Buffer.add_substring out s 0 !i;
+    while !i < len do
       let j, n = next_utf8 s !i in
-      out :=
-        if n >= 128
-        then Printf.sprintf "%s__U%04x_" !out n
-        else Printf.sprintf "%s%c" !out s.[!i];
-      i := !i + j
-    done with End_of_input -> () end;
-    !out
+      if n >= 128 then
+        (Printf.bprintf out "_UU%04x_" n; i := !i + j)
+      else if has_UU !i then
+        (Buffer.add_string out "_UUU"; i := !i + 3)
+      else
+        (Buffer.add_char out s.[!i]; incr i)
+    done;
+    Buffer.contents out
diff --git a/lib/unicode.mli b/lib/unicode.mli
index b8a11e2945..aaf455dec5 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -27,14 +27,15 @@ val ident_refutation : string -> (bool * string) option
     @raise Assert_failure if the input string is empty. *)
 val lowercase_first_char : string -> string
 
-(** Return [true] if all UTF-8 characters in the input string are just plain ASCII characters.
-    Returns [false] otherwise. *)
+(** Return [true] if all UTF-8 characters in the input string are just plain
+    ASCII characters. Returns [false] otherwise. *)
 val is_basic_ascii : string -> bool
 
-(** [ascii_of_ident s] maps UTF-8 string to a string composed solely from ASCII characters.
-    Those UTF-8 characters which do not have their ASCII counterparts are
-    translated to ["__Uxxxx_"] where {i xxxx} are four hexadecimal digits.
-    @raise Unsupported if the input string contains unsupported UTF-8 characters. *)
+(** [ascii_of_ident s] maps UTF-8 string to a string composed solely from ASCII
+    characters. The non-ASCII characters are translated to ["_UUxxxx_"] where
+    {i xxxx} is the Unicode index of the character in hexadecimal (from four
+    to six hex digits). To avoid potential name clashes, any preexisting
+    substring ["_UU"] is turned into ["_UUU"]. *)
 val ascii_of_ident : string -> string
 
 (** Validate an UTF-8 string *)
author	Pierre Letouzey	2016-05-19 15:18:26 +0200
committer	Pierre Letouzey	2016-05-19 15:18:26 +0200
commit	244d7a9aafe7ad613dd2095ca3126560cb3ea1d0 (patch)
tree	26102e433f0072ab32f724fa231693510119c37b /lib
parent	c14e6eebc6c3696623a440cd7eaa4a8d8fe4f492 (diff)