From d2f9a457d0bb2fd11ac7d5f6587174a79ca9c4b6 Mon Sep 17 00:00:00 2001 From: Pierre Letouzey Date: Thu, 19 May 2016 15:18:26 +0200 Subject: Unicode.ascii_of_ident is now truly injective A non-ASCII char is now converted to _UUxxxx_ with xxxx being its unicode index in hexa. And any preexisting _UU substring in the ident is converted to _UUU. The switch from __Uxxxx_ to _UUxxxx_ is cosmetic, it just helps the extraction (less __ in names). But the other part of the patch (detection of preexisting _UU substrings) is critical to make ascii_of_ident truly injective and avoid the following kind of proof of False via native_compute : Definition α := 1. Definition __U03b1_ := 2. Lemma oups : False. Proof. assert (α = __U03b1_). { native_compute. reflexivity. } discriminate. Qed. Conflicts: lib/unicode.mli --- lib/unicode.ml | 30 ++++++++++++++++++++---------- lib/unicode.mli | 9 ++++++++- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/lib/unicode.ml b/lib/unicode.ml index cfaa73cc11..0dc4238ee4 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -235,14 +235,24 @@ let is_basic_ascii s = !ok let ascii_of_ident s = - if is_basic_ascii s then s else - let i = ref 0 and out = ref "" in - begin try while true do + let len = String.length s in + let has_UU i = + i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U' + in + let i = ref 0 in + while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do + incr i + done; + if !i = len then s else + let out = Buffer.create (2*len) in + Buffer.add_substring out s 0 !i; + while !i < len do let j, n = next_utf8 s !i in - out := - if n >= 128 - then Printf.sprintf "%s__U%04x_" !out n - else Printf.sprintf "%s%c" !out s.[!i]; - i := !i + j - done with End_of_input -> () end; - !out + if n >= 128 then + (Printf.bprintf out "_UU%04x_" n; i := !i + j) + else if has_UU !i then + (Buffer.add_string out "_UUU"; i := !i + 3) + else + (Buffer.add_char out s.[!i]; incr i) + done; + Buffer.contents out diff --git a/lib/unicode.mli b/lib/unicode.mli index 65e75a20d6..00211164fb 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -23,8 +23,15 @@ val ident_refutation : string -> (bool * string) option (** First char of a string, converted to lowercase *) val lowercase_first_char : string -> string -(** For extraction, turn a unicode string into an ascii-only one *) +(** Return [true] if all UTF-8 characters in the input string are just plain + ASCII characters. Returns [false] otherwise. *) val is_basic_ascii : string -> bool + +(** [ascii_of_ident s] maps UTF-8 string to a string composed solely from ASCII + characters. The non-ASCII characters are translated to ["_UUxxxx_"] where + {i xxxx} is the Unicode index of the character in hexadecimal (from four + to six hex digits). To avoid potential name clashes, any preexisting + substring ["_UU"] is turned into ["_UUU"]. *) val ascii_of_ident : string -> string (** Validate an UTF-8 string *) -- cgit v1.2.3