aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode.ml
diff options
context:
space:
mode:
authorPierre Letouzey2016-05-19 15:18:26 +0200
committerPierre Letouzey2016-05-19 15:18:26 +0200
commit244d7a9aafe7ad613dd2095ca3126560cb3ea1d0 (patch)
tree26102e433f0072ab32f724fa231693510119c37b /lib/unicode.ml
parentc14e6eebc6c3696623a440cd7eaa4a8d8fe4f492 (diff)
Unicode.ascii_of_ident is now truly injective
A non-ASCII char is now converted to _UUxxxx_ with xxxx being its unicode index in hexa. And any preexisting _UU substring in the ident is converted to _UUU. The switch from __Uxxxx_ to _UUxxxx_ is cosmetic, it just helps the extraction (less __ in names). But the other part of the patch (detection of preexisting _UU substrings) is critical to make ascii_of_ident truly injective and avoid the following kind of proof of False via native_compute : Definition α := 1. Definition __U03b1_ := 2. Lemma oups : False. Proof. assert (α = __U03b1_). { native_compute. reflexivity. } discriminate. Qed.
Diffstat (limited to 'lib/unicode.ml')
-rw-r--r--lib/unicode.ml30
1 files changed, 20 insertions, 10 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 938e8f1a99..7aa8d9d513 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -240,14 +240,24 @@ let is_basic_ascii s =
!ok
let ascii_of_ident s =
- if is_basic_ascii s then s else
- let i = ref 0 and out = ref "" in
- begin try while true do
+ let len = String.length s in
+ let has_UU i =
+ i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U'
+ in
+ let i = ref 0 in
+ while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do
+ incr i
+ done;
+ if !i = len then s else
+ let out = Buffer.create (2*len) in
+ Buffer.add_substring out s 0 !i;
+ while !i < len do
let j, n = next_utf8 s !i in
- out :=
- if n >= 128
- then Printf.sprintf "%s__U%04x_" !out n
- else Printf.sprintf "%s%c" !out s.[!i];
- i := !i + j
- done with End_of_input -> () end;
- !out
+ if n >= 128 then
+ (Printf.bprintf out "_UU%04x_" n; i := !i + j)
+ else if has_UU !i then
+ (Buffer.add_string out "_UUU"; i := !i + 3)
+ else
+ (Buffer.add_char out s.[!i]; incr i)
+ done;
+ Buffer.contents out