aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPierre-Marie Pédrot2016-06-14 00:08:14 +0200
committerPierre-Marie Pédrot2016-06-14 00:09:30 +0200
commit236627cdf081e51fce3bc54fdee4e40d4f6ca85e (patch)
tree161c930e2efacd1198fed18f208e9f4fdba1b8fc
parent45de05d0ea9740f14c58dfd67436ddbea03c6a49 (diff)
Moving UTF-8 related functions to Unicode module.
-rw-r--r--lib/pp.ml32
-rw-r--r--lib/pp.mli3
-rw-r--r--lib/unicode.ml70
-rw-r--r--lib/unicode.mli6
-rw-r--r--ltacprof/profile_ltac.ml1
5 files changed, 77 insertions, 35 deletions
diff --git a/lib/pp.ml b/lib/pp.ml
index 8687878646..d07f01b906 100644
--- a/lib/pp.ml
+++ b/lib/pp.ml
@@ -168,38 +168,6 @@ let utf8_length s =
done ;
!cnt
-(* Variant of String.sub for UTF8 character positions *)
-let utf8_sub s start_u len_u =
- let len_b = String.length s
- and end_u = start_u + len_u
- and cnt = ref 0
- and nc = ref 0
- and p = ref 0 in
- let start_b = ref len_b in
- while !p < len_b && !cnt < end_u do
- if !cnt <= start_u then start_b := !p ;
- begin
- match s.[!p] with
- | '\000'..'\127' -> nc := 0 (* ascii char *)
- | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
- | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
- | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
- | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
- | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
- | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
- | '\254'..'\255' -> nc := 0 (* invalid byte *)
- end ;
- incr p ;
- while !p < len_b && !nc > 0 do
- match s.[!p] with
- | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
- | _ (* not a continuation byte *) -> nc := 0
- done ;
- incr cnt
- done ;
- let end_b = !p in
- String.sub s !start_b (end_b - !start_b)
-
(* formatting commands *)
let str s = Glue.atom(Ppcmd_print (Str_def s))
let stras (i, s) = Glue.atom(Ppcmd_print (Str_len (s, i)))
diff --git a/lib/pp.mli b/lib/pp.mli
index 56b82e4489..a18744c376 100644
--- a/lib/pp.mli
+++ b/lib/pp.mli
@@ -100,9 +100,6 @@ val close_tag : unit -> std_ppcmds
val string_of_ppcmds : std_ppcmds -> string
-val utf8_length : string -> int
-val utf8_sub : string -> int -> int -> string
-
(** {6 Printing combinators} *)
val pr_comma : unit -> std_ppcmds
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 7aa8d9d513..dc852d9819 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -261,3 +261,73 @@ let ascii_of_ident s =
(Buffer.add_char out s.[!i]; incr i)
done;
Buffer.contents out
+
+(* Compute length of an UTF-8 encoded string
+ Rem 1 : utf8_length <= String.length (equal if pure ascii)
+ Rem 2 : if used for an iso8859_1 encoded string, the result is
+ wrong in very rare cases. Such a wrong case corresponds to any
+ sequence of a character in range 192..253 immediately followed by a
+ character in range 128..191 (typical case in french is "déçu" which
+ is counted 3 instead of 4); then no real harm to use always
+ utf8_length even if using an iso8859_1 encoding *)
+
+(** FIXME: duplicate code with Pp *)
+
+let utf8_length s =
+ let len = String.length s
+ and cnt = ref 0
+ and nc = ref 0
+ and p = ref 0 in
+ while !p < len do
+ begin
+ match s.[!p] with
+ | '\000'..'\127' -> nc := 0 (* ascii char *)
+ | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+ | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+ | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+ | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+ | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+ | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+ | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ end ;
+ incr p ;
+ while !p < len && !nc > 0 do
+ match s.[!p] with
+ | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+ | _ (* not a continuation byte *) -> nc := 0
+ done ;
+ incr cnt
+ done ;
+ !cnt
+
+(* Variant of String.sub for UTF8 character positions *)
+let utf8_sub s start_u len_u =
+ let len_b = String.length s
+ and end_u = start_u + len_u
+ and cnt = ref 0
+ and nc = ref 0
+ and p = ref 0 in
+ let start_b = ref len_b in
+ while !p < len_b && !cnt < end_u do
+ if !cnt <= start_u then start_b := !p ;
+ begin
+ match s.[!p] with
+ | '\000'..'\127' -> nc := 0 (* ascii char *)
+ | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+ | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+ | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+ | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+ | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+ | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+ | '\254'..'\255' -> nc := 0 (* invalid byte *)
+ end ;
+ incr p ;
+ while !p < len_b && !nc > 0 do
+ match s.[!p] with
+ | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+ | _ (* not a continuation byte *) -> nc := 0
+ done ;
+ incr cnt
+ done ;
+ let end_b = !p in
+ String.sub s !start_b (end_b - !start_b)
diff --git a/lib/unicode.mli b/lib/unicode.mli
index aaf455dec5..1f8bd44eee 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -40,3 +40,9 @@ val ascii_of_ident : string -> string
(** Validate an UTF-8 string *)
val is_utf8 : string -> bool
+
+(** Return the length of a valid UTF-8 string. *)
+val utf8_length : string -> int
+
+(** Variant of {!String.sub} for UTF-8 strings. *)
+val utf8_sub : string -> int -> int -> string
diff --git a/ltacprof/profile_ltac.ml b/ltacprof/profile_ltac.ml
index 97d74ad80f..41bde12b18 100644
--- a/ltacprof/profile_ltac.ml
+++ b/ltacprof/profile_ltac.ml
@@ -1,3 +1,4 @@
+open Unicode
open Pp
open Printer
open Util