diff options
| author | Pierre-Marie Pédrot | 2016-06-14 00:08:14 +0200 |
|---|---|---|
| committer | Pierre-Marie Pédrot | 2016-06-14 00:09:30 +0200 |
| commit | 236627cdf081e51fce3bc54fdee4e40d4f6ca85e (patch) | |
| tree | 161c930e2efacd1198fed18f208e9f4fdba1b8fc | |
| parent | 45de05d0ea9740f14c58dfd67436ddbea03c6a49 (diff) | |
Moving UTF-8 related functions to Unicode module.
| -rw-r--r-- | lib/pp.ml | 32 | ||||
| -rw-r--r-- | lib/pp.mli | 3 | ||||
| -rw-r--r-- | lib/unicode.ml | 70 | ||||
| -rw-r--r-- | lib/unicode.mli | 6 | ||||
| -rw-r--r-- | ltacprof/profile_ltac.ml | 1 |
5 files changed, 77 insertions, 35 deletions
@@ -168,38 +168,6 @@ let utf8_length s = done ; !cnt -(* Variant of String.sub for UTF8 character positions *) -let utf8_sub s start_u len_u = - let len_b = String.length s - and end_u = start_u + len_u - and cnt = ref 0 - and nc = ref 0 - and p = ref 0 in - let start_b = ref len_b in - while !p < len_b && !cnt < end_u do - if !cnt <= start_u then start_b := !p ; - begin - match s.[!p] with - | '\000'..'\127' -> nc := 0 (* ascii char *) - | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) - | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) - | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) - | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) - end ; - incr p ; - while !p < len_b && !nc > 0 do - match s.[!p] with - | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc - | _ (* not a continuation byte *) -> nc := 0 - done ; - incr cnt - done ; - let end_b = !p in - String.sub s !start_b (end_b - !start_b) - (* formatting commands *) let str s = Glue.atom(Ppcmd_print (Str_def s)) let stras (i, s) = Glue.atom(Ppcmd_print (Str_len (s, i))) diff --git a/lib/pp.mli b/lib/pp.mli index 56b82e4489..a18744c376 100644 --- a/lib/pp.mli +++ b/lib/pp.mli @@ -100,9 +100,6 @@ val close_tag : unit -> std_ppcmds val string_of_ppcmds : std_ppcmds -> string -val utf8_length : string -> int -val utf8_sub : string -> int -> int -> string - (** {6 Printing combinators} *) val pr_comma : unit -> std_ppcmds diff --git a/lib/unicode.ml b/lib/unicode.ml index 7aa8d9d513..dc852d9819 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -261,3 +261,73 @@ let ascii_of_ident s = (Buffer.add_char out s.[!i]; incr i) done; Buffer.contents out + +(* Compute length of an UTF-8 encoded string + Rem 1 : utf8_length <= String.length (equal if pure ascii) + Rem 2 : if used for an iso8859_1 encoded string, the result is + wrong in very rare cases. Such a wrong case corresponds to any + sequence of a character in range 192..253 immediately followed by a + character in range 128..191 (typical case in french is "déçu" which + is counted 3 instead of 4); then no real harm to use always + utf8_length even if using an iso8859_1 encoding *) + +(** FIXME: duplicate code with Pp *) + +let utf8_length s = + let len = String.length s + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + while !p < len do + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) + | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) + | '\254'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + !cnt + +(* Variant of String.sub for UTF8 character positions *) +let utf8_sub s start_u len_u = + let len_b = String.length s + and end_u = start_u + len_u + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + let start_b = ref len_b in + while !p < len_b && !cnt < end_u do + if !cnt <= start_u then start_b := !p ; + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) + | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) + | '\254'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len_b && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + let end_b = !p in + String.sub s !start_b (end_b - !start_b) diff --git a/lib/unicode.mli b/lib/unicode.mli index aaf455dec5..1f8bd44eee 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -40,3 +40,9 @@ val ascii_of_ident : string -> string (** Validate an UTF-8 string *) val is_utf8 : string -> bool + +(** Return the length of a valid UTF-8 string. *) +val utf8_length : string -> int + +(** Variant of {!String.sub} for UTF-8 strings. *) +val utf8_sub : string -> int -> int -> string diff --git a/ltacprof/profile_ltac.ml b/ltacprof/profile_ltac.ml index 97d74ad80f..41bde12b18 100644 --- a/ltacprof/profile_ltac.ml +++ b/ltacprof/profile_ltac.ml @@ -1,3 +1,4 @@ +open Unicode open Pp open Printer open Util |
