diff options
| author | Matej Kosik | 2015-12-16 12:55:40 +0100 |
|---|---|---|
| committer | Matej Kosik | 2015-12-18 15:57:49 +0100 |
| commit | 5174ee7e118d2bc57fc2d8a6619101735af79b16 (patch) | |
| tree | d387c3bcf3f7358b7aefbfa5e3d556b46f8b32ec /lib | |
| parent | ca42472322013714050b98756aeaa222908fbe67 (diff) | |
COMMENTS: added to the "Unicode" module.
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/unicode.ml | 7 | ||||
| -rw-r--r-- | lib/unicode.mli | 22 |
2 files changed, 22 insertions, 7 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml index 1765e93dcd..05998bb804 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -18,7 +18,7 @@ exception Unsupported to simplify the masking process. (This choice seems to be a good trade-off between speed and space after some benchmarks.) *) -(* A 256ko table, initially filled with zeros. *) +(* A 256 KiB table, initially filled with zeros. *) let table = Array.make (1 lsl 17) 0 (* Associate a 2-bit pattern to each status at position [i]. @@ -147,6 +147,11 @@ let utf8_of_unicode n = s end +(* If [s] is some UTF-8 encoded string + and [i] is a position of some UTF-8 character within [s] + then [next_utf8 s i] returns [(j,n)] where: + - [j] indicates the position of the next UTF-8 character + - [n] represents the UTF-8 character at index [i] *) let next_utf8 s i = let err () = invalid_arg "utf8" in let l = String.length s - i in diff --git a/lib/unicode.mli b/lib/unicode.mli index 098f6c919d..eb75f00c28 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -10,19 +10,29 @@ type status = Letter | IdentPart | Symbol +(** This exception is raised when UTF-8 the input string contains unsupported UTF-8 characters. *) exception Unsupported -(** Classify a unicode char into 3 classes, or raise [Unsupported] *) +(** Classify a unicode char into 3 classes. + @raise Unsupported if the input string contains unsupported UTF-8 characters. *) val classify : int -> status -(** Check whether a given string be used as a legal identifier. - - [None] means yes - - [Some (b,s)] means no, with explanation [s] and severity [b] *) +(** Return [None] if a given string can be used as a (Coq) identifier. + Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity. + @raise Unsupported if the input string contains unsupported UTF-8 characters. *) val ident_refutation : string -> (bool * string) option -(** First char of a string, converted to lowercase *) +(** First char of a string, converted to lowercase + @raise Unsupported if the input string contains unsupported UTF-8 characters. + @raise Assert_failure if the input string is empty. *) val lowercase_first_char : string -> string -(** For extraction, turn a unicode string into an ascii-only one *) +(** Return [true] if all UTF-8 characters in the input string are just plain ASCII characters. + Returns [false] otherwise. *) val is_basic_ascii : string -> bool + +(** [ascii_of_ident s] maps UTF-8 string to a string composed solely from ASCII characters. + Those UTF-8 characters which do not have their ASCII counterparts are + translated to ["__Uxxxx_"] where {i xxxx} are four hexadecimal digits. + @raise Unsupported if the input string contains unsupported UTF-8 characters. *) val ascii_of_ident : string -> string |
