aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorHugo Herbelin2017-10-04 22:00:19 +0200
committerHugo Herbelin2017-10-05 08:36:50 +0200
commit526791d917f9b0804376eae02a462a3b32dd7cba (patch)
treed9ad17bf4c1e287627acc3183bdbabf88b49438c /lib
parent9a2bb3a6d12a082c61dfda62be53c195fe3cb57c (diff)
Distinguishing pseudo-letters out of the set of unicode letters.
This includes _ and insecable space which can be used in idents and this allows more precise heuristics.
Diffstat (limited to 'lib')
-rw-r--r--lib/unicode.ml53
-rw-r--r--lib/unicode.mli11
2 files changed, 43 insertions, 21 deletions
diff --git a/lib/unicode.ml b/lib/unicode.ml
index 9e79f1048b..528d6434c1 100644
--- a/lib/unicode.ml
+++ b/lib/unicode.ml
@@ -8,13 +8,14 @@
(** Unicode utilities *)
-type status = Letter | IdentPart | Symbol | Unknown
+type status = Letter | IdentPart | Symbol | IdentSep | Unknown
(* The following table stores classes of Unicode characters that
- are used by the lexer. There are 3 different classes so 2 bits are
- allocated for each character. We only use 16 bits over the 31 bits
- to simplify the masking process. (This choice seems to be a good
- trade-off between speed and space after some benchmarks.) *)
+ are used by the lexer. There are 5 different classes so 3 bits
+ are allocated for each character. We encode the masks of 8
+ characters per word, thus using 24 bits over the 31 available
+ bits. (This choice seems to be a good trade-off between speed
+ and space after some benchmarks.) *)
(* A 256 KiB table, initially filled with zeros. *)
let table = Array.make (1 lsl 17) 0
@@ -24,14 +25,15 @@ let table = Array.make (1 lsl 17) 0
define the position of the pattern in the word.
Notice that pattern "00" means "undefined". *)
let mask i = function
- | Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *)
- | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
- | Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *)
- | Unknown -> 0 lsl ((i land 7) lsl 1) (* 00 *)
+ | Letter -> 1 lsl ((i land 7) * 3) (* 001 *)
+ | IdentPart -> 2 lsl ((i land 7) * 3) (* 010 *)
+ | Symbol -> 3 lsl ((i land 7) * 3) (* 011 *)
+ | IdentSep -> 4 lsl ((i land 7) * 3) (* 100 *)
+ | Unknown -> 0 lsl ((i land 7) * 3) (* 000 *)
-(* Helper to reset 2 bits in a word. *)
+(* Helper to reset 3 bits in a word. *)
let reset_mask i =
- lnot (3 lsl ((i land 7) lsl 1))
+ lnot (7 lsl ((i land 7) * 3))
(* Initialize the lookup table from a list of segments, assigning
a status to every character of each segment. The order of these
@@ -50,13 +52,14 @@ let mk_lookup_table_from_unicode_tables_for status tables =
(* Look up into the table and interpret the found pattern. *)
let lookup x =
- let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
+ let v = (table.(x lsr 3) lsr ((x land 7) * 3)) land 7 in
if v = 1 then Letter
else if v = 2 then IdentPart
else if v = 3 then Symbol
+ else if v = 4 then IdentSep
else Unknown
-(* [classify] discriminates between 3 different kinds of
+(* [classify] discriminates between 5 different kinds of
symbols based on the standard unicode classification (extracted from
Camomile). *)
let classify =
@@ -107,7 +110,7 @@ let classify =
[(0x02074, 0x02079)]; (* Superscript 4-9. *)
single 0x0002E; (* Dot. *)
];
- mk_lookup_table_from_unicode_tables_for Letter
+ mk_lookup_table_from_unicode_tables_for IdentSep
[
single 0x005F; (* Underscore. *)
single 0x00A0; (* Non breaking space. *)
@@ -165,22 +168,32 @@ let is_utf8 s =
(* Check the well-formedness of an identifier *)
+let is_valid_ident_initial = function
+ | Letter | IdentSep -> true
+ | IdentPart | Symbol | Unknown -> false
+
let initial_refutation j n s =
- match classify n with
- | Letter -> None
- | _ ->
+ if is_valid_ident_initial (classify n) then None
+ else
let c = String.sub s 0 j in
Some (false,
"Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
+let is_valid_ident_trailing = function
+ | Letter | IdentSep | IdentPart -> true
+ | Symbol | Unknown -> false
+
let trailing_refutation i j n s =
- match classify n with
- | Letter | IdentPart -> None
- | _ ->
+ if is_valid_ident_trailing (classify n) then None
+ else
let c = String.sub s i j in
Some (false,
"Invalid character '"^c^"' in identifier \""^s^"\".")
+let is_unknown = function
+ | Unknown -> true
+ | Letter | IdentSep | IdentPart | Symbol -> false
+
let ident_refutation s =
if s = ".." then None else try
let j, n = next_utf8 s 0 in
diff --git a/lib/unicode.mli b/lib/unicode.mli
index c7d7424801..a608d5f026 100644
--- a/lib/unicode.mli
+++ b/lib/unicode.mli
@@ -8,7 +8,7 @@
(** Unicode utilities *)
-type status = Letter | IdentPart | Symbol | Unknown
+type status
(** Classify a unicode char into 3 classes or unknown. *)
val classify : int -> status
@@ -17,6 +17,15 @@ val classify : int -> status
Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity. *)
val ident_refutation : string -> (bool * string) option
+(** Tells if a valid initial character for an identifier *)
+val is_valid_ident_initial : status -> bool
+
+(** Tells if a valid non-initial character for an identifier *)
+val is_valid_ident_trailing : status -> bool
+
+(** Tells if a character is unclassified *)
+val is_unknown : status -> bool
+
(** First char of a string, converted to lowercase
@raise Assert_failure if the input string is empty. *)
val lowercase_first_char : string -> string