diff options
Diffstat (limited to 'parsing/lexer.ml4')
| -rw-r--r-- | parsing/lexer.ml4 | 96 |
1 files changed, 44 insertions, 52 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index 6424cb3d9c..0a0228ac04 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -68,71 +68,64 @@ exception Error of error let bad_token str = raise (Error (Bad_token str)) let check_special_token str = - let rec loop = parser + let rec loop_symb = parser | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str | [< _ = Stream.empty >] -> () - | [< '_ ; s >] -> loop s + | [< '_ ; s >] -> loop_symb s in - loop (Stream.of_string str) + loop_symb (Stream.of_string str) let check_ident str = - let rec loop = parser - | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> loop s + let first_letter = function + (''' | '0'..'9') -> false + | _ -> true in + let rec loop_id = parser + | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> + loop_id s (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop s - (* iso 8859-1 accentuated letters *) - | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> loop s + | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop_id s + | [< ''\226'; 'c2; 'c3; s >] -> + (match c2, c3 with + (* utf8 letter-like unicode 2100-214F *) + | (('\132', '\128'..'\191') | ('\133', '\128'..'\143')) -> + loop_id s + (* utf8 symbols (see [parse_226_tail]) *) + | (('\134'..'\143' | '\152'..'\155' + | '\164'..'\165' | '\168'..'\171'),_) -> + bad_token str + | _ -> (* default to iso 8859-1 "â" *) + if !Options.v7 then loop_id [< 'c2; 'c3; s >] + else bad_token str) + (* iso 8859-1 accentuated letters *) + | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> + if !Options.v7 then loop_id s else bad_token str | [< _ = Stream.empty >] -> () | [< >] -> bad_token str in - loop (Stream.of_string str) + if String.length str > 0 && first_letter str.[0] then + loop_id (Stream.of_string str) + else + bad_token str -(* Special token dictionary *) -let token_tree = ref empty_ttree +let check_keyword str = + try check_special_token str + with Error _ -> check_ident str -let add_special_token str = - check_special_token str; - token_tree := ttree_add !token_tree str +(* Keyword and symbol dictionary *) +let token_tree = ref empty_ttree -(* Keyword identifier dictionary *) -let keywords = ref empty_ttree +let find_keyword s = ttree_find !token_tree s -let find_keyword s = ttree_find !keywords s +let is_keyword s = + try let _ = ttree_find !token_tree s in true with Not_found -> false let add_keyword str = - check_ident str; - keywords := ttree_add !keywords str - -let is_keyword s = - try let _ = ttree_find !keywords s in true with Not_found -> false - -let is_normal_token str = - if String.length str > 0 then - match str.[0] with - | ' ' | '\n' | '\r' | '\t' | '"' -> bad_token str - | '$' | 'a'..'z' | 'A'..'Z' | '_' -> true - (* utf-8 symbols of the form "E2 xx xx" [E2=226] *) - | '\226' when String.length str > 2 -> - (match str.[1], str.[2] with - | ('\132', '\128'..'\191') | ('\133', '\128'..'\143') -> - (* utf8 letter-like unicode 2100-214F *) true - | (('\134'..'\143' | '\152'..'\155' - | '\164'..'\165' | '\168'..'\171'),_) -> - (* utf8 symbols (see [parse_226_tail] *) - false - | _ -> - (* default to iso 8859-1 "â" *) - !Options.v7) - (* iso 8859-1 accentuated letters *) - | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> !Options.v7 - | _ -> false - else - bad_token str + check_keyword str; + token_tree := ttree_add !token_tree str (* Adding a new token (keyword or special token). *) let add_token (con, str) = match con with - | "" -> - if is_normal_token str then add_keyword str else add_special_token str + | "" -> add_keyword str | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI" -> () | _ -> @@ -141,16 +134,15 @@ the constructor \"" ^ con ^ "\" is not recognized by Lexer")) (* Freeze and unfreeze the state of the lexer *) -type frozen_t = ttree * ttree +type frozen_t = ttree -let freeze () = (!keywords, !token_tree) +let freeze () = !token_tree -let unfreeze (kw,tt) = - keywords := kw; +let unfreeze tt = token_tree := tt let init () = - unfreeze (empty_ttree, empty_ttree) + unfreeze empty_ttree let _ = init() |
