summaryrefslogtreecommitdiff
path: root/src/lexer.mll
diff options
context:
space:
mode:
Diffstat (limited to 'src/lexer.mll')
-rw-r--r--src/lexer.mll210
1 files changed, 210 insertions, 0 deletions
diff --git a/src/lexer.mll b/src/lexer.mll
new file mode 100644
index 00000000..551544c7
--- /dev/null
+++ b/src/lexer.mll
@@ -0,0 +1,210 @@
+(**************************************************************************)
+(* Lem *)
+(* *)
+(* Dominic Mulligan, University of Cambridge *)
+(* Francesco Zappa Nardelli, INRIA Paris-Rocquencourt *)
+(* Gabriel Kerneis, University of Cambridge *)
+(* Kathy Gray, University of Cambridge *)
+(* Peter Boehm, University of Cambridge (while working on Lem) *)
+(* Peter Sewell, University of Cambridge *)
+(* Scott Owens, University of Kent *)
+(* Thomas Tuerk, University of Cambridge *)
+(* *)
+(* The Lem sources are copyright 2010-2013 *)
+(* by the UK authors above and Institut National de Recherche en *)
+(* Informatique et en Automatique (INRIA). *)
+(* *)
+(* All files except ocaml-lib/pmap.{ml,mli} and ocaml-libpset.{ml,mli} *)
+(* are distributed under the license below. The former are distributed *)
+(* under the LGPLv2, as in the LICENSE file. *)
+(* *)
+(* *)
+(* Redistribution and use in source and binary forms, with or without *)
+(* modification, are permitted provided that the following conditions *)
+(* are met: *)
+(* 1. Redistributions of source code must retain the above copyright *)
+(* notice, this list of conditions and the following disclaimer. *)
+(* 2. Redistributions in binary form must reproduce the above copyright *)
+(* notice, this list of conditions and the following disclaimer in the *)
+(* documentation and/or other materials provided with the distribution. *)
+(* 3. The names of the authors may not be used to endorse or promote *)
+(* products derived from this software without specific prior written *)
+(* permission. *)
+(* *)
+(* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS *)
+(* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *)
+(* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE *)
+(* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY *)
+(* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL *)
+(* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE *)
+(* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *)
+(* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER *)
+(* IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *)
+(* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *)
+(* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *)
+(**************************************************************************)
+
+{
+open Parser
+module M = Map.Make(String)
+exception LexError of char * Lexing.position
+
+let (^^^) = Ulib.Text.(^^^)
+let r = Ulib.Text.of_latin1
+(* Available as Scanf.unescaped since OCaml 4.0 but 3.12 is still common *)
+let unescaped s = Scanf.sscanf ("\"" ^ s ^ "\"") "%S%!" (fun x -> x)
+
+let kw_table =
+ List.fold_left
+ (fun r (x,y) -> M.add x y r)
+ M.empty
+ [
+ ("and", (fun x -> And(x)));
+ ("as", (fun x -> As(x)));
+ ("case", (fun x -> Case(x)));
+ ("const", (fun x -> Const(x)));
+ ("default", (fun x -> Default(x)));
+ ("enum", (fun x -> Enum(x)));
+ ("else", (fun x -> Else(x)));
+ ("false", (fun x -> False(x)));
+ ("forall", (fun x -> Forall(x)));
+ ("function", (fun x -> Function_(x)));
+ ("if", (fun x -> If_(x)));
+ ("in", (fun x -> In(x)));
+ ("IN", (fun x -> IN(x,r"IN")));
+ ("let", (fun x -> Let_(x)));
+ ("rec", (fun x -> Rec(x)));
+ ("register", (fun x -> Register(x)));
+ ("struct", (fun x -> Struct(x)));
+ ("switch", (fun x -> Switch(x)));
+ ("then", (fun x -> Then(x)));
+ ("true", (fun x -> True(x)));
+ ("type", (fun x -> Type(x)));
+ ("typedef", (fun x -> Typedef(x)));
+ ("union", (fun x -> Union(x)))
+ ("with", (fun x -> With(x)));
+ ("val", (fun x -> Val(x)));
+]
+
+}
+
+let ws = [' ''\t']+
+let letter = ['a'-'z''A'-'Z']
+let digit = ['0'-'9']
+let binarydigit = ['0'-'1']
+let hexdigit = ['0'-'9''A'-'F''a'-'f']
+let alphanum = letter|digit
+let startident = letter|'_'
+let ident = alphanum|['_''\'']
+let oper_char = ['!''$''%''&''*''+''-''.''/'':''<''=''>''?''@''^''|''~']
+let safe_com1 = [^'*''('')''\n']
+let safe_com2 = [^'*''(''\n']
+let com_help = "("*safe_com2 | "*"*"("+safe_com2 | "*"*safe_com1
+let com_body = com_help*"*"*
+let escape_sequence = ('\\' ['\\''\"''\'''n''t''b''r']) | ('\\' digit digit digit) | ('\\' 'x' hexdigit hexdigit)
+
+rule token skips = parse
+ | ws as i
+ { token (Ast.Ws(Ulib.Text.of_latin1 i)::skips) lexbuf }
+ | "\n"
+ { Lexing.new_line lexbuf;
+ token (Ast.Nl::skips) lexbuf }
+
+ | "&" { (Amp(Some(skips),r"&")) }
+ | "@" { (At(Some(skips),r"@")) }
+ | "|" { (Bar(Some(skips))) }
+ | "^" { (Carrot(Some(skips),r"^")) }
+ | ":" { (Colon(Some(skips))) }
+ | "," { (Comma(Some(skips))) }
+ | "." { (Dot(Some(skips))) }
+ | "\" { (Div(Some(skips),r"\")) }
+ | "=" { (Eq(Some(skips),r"=")) }
+ | "!" { (Excl(Some(skips),r"!")) }
+ | ">" { (Gt(Some(skips),r">")) }
+ | "<" { (Lt(Some(skips),r"<")) }
+ | "+" { (Plus(Some(skips),r"+")) }
+ | ";" { (Semi(Some(skips))) }
+ | "*" { (Star(Some(skips),r"*")) }
+ | "~" { (Tilde(Some(skips),r"~")) }
+ | "_" { (Under(Some(skips))) }
+ | "{" { (Lcurly(Some(skips))) }
+ | "}" { (Rcurly(Some(skips))) }
+ | "(" { (Lparen(Some(skips))) }
+ | ")" { (Rparen(Some(skips))) }
+ | "[" { (Lsquare(Some(skips))) }
+ | "]" { (Rsquare(Some(skips))) }
+ | "&&" as i { (AmpAmp(Some(skips),Ulib.Text.of_latin1 i)) }
+ | "->" { (Arrow(Some(skips))) }
+ | "||" { (BarBar(Some(skips))) }
+ | "|>" { (BarGt(Some(skips))) }
+ | "|]" { (BarSquare(Some(skips))) }
+ | "::" as i { (ColonColon(Some(skips),Ulib.Text.of_latin1 i)) }
+ | ".." { (DotDot(Some(skips)) }
+ | "==" { (EqEq(Some(skips),r"==")) }
+ | "!=" { (ExclEq(Some(skips),r"!=")) }
+ | ">=" { (GtEq(Some(skips),r">=")) }
+ | ">=+" { (GtEqPlus(Some(skips),r">=+")) }
+ | ">>" { (GtGt(Some(skips),r">>")) }
+ | ">>>" { (GtGtGt(Some(skips),r">>")) }
+ | ">+" { (GtPlus(Some(skips),r">+")) }
+ | "<=" { (LtEq(Some(skips),r"<=")) }
+ | "<=+" { (LtEqPlus(Some(skips),r"<=+")) }
+ | "<<" { (LtLt(Some(skips),r"<<")) }
+ | "<<<" { (LtLtLt(Some(skips),r"<<<")) }
+ | "<+" { (LtPlus(Some(skips),r"<+")) }
+ | "**" { (StarStar(Some(skips),r"**")) }
+
+ | "(*"
+ { token (Ast.Com(Ast.Comment(comment lexbuf))::skips) lexbuf }
+
+ | startident ident* as i { if M.mem i kw_table then
+ (M.find i kw_table) (Some(skips))
+ else
+ Id(Some(skips), Ulib.Text.of_latin1 i) }
+
+ | "\\\\" ([^' ' '\t' '\n']+ as i) { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+
+ | ['!''?''~'] oper_char* as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+
+ | "**" oper_char* as i { (StarStar(Some(skips), Ulib.Text.of_latin1 i)) }
+ | ['/''%'] oper_char* as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+ | "*" oper_char+ as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+ | ['+''-'] oper_char* as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+ | ">=" oper_char+ as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+ | ['@''^'] oper_char* as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+ | ['=''<''>''|''&''$'] oper_char* as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+ | digit+ as i { (Num(Some(skips),int_of_string i)) }
+ | "0b" (binarydigit+ as i) { (Bin(Some(skips), i)) }
+ | "0x" (hexdigit+ as i) { (Hex(Some(skips), i)) }
+ | '"' { (String(Some(skips),
+ string (Lexing.lexeme_start_p lexbuf) (Buffer.create 10) lexbuf)) }
+ | eof { (Eof(Some(skips))) }
+ | _ as c { raise (LexError(c, Lexing.lexeme_start_p lexbuf)) }
+
+
+and comment = parse
+ | (com_body "("* as i) "(*" { let c1 = comment lexbuf in
+ let c2 = comment lexbuf in
+ Ast.Chars(Ulib.Text.of_latin1 i) :: Ast.Comment(c1) :: c2}
+ | (com_body as i) "*)" { [Ast.Chars(Ulib.Text.of_latin1 i)] }
+ | com_body "("* "\n" as i { Lexing.new_line lexbuf;
+ (Ast.Chars(Ulib.Text.of_latin1 i) :: comment lexbuf) }
+ | _ as c { raise (LexError(c, Lexing.lexeme_start_p lexbuf)) }
+ | eof { [] }
+
+and string pos b = parse
+ | ([^'"''\n''\\']*'\n' as i) { Lexing.new_line lexbuf;
+ Buffer.add_string b i;
+ string pos b lexbuf }
+ | ([^'"''\n''\\']* as i) { Buffer.add_string b i; string pos b lexbuf }
+ | escape_sequence as i { Buffer.add_string b i; string pos b lexbuf }
+ | '\\' '\n' ws { Lexing.new_line lexbuf; string pos b lexbuf }
+ | '\\' { raise (Reporting_basic.Fatal_error (Reporting_basic.Err_syntax (pos,
+ "illegal backslash escape in string"))) }
+ | '"' { let s = unescaped(Buffer.contents b) in
+ try Ulib.UTF8.validate s; s
+ with Ulib.UTF8.Malformed_code ->
+ raise (Reporting_basic.Fatal_error (Reporting_basic.Err_syntax (pos,
+ "String literal is not valid utf8"))) }
+ | eof { raise (Reporting_basic.Fatal_error (Reporting_basic.Err_syntax (pos,
+ "String literal not terminated"))) }