1 files changed, 210 insertions, 0 deletions
diff --git a/src/lexer.mll b/src/lexer.mll
new file mode 100644
index 00000000..551544c7
--- /dev/null
+++ b/src/lexer.mll
@@ -0,0 +1,210 @@
+(**************************************************************************)
+(*                        Lem                                             *)
+(*                                                                        *)
+(*          Dominic Mulligan, University of Cambridge                     *)
+(*          Francesco Zappa Nardelli, INRIA Paris-Rocquencourt            *)
+(*          Gabriel Kerneis, University of Cambridge                      *)
+(*          Kathy Gray, University of Cambridge                           *)
+(*          Peter Boehm, University of Cambridge (while working on Lem)   *)
+(*          Peter Sewell, University of Cambridge                         *)
+(*          Scott Owens, University of Kent                               *)
+(*          Thomas Tuerk, University of Cambridge                         *)
+(*                                                                        *)
+(*  The Lem sources are copyright 2010-2013                               *)
+(*  by the UK authors above and Institut National de Recherche en         *)
+(*  Informatique et en Automatique (INRIA).                               *)
+(*                                                                        *)
+(*  All files except ocaml-lib/pmap.{ml,mli} and ocaml-libpset.{ml,mli}   *)
+(*  are distributed under the license below.  The former are distributed  *)
+(*  under the LGPLv2, as in the LICENSE file.                             *)
+(*                                                                        *)
+(*                                                                        *)
+(*  Redistribution and use in source and binary forms, with or without    *)
+(*  modification, are permitted provided that the following conditions    *)
+(*  are met:                                                              *)
+(*  1. Redistributions of source code must retain the above copyright     *)
+(*  notice, this list of conditions and the following disclaimer.         *)
+(*  2. Redistributions in binary form must reproduce the above copyright  *)
+(*  notice, this list of conditions and the following disclaimer in the   *)
+(*  documentation and/or other materials provided with the distribution.  *)
+(*  3. The names of the authors may not be used to endorse or promote     *)
+(*  products derived from this software without specific prior written    *)
+(*  permission.                                                           *)
+(*                                                                        *)
+(*  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS    *)
+(*  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED     *)
+(*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE    *)
+(*  ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY       *)
+(*  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL    *)
+(*  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE     *)
+(*  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS         *)
+(*  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER  *)
+(*  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR       *)
+(*  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN   *)
+(*  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                         *)
+(**************************************************************************)
+
+{
+open Parser
+module M = Map.Make(String)
+exception LexError of char * Lexing.position
+
+let (^^^) = Ulib.Text.(^^^)
+let r = Ulib.Text.of_latin1
+(* Available as Scanf.unescaped since OCaml 4.0 but 3.12 is still common *)
+let unescaped s = Scanf.sscanf ("\"" ^ s ^ "\"") "%S%!" (fun x -> x)
+
+let kw_table = 
+  List.fold_left
+    (fun r (x,y) -> M.add x y r)
+    M.empty
+    [
+     ("and",                     (fun x -> And(x)));
+     ("as",                      (fun x -> As(x)));
+     ("case",			 (fun x -> Case(x)));
+     ("const",			 (fun x -> Const(x)));
+     ("default",		 (fun x -> Default(x)));
+     ("enum",			 (fun x -> Enum(x)));
+     ("else",                    (fun x -> Else(x)));
+     ("false",                   (fun x -> False(x)));
+     ("forall",                  (fun x -> Forall(x)));
+     ("function",                (fun x -> Function_(x)));
+     ("if",                      (fun x -> If_(x)));
+     ("in",			 (fun x -> In(x)));
+     ("IN",                      (fun x -> IN(x,r"IN")));     
+     ("let",                     (fun x -> Let_(x)));
+     ("rec",			 (fun x -> Rec(x)));
+     ("register",		 (fun x -> Register(x)));
+     ("struct",                  (fun x -> Struct(x)));
+     ("switch",			 (fun x -> Switch(x)));
+     ("then",                    (fun x -> Then(x)));
+     ("true",                    (fun x -> True(x)));
+     ("type",                    (fun x -> Type(x)));
+     ("typedef",		 (fun x -> Typedef(x)));
+     ("union",			 (fun x -> Union(x)))
+     ("with",                    (fun x -> With(x)));
+     ("val",                     (fun x -> Val(x)));
+]
+
+}
+
+let ws = [' ''\t']+
+let letter = ['a'-'z''A'-'Z']
+let digit = ['0'-'9']
+let binarydigit = ['0'-'1']
+let hexdigit = ['0'-'9''A'-'F''a'-'f']
+let alphanum = letter|digit
+let startident = letter|'_'
+let ident = alphanum|['_''\'']
+let oper_char = ['!''$''%''&''*''+''-''.''/'':''<''=''>''?''@''^''|''~']
+let safe_com1 = [^'*''('')''\n']
+let safe_com2 = [^'*''(''\n']
+let com_help = "("*safe_com2 | "*"*"("+safe_com2 | "*"*safe_com1
+let com_body = com_help*"*"*
+let escape_sequence = ('\\' ['\\''\"''\'''n''t''b''r']) | ('\\' digit digit digit) | ('\\' 'x' hexdigit hexdigit)
+
+rule token skips = parse
+  | ws as i
+    { token (Ast.Ws(Ulib.Text.of_latin1 i)::skips) lexbuf }
+  | "\n"
+    { Lexing.new_line lexbuf;
+      token (Ast.Nl::skips) lexbuf } 
+  
+  | "&"					{ (Amp(Some(skips),r"&")) }
+  | "@"					{ (At(Some(skips),r"@")) }
+  | "|"                                 { (Bar(Some(skips))) }
+  | "^"					{ (Carrot(Some(skips),r"^")) }
+  | ":"                                 { (Colon(Some(skips))) }
+  | ","                                 { (Comma(Some(skips))) }
+  | "."                                 { (Dot(Some(skips))) }
+  | "\"					{ (Div(Some(skips),r"\")) }
+  | "="                                 { (Eq(Some(skips),r"=")) }
+  | "!"					{ (Excl(Some(skips),r"!")) }
+  | ">"					{ (Gt(Some(skips),r">")) }
+  | "<"					{ (Lt(Some(skips),r"<")) }
+  | "+"					{ (Plus(Some(skips),r"+")) }
+  | ";"                                 { (Semi(Some(skips))) }
+  | "*"                                 { (Star(Some(skips),r"*")) }
+  | "~"					{ (Tilde(Some(skips),r"~")) }
+  | "_"                                 { (Under(Some(skips))) }
+  | "{"                                 { (Lcurly(Some(skips))) }
+  | "}"                                 { (Rcurly(Some(skips))) }
+  | "("                                 { (Lparen(Some(skips))) }
+  | ")"                                 { (Rparen(Some(skips))) }
+  | "["                                 { (Lsquare(Some(skips))) }
+  | "]"                                 { (Rsquare(Some(skips))) }
+  | "&&" as i                           { (AmpAmp(Some(skips),Ulib.Text.of_latin1 i)) }
+  | "->"                                { (Arrow(Some(skips))) }
+  | "||"                                { (BarBar(Some(skips))) }
+  | "|>"                                { (BarGt(Some(skips))) }
+  | "|]"				{ (BarSquare(Some(skips))) }
+  | "::" as i                           { (ColonColon(Some(skips),Ulib.Text.of_latin1 i)) }
+  | ".."				{ (DotDot(Some(skips)) }  
+  | "=="				{ (EqEq(Some(skips),r"==")) }
+  | "!="				{ (ExclEq(Some(skips),r"!=")) }
+  | ">="				{ (GtEq(Some(skips),r">=")) }
+  | ">=+"				{ (GtEqPlus(Some(skips),r">=+")) }
+  | ">>"				{ (GtGt(Some(skips),r">>")) }
+  | ">>>"				{ (GtGtGt(Some(skips),r">>")) }
+  | ">+"				{ (GtPlus(Some(skips),r">+")) }
+  | "<="				{ (LtEq(Some(skips),r"<=")) }
+  | "<=+"				{ (LtEqPlus(Some(skips),r"<=+")) }
+  | "<<"				{ (LtLt(Some(skips),r"<<")) }
+  | "<<<"				{ (LtLtLt(Some(skips),r"<<<")) }
+  | "<+"				{ (LtPlus(Some(skips),r"<+")) }
+  | "**"				{ (StarStar(Some(skips),r"**")) }
+
+  | "(*"                           
+    { token (Ast.Com(Ast.Comment(comment lexbuf))::skips) lexbuf }
+
+  | startident ident* as i              { if M.mem i kw_table then
+                                            (M.find i kw_table) (Some(skips))
+                                          else
+                                            Id(Some(skips), Ulib.Text.of_latin1 i) }
+
+  | "\\\\" ([^' ' '\t' '\n']+ as i)     { (Id(Some(skips), Ulib.Text.of_latin1 i)) } 
+
+  | ['!''?''~'] oper_char* as i         { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+
+  | "**" oper_char* as i                { (StarStar(Some(skips), Ulib.Text.of_latin1 i)) }
+  | ['/''%'] oper_char* as i         { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+  | "*" oper_char+ as i         { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+  | ['+''-'] oper_char* as i            { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+  | ">=" oper_char+ as i   		{ (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+  | ['@''^'] oper_char* as i            { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+  | ['=''<''>''|''&''$'] oper_char* as i { (Id(Some(skips), Ulib.Text.of_latin1 i)) }
+  | digit+ as i                         { (Num(Some(skips),int_of_string i)) }
+  | "0b" (binarydigit+ as i)		{ (Bin(Some(skips), i)) }
+  | "0x" (hexdigit+ as i) 		{ (Hex(Some(skips), i)) }
+  | '"'                                 { (String(Some(skips),
+                                           string (Lexing.lexeme_start_p lexbuf) (Buffer.create 10) lexbuf)) }
+  | eof                                 { (Eof(Some(skips))) }
+  | _  as c                             { raise (LexError(c, Lexing.lexeme_start_p lexbuf)) }
+
+
+and comment = parse
+  | (com_body "("* as i) "(*"           { let c1 = comment lexbuf in
+                                          let c2 = comment lexbuf in
+                                            Ast.Chars(Ulib.Text.of_latin1 i) :: Ast.Comment(c1) :: c2}
+  | (com_body as i) "*)"                { [Ast.Chars(Ulib.Text.of_latin1 i)] }
+  | com_body "("* "\n" as i             { Lexing.new_line lexbuf; 
+                                          (Ast.Chars(Ulib.Text.of_latin1 i) :: comment lexbuf) }
+  | _  as c                             { raise (LexError(c, Lexing.lexeme_start_p lexbuf)) }
+  | eof                                 { [] }
+
+and string pos b = parse
+  | ([^'"''\n''\\']*'\n' as i)          { Lexing.new_line lexbuf;
+                                          Buffer.add_string b i;
+                                          string pos b lexbuf }
+  | ([^'"''\n''\\']* as i)              { Buffer.add_string b i; string pos b lexbuf }
+  | escape_sequence as i                { Buffer.add_string b i; string pos b lexbuf }
+  | '\\' '\n' ws                        { Lexing.new_line lexbuf; string pos b lexbuf }
+  | '\\'                                { raise (Reporting_basic.Fatal_error (Reporting_basic.Err_syntax (pos,
+                                            "illegal backslash escape in string"))) }
+  | '"'                                 { let s = unescaped(Buffer.contents b) in
+                                          try Ulib.UTF8.validate s; s
+                                          with Ulib.UTF8.Malformed_code ->
+                                            raise (Reporting_basic.Fatal_error (Reporting_basic.Err_syntax (pos,
+                                              "String literal is not valid utf8"))) }
+  | eof                                 { raise (Reporting_basic.Fatal_error (Reporting_basic.Err_syntax (pos,
+                                            "String literal not terminated"))) }