summary refs log tree commit diff
path: root/source/mod_student/lexer.mll
blob: bfc279a946f4f1d01a4e45667079d8bc7ea5247b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
{
  open Xi_lib
  open Parser
  open Parser_utils

  (* Lexing z biblioteki standardowej ocamla *)
  open Lexing

  (* Standardowo w YACC-podobnych narzędziach  to lekser jest uzależniony od parsera. To znaczy, że typ 
   * danych z tokenami definiuje moduł wygenerowany na bazie grammar.mly. Definiujemy alias na typ
   * tokenu na potrzeby interfejsów Xi_lib.Iface *)
  type token = Parser.token

  (* Obsługa błędu *)
  let handleError pos token =
      let exc = InvalidToken (mkLocation pos, token) in
      raise exc
      
  (* vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 
   * Miejsce na twój kod w Ocamlu
   *)
let explode s =
  let rec exp i l =
    if i < 0 then l else exp (i - 1) (s.[i] :: l) in
  exp (String.length s - 1) [];;
let implode l =
  let res = Bytes.create (List.length l) in
  let rec imp i = function
  | [] -> res
  | c :: l -> Bytes.set res i c; imp (i + 1) l in
  Bytes.to_string(imp 0 l);;

  let unescape str =
    let rec aux x= match x with
      | [] -> []
      | '\\'::'\\'::xs -> '\\'::(aux xs)
      | '\\'::'n'::xs  -> '\n'::(aux xs)
      | '\\'::'"'::xs  -> '"'::(aux xs)
      | '\\'::x::xs  -> failwith "unsupported escape sequence"
      | x::xs ->x::(aux xs)
    in implode (aux  (explode str) )  ;;
      
  let unescape_chr chr =
      match (explode chr) with
      | '\\'::'\\'::[] -> '\\'
      | '\\'::'n' ::[] -> '\n'
      | '\\'::'\''::[] -> '\''
      | '\\'::c::_  -> failwith "unsupported escape sequence"
      | c::[] ->c
      | [] -> failwith "empty char literal"
      | _ -> failwith "too long character literal"

  (* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     ----------------------------------------------------------------------------- *)

  }
  
  (* vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 
   * Miejsce na nazwane wyrażenia regularne
   *)

  let identifier    = ['a'-'z' '_' 'A' - 'Z']['_' 'A' - 'Z' 'a'-'z' '0'-'9']*
  let integer       = ['0'-'9']+
  let str           = (([^ '\n''"''\\'])|('\\'['n''\\''"']))*
  let chr           = (([^ '\n' '\'' '\\'])|('\\'['n' '\\' '\'']))
  
  (* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     ----------------------------------------------------------------------------- *)


  rule token = parse
      (* Trzeba pamiętać aby uaktualnić pozycje w lexbuf, gdy widzimy znak końca wiersza.
       * To się samo nie robi. Moduł Lexing z standardowej biblioteki daje do tego wygodną
       * funkcję new_line.
       *)
      | ['\n']
      { new_line lexbuf; token lexbuf }

      (* widzimy początek komentarza i przechodzimy do pomocniczego stanu *)
      | "//"
      { line_comment lexbuf }

      | eof
      { EOF }

      (* vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 
       * Miejsce na twoje reguły
       *)
      | '"' str as e '"'  { STRING (unescape e) }
      | "'" (chr as e) "'"  { CHAR (unescape_chr e) }
      | ","       { COMMA }
      | ";"       { SEMICOLON }
      | "("       { LPAREN }
      | ")"       { RPAREN }
      | "{"       { LBRACKET }
      | "}"       { RBRACKET }
      | "["       { LSBRACKET }
      | "]"       { RSBRACKET }
      | ":"       { COLON }
      | "+"       { OP_PLUS }
      | "-"       { OP_MINUS }
      | "*"       { OP_MULT }
      | "/"       { OP_DIV }
      | "%"       { OP_REM }
      | "&"       { OP_AND }
      | "|"       { OP_OR }
      | "=="      { OP_EQ }
      | "!="      { OP_NEQ }
      | "<="      { OP_LE }
      | ">="      { OP_GE }
      | "<"       { OP_LT }
      | ">"       { OP_GT }
      | "!"       { OP_NOT }
      | "="       { ASSIGN }
      | "if"      { IF }
      | "else"    { ELSE }
      | "while"   { WHILE }
      | "return"  { RETURN }
      | "length"  { LENGTH }
      | "int"     { T_INT }
      | "bool"    { T_BOOL }
      | "_"       { UNDERSCORE }
      | identifier as id  { IDENTIFIER id }
      | integer as i      { INT (int_of_string i) }

      (* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
         ----------------------------------------------------------------------------- *)
      | " " {token lexbuf}
      | _
      { handleError (Lexing.lexeme_start_p lexbuf) (Lexing.lexeme lexbuf) }

  (* Pomocniczy stan aby wygodnie i prawidłowo obsłużyć komentarze *)
  and line_comment = parse
      | '\n' 
      { new_line lexbuf; token lexbuf }

      (* Niektóre edytory nie wstawiają znaku końca wiersza w ostatniej linijce, jesteśmy
       * przygotowani na obsługę takiego komentarza.
       *)
      | eof
      { EOF }

      | _ 
      { line_comment lexbuf }