@@ -27,26 +27,18 @@ class Lexer
27
27
*/
28
28
public function tokenize ($ expression )
29
29
{
30
- $ expression = str_replace (array ("\r\n" , "\r" ) , "\n" , $ expression );
30
+ $ expression = str_replace (array ("\r" , " \n" , "\t" , "\v" , "\f" ), ' ' , $ expression );
31
31
$ cursor = 0 ;
32
32
$ tokens = array ();
33
33
$ brackets = array ();
34
- $ operatorRegex = $ this ->getOperatorRegex ();
35
34
$ end = strlen ($ expression );
36
35
37
36
while ($ cursor < $ end ) {
38
- if (preg_match ('/\s+/A ' , $ expression , $ match , null , $ cursor )) {
39
- // whitespace
40
- $ cursor += strlen ($ match [0 ]);
41
- } elseif (preg_match ($ operatorRegex , $ expression , $ match , null , $ cursor )) {
42
- // operators
43
- $ tokens [] = new Token (Token::OPERATOR_TYPE , $ match [0 ], $ cursor + 1 );
44
- $ cursor += strlen ($ match [0 ]);
45
- } elseif (preg_match ('/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/A ' , $ expression , $ match , null , $ cursor )) {
46
- // names
47
- $ tokens [] = new Token (Token::NAME_TYPE , $ match [0 ], $ cursor + 1 );
48
- $ cursor += strlen ($ match [0 ]);
49
- } elseif (preg_match ('/[0-9]+(?:\.[0-9]+)?/A ' , $ expression , $ match , null , $ cursor )) {
37
+ while (' ' == $ expression [$ cursor ]) {
38
+ ++$ cursor ;
39
+ }
40
+
41
+ if (preg_match ('/[0-9]+(?:\.[0-9]+)?/A ' , $ expression , $ match , null , $ cursor )) {
50
42
// numbers
51
43
$ number = (float ) $ match [0 ]; // floats
52
44
if (ctype_digit ($ match [0 ]) && $ number <= PHP_INT_MAX ) {
@@ -81,6 +73,14 @@ public function tokenize($expression)
81
73
// strings
82
74
$ tokens [] = new Token (Token::STRING_TYPE , stripcslashes (substr ($ match [0 ], 1 , -1 )), $ cursor + 1 );
83
75
$ cursor += strlen ($ match [0 ]);
76
+ } elseif (preg_match ('/not in(?=[\s(])|\!\=\=|not(?=[\s(])|and(?=[\s(])|\=\=\=|\>\=|or(?=[\s(])|\<\=|\*\*|\.\.|in(?=[\s(])|&&|\|\||\!~|\=~|\=\=|\!\=|\*|~|%|\/|\>|\||\!|\^|&|\+|\<|\-/A ' , $ expression , $ match , null , $ cursor )) {
77
+ // operators
78
+ $ tokens [] = new Token (Token::OPERATOR_TYPE , $ match [0 ], $ cursor + 1 );
79
+ $ cursor += strlen ($ match [0 ]);
80
+ } elseif (preg_match ('/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/A ' , $ expression , $ match , null , $ cursor )) {
81
+ // names
82
+ $ tokens [] = new Token (Token::NAME_TYPE , $ match [0 ], $ cursor + 1 );
83
+ $ cursor += strlen ($ match [0 ]);
84
84
} else {
85
85
// unlexable
86
86
throw new SyntaxError (sprintf ('Unexpected character "%s" ' , $ expression [$ cursor ]), $ cursor );
@@ -96,24 +96,4 @@ public function tokenize($expression)
96
96
97
97
return new TokenStream ($ tokens );
98
98
}
99
-
100
- private function getOperatorRegex ()
101
- {
102
- $ operators = array (
103
- 'not ' , '! ' , '- ' , '+ ' ,
104
- 'or ' , '|| ' , '&& ' , 'and ' , '| ' , '^ ' , '& ' , '== ' , '=== ' , '!= ' , '!== ' , '< ' , '> ' , '>= ' , '<= ' , 'not in ' , 'in ' , '.. ' , '+ ' , '- ' , '~ ' , '* ', '/ ' , '% ' , '=~ ' , '!~ ' , '** ' ,
105
- );
106
-
107
- $ operators = array_combine ($ operators , array_map ('strlen ' , $ operators ));
108
- arsort ($ operators );
109
-
110
- $ regex = array ();
111
- foreach ($ operators as $ operator => $ length ) {
112
- // an operator that ends with a character must be followed by
113
- // a whitespace or a parenthesis
114
- $ regex [] = preg_quote ($ operator , '/ ' ).(ctype_alpha ($ operator [$ length - 1 ]) ? '(?=[\s()]) ' : '' );
115
- }
116
-
117
- return '/ ' .implode ('| ' , $ regex ).'/A ' ;
118
- }
119
99
}
0 commit comments