8000 grammars: x{min,max} repetition operator (#6640) · ggml-org/llama.cpp@55b2d08 · GitHub
[go: up one dir, main page]

Skip to content

Commit 55b2d08

Browse files
ochafikHanClinto
andauthored
grammars: x{min,max} repetition operator (#6640)
* grammars: x{min,max} repetition operator + tweak +/*/? to avoid duplication of original over alternates * grammars: handle `x{n}` and fix `x{n,n}` * grammars: document new repetition operators * grammars: uniform use of int for min & max * grammars: refactor parser test * grammar: parsing tests w/ natural pretty print of updated expectations * grammars: much prettier print of expectations (+ TEST_GRAMMAR_PARSER_PRINT_ALL=1 to force all) * grammars: improve test pretty print again * grammars: pretty print rules and chars * grammars: fix copy rule skipping * grammars: disallow `a{,}` (not allowed in regexps) * Update common/grammar-parser.cpp Co-authored-by: Clint Herron <hanclinto@gmail.com> * grammars: fix copy rule skipping (again) & display of expectations * grammars: more test cases * grammars: update reps parsing to bring ? / * / + closer to before * json: use new GBNF repetitions{m,n} syntax * grammars: update performance gotchas w/ repetition advice * Update examples/json_schema_to_grammar.py Co-authored-by: Clint Herron <hanclinto@gmail.com> * Update examples/server/public/json-schema-to-grammar.mjs Co-authored-by: Clint Herron <hanclinto@gmail.com> * grammars: comment on rule repetitions * grammars: ensure unambiguous number alternatives * grammar: nit typo switched error msgs * grammar: nit numbering in comment * json: update numeric rule to be unambiguous * Apply suggestions from code review Co-authored-by: Clint Herron <hanclinto@gmail.com> * Update examples/server/public/json-schema-to-grammar.mjs Co-authored-by: Clint Herron <hanclinto@gmail.com> * json: fix integral-part * grammar: add repetition tests --------- Co-authored-by: Clint Herron <hanclinto@gmail.com>
1 parent f5d7b26 commit 55b2d08

9 files changed

+726
-408
lines changed

common/grammar-parser.cpp

Lines changed: 107 additions & 31 deletions
10000
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ namespace grammar_parser {
4646
state.rules[rule_id] = rule;
4747
}
4848

49+
static bool is_digit_char(char c) {
50+
return '0' <= c && c <= '9';
51+
}
52+
4953
static bool is_word_char(char c) {
50-
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
54+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
5155
}
5256

5357
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@@ -99,6 +103,17 @@ namespace grammar_parser {
99103
return pos;
100104
}
101105

106+
static const char * parse_int(const char * src) {
107+
const char * pos = src;
108+
while (is_digit_char(*pos)) {
109+
pos++;
110+
}
111+
if (pos == src) {
112+
throw std::runtime_error(std::string("expecting integer at ") + src);
113+
}
114+
return pos;
115+
}
116+
102117
static std::pair<uint32_t, const char *> parse_char(const char * src) {
103118
if (*src == '\\') {
104119
switch (src[1]) {
@@ -137,6 +152,60 @@ namespace grammar_parser {
137152
bool is_nested) {
138153
size_t last_sym_start = out_elements.size();
139154
const char * pos = src;
155+
156+
auto handle_repetitions = [&](int min_times, int max_times) {
157+
158+
if (last_sym_start == out_elements.size()) {
159+
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
160+
}
161+
162+
// apply transformation to previous symbol (last_sym_start to end) according to
163+
// the following rewrite rules:
164+
// S{m,n} --> S S S (m times) S'(n-m)
165+
// S'(x) ::= S S'(x-1) |
166+
// (... n-m definitions of these S' rules ...)
167+
// S'(1) ::= S |
168+
// S{m,} --> S S S (m times) S'
169+
// S' ::= S S' |
170+
// S* --> S{0,}
171+
// --> S' ::= S S' |
172+
// S+ --> S{1,}
173+
// --> S S'
174+
// S' ::= S S' |
175+
// S? --> S{0,1}
176+
// --> S'
177+
// S' ::= S |
178+
179+
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
180+
if (min_times == 0) {
181+
out_elements.resize(last_sym_start);
182+
} else {
183+
// Repeat the previous elements (min_times - 1) times
184+
for (int i = 1; i < min_times; i++) {
185+
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
186+
}
187+
}
188+
189+
uint32_t last_rec_rule_id = 0;
190+
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
191+
192+
std::vector<llama_grammar_element> rec_rule(previous_elements);
193+
for (int i = 0; i < n_opt; i++) {
194+
rec_rule.resize(previous_elements.size());
195+
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
196+
if (i > 0 || max_times < 0) {
197+
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
198+
}
199+
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
200+
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
201+
add_rule(state, rec_rule_id, rec_rule);
202+
last_rec_rule_id = rec_rule_id;
203+
}
204+
if (n_opt > 0) {
205+
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
206+
}
207+
};
208+
140209
while (*pos) {
141210
if (*pos == '"') { // literal string
142211
pos++;
@@ -197,40 +266,47 @@ namespace grammar_parser {
197266
throw std::runtime_error(std::string("expecting ')' at ") + pos);
198267
}
199268
pos = parse_space(pos + 1, is_nested);
200-
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
201-
if (last_sym_start == out_elements.size()) {
202-
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
203-
}
269+
} else if (*pos == '*') {
270+
pos = parse_space(pos + 1, is_nested);
271+
handle_repetitions(0, -1);
272+
} else if (*pos == '+') {
273+
pos = parse_space(pos + 1, is_nested);
274+
handle_repetitions(1, -1);
275+
} else if (*pos == '?') {
276+
pos = parse_space(pos + 1, is_nested);
277+
handle_repetitions(0, 1);
278+
} else if (*pos == '{') {
279+
pos = parse_space(pos + 1, is_nested);
204280

205-
// apply transformation to previous symbol (last_sym_start to end) according to
206-
// rewrite rules:
207-
// S* --> S' ::= S S' |
208-
// S+ --> S' ::= S S' | S
209-
// S? --> S' ::= S |
210-
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
211-
std::vector<llama_grammar_element> sub_rule;
212-
// add preceding symbol to generated rule
213-
sub_rule.insert(
214-
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
215-
if (*pos == '*' || *pos == '+') {
216-
// cause generated rule to recurse
217-
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
218-
}
219-
// mark start of alternate def
220-
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
221-
if (*pos == '+') {
222-
// add preceding symbol as alternate only for '+' (otherwise empty)
223-
sub_rule.insert(
224-
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
281+
if (!is_digit_char(*pos)) {
282+
throw std::runtime_error(std::string("expecting an int at ") + pos);
225283
}
226-
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
227-
add_rule(state, sub_rule_id, sub_rule);
284+
const char * int_end = parse_int(pos);
285+
int min_times = std::stoul(std::string(pos, int_end - pos));
286+
pos = parse_space(int_end, is_nested);
228287

229-
// in original rule, replace previous symbol with reference to generated rule
230-
out_elements.resize(last_sym_start);
231-
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
288+
int max_times = -1;
232289

233-
pos = parse_space(pos + 1, is_nested);
290+
if (*pos == '}') {
291+
max_times = min_times;
292+
pos = parse_space(pos + 1, is_nested);
293+
} else if (*pos == ',') {
294+
pos = parse_space(pos + 1, is_nested);
295+
296+
if (is_digit_char(*pos)) {
297+
const char * int_end = parse_int(pos);
298+
max_times = std::stoul(std::string(pos, int_end - pos));
299+
pos = parse_space(int_end, is_nested);
300+
}
301+
302+
if (*pos != '}') {
303+
throw std::runtime_error(std::string("expecting '}' at ") + pos);
304+
}
305+
pos = parse_space(pos + 1, is_nested);
306+
} else {
307+
throw std::runtime_error(std::string("expecting ',' at ") + pos);
308+
}
309+
handle_repetitions(min_times, max_times);
234310
} else {
235311
break;
236312
}

common/json-schema-to-grammar.cpp

Lines changed: 20 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
1616

1717
static std::string repeat(const std::string & str, size_t n);
1818

19-
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
20-
if (separator_rule.empty()) {
21-
if (min_items == 0 && max_items == 1) {
22-
return item_rule + "?";
23-
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
24-
return item_rule + "+";
25-
}
26-
}
19+
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
20+
auto has_max = max_items != std::numeric_limits<int>::max();
2721

28-
std::string result;
29-
if (min_items > 0) {
30-
if (item_rule_is_literal && separator_rule.empty()) {
31-
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
32-
} else {
33-
std::vector<std::string> items(min_items, item_rule);
34-
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
35-
}
22+
if (min_items == 0 && max_items == 1) {
23+
return item_rule + "?";
3624
}
3725

38-
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
39-
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
40-
41-
if (up_to_n == 0) {
42-
return "";
43-
} else if (up_to_n == 1) {
44-
return "(" + content + ")?";
45-
} else if (!separator_rule.empty() && !prefix_with_sep) {
46-
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
26+
if (separator_rule.empty()) {
27+
if (min_items == 1 && !has_max) {
28+
return item_rule + "+";
29+
} else if (min_items == 0 && !has_max) {
30+
return item_rule + "*";
4731
} else {
48-
std::string res = repeat("(" + content + " ", up_to_n);
49-
// strip trailing space
50-
res = res.substr(0, res.length() - 1);
51-
res += repeat(")?", up_to_n);
52-
return res;
32+
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
5333
}
54-
};
55-
56-
if (min_items > 0 && max_items != min_items) {
57-
result += " ";
5834
}
5935

60-
if (max_items != std::numeric_limits<int>::max()) {
61-
result += opt_repetitions(max_items - min_items, min_items > 0);
62-
} else {
63-
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
64-
if (min_items == 0 && !separator_rule.empty()) {
65-
result = "(" + item_rule + " " + item_operator + "*)?";
66-
} else {
67-
result += item_operator + "*";
68-
}
36+
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
37+
if (min_items == 0) {
38+
result = "(" + result + ")?";
6939
}
70-
7140
return result;
7241
}
7342

@@ -78,30 +47,24 @@ struct BuiltinRule {
7847
std::vector<std::string> deps;
7948
};
8049

81-
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
82-
8350
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
8451
{"boolean", {"(\"true\" | \"false\") space", {}}},
85-
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
86-
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
52+
{"decimal-part", {"[0-9]{1,16}", {}}},
53+
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
8754
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
8855
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
8956
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
9057
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
9158
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
92-
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
93-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
94-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
95-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
96-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
97-
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
59+
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
60+
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
9861
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
9962
{"null", {"\"null\" space", {}}},
10063
};
10164

10265
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
103-
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
104-
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
66+
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
67+
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
10568
{"date-time", {"date \"T\" time", {"date", "time"}}},
10669
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
10770
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -385,8 +348,7 @@ class SchemaConverter {
385348
sub_is_literal ? "\"" + sub + "\"" : sub,
386349
min_times,
387350
max_times,
388-
"",
389-
sub_is_literal
351+
""
390352
);
391353
seq.back().second = false;
392354
} else {

0 commit comments

Comments
 (0)
0