8000 Use SAX model for stage 2 · JavaScriptExpert/simdjson@03d54f8 · GitHub
[go: up one dir, main page]

Skip to content

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 03d54f8

Browse files
committed
Use SAX model for stage 2
1 parent 553e6d7 commit 03d54f8

File tree

3 files changed

+246
-160
lines changed

3 files changed

+246
-160
lines changed

src/generic/stage2/logger.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ namespace logger {
2828
if (LOG_ENABLED) {
2929
log_depth = 0;
3030
printf("\n");
31-
printf("| %-*s | %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#", 5, "Tape#");
32-
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES, 5+2, DASHES);
31+
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
32+
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
3333
}
3434
}
3535

@@ -71,7 +71,7 @@ namespace logger {
7171
} else {
7272
printf("| %-*s ", LOG_INDEX_LEN, "");
7373
}
74-
printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
74+
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
7575
printf("| %-s ", detail);
7676
printf("|\n");
7777
}

src/generic/stage2/structural_parser.h

Lines changed: 48 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -3,226 +3,109 @@
33
// We assume the file in which it is include already includes
44
// "simdjson/stage2.h" (this simplifies amalgation)
55

6-
#include "generic/stage2/tape_writer.h"
76
#include "generic/stage2/logger.h"
8-
#include "generic/stage2/atomparsing.h"
97
#include "generic/stage2/structural_iterator.h"
108

119
namespace { // Make everything here private
1210
namespace SIMDJSON_IMPLEMENTATION {
1311
namespace stage2 {
1412

13+
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
14+
15+
template<typename T>
1516
struct structural_parser : structural_iterator {
16-
/** Lets you append to the tape */
17-
tape_writer tape;
18-
/** Next write location in the string buf for stage 2 parsing */
19-
uint8_t *current_string_buf_loc;
17+
/** Receiver that actually parses the strings and builds the tape */
18+
T builder;
2019
/** Current depth (nested objects and arrays) */
2120
uint32_t depth{0};
2221

2322
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
2423
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
2524
: structural_iterator(_parser, start_structural_index),
26-
tape{parser.doc->tape.get()},
27-
current_string_buf_loc{parser.doc->string_buf.get()} {
28-
}
29-
30-
WARN_UNUSED really_inline error_code start_scope(bool is_array) {
31-
depth++;
32-
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
33-
parser.containing_scope[depth].tape_index = next_tape_index();
34-
parser.containing_scope[depth].count = 0;
35-
tape.skip(); // We don't actually *write* the start element until the end.
36-
parser.is_array[depth] = is_array;
37-
return SUCCESS;
25+
builder{parser.doc->tape.get(), parser.doc->string_buf.get()} {
3826
}
3927

4028
WARN_UNUSED really_inline error_code start_document() {
41-
log_start_value("document");
42-
parser.containing_scope[depth].tape_index = next_tape_index();
43-
parser.containing_scope[depth].count = 0;
44-
tape.skip(); // We don't actually *write* the start element until the end.
29+
builder.start_document(*this);
4530
parser.is_array[depth] = false;
4631
return SUCCESS;
4732
}
48-
4933
WARN_UNUSED really_inline error_code start_object() {
50-
log_start_value("object");
51-
return start_scope(false);
34+
depth++;
35+
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
36+
builder.start_object(*this);
37+
parser.is_array[depth] = false;
38+
return SUCCESS;
5239
}
53-
5440
WARN_UNUSED really_inline error_code start_array() {
55-
log_start_value("array");
56-
return start_scope(true);
57-
}
58-
59-
// this function is responsible for annotating the start of the scope
60-
really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
61-
// SIMDJSON_ASSUME(depth > 0);
62-
// Write the ending tape element, pointing at the start location
63-
const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
64-
tape.append(start_tape_index, end);
65-
// Write the start tape element, pointing at the end location (and including count)
66-
// count can overflow if it exceeds 24 bits... so we saturate
67-
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
68-
const uint32_t count = parser.containing_scope[depth].count;
69-
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
70-
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
71-
depth--;
72-
}
73-
74-
really_inline uint32_t next_tape_index() {
75-
return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
41+
depth++;
42+
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
43+
builder.start_array(*this);
44+
parser.is_array[depth] = true;
45+
return SUCCESS;
7646
}
77-
7847
really_inline void end_object() {
79-
log_end_value("object");
80-
end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
48+
builder.end_object(*this);
49+
depth--;
8150
}
8251
really_inline void end_array() {
83-
log_end_value("array");
84-
end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
52+
builder.end_array(*this);
53+
depth--;
8554
}
8655
really_inline void end_document() {
87-
log_end_value("document");
88-
constexpr uint32_t start_tape_index = 0;
89-
tape.append(start_tape_index, internal::tape_type::ROOT);
90-
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT);
56+
builder.end_document(*this);
9157
}
9258

93-
really_inline void empty_container(internal::tape_type start, internal::tape_type end) {
94-
auto start_index = next_tape_index();
95-
tape.append(start_index+2, start);
96-
tape.append(start_index, end);
97-
}
9859
WARN_UNUSED really_inline bool empty_object() {
9960
if (peek_next_char() == '}') {
10061
advance_char();
101-
log_value("empty object");
102-
empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
62+
builder.empty_object(*this);
10363
return true;
10464
}
10565
return false;
10666
}
10767
WARN_UNUSED really_inline bool empty_array() {
10868
if (peek_next_char() == ']') {
10969
advance_char();
110-
log_value("empty array");
111-
empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
70+
builder.empty_array(*this);
11271
return true;
11372
}
11473
return false;
11574
}
11675

117-
// increment_count increments the count of keys in an object or values in an array.
11876
really_inline void increment_count() {
119-
parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1
120-
}
121-
122-
really_inline uint8_t *on_start_string() noexcept {
123-
// we advance the point, accounting for the fact that we have a NULL termination
124-
tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
125-
return current_string_buf_loc + sizeof(uint32_t);
126-
}
127-
128-
really_inline void on_end_string(uint8_t *dst) noexcept {
129-
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
130-
// TODO check for overflow in case someone has a crazy string (>=4GB?)
131-
// But only add the overflow check when the document itself exceeds 4GB
132-
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
133-
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
134-
// NULL termination is still handy if you expect all your strings to
135-
// be NULL terminated? It comes at a small cost
136-
*dst = 0;
137-
current_string_buf_loc = dst + 1;
77+
builder.increment_count(*this);
13878
}
13979

14080
WARN_UNUSED really_inline error_code parse_key(const uint8_t *key) {
141-
return parse_string(key, true);
142-
}
143-
WARN_UNUSED really_inline error_code parse_string(const uint8_t *value, bool key = false) {
144-
log_value(key ? "key" : "string");
145-
uint8_t *dst = on_start_string();
146-
dst = stringparsing::parse_string(value, dst);
147-
if (dst == nullptr) {
148-
log_error("Invalid escape in string");
149-
return STRING_ERROR;
150-
}
151-
on_end_string(dst);
152-
return SUCCESS;
81+
return builder.parse_key(*this, key);
82+
}
83+
WARN_UNUSED really_inline error_code parse_string(const uint8_t *value) {
84+
return builder.parse_string(*this, value);
15385
}
154-
15586
WARN_UNUSED really_inline error_code parse_number(const uint8_t *value) {
156-
log_value("number");
157-
if (!numberparsing::parse_number(value, tape)) { log_error("Invalid number"); return NUMBER_ERROR; }
158-
return SUCCESS;
87+
return builder.parse_number(*this, value);
15988
}
160-
161-
really_inline error_code parse_root_number(const uint8_t *value) {
162-
//
163-
// We need to make a copy to make sure that the string is space terminated.
164-
// This is not about padding the input, which should already padded up
165-
// to len + SIMDJSON_PADDING. However, we have no control at this stage
166-
// on how the padding was done. What if the input string was padded with nulls?
167-
// It is quite common for an input string to have an extra null character (C string).
168-
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
169-
// document, but the string "9\0" by itself is fine. So we make a copy and
170-
// pad the input with spaces when we know that there is just one input element.
171-
// This copy is relatively expensive, but it will almost never be called in
172-
// practice unless you are in the strange scenario where you have many JSON
173-
// documents made of single atoms.
174-
//
175-
uint8_t *copy = static_cast<uint8_t *>(malloc(remaining_len() + SIMDJSON_PADDING));
176-
if (copy == nullptr) {
177-
return MEMALLOC;
178-
}
179-
memcpy(copy, value, remaining_len());
180-
memset(copy + remaining_len(), ' ', SIMDJSON_PADDING);
181-
error_code error = parse_number(copy);
182-
free(copy);
183-
return error;
89+
WARN_UNUSED really_inline error_code parse_root_number(const uint8_t *value) {
90+
return builder.parse_root_number(*this, value);
18491
}
185-
18692
WARN_UNUSED really_inline error_code parse_true_atom(const uint8_t *value) {
187-
log_value("true");
188-
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
189-
tape.append(0, internal::tape_type::TRUE_VALUE);
190-
return SUCCESS;
93+
return builder.parse_true_atom(*this, value);
19194
}
192-
19395
WARN_UNUSED really_inline error_code parse_root_true_atom(const uint8_t *value) {
194-
log_value("true");
195-
if (!atomparsing::is_valid_true_atom(value, remaining_len())) { return T_ATOM_ERROR; }
196-
tape.append(0, internal::tape_type::TRUE_VALUE);
197-
return SUCCESS;
96+
return builder.parse_root_true_atom(*this, value);
19897
}
199-
20098
WARN_UNUSED really_inline error_code parse_false_atom(const uint8_t *value) {
201-
log_value("false");
202-
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
203-
tape.append(0, internal::tape_type::FALSE_VALUE);
204-
return SUCCESS;
99+
return builder.parse_false_atom(*this, value);
205100
}
206-
207101
WARN_UNUSED really_inline error_code parse_root_false_atom(const uint8_t *value) {
208-
log_value("false");
209-
if (!atomparsing::is_valid_false_atom(value, remaining_len())) { return F_ATOM_ERROR; }
210-
tape.append(0, internal::tape_type::FALSE_VALUE);
211-
return SUCCESS;
102+
return builder.parse_root_false_atom(*this, value);
212103
}
213-
214104
WARN_UNUSED really_inline error_code parse_null_atom(const uint8_t *value) {
215-
log_value("null");
216-
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
217-
tape.append(0, internal::tape_type::NULL_VALUE);
218-
return SUCCESS;
105+
return builder.parse_null_atom(*this, value);
219106
}
220-
221107
WARN_UNUSED really_inline error_code parse_root_null_atom(const uint8_t *value) {
222-
log_value("null");
223-
if (!atomparsing::is_valid_null_atom(value, remaining_len())) { return N_ATOM_ERROR; }
224-
tape.append(0, internal::tape_type::NULL_VALUE);
225-
return SUCCESS;
108+
return builder.parse_root_null_atom(*this, value);
226109
}
227110

228111
WARN_UNUSED really_inline error_code start() {
@@ -266,12 +149,20 @@ struct structural_parser : structural_iterator {
266149
}
267150
}; // struct structural_parser
268151

269-
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
152+
} // namespace stage2
153+
} // namespace SIMDJSON_IMPLEMENTATION
154+
} // unnamed namespace
155+
156+
#include "generic/stage2/tape_builder.h"
157+
158+
namespace { // Make everything here private
159+
namespace SIMDJSON_IMPLEMENTATION {
160+
namespace stage2 {
270161

271162
template<bool STREAMING>
272163
WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
273164
dom_parser.doc = &doc;
274-
stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
165+
stage2::structural_parser<stage2::tape_builder> parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
275166
SIMDJSON_TRY( parser.start() );
276167

277168
//

0 commit comments

Comments
 (0)
0