8000 Fixes issue 600 (#614) · JavaScriptExpert/simdjson@8769e42 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8769e42

Browse files
authored
Fixes issue 600 (simdjson#614)
* Fixes issue 600
1 parent 7cde65a commit 8769e42

File tree

3 files changed

+141
-76
lines changed

3 files changed

+141
-76
lines changed

include/simdjson/document.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,8 @@ class document::element : protected internal::tape_ref {
613613
really_inline bool is_number() const noexcept;
614614
/** Whether this is a JSON integer (e.g. 1 or -1, but *not* 1.0 or 1e2) */
615615
really_inline bool is_integer() const noexcept;
616+
/** Whether this is a JSON number but not an integer */
617+
really_inline bool is_float() const noexcept;
616618
/** Whether this is a JSON string (e.g. "abc") */
617619
really_inline bool is_string() const noexcept;
618620
/** Whether this is a JSON array (e.g. []) */

include/simdjson/inline/document.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,9 @@ really_inline bool document::element::is_bool() const noexcept {
914914
really_inline bool document::element::is_number() const noexcept {
915915
return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64 || type() == internal::tape_type::DOUBLE;
916916
}
917+
really_inline bool document::element::is_float() const noexcept {
918+
return type() == internal::tape_type::DOUBLE;
919+
}
917920
really_inline bool document::element::is_integer() const noexcept {
918921
return type() == internal::tape_type::UINT64 || type() == internal::tape_type::INT64;
919922
}

tools/jsonstats.cpp

Lines changed: 136 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -31,83 +31,118 @@ struct stat_s {
3131
size_t false_count;
3232
size_t byte_count;
3333
size_t structural_indexes_count;
34+
size_t key_count;
35+
size_t key_maximum_length;
36+
size_t maximum_depth;
37+
size_t ascii_key_count;
38+
size_t ascii_string_count;
39+
size_t maximum_object_size;
40+
size_t maximum_array_size;
41+
size_t string_maximum_length;
3442
bool valid;
3543
};
3644

3745
using stat_t = struct stat_s;
3846

47+
bool is_ascii(const std::string_view &v) {
48+
for (size_t i = 0; i < v.size(); i++) {
49+
if (static_cast<unsigned char>(v[i]) >= 128) {
50+
return false;
51+
}
52+
}
53+
return true;
54+
}
55+
56+
void recurse(simdjson::document::element element, stat_t &s, size_t depth) {
57+
if (depth > s.maximum_depth) {
58+
s.maximum_depth = depth;
59+
}
60+
if (element.is_array()) {
61+
s.array_count++;
62+
auto [array, array_error] = element.as_array();
63+
if (!array_error) {
64+
size_t counter = 0;
65+
for (auto child : array) {
66+
counter++;
67+
recurse(child, s, depth + 1);
68+
}
69+
if (counter > s.maximum_array_size) {
70+
s.maximum_array_size = counter;
71+
}
72+
}
73+
} else if (element.is_object()) {
74+
s.object_count++;
75+
auto [object, object_error] = element.as_object();
76+
if (!object_error) {
77+
size_t counter = 0;
78+
for (auto [key, value] : object) {
79+
counter++;
80+
if (is_ascii(key)) {
81+
s.ascii_key_count++;
82+
s.ascii_string_count++;
83+
}
84+
if (key.size() > s.key_maximum_length) {
85+
s.key_maximum_length = key.size();
86+
}
87+
if (key.size() > s.string_maximum_length) {
88+
s.string_maximum_length = key.size();
89+
}
90+
s.string_count++;
91+
s.key_count++;
92+
recurse(value, s, depth + 1);
93+
}
94+
if (counter > s.maximum_object_size) {
95+
s.maximum_object_size = counter;
96+
}
97+
}
98+
} else {
99+
if (element.is_float()) {
100+
s.float_count++;
101+
} else if (element.is_integer()) {
102+
s.integer_count++;
103+
} else if (element.is_bool()) {
104+
if (element.as_bool()) {
105+
s.true_count++;
106+
} else {
107+
s.false_count++;
108+
}
109+
} else if (element.is_null()) {
110+
s.null_count++;
111+
} else if (element.is_string()) {
112+
s.string_count++;
113+
if (is_ascii(element.as_string())) {
114+
s.ascii_string_count++;
115+
}
116+
const std::string_view strval = element.as_string();
117+
if (strval.size() > s.string_maximum_length) {
118+
s.string_maximum_length = strval.size();
119+
}
120+
} else {
121+
throw std::runtime_error("unrecognized node.");
122+
}
123+
}
124+
}
125+
39126
stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
40-
stat_t answer;
41-
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
42-
answer.valid = pj.is_valid();
43-
if (!answer.valid) {
44-
std::cerr << pj.get_error_message() << std::endl;
45-
return answer;
127+
stat_t s{};
128+
simdjson::document::parser parser;
129+
auto [doc, error] = parser.parse(p);
130+
if (error) {
131+
s.valid = false;
132+
std::cerr << error << std::endl;
133+
return s;
46134
}
47-
answer.backslash_count =
135+
s.valid = true;
136+
s.backslash_count =
48137
count_backslash(reinterpret_cast<const uint8_t *>(p.data()), p.size());
49-
answer.non_ascii_byte_count = count_nonasciibytes(
138+
s.non_ascii_byte_count = count_nonasciibytes(
50139
reinterpret_cast<const uint8_t *>(p.data()), p.size());
51-
answer.byte_count = p.size();
52-
answer.integer_count = 0;
53-
answer.float_count = 0;
54-
answer.object_count = 0;
55-
answer.array_count = 0;
56-
answer.null_count = 0;
57-
answer.true_count = 0;
58-
answer.false_count = 0;
59-
answer.string_count = 0;
60-
answer.structural_indexes_count = pj.n_structural_indexes;
61-
size_t tape_idx = 0;
62-
uint64_t tape_val = pj.doc.tape[tape_idx++];
63-
uint8_t type = (tape_val >> 56);
64-
size_t how_many = 0;
65-
assert(type == 'r');
66-
how_many = tape_val & simdjson::internal::JSON_VALUE_MASK;
67-
for (; tape_idx < how_many; tape_idx++) {
68-
tape_val = pj.doc.tape[tape_idx];
69-
// uint64_t payload = tape_val & simdjson::internal::JSON_VALUE_MASK;
70-
type = (tape_val >> 56);
71-
switch (type) {
72-
case 'l': // we have a long int
73-
answer.integer_count++;
74-
tape_idx++; // skipping the integer
75-
break;
76-
case 'u': // we have a long uint
77-
answer.integer_count++;
78-
tape_idx++; // skipping the integer
79-
break;
80-
case 'd': // we have a double
81-
answer.float_count++;
82-
tape_idx++; // skipping the double
83-
break;
84-
case 'n': // we have a null
85-
answer.null_count++;
86-
break;
87-
case 't': // we have a true
88-
answer.true_count++;
89-
break;
90-
case 'f': // we have a false
91-
answer.false_count++;
92-
break;
93-
case '{': // we have an object
94-
answer.object_count++;
95-
break;
96-
case '}': // we end an object
97-
break;
98-
case '[': // we start an array
99-
answer.array_count++;
100-
break;
101-
case ']': // we end an array
102-
break;
103-
case '"': // we have a string
104-
answer.string_count++;
105-
break;
106-
default:
107-
break; // ignore
108-
}
109-
}
110-
return answer;
140+
s.byte_count = p.size();
141+
s.structural_indexes_count = parser.n_structural_indexes;
142+
143+
// simdjson::document::iterator iter(doc);
144+
recurse(doc.root(), s, 0);
145+
return s;
111146
}
112147

113148
int main(int argc, char *argv[]) {
@@ -122,6 +157,7 @@ int main(int argc, char *argv[]) {
122157
std::cerr << "warning: ignoring everything after " << argv[myoptind + 1]
123158
<< std::endl;
124159
}
160+
125161
auto [p, error] = simdjson::padded_string::load(filename);
126162
if (error) {
127163
std::cerr << "Could not load the file " << filename << std::endl;
@@ -132,13 +168,37 @@ int main(int argc, char *argv[]) {
132168
std::cerr << "not a valid JSON" << std::endl;
133169
return EXIT_FAILURE;
134170
}
171+
// Future work: the proper way to do the what follows would be to create
172+
// a JSON object and then to serialize it.
135173

136-
printf("# integer_count float_count string_count backslash_count "
137-
"non_ascii_byte_count object_count array_count null_count true_count "
138-
"false_count byte_count structural_indexes_count\n");
139-
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count,
140-
s.float_count, s.string_count, s.backslash_count,
141-
s.non_ascii_byte_count, s.object_count, s.array_count, s.null_count,
142-
s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
143-
return EXIT_SUCCESS;
174+
printf(R"({
175+
"integer_count" = %10zu,
176+
"float_count" = %10zu,
177+
"string_count" = %10zu,
178+
"ascii_string_count" = %10zu,
179+
"string_maximum_length" = %10zu,
180+
"backslash_count" = %10zu,
181+
"non_ascii_byte_count" = %10zu,
182+
"object_count" = %10zu,
183+
"maximum_object_size" = %10zu,
184+
"array_count" = %10zu,
185+
"maximum_array_size" = %10zu,
186+
"null_count" = %10zu,
187+
"true_count" = %10zu,
188+
"false_count" = %10zu,
189+
"byte_count" = %10zu,
190+
"structural_indexes_count" = %10zu,
191+
"key_count" = %10zu,
192+
"ascii_key_count" = %10zu,
193+
"key_maximum_length" = %10zu,
194+
"maximum_depth" = %10zu
144195
}
196+
)",
197+
s.integer_count, s.float_count, s.string_count, s.ascii_string_count,
198+
s.string_maximum_length, s.backslash_count, s.non_ascii_byte_count,
199+
s.object_count, s.maximum_object_size, s.array_count,
200+
s.maximum_array_size, s.null_count, s.true_count, s.false_count,
201+
s.byte_count, s.structural_indexes_count, s.key_count,
202+
s.ascii_key_count, s.key_maximum_length, s.maximum_depth);
203+
return EXIT_SUCCESS;
204+
}

0 commit comments

Comments
 (0)
0