@@ -31,83 +31,118 @@ struct stat_s {
31
31
size_t false_count;
32
32
size_t byte_count;
33
33
size_t structural_indexes_count;
34
+ size_t key_count;
35
+ size_t key_maximum_length;
36
+ size_t maximum_depth;
37
+ size_t ascii_key_count;
38
+ size_t ascii_string_count;
39
+ size_t maximum_object_size;
40
+ size_t maximum_array_size;
41
+ size_t string_maximum_length;
34
42
bool valid;
35
43
};
36
44
37
45
using stat_t = struct stat_s ;
38
46
47
+ bool is_ascii (const std::string_view &v) {
48
+ for (size_t i = 0 ; i < v.size (); i++) {
49
+ if (static_cast <unsigned char >(v[i]) >= 128 ) {
50
+ return false ;
51
+ }
52
+ }
53
+ return true ;
54
+ }
55
+
56
+ void recurse (simdjson::document::element element, stat_t &s, size_t depth) {
57
+ if (depth > s.maximum_depth ) {
58
+ s.maximum_depth = depth;
59
+ }
60
+ if (element.is_array ()) {
61
+ s.array_count ++;
62
+ auto [array, array_error] = element.as_array ();
63
+ if (!array_error) {
64
+ size_t counter = 0 ;
65
+ for (auto child : array) {
66
+ counter++;
67
+ recurse (child, s, depth + 1 );
68
+ }
69
+ if (counter > s.maximum_array_size ) {
70
+ s.maximum_array_size = counter;
71
+ }
72
+ }
73
+ } else if (element.is_object ()) {
74
+ s.object_count ++;
75
+ auto [object, object_error] = element.as_object ();
76
+ if (!object_error) {
77
+ size_t counter = 0 ;
78
+ for (auto [key, value] : object) {
79
+ counter++;
80
+ if (is_ascii (key)) {
81
+ s.ascii_key_count ++;
82
+ s.ascii_string_count ++;
83
+ }
84
+ if (key.size () > s.key_maximum_length ) {
85
+ s.key_maximum_length = key.size ();
86
+ }
87
+ if (key.size () > s.string_maximum_length ) {
88
+ s.string_maximum_length = key.size ();
89
+ }
90
+ s.string_count ++;
91
+ s.key_count ++;
92
+ recurse (value, s, depth + 1 );
93
+ }
94
+ if (counter > s.maximum_object_size ) {
95
+ s.maximum_object_size = counter;
96
+ }
97
+ }
98
+ } else {
99
+ if (element.is_float ()) {
100
+ s.float_count ++;
101
+ } else if (element.is_integer ()) {
102
+ s.integer_count ++;
103
+ } else if (element.is_bool ()) {
104
+ if (element.as_bool ()) {
105
+ s.true_count ++;
106
+ } else {
107
+ s.false_count ++;
108
+ }
109
+ } else if (element.is_null ()) {
110
+ s.null_count ++;
111
+ } else if (element.is_string ()) {
112
+ s.string_count ++;
113
+ if (is_ascii (element.as_string ())) {
114
+ s.ascii_string_count ++;
115
+ }
116
+ const std::string_view strval = element.as_string ();
117
+ if (strval.size () > s.string_maximum_length ) {
118
+ s.string_maximum_length = strval.size ();
119
+ }
120
+ } else {
121
+ throw std::runtime_error (" unrecognized node." );
122
+ }
123
+ }
124
+ }
125
+
39
126
stat_t simdjson_compute_stats (const simdjson::padded_string &p) {
40
- stat_t answer;
41
- simdjson::ParsedJson pj = simdjson::build_parsed_json (p);
42
- answer.valid = pj.is_valid ();
43
- if (!answer.valid ) {
44
- std::cerr << pj.get_error_message () << std::endl;
45
- return answer;
127
+ stat_t s{};
128
+ simdjson::document::parser parser;
129
+ auto [doc, error] = parser.parse (p);
130
+ if (error) {
131
+ s.valid = false ;
132
+ std::cerr << error << std::endl;
133
+ return s;
46
134
}
47
- answer.backslash_count =
135
+ s.valid = true ;
136
+ s.backslash_count =
48
137
count_backslash (reinterpret_cast <const uint8_t *>(p.data ()), p.size ());
49
- answer .non_ascii_byte_count = count_nonasciibytes (
138
+ s .non_ascii_byte_count = count_nonasciibytes (
50
139
reinterpret_cast <const uint8_t *>(p.data ()), p.size ());
51
- answer.byte_count = p.size ();
52
- answer.integer_count = 0 ;
53
- answer.float_count = 0 ;
54
- answer.object_count = 0 ;
55
- answer.array_count = 0 ;
56
- answer.null_count = 0 ;
57
- answer.true_count = 0 ;
58
- answer.false_count = 0 ;
59
- answer.string_count = 0 ;
60
- answer.structural_indexes_count = pj.n_structural_indexes ;
61
- size_t tape_idx = 0 ;
62
- uint64_t tape_val = pj.doc .tape [tape_idx++];
63
- uint8_t type = (tape_val >> 56 );
64
- size_t how_many = 0 ;
65
- assert (type == ' r' );
66
- how_many = tape_val & simdjson::internal::JSON_VALUE_MASK;
67
- for (; tape_idx < how_many; tape_idx++) {
68
- tape_val = pj.doc .tape [tape_idx];
69
- // uint64_t payload = tape_val & simdjson::internal::JSON_VALUE_MASK;
70
- type = (tape_val >> 56 );
71
- switch (type) {
72
- case ' l' : // we have a long int
73
- answer.integer_count ++;
74
- tape_idx++; // skipping the integer
75
- break ;
76
- case ' u' : // we have a long uint
77
- answer.integer_count ++;
78
- tape_idx++; // skipping the integer
79
- break ;
80
- case ' d' : // we have a double
81
- answer.float_count ++;
82
- tape_idx++; // skipping the double
83
- break ;
84
- case ' n' : // we have a null
85
- answer.null_count ++;
86
- break ;
87
- case ' t' : // we have a true
88
- answer.true_count ++;
89
- break ;
90
- case ' f' : // we have a false
91
- answer.false_count ++;
92
- break ;
93
- case ' {' : // we have an object
94
- answer.object_count ++;
95
- break ;
96
- case ' }' : // we end an object
97
- break ;
98
- case ' [' : // we start an array
99
- answer.array_count ++;
100
- break ;
101
- case ' ]' : // we end an array
102
- break ;
103
- case ' "' : // we have a string
104
- answer.string_count ++;
105
- break ;
106
- default :
107
- break ; // ignore
108
- }
109
- }
110
- return answer;
140
+ s.byte_count = p.size ();
141
+ s.structural_indexes_count = parser.n_structural_indexes ;
142
+
143
+ // simdjson::document::iterator iter(doc);
144
+ recurse (doc.root (), s, 0 );
145
+ return s;
111
146
}
112
147
113
148
int main (int argc, char *argv[]) {
@@ -122,6 +157,7 @@ int main(int argc, char *argv[]) {
122
157
std::cerr << " warning: ignoring everything after " << argv[myoptind + 1 ]
123
158
<< std::endl;
124
159
}
160
+
125
161
auto [p, error] = simdjson::padded_string::load (filename);
126
162
if (error) {
127
163
std::cerr << " Could not load the file " << filename << std::endl;
@@ -132,13 +168,37 @@ int main(int argc, char *argv[]) {
132
168
std::cerr << " not a valid JSON" << std::endl;
133
169
return EXIT_FAILURE;
134
170
}
171
+ // Future work: the proper way to do the what follows would be to create
172
+ // a JSON object and then to serialize it.
135
173
136
- printf (" # integer_count float_count string_count backslash_count "
137
- " non_ascii_byte_count object_count array_count null_count true_count "
138
- " false_count byte_count structural_indexes_count\n " );
139
- printf (" %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n " , s.integer_count ,
140
- s.float_count , s.string_count , s.backslash_count ,
141
- s.non_ascii_byte_count , s.object_count , s.array_count , s.null_count ,
142
- s.true_count , s.false_count , s.byte_count , s.structural_indexes_count );
143
- return EXIT_SUCCESS;
174
+ printf (R"( {
175
+ "integer_count" = %10zu,
176
+ "float_count" = %10zu,
177
+ "string_count" = %10zu,
178
+ "ascii_string_count" = %10zu,
179
+ "string_maximum_length" = %10zu,
180
+ "backslash_count" = %10zu,
181
+ "non_ascii_byte_count" = %10zu,
182
+ "object_count" = %10zu,
183
+ "maximum_object_size" = %10zu,
184
+ "array_count" = %10zu,
185
+ "maximum_array_size" = %10zu,
186
+ "null_count" = %10zu,
187
+ "true_count" = %10zu,
188
+ "false_count" = %10zu,
189
+ "byte_count" = %10zu,
190
+ "structural_indexes_count" = %10zu,
191
+ "key_count" = %10zu,
192
+ "ascii_key_count" = %10zu,
193
+ "key_maximum_length" = %10zu,
194
+ "maximum_depth" = %10zu
144
195
}
196
+ )" ,
197
+ s.integer_count , s.float_count , s.string_count , s.ascii_string_count ,
198
+ s.string_maximum_length , s.backslash_count , s.non_ascii_byte_count ,
199
+ s.object_count , s.maximum_object_size , s.array_count ,
200
+ s.maximum_array_size , s.null_count , s.true_count , s.false_count ,
201 + s.byte_count , s.structural_indexes_count , s.key_count ,
202
+ s.ascii_key_count , s.key_maximum_length , s.maximum_depth );
203
+ return EXIT_SUCCESS;
204
+ }
0 commit comments