10000 Add AMD64 build_bitmask · JavaScriptExpert/simdjson@441963c · GitHub
[go: up one dir, main page]

Skip to content

Commit 441963c

Browse files
committed
Add AMD64 build_bitmask
1 parent cf4ae61 commit 441963c

File tree

5 files changed

+96
-97
lines changed

5 files changed

+96
-97
lines changed

singleheader/amalgamation_demo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
1+
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
22

33
#include <iostream>
44
#include "simdjson.h"

singleheader/simdjson.cpp

Lines changed: 44 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
1+
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
22
#include "simdjson.h"
33

44
/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
@@ -574,22 +574,38 @@ struct simd_input<Architecture::ARM64> {
574574
this->i3 = vld1q_u8(ptr + 48);
575575
}
576576

577+
template <typename F>
578+
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
579+
uint8x16_t r0 = chunk_to_mask(this->i0);
580+
uint8x16_t r1 = chunk_to_mask(this->i1);
581+
uint8x16_t r2 = chunk_to_mask(this->i2);
582+
uint8x16_t r3 = chunk_to_mask(this->i3);
583+
return neon_movemask_bulk(r0, r1, r2, r3);
584+
}
585+
586+
template <typename F>
587+
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
588+
simd_input<Architecture::ARM64> result = {
589+
map_chunk(this->i0),
590+
map_chunk(this->i1),
591+
map_chunk(this->i2),
592+
map_chunk(this->i3)
593+
};
594+
return result;
595+
}
596+
577597
really_inline uint64_t eq(uint8_t m) {
578598
const uint8x16_t mask = vmovq_n_u8(m);
579-
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
580-
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
581-
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
582-
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
583-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
599+
return this->build_bitmask([&](uint8x16_t chunk) {
600+
return vceqq_u8(chunk, mask);
601+
});
584602
}
585603

586604
really_inline uint64_t lteq(uint8_t m) {
587605
const uint8x16_t mask = vmovq_n_u8(m);
588-
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
589-
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
590-
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
591-
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
592-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
606+
return this->build_bitmask([&](uint8x16_t chunk) {
607+
return vcleq_u8(chunk, mask);
608+
});
593609
}
594610

595611
}; // struct simd_input
@@ -1467,45 +1483,25 @@ really_inline void find_whitespace_and_structurals(
14671483
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
14681484
const uint8x16_t high_nibble_mask =
14691485
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
1470-
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
1471-
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
14721486
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
14731487

1474-
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
1475-
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
1476-
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
1477-
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
1478-
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
1479-
1480-
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
1481-
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
1482-
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
1483-
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
1484-
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
1485-
1486-
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
1487-
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
1488-
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
1489-
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
1490-
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
1491-
1492-
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
1493-
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
1494-
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
1495-
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
1496-
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
1497-
1498-
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
1499-
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
1500-
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
1501-
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
1502-
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
1503-
1504-
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
1505-
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
1506-
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
1507-
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
1508-
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
1488+
simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
1489+
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
1490+
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
1491+
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
1492+
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
1493+
return vandq_u8(shuf_lo, shuf_hi);
1494+
});
1495+
1496+
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
1497+
structurals = v.build_bitmask([&](auto chunk) {
1498+
return vtstq_u8(chunk, structural_shufti_mask);
1499+
});
1500+
1501+
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
1502+
whitespace = v.build_bitmask([&](auto chunk) {
1503+
return vtstq_u8(chunk, whitespace_shufti_mask);
1504+
});
15091505
}
15101506

15111507
// This file contains a non-architecture-specific version of "flatten" used in stage1.

singleheader/simdjson.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Fri Aug 23 10:23:28 DST 2019. Do not edit! */
1+
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
22
/* begin file include/simdjson/simdjson_version.h */
33
// /include/simdjson/simdjson_version.h automatically generated by release.py,
44
// do not change by hand

src/arm64/simd_input.h

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,45 @@ struct simd_input<Architecture::ARM64> {
4646
this->i3 = vld1q_u8(ptr + 48);
4747
}
4848

49+
really_inline simd_input(uint8x16_t i0, uint8x16_t i1, uint8x16_t i2, uint8x16_t i3) {
50+
this->i0 = i0;
51+
this->i1 = i1;
52+
this->i2 = i2;
53+
this->i3 = i3;
54+
}
55+
56+
template <typename F>
57+
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
58+
uint8x16_t r0 = chunk_to_mask(this->i0);
59+
uint8x16_t r1 = chunk_to_mask(this->i1);
60+
uint8x16_t r2 = chunk_to_mask(this->i2);
61+
uint8x16_t r3 = chunk_to_mask(this->i3);
62+
return neon_movemask_bulk(r0, r1, r2, r3);
63+
}
64+
65+
template <typename F>
66+
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
67+
simd_input<Architecture::ARM64> result = {
68+
map_chunk(this->i0),
69+
map_chunk(this->i1),
70+
map_chunk(this->i2),
71+
map_chunk(this->i3)
72+
};
73+
return result;
74+
}
75+
4976
really_inline uint64_t eq(uint8_t m) {
5077
const uint8x16_t mask = vmovq_n_u8(m);
51-
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
52-
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
53-
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
54-
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
55-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
78+
return this->build_bitmask([&](uint8x16_t chunk) {
79+
return vceqq_u8(chunk, mask);
80+
});
5681
}
5782

5883
really_inline uint64_t lteq(uint8_t m) {
5984
const uint8x16_t mask = vmovq_n_u8(m);
60-
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
61-
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
62-
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
63-
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
64-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
85+
return this->build_bitmask([&](uint8x16_t chunk) {
86+
return vcleq_u8(chunk, mask);
87+
});
6588
}
6689

6790
}; // struct simd_input

src/arm64/stage1_find_marks.h

Lines changed: 17 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -28,45 +28,25 @@ really_inline void find_whitespace_and_structurals(
2828
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
2929
const uint8x16_t high_nibble_mask =
3030
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
31-
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
32-
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
3331
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
3432

35-
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
36-
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
37-
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
38-
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
39-
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
40-
41-
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
42-
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
43-
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
44-
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
45-
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
46-
47-
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
48-
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
49-
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
50-
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
51-
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
52-
53-
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
54-
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
55-
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
56-
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
57-
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
58-
59-
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
60-
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
61-
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
62-
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
63-
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
64-
65-
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
66-
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
67-
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
68-
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
69-
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
33+
simd_input<ARCHITECTURE> v = in.map([&](auto chunk) {
34+
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
35+
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
36+
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
37+
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
38+
return vandq_u8(shuf_lo, shuf_hi);
39+
});
40+
41+
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
42+
structurals = v.build_bitmask([&](auto chunk) {
43+
return vtstq_u8(chunk, structural_shufti_mask);
44+
});
45+
46+
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
47+
whitespace = v.build_bitmask([&](auto chunk) {
48+
return vtstq_u8(chunk, whitespace_shufti_mask);
49+
});
7050
}
7151

7252
#include "generic/stage1_find_marks_flatten.h"

0 commit comments

Comments
 (0)
0