diff --git a/tinygraph/tinygraph-bits.c b/tinygraph/tinygraph-bits.c index 1a1a531..1435891 100644 --- a/tinygraph/tinygraph-bits.c +++ b/tinygraph/tinygraph-bits.c @@ -192,7 +192,7 @@ uint32_t tinygraph_bits_select_512(const uint64_t *p, uint32_t n) { uint32_t count = 0; for (uint32_t i = 0; i < 7; ++i) { - uint32_t block = tinygraph_bits_count(p[i]); + const uint32_t block = tinygraph_bits_count(p[i]); if ((count + block) > n) { return i * 64 + tinygraph_bits_select(p[i], n - count); diff --git a/tinygraph/tinygraph-bitset.c b/tinygraph/tinygraph-bitset.c index 4ca02de..422b217 100644 --- a/tinygraph/tinygraph-bitset.c +++ b/tinygraph/tinygraph-bitset.c @@ -3,6 +3,7 @@ #include #include +#include "tinygraph-align.h" #include "tinygraph-utils.h" #include "tinygraph-bitset.h" @@ -33,7 +34,7 @@ tinygraph_bitset* tinygraph_bitset_construct(uint64_t size) { uint64_t num_blocks = ceil(size / 64.f); - uint64_t *blocks = calloc(num_blocks, sizeof(uint64_t)); + uint64_t *blocks = tinygraph_align_malloc(64, num_blocks * sizeof(uint64_t)); if (!blocks) { free(out); @@ -41,6 +42,8 @@ tinygraph_bitset* tinygraph_bitset_construct(uint64_t size) { return NULL; } + memset(blocks, 0, num_blocks * sizeof(uint64_t)); + out->blocks = blocks; out->blocks_len = num_blocks; @@ -53,12 +56,13 @@ tinygraph_bitset* tinygraph_bitset_copy(const tinygraph_bitset * const bitset) { return NULL; } - tinygraph_bitset *copy = tinygraph_bitset_construct(bitset->blocks_len); + tinygraph_bitset *copy = tinygraph_bitset_construct(bitset->size); if (!copy) { return NULL; } + TINYGRAPH_ASSERT(copy->size == bitset->size); TINYGRAPH_ASSERT(copy->blocks_len == bitset->blocks_len); if (bitset->blocks_len > 0) { @@ -79,7 +83,7 @@ void tinygraph_bitset_destruct(tinygraph_bitset * const bitset) { TINYGRAPH_ASSERT(bitset->blocks || bitset->blocks_len == 0); - free(bitset->blocks); + tinygraph_align_free(bitset->blocks); bitset->blocks = NULL; bitset->blocks_len = 0; @@ -91,6 +95,7 @@ void tinygraph_bitset_destruct(tinygraph_bitset * const bitset) { void tinygraph_bitset_set_at(tinygraph_bitset * const bitset, uint64_t i) { TINYGRAPH_ASSERT(bitset); TINYGRAPH_ASSERT(bitset->blocks_len > 0); + TINYGRAPH_ASSERT(i < bitset->size); TINYGRAPH_ASSERT((i >> 6) < bitset->blocks_len); bitset->blocks[i >> 6] |= (UINT64_C(1) << (i & UINT64_C(63))); @@ -100,6 +105,7 @@ void tinygraph_bitset_set_at(tinygraph_bitset * const bitset, uint64_t i) { bool tinygraph_bitset_get_at(const tinygraph_bitset * const bitset, uint64_t i) { TINYGRAPH_ASSERT(bitset); TINYGRAPH_ASSERT(bitset->blocks_len > 0); + TINYGRAPH_ASSERT(i < bitset->size); TINYGRAPH_ASSERT((i >> 6) < bitset->blocks_len); return (bitset->blocks[i >> 6] & (UINT64_C(1) << (i & UINT64_C(63)))) != 0; @@ -124,6 +130,13 @@ void tinygraph_bitset_clear(tinygraph_bitset * const bitset) { } +const uint64_t * tinygraph_bitset_get_data(const tinygraph_bitset * const bitset) { + TINYGRAPH_ASSERT(bitset); + + return bitset->blocks; +} + + void tinygraph_bitset_not(tinygraph_bitset * const bitset) { TINYGRAPH_ASSERT(bitset); diff --git a/tinygraph/tinygraph-bitset.h b/tinygraph/tinygraph-bitset.h index 343d554..a3117c8 100644 --- a/tinygraph/tinygraph-bitset.h +++ b/tinygraph/tinygraph-bitset.h @@ -36,6 +36,9 @@ uint64_t tinygraph_bitset_get_size(tinygraph_bitset_const_s bitset); void tinygraph_bitset_clear(tinygraph_bitset_s bitset); +TINYGRAPH_WARN_UNUSED +const uint64_t * tinygraph_bitset_get_data(tinygraph_bitset_const_s bitset); + void tinygraph_bitset_not(tinygraph_bitset_s bitset); void tinygraph_bitset_and(tinygraph_bitset_s bitset, tinygraph_bitset_const_s other); void tinygraph_bitset_or(tinygraph_bitset_s bitset, tinygraph_bitset_const_s other); diff --git a/tinygraph/tinygraph-rs.c b/tinygraph/tinygraph-rs.c new file mode 100644 index 0000000..28d5ec5 --- /dev/null +++ b/tinygraph/tinygraph-rs.c @@ -0,0 +1,313 @@ +#include +#include +#include +#include + +#include "tinygraph-array.h" +#include "tinygraph-utils.h" +#include "tinygraph-bits.h" +#include "tinygraph-rs.h" + +// A succinct rank-select datastructure over a bitset. +// +// rank1(bits, n): sum of 1s in bits before position n +// +// select1(bits, n): position of nth 1 bit set in bits +// +// We implement a very simple rank-select datastructure +// with room for improvement in terms of space and runtime +// but the benefit is a relatively simple implementation. +// +// For rank: We make use of recent learnings from [1] and +// [3]: We use basic blocks of 512 bits (cache line size) +// and within these basic blocks make use of repeated +// popcnt instructions. This is similar to rank9 from [4] +// but without the 9-bit second level index and with the +// popcnt hardware instruction instead of broadword tricks. +// With a cache line aligned bitvector we should see a max +// of two cache misses: one for the basic block and then one +// for the remainder in the bitvector itself. Potential +// for improvements are listed in [1] and [2] e.g. the +// use of multi level rank inventories. +// +// For select: We make use of recent learnings from [1] and +// [3] where we sample every e.g. 8192'th bit set and from +// there use a scan over the rank inventory to speed up +// the search. The final cache line select works using +// the recent pdep hardware instruction. Note that worst +// case runtime is in O(n) for adversarily bitvector cases +// such as: a very large bit vector but only one bit set at +// the very start and one bit set at the very end. +// Potential for improvements are listed in [1] and [2]. +// In addition one idea could be tuning that 8192 constant +// e.g. based on the bitvector's load factor (how many bits +// are set) to make sure scanning ranks stays reasonable. +// The simple-select datastructure in [3] is especially +// tuned for select (does not rank) and seems to be strong +// in runtime but also needs more space. +// +// +// [1] SPIDER: Improved Succinct Rank and Select Performance +// https://arxiv.org/abs/2405.05214 +// +// [2] Engineering Compact Data Structures for Rank and Select Queries on Bit Vectors +// https://arxiv.org/abs/2206.01149 +// +// [3] A Fast x86 Implementation of Select +// https://arxiv.org/abs/1706.00990 +// +// [4] Broadword Implementation of Rank/Select Queries +// https://vigna.di.unimi.it/ftp/papers/Broadword.pdf + + +typedef struct tinygraph_rs { + + // a view onto the read-only bitset, max 2^32 bits supported + tinygraph_bitset_const_s bitset; + + // the pre-computed ranks for each cache line of size 512 bits + tinygraph_array_s ranks; + + // position of the 8192'th set bit, to speed up select search + tinygraph_array_s samples; + + // total bits set in bitset, mostly used for checks + uint32_t popcount; + +} tinygraph_rs; + + +tinygraph_rs* tinygraph_rs_construct(tinygraph_bitset_const_s bitset) { + TINYGRAPH_ASSERT(bitset); + TINYGRAPH_ASSERT(tinygraph_bitset_get_size(bitset) <= UINT32_MAX); + + // TODO: For now this only works with cache line aligned memory and + // a bitset the size of a cache line multiple; for internal use this + // is fine and makes our code simpler and more efficient. In case you + // want to re-use this code you'll need to handle the residuals on + // your own, especially all rank/select_512 functions need changing. + TINYGRAPH_ASSERT(tinygraph_bitset_get_size(bitset) % 512 == 0); + TINYGRAPH_ASSERT(((uintptr_t)tinygraph_bitset_get_data(bitset) % 64) == 0); + + tinygraph_rs *out = malloc(sizeof(tinygraph_rs)); + + if (!out) { + return NULL; + } + + // rank1 + + const uint64_t * const data = tinygraph_bitset_get_data(bitset); + const uint32_t n = tinygraph_bitset_get_size(bitset) / 512; + + tinygraph_array_s ranks = tinygraph_array_construct(n); + + if (!ranks) { + free(out); + + return NULL; + } + + tinygraph_array_s samples = tinygraph_array_construct(0); + + if (!samples) { + tinygraph_array_destruct(ranks); + free(out); + + return NULL; + } + + // Our data structure can be constructed in a single linear scan + // over the bitvector: for every block of 512 bits we store the + // computed rank up to this position in the ranks array. And for + // every 8192th bit set we store its position in the samples array. + // + // By iterating one cache line at a time we can make use of our + // highly tuned rank512 and select512 implementations. + + uint32_t rank = 0; + uint32_t sample = 0; + + for (uint32_t i = 0; i < n; ++i) { + const uint32_t count = tinygraph_bits_count_512(data + (i * 8)); + + rank += count; + tinygraph_array_set_at(ranks, i, rank); + + sample += count; + + if (sample >= 8192) { + const uint32_t p = tinygraph_bits_select_512(data + (i * 8), 8192 - (sample - count) - 1); + + const bool ok = tinygraph_array_push(samples, i * 512 + p); + + if (!ok) { + tinygraph_array_destruct(ranks); + tinygraph_array_destruct(samples); + free(out); + return NULL; + } + + sample = sample - 8192; + } + } + + // There might be a residual left smaller than a cache line + // of 512 bits. We take care of this case explicitly here + // for the select samples. + + const uint32_t r = tinygraph_bitset_get_size(bitset) % 512; + + if (r != 0) { + const uint32_t count = tinygraph_bits_rank_512(data + (n * 8), r); + + rank += count; + sample += count; + + if (sample >= 8192) { + const uint32_t p = tinygraph_bits_select_512(data + (n * 8), 8192 - (sample - count) - 1); + + const bool ok = tinygraph_array_push(samples, n * 512 + p); + + if (!ok) { + tinygraph_array_destruct(ranks); + tinygraph_array_destruct(samples); + free(out); + return NULL; + } + } + } + + *out = (tinygraph_rs) { + .bitset = bitset, + .ranks = ranks, + .samples = samples, + .popcount = rank, + }; + + return out; +} + + +void tinygraph_rs_destruct(tinygraph_rs * const rs) { + if (!rs) { + return; + } + + tinygraph_array_destruct(rs->ranks); + tinygraph_array_destruct(rs->samples); + + rs->ranks = NULL; + rs->samples = NULL; + + free(rs); +} + + +uint32_t tinygraph_rs_rank(const tinygraph_rs * const rs, uint32_t n) { + TINYGRAPH_ASSERT(rs); + TINYGRAPH_ASSERT(rs->bitset); + TINYGRAPH_ASSERT(rs->ranks); + TINYGRAPH_ASSERT(n <= tinygraph_bitset_get_size(rs->bitset)); + + if (n > tinygraph_bitset_get_size(rs->bitset)) { + TINYGRAPH_UNREACHABLE(); + } + + // We can compute rank from two parts: the pre-computed + // ranks for each cache line of size 512, and in case + // we rank inside a cache line adding the remaining + // residual from ranking the bits inside the cache line. + + const uint64_t * const data = tinygraph_bitset_get_data(rs->bitset); + + const uint32_t p = n / 512; + const uint32_t r = n % 512; + + if (p == 0) { + return tinygraph_bits_rank_512(data, r); + } else { + return tinygraph_array_get_at(rs->ranks, p - 1) + + tinygraph_bits_rank_512(data + (p * 8), r); + } +} + + +uint32_t tinygraph_rs_select(const tinygraph_rs * const rs, uint32_t n) { + TINYGRAPH_ASSERT(rs); + TINYGRAPH_ASSERT(rs->bitset); + TINYGRAPH_ASSERT(rs->ranks); + TINYGRAPH_ASSERT(rs->samples); + TINYGRAPH_ASSERT(n < rs->popcount); + + if (n >= rs->popcount) { + TINYGRAPH_UNREACHABLE(); + } + + // To select the nth set bit on the bitset + // + // 1. Use the samples array to find two positions + // where the nth bit has to be within. This range + // in general is unrestricted and in adversarial + // cases the range might span the whole bitset. + // That is why select is in O(n) but fast in + // practice and works fine for our use cases. + // + // 2. Use the ranks array to search for a block in + // the range where the nth bit must occur. + // + // 3. Within the 512 bit block where the nth bit + // must occur, rely on our highly tuned + // select512 implementations. + + const uint64_t * const data = tinygraph_bitset_get_data(rs->bitset); + + if (tinygraph_array_get_size(rs->ranks) == 0) { + return tinygraph_bits_select_512(data, n); + } + + // TODO: There is room for improvement when scanning + // over the ranks array e.g. using a different search. + // + // - We are running a linear scan over the ranks array + // but might benefit from a binary search or a more + // clever heuristic based on the distance + // + // - Even in our linear scan we could benefit from + // skipping multiple steps based on the knowledge + // that in a single block there can only ever be + // 512 bits set at best; if we search for the nth + // bit with n > 512 we can skip one block (repeat) + + const uint32_t p = n / 8192; + const uint32_t first = (p == 0) ? 0 : tinygraph_array_get_at(rs->samples, p - 1); + + uint32_t count = 0; + + for (uint32_t it = first / 512, last = tinygraph_array_get_size(rs->ranks); it < last; ++it) { + const uint32_t block = tinygraph_array_get_at(rs->ranks, it); + + if (block > n) { + return (it * 512) + tinygraph_bits_select_512(data + (it * 8), n - count); + } + + count = block; + } + + TINYGRAPH_ASSERT(false); + TINYGRAPH_UNREACHABLE(); + + return tinygraph_bitset_get_size(rs->bitset); +} + + +void tinygraph_rs_print_internal(const tinygraph_rs * const rs) { + TINYGRAPH_ASSERT(rs); + TINYGRAPH_ASSERT(rs->ranks); + TINYGRAPH_ASSERT(rs->samples); + + fprintf(stderr, "rs internals\n"); + + tinygraph_array_print_internal(rs->ranks); + tinygraph_array_print_internal(rs->samples); +} diff --git a/tinygraph/tinygraph-rs.h b/tinygraph/tinygraph-rs.h new file mode 100644 index 0000000..c706ce0 --- /dev/null +++ b/tinygraph/tinygraph-rs.h @@ -0,0 +1,33 @@ +#ifndef TINYGRAPH_RS_H +#define TINYGRAPH_RS_H + +#include + +#include "tinygraph-bitset.h" +#include "tinygraph-utils.h" + +/* + * Succinct rank-select data structure + * answering rank1 and select1 queries + * over a bitset. + */ + +typedef struct tinygraph_rs* tinygraph_rs_s; +typedef const struct tinygraph_rs* tinygraph_rs_const_s; + + +TINYGRAPH_WARN_UNUSED +tinygraph_rs_s tinygraph_rs_construct(tinygraph_bitset_const_s bitset); + +void tinygraph_rs_destruct(tinygraph_rs_s rs); + +TINYGRAPH_WARN_UNUSED +uint32_t tinygraph_rs_rank(tinygraph_rs_const_s rs, uint32_t n); + +TINYGRAPH_WARN_UNUSED +uint32_t tinygraph_rs_select(tinygraph_rs_const_s rs, uint32_t n); + +void tinygraph_rs_print_internal(tinygraph_rs_const_s rs); + + +#endif diff --git a/tinygraph/tinygraph-tests.c b/tinygraph/tinygraph-tests.c index 946bfad..2993085 100644 --- a/tinygraph/tinygraph-tests.c +++ b/tinygraph/tinygraph-tests.c @@ -15,6 +15,7 @@ #include "tinygraph-bits.h" #include "tinygraph-eliasfano.h" #include "tinygraph-align.h" +#include "tinygraph-rs.h" void test1(void) { @@ -891,6 +892,73 @@ void test25(void) { } } +void test26(void) { + tinygraph_bitset_s bits = tinygraph_bitset_construct(4096); // zeros + assert(bits); + + + tinygraph_rs_s rs1 = tinygraph_rs_construct(bits); + assert(rs1); + + for (uint32_t i = 0; i <= 4096; ++i) { + assert(tinygraph_rs_rank(rs1, i) == 0); + } + + tinygraph_rs_destruct(rs1); + + + tinygraph_bitset_not(bits); // ones + + + tinygraph_rs_s rs2 = tinygraph_rs_construct(bits); + assert(rs2); + + for (uint32_t i = 0; i <= 4096; ++i) { + assert(tinygraph_rs_rank(rs2, i) == i); + } + + tinygraph_rs_destruct(rs2); + + + tinygraph_bitset_destruct(bits); +} + + +void test27(void) { + tinygraph_bitset_s bits = tinygraph_bitset_construct(UINT32_C(1) << 15); // zeros + assert(bits); + + tinygraph_bitset_not(bits); // ones + + tinygraph_rs_s rs1 = tinygraph_rs_construct(bits); + assert(rs1); + + for (uint32_t i = 0; i < (UINT32_C(1) << 15); ++i) { + assert(tinygraph_rs_select(rs1, i) == i); + } + + tinygraph_rs_destruct(rs1); + tinygraph_bitset_destruct(bits); +} + + +void test28(void) { + tinygraph_bitset_s bits = tinygraph_bitset_construct(UINT32_C(4294966784)); + assert(bits); + + tinygraph_bitset_set_at(bits, 0); + tinygraph_bitset_set_at(bits, UINT32_C(4294966783)); + + tinygraph_rs_s rs1 = tinygraph_rs_construct(bits); + assert(rs1); + + assert(tinygraph_rs_select(rs1, 0) == 0); + assert(tinygraph_rs_select(rs1, 1) == UINT32_C(4294966783)); + + tinygraph_rs_destruct(rs1); + tinygraph_bitset_destruct(bits); +} + int main(void) { test1(); @@ -918,4 +986,7 @@ int main(void) { test23(); test24(); test25(); + test26(); + test27(); + test28(); }