diff --git a/Builds/VisualStudio/stellar-core.vcxproj b/Builds/VisualStudio/stellar-core.vcxproj index e9b43505cb..9573209abc 100644 --- a/Builds/VisualStudio/stellar-core.vcxproj +++ b/Builds/VisualStudio/stellar-core.vcxproj @@ -873,7 +873,6 @@ exit /b 0 - diff --git a/Builds/VisualStudio/stellar-core.vcxproj.filters b/Builds/VisualStudio/stellar-core.vcxproj.filters index d2174e0530..596e96c518 100644 --- a/Builds/VisualStudio/stellar-core.vcxproj.filters +++ b/Builds/VisualStudio/stellar-core.vcxproj.filters @@ -1754,9 +1754,6 @@ util\xdrquery - - lib - lib diff --git a/COPYING b/COPYING index 8d18079b1f..d68c3a3316 100644 --- a/COPYING +++ b/COPYING @@ -37,14 +37,14 @@ lib/asio: Licensed under the Boost Software License 1.0 (http://www.boost.org/LICENSE_1_0.txt) -lib/bloom_filter.hpp: +lib/binaryfusefilter.h: - The C++ Open Bloom Filter Library, - Copyright 2000 Arash Partow - (arash at partow dot net) - (https://www.partow.net/programming/bloomfilter/index.html) - Licensed under the MIT license - (http://opensource.org/licenses/MIT) + The FastFilter library, + Copyright 2023 Daniel Lemire + (daniel at lemire dot me) + () + Licensed under the Apache License, Version 2.0 + () lib/catch.hpp: diff --git a/Cargo.lock b/Cargo.lock index 3db2612ac2..5d8c975640 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -940,8 +940,8 @@ dependencies = [ [[package]] name = "soroban-builtin-sdk-macros" -version = "21.1.0" -source = "git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693" dependencies = [ "itertools", "proc-macro2", @@ -966,17 +966,17 @@ dependencies = [ [[package]] name = "soroban-env-common" -version = "21.1.0" -source = "git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693" dependencies = [ "crate-git-revision", "ethnum", "num-derive", "num-traits", - "soroban-env-macros 21.1.0", + "soroban-env-macros 21.2.0", "soroban-wasmi", "static_assertions", - "stellar-xdr 21.1.0", + "stellar-xdr 21.2.0", "tracy-client", "wasmparser", ] @@ -1008,8 +1008,8 @@ dependencies = [ [[package]] name = "soroban-env-host" -version = "21.1.0" -source = "git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693" dependencies = [ "curve25519-dalek", "ecdsa", @@ -1029,8 +1029,8 @@ dependencies = [ "sec1", "sha2", "sha3", - "soroban-builtin-sdk-macros 21.1.0", - "soroban-env-common 21.1.0", + "soroban-builtin-sdk-macros 21.2.0", + "soroban-env-common 21.2.0", "soroban-wasmi", "static_assertions", "stellar-strkey", @@ -1054,35 +1054,35 @@ dependencies = [ [[package]] name = "soroban-env-macros" -version = "21.1.0" -source = "git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693" dependencies = [ "itertools", "proc-macro2", "quote", "serde", "serde_json", - "stellar-xdr 21.1.0", + "stellar-xdr 21.2.0", "syn", ] [[package]] name = "soroban-synth-wasm" -version = "21.1.0" -source = "git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693" dependencies = [ "arbitrary", - "soroban-env-common 21.1.0", - "soroban-env-macros 21.1.0", - "stellar-xdr 21.1.0", + "soroban-env-common 21.2.0", + "soroban-env-macros 21.2.0", + "stellar-xdr 21.2.0", "wasm-encoder", "wasmparser", ] [[package]] name = "soroban-test-wasms" -version = "21.1.0" -source = "git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693" [[package]] name = "soroban-wasmi" @@ -1131,7 +1131,7 @@ dependencies = [ "rand", "rustc-simple-version", "soroban-env-host 20.3.0", - "soroban-env-host 21.1.0", + "soroban-env-host 21.2.0", "soroban-synth-wasm", "soroban-test-wasms", "toml", @@ -1163,9 +1163,8 @@ dependencies = [ [[package]] name = "stellar-xdr" -version = "21.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec43c9c5ae7ec7b6ac9e263b6d5b9e3781aa05ba3a1c05f6e70701c5c6600665" +version = "21.2.0" +source = "git+https://github.com/stellar/rs-stellar-xdr?rev=ae805d0f8c28ca86327a834eea9ce7d29b0a63bb#ae805d0f8c28ca86327a834eea9ce7d29b0a63bb" dependencies = [ "base64", "crate-git-revision", diff --git a/Cargo.toml b/Cargo.toml index b1a57b6b1b..b9a789b853 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ members = ["src/rust", "lib/tracy-client-sys"] [patch.crates-io] tracy-client-sys = { path = "lib/tracy-client-sys" } +stellar-xdr = { git = "https://github.com/stellar/rs-stellar-xdr", rev = "ae805d0f8c28ca86327a834eea9ce7d29b0a63bb" } #[patch."https://github.com/stellar/rs-soroban-env"] #soroban-env-common = { path = "../rs-soroban-env/soroban-env-common" } @@ -13,7 +14,6 @@ tracy-client-sys = { path = "lib/tracy-client-sys" } # stellar-xdr = { path = "../rs-stellar-xdr/" } # [patch.crates-io] -# stellar-xdr = { git = "https://github.com/stellar/rs-stellar-xdr", rev = "2775f4b6" } # wasmi = { package = "soroban-wasmi", git = "https://github.com/stellar/wasmi", rev = "862b32f5" } # soroban-env-common = { git = "https://github.com/stellar/rs-soroban-env", rev = "eda2ab70" } # soroban-native-sdk-macros = { git = "https://github.com/stellar/rs-soroban-env", rev = "eda2ab70" } diff --git a/lib/binaryfusefilter.h b/lib/binaryfusefilter.h new file mode 100644 index 0000000000..e7c59793e0 --- /dev/null +++ b/lib/binaryfusefilter.h @@ -0,0 +1,627 @@ +#ifndef BINARYFUSEFILTER_H +#define BINARYFUSEFILTER_H +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "util/siphash.h" +#include "xdr/Stellar-types.h" + +#include +#include + +typedef std::array binary_fuse_seed_t; + +#ifndef XOR_MAX_ITERATIONS +#define XOR_MAX_ITERATIONS \ + 1000000 // probability of success should always be > 0.5, so with this many + // iterations, it is essentially impossible to fail +#endif + +/** + * We start with a few utilities. + ***/ +static inline uint64_t +sip_hash24(uint64_t key, binary_fuse_seed_t const& seed) +{ + SipHash24 hasher(seed.data()); + hasher.update(reinterpret_cast(&key), sizeof(key)); + return hasher.digest(); +} +static inline uint64_t +binary_fuse_rotl64(uint64_t n, unsigned int c) +{ + return (n << (c & 63)) | (n >> ((-c) & 63)); +} +static inline uint32_t +binary_fuse_reduce(uint32_t hash, uint32_t n) +{ + // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + return (uint32_t)(((uint64_t)hash * n) >> 32); +} + +/** + * We need a decent random number generator. + **/ + +// returns random number, modifies the seed +static inline uint64_t +binary_fuse_rng_splitmix64(uint64_t* seed) +{ + uint64_t z = (*seed += UINT64_C(0x9E3779B97F4A7C15)); + z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); + z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); + return z ^ (z >> 31); +} + +// #ifdefs adapted from: +// https://stackoverflow.com/a/50958815 +#ifdef __SIZEOF_INT128__ // compilers supporting __uint128, e.g., gcc, clang +static inline uint64_t +binary_fuse_mulhi(uint64_t a, uint64_t b) +{ + return ((__uint128_t)a * b) >> 64; +} +#elif defined(_M_X64) || defined(_MARM64) // MSVC +static inline uint64_t +binary_fuse_mulhi(uint64_t a, uint64_t b) +{ + return __umulh(a, b); +} +#elif defined(_M_IA64) // also MSVC +static inline uint64_t +binary_fuse_mulhi(uint64_t a, uint64_t b) +{ + unsigned __int64 hi; + (void)_umul128(a, b, &hi); + return hi; +} +#else // portable implementation using uint64_t +static inline uint64_t +binary_fuse_mulhi(uint64_t a, uint64_t b) +{ + // Adapted from: + // https://stackoverflow.com/a/51587262 + + /* + This is implementing schoolbook multiplication: + + a1 a0 + X b1 b0 + ------------- + 00 LOW PART + ------------- + 00 + 10 10 MIDDLE PART + + 01 + ------------- + 01 + + 11 11 HIGH PART + ------------- + */ + + const uint64_t a0 = (uint32_t)a; + const uint64_t a1 = a >> 32; + const uint64_t b0 = (uint32_t)b; + const uint64_t b1 = b >> 32; + const uint64_t p11 = a1 * b1; + const uint64_t p01 = a0 * b1; + const uint64_t p10 = a1 * b0; + const uint64_t p00 = a0 * b0; + + // 64-bit product + two 32-bit values + const uint64_t middle = p10 + (p00 >> 32) + (uint32_t)p01; + + /* + Proof that 64-bit products can accumulate two more 32-bit values + without overflowing: + + Max 32-bit value is 2^32 - 1. + PSum = (2^32-1) * (2^32-1) + (2^32-1) + (2^32-1) + = 2^64 - 2^32 - 2^32 + 1 + 2^32 - 1 + 2^32 - 1 + = 2^64 - 1 + Therefore the high half below cannot overflow regardless of input. + */ + + // high half + return p11 + (middle >> 32) + (p01 >> 32); + + // low half (which we don't care about, but here it is) + // (middle << 32) | (uint32_t) p00; +} +#endif + +static inline uint32_t +binary_fuse_calculate_segment_length(uint32_t arity, uint32_t size) +{ + // These parameters are very sensitive. Replacing 'floor' by 'round' can + // substantially affect the construction time. + if (arity == 3) + { + return ((uint32_t)1) + << (int)(floor(log((double)(size)) / log(3.33) + 2.25)); + } + else if (arity == 4) + { + return ((uint32_t)1) + << (int)(floor(log((double)(size)) / log(2.91) - 0.5)); + } + else + { + return 65536; + } +} + +static inline double +binary_fuse_max(double a, double b) +{ + if (a < b) + { + return b; + } + return a; +} + +static inline double +binary_fuse_calculate_size_factor(uint32_t arity, uint32_t size) +{ + if (arity == 3) + { + return binary_fuse_max(1.125, 0.875 + 0.25 * log(1000000.0) / + log((double)size)); + } + else if (arity == 4) + { + return binary_fuse_max(1.075, 0.77 + 0.305 * log(600000.0) / + log((double)size)); + } + else + { + return 2.0; + } +} + +static inline uint8_t +binary_fuse_mod3(uint8_t x) +{ + return x > 2 ? x - 3 : x; +} + +////////////////// +// fuseT +////////////////// + +static inline uint64_t +binary_fuse_fingerprint(uint64_t hash) +{ + return hash ^ (hash >> 32); +} + +template class binary_fuse_t +{ + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "Binary Fuse Filter only supports 8, 16, or 32 bit width"); + + private: + binary_fuse_seed_t Seed; + uint32_t SegmentLength; + uint32_t SegmentLengthMask; + uint32_t SegmentCount; + uint32_t SegmentCountLength; + uint32_t ArrayLength; + std::vector Fingerprints; + + struct binary_hashes_t + { + uint32_t h0; + uint32_t h1; + uint32_t h2; + }; + + binary_hashes_t + hash_batch(uint64_t hash) const + { + uint64_t hi = binary_fuse_mulhi(hash, SegmentCountLength); + binary_hashes_t ans; + ans.h0 = (uint32_t)hi; + ans.h1 = ans.h0 + SegmentLength; + ans.h2 = ans.h1 + SegmentLength; + ans.h1 ^= (uint32_t)(hash >> 18) & SegmentLengthMask; + ans.h2 ^= (uint32_t)(hash)&SegmentLengthMask; + return ans; + } + + uint32_t + binary_fuse_hash(int index, uint64_t hash) const + { + uint64_t h = binary_fuse_mulhi(hash, SegmentCountLength); + h += index * SegmentLength; + // keep the lower 36 bits + uint64_t hh = hash & ((1UL << 36) - 1); + // index 0: right shift by 36; index 1: right shift by 18; index 2: no + // shift + h ^= (size_t)((hh >> (36 - 18 * index)) & SegmentLengthMask); + return h; + } + + // Construct the filter, returns true on success, false on failure. + // The algorithm fails when there is insufficient memory. + // For best performance, the caller should ensure that there are not + // too many duplicated keys. + // While highly improbable, it is possible that the population fails, at + // which point the seed must be rotated. keys will be sorted and duplicates + // removed if any duplicate keys exist + [[nodiscard]] bool + populate(std::vector& keys, binary_fuse_seed_t rngSeed) + { + ZoneScoped; + if (keys.size() > std::numeric_limits::max()) + { + throw std::runtime_error("size should be at most 2^32"); + } + + uint32_t size = keys.size(); + ZoneValue(static_cast(size)); + + Seed = rngSeed; + + std::vector reverseOrder(size + 1); + uint32_t capacity = ArrayLength; + + std::vector alone(capacity); + std::vector t2count(capacity); + std::vector reverseH(size); + std::vector t2hash(capacity); + + uint32_t blockBits = 1; + while (((uint32_t)1 << blockBits) < SegmentCount) + { + blockBits += 1; + } + uint32_t block = ((uint32_t)1 << blockBits); + + std::vector startPos(1 << blockBits); + uint32_t h012[5]; + + reverseOrder[size] = 1; + for (int loop = 0; true; ++loop) + { + if (loop + 1 > XOR_MAX_ITERATIONS) + { + // The probability of this happening is lower than the + // the cosmic-ray probability (i.e., a cosmic ray corrupts your + // system). + return false; + } + + for (uint32_t i = 0; i < block; i++) + { + // important : i * size would overflow as a 32-bit number in + // some cases. + startPos[i] = ((uint64_t)i * size) >> blockBits; + } + + uint64_t maskblock = block - 1; + for (uint32_t i = 0; i < size; i++) + { + uint64_t hash = sip_hash24(keys[i], Seed); + uint64_t segment_index = hash >> (64 - blockBits); + while (reverseOrder[startPos[segment_index]] != 0) + { + segment_index++; + segment_index &= maskblock; + } + reverseOrder[startPos[segment_index]] = hash; + startPos[segment_index]++; + } + int error = 0; + uint32_t duplicates = 0; + for (uint32_t i = 0; i < size; i++) + { + uint64_t hash = reverseOrder[i]; + uint32_t h0 = binary_fuse_hash(0, hash); + t2count[h0] += 4; + t2hash[h0] ^= hash; + uint32_t h1 = binary_fuse_hash(1, hash); + t2count[h1] += 4; + t2count[h1] ^= 1; + t2hash[h1] ^= hash; + uint32_t h2 = binary_fuse_hash(2, hash); + t2count[h2] += 4; + t2hash[h2] ^= hash; + t2count[h2] ^= 2; + if ((t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0) + { + if (((t2hash[h0] == 0) && (t2count[h0] == 8)) || + ((t2hash[h1] == 0) && (t2count[h1] == 8)) || + ((t2hash[h2] == 0) && (t2count[h2] == 8))) + { + duplicates += 1; + t2count[h0] -= 4; + t2hash[h0] ^= hash; + t2count[h1] -= 4; + t2count[h1] ^= 1; + t2hash[h1] ^= hash; + t2count[h2] -= 4; + t2count[h2] ^= 2; + t2hash[h2] ^= hash; + } + } + error = (t2count[h0] < 4) ? 1 : error; + error = (t2count[h1] < 4) ? 1 : error; + error = (t2count[h2] < 4) ? 1 : error; + } + if (error) + { + std::fill(reverseOrder.begin(), reverseOrder.end(), 0); + std::fill(t2count.begin(), t2count.end(), 0); + std::fill(t2hash.begin(), t2hash.end(), 0); + + // Rotate seed deterministically + auto seedIndex = loop / crypto_shorthash_KEYBYTES; + Seed[seedIndex]++; + } + + // End of key addition + uint32_t Qsize = 0; + // Add sets with one key to the queue. + for (uint32_t i = 0; i < capacity; i++) + { + alone[Qsize] = i; + Qsize += ((t2count[i] >> 2) == 1) ? 1 : 0; + } + uint32_t stacksize = 0; + while (Qsize > 0) + { + Qsize--; + uint32_t index = alone[Qsize]; + if ((t2count[index] >> 2) == 1) + { + uint64_t hash = t2hash[index]; + + // h012[0] = binary_fuse16_hash(0, hash); + h012[1] = binary_fuse_hash(1, hash); + h012[2] = binary_fuse_hash(2, hash); + h012[3] = binary_fuse_hash(0, hash); // == h012[0]; + h012[4] = h012[1]; + uint8_t found = t2count[index] & 3; + reverseH[stacksize] = found; + reverseOrder[stacksize] = hash; + stacksize++; + uint32_t other_index1 = h012[found + 1]; + alone[Qsize] = other_index1; + Qsize += ((t2count[other_index1] >> 2) == 2 ? 1 : 0); + + t2count[other_index1] -= 4; + t2count[other_index1] ^= binary_fuse_mod3(found + 1); + t2hash[other_index1] ^= hash; + + uint32_t other_index2 = h012[found + 2]; + alone[Qsize] = other_index2; + Qsize += ((t2count[other_index2] >> 2) == 2 ? 1 : 0); + t2count[other_index2] -= 4; + t2count[other_index2] ^= binary_fuse_mod3(found + 2); + t2hash[other_index2] ^= hash; + } + } + if (stacksize + duplicates == size) + { + // success + size = stacksize; + break; + } + else if (duplicates > 0) + { + // Sort keys and remove duplicates + std::sort(keys.begin(), keys.end()); + keys.erase(std::unique(keys.begin(), keys.end()), keys.end()); + size = keys.size(); + } + + // Reset everything except for the last entry in reverseOrder + std::fill_n(reverseOrder.begin(), size, 0); + std::fill(t2count.begin(), t2count.end(), 0); + std::fill(t2hash.begin(), t2hash.end(), 0); + + // Rotate seed deterministically + auto seedIndex = loop / crypto_shorthash_KEYBYTES; + Seed[seedIndex]++; + } + + for (uint32_t i = size - 1; i < size; i--) + { + // the hash of the key we insert next + uint64_t hash = reverseOrder[i]; + T xor2 = binary_fuse_fingerprint(hash); + uint8_t found = reverseH[i]; + h012[0] = binary_fuse_hash(0, hash); + h012[1] = binary_fuse_hash(1, hash); + h012[2] = binary_fuse_hash(2, hash); + h012[3] = h012[0]; + h012[4] = h012[1]; + Fingerprints[h012[found]] = xor2 ^ Fingerprints[h012[found + 1]] ^ + Fingerprints[h012[found + 2]]; + } + + return true; + } + + public: + // allocate enough capacity for a set containing up to 'size' elements + // size should be at least 2. + explicit binary_fuse_t(uint32_t size, std::vector& keys, + binary_fuse_seed_t rngSeed) + { + if (size < 2) + { + throw std::runtime_error("size should be at least 2"); + } + + uint32_t arity = 3; + SegmentLength = binary_fuse_calculate_segment_length(arity, size); + if (SegmentLength > 262144) + { + SegmentLength = 262144; + } + SegmentLengthMask = SegmentLength - 1; + double sizeFactor = binary_fuse_calculate_size_factor(arity, size); + uint32_t capacity = (uint32_t)(round((double)size * sizeFactor)); + uint32_t initSegmentCount = + (capacity + SegmentLength - 1) / SegmentLength - (arity - 1); + ArrayLength = (initSegmentCount + arity - 1) * SegmentLength; + SegmentCount = (ArrayLength + SegmentLength - 1) / SegmentLength; + if (SegmentCount <= arity - 1) + { + SegmentCount = 1; + } + else + { + SegmentCount = SegmentCount - (arity - 1); + } + ArrayLength = (SegmentCount + arity - 1) * SegmentLength; + SegmentCountLength = SegmentCount * SegmentLength; + Fingerprints.resize(ArrayLength); + + if (!populate(keys, rngSeed)) + { + throw std::runtime_error("BinaryFuseFilter failed to populate"); + } + } + + explicit binary_fuse_t(stellar::SerializedBinaryFuseFilter const& xdrFilter) + : SegmentLength(xdrFilter.segmentLength) + , SegmentLengthMask(xdrFilter.segementLengthMask) + , SegmentCount(xdrFilter.segmentCount) + , SegmentCountLength(xdrFilter.segmentCountLength) + , ArrayLength(xdrFilter.fingerprintLength) + { + std::copy(xdrFilter.filterSeed.seed.begin(), + xdrFilter.filterSeed.seed.end(), Seed.begin()); + + // Convert vector to vector + Fingerprints.reserve(ArrayLength); + for (size_t elem = 0; elem < ArrayLength; ++elem) + { + T value = 0; + auto pos = elem * sizeof(T); + + for (auto byte_i = 0; byte_i < sizeof(T); ++byte_i) + { + value |= static_cast(xdrFilter.fingerprints[pos + byte_i]) + << (byte_i * 8); + } + + Fingerprints.push_back(value); + } + } + + // Report if the key is in the set, with false positive rate. + bool + contain(uint64_t key) const + { + ZoneScoped; + uint64_t hash = sip_hash24(key, Seed); + T f = binary_fuse_fingerprint(hash); + binary_hashes_t hashes = hash_batch(hash); + f ^= Fingerprints[hashes.h0] ^ Fingerprints[hashes.h1] ^ + Fingerprints[hashes.h2]; + return f == 0; + } + + // report memory usage + size_t + size_in_bytes() const + { + return ArrayLength * sizeof(T) + sizeof(*this); + } + + void + copyTo(stellar::SerializedBinaryFuseFilter& xdrFilter) const + { + if constexpr (std::is_same::value) + { + xdrFilter.type = stellar::BINARY_FUSE_FILTER_8_BIT; + } + else if constexpr (std::is_same::value) + { + xdrFilter.type = stellar::BINARY_FUSE_FILTER_16_BIT; + } + else if constexpr (std::is_same::value) + { + xdrFilter.type = stellar::BINARY_FUSE_FILTER_32_BIT; + } + else + { + static_assert(!sizeof(T), "Invalid BinaryFuseFilter type"); + } + + std::copy(Seed.begin(), Seed.end(), xdrFilter.filterSeed.seed.begin()); + xdrFilter.segmentLength = SegmentLength; + xdrFilter.segementLengthMask = SegmentLengthMask; + xdrFilter.segmentCount = SegmentCount; + xdrFilter.segmentCountLength = SegmentCountLength; + xdrFilter.fingerprintLength = ArrayLength; + + // We need to convert the in-memory vector into a vector + xdrFilter.fingerprints.reserve(ArrayLength * sizeof(T)); + for (T f : Fingerprints) + { + for (size_t byte_i = 0; byte_i < sizeof(T); ++byte_i) + { + xdrFilter.fingerprints.push_back((f >> (byte_i * 8)) & 0xFF); + } + } + } + + bool + operator==(binary_fuse_t const& other) const + { + return SegmentLength == other.SegmentLength && + SegmentLengthMask == other.SegmentLengthMask && + SegmentCount == other.SegmentCount && + SegmentCountLength == other.SegmentCountLength && + ArrayLength == other.ArrayLength && Seed == other.Seed && + Fingerprints == other.Fingerprints; + } +}; + +template struct binary_fuse_create_return +{ + using type = T; +}; + +template +typename binary_fuse_create_return::type +binary_fuse_create_from_serialized_xdr( + stellar::SerializedBinaryFuseFilter const& xdrFilter) +{ + if constexpr (std::is_same::value) + { + return binary_fuse_t(xdrFilter); + } + else if constexpr (std::is_same::value) + { + return binary_fuse_t(xdrFilter); + } + else if constexpr (std::is_same::value) + { + return binary_fuse_t(xdrFilter); + } + else + { + static_assert(!sizeof(T), "Invalid BinaryFuseFilter type"); + } +} + +#endif \ No newline at end of file diff --git a/lib/bloom_filter.hpp b/lib/bloom_filter.hpp deleted file mode 100644 index 7ad8fec20d..0000000000 --- a/lib/bloom_filter.hpp +++ /dev/null @@ -1,645 +0,0 @@ -/* - ********************************************************************* - * * - * Open Bloom Filter * - * * - * Author: Arash Partow - 2000 * - * URL: http://www.partow.net * - * URL: http://www.partow.net/programming/hashfunctions/index.html * - * * - * Copyright notice: * - * Free use of the Open Bloom Filter Library is permitted under the * - * guidelines and in accordance with the MIT License. * - * http://www.opensource.org/licenses/MIT * - * * - ********************************************************************* - */ - -#ifndef INCLUDE_BLOOM_FILTER_HPP -#define INCLUDE_BLOOM_FILTER_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "util/siphash.h" -#include -#include -#include -#include - -static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) - -static const unsigned char bit_mask[bits_per_char] = { - 0x01, // 00000001 - 0x02, // 00000010 - 0x04, // 00000100 - 0x08, // 00001000 - 0x10, // 00010000 - 0x20, // 00100000 - 0x40, // 01000000 - 0x80 // 10000000 -}; - -class bloom_parameters -{ - public: - typedef std::array rand_t; - - bloom_parameters() - : minimum_size(1) - , maximum_size(std::numeric_limits::max()) - , minimum_number_of_hashes(1) - , maximum_number_of_hashes(std::numeric_limits::max()) - , projected_element_count(10000) - , false_positive_probability(1.0 / projected_element_count) - , random_seed() - { - } - - virtual ~bloom_parameters() - { - } - - inline bool - operator!() - { - return (minimum_size > maximum_size) || - (minimum_number_of_hashes > maximum_number_of_hashes) || - (minimum_number_of_hashes < 1) || - (0 == maximum_number_of_hashes) || - (0 == projected_element_count) || - (false_positive_probability < 0.0) || - (std::numeric_limits::infinity() == - std::abs(false_positive_probability)) || - (random_seed == rand_t{}); - } - - // Allowable min/max size of the bloom filter in bits - unsigned long long int minimum_size; - unsigned long long int maximum_size; - - // Allowable min/max number of hash functions - unsigned int minimum_number_of_hashes; - unsigned int maximum_number_of_hashes; - - // The approximate number of elements to be inserted - // into the bloom filter, should be within one order - // of magnitude. The default is 10000. - unsigned long long int projected_element_count; - - // The approximate false positive probability expected - // from the bloom filter. The default is assumed to be - // the reciprocal of the projected_element_count. - double false_positive_probability; - - rand_t random_seed; - - struct optimal_parameters_t - { - optimal_parameters_t() : number_of_hashes(0), table_size(0) - { - } - - unsigned int number_of_hashes; - unsigned long long int table_size; - }; - - optimal_parameters_t optimal_parameters; - - virtual bool - compute_optimal_parameters() - { - /* - Note: - The following will attempt to find the number of hash functions - and minimum amount of storage bits required to construct a bloom - filter consistent with the user defined false positive probability - and estimated element insertion count. - */ - - if (!(*this)) - return false; - - double min_m = std::numeric_limits::infinity(); - double min_k = 0.0; - double k = 1.0; - - while (k < 1000.0) - { - const double numerator = (-k * projected_element_count); - const double denominator = - std::log(1.0 - std::pow(false_positive_probability, 1.0 / k)); - - const double curr_m = numerator / denominator; - - if (curr_m < min_m) - { - min_m = curr_m; - min_k = k; - } - - k += 1.0; - } - - optimal_parameters_t& optp = optimal_parameters; - - optp.number_of_hashes = static_cast(min_k); - - optp.table_size = static_cast(min_m); - - optp.table_size += - (((optp.table_size % bits_per_char) != 0) - ? (bits_per_char - (optp.table_size % bits_per_char)) - : 0); - - if (optp.number_of_hashes < minimum_number_of_hashes) - optp.number_of_hashes = minimum_number_of_hashes; - else if (optp.number_of_hashes > maximum_number_of_hashes) - optp.number_of_hashes = maximum_number_of_hashes; - - if (optp.table_size < minimum_size) - optp.table_size = minimum_size; - else if (optp.table_size > maximum_size) - optp.table_size = maximum_size; - - return true; - } -}; - -class bloom_filter -{ - protected: - typedef unsigned int bloom_type; - typedef unsigned char cell_type; - typedef std::vector table_type; - - public: - bloom_filter() - : hash_count_(0) - , table_size_(0) - , projected_element_count_(0) - , inserted_element_count_(0) - , random_seed_() - , desired_false_positive_probability_(0.0) - { - } - - bloom_filter(const bloom_parameters& p) - : projected_element_count_(p.projected_element_count) - , inserted_element_count_(0) - , random_seed_(p.random_seed) - , desired_false_positive_probability_(p.false_positive_probability) - { - hash_count_ = p.optimal_parameters.number_of_hashes; - table_size_ = p.optimal_parameters.table_size; - - bit_table_.resize(table_size_ / bits_per_char, - static_cast(0x00)); - } - - bloom_filter(const bloom_filter& filter) - { - this->operator=(filter); - } - - inline bool - operator==(const bloom_filter& f) const - { - if (this != &f) - { - return (hash_count_ == f.hash_count_) && - (table_size_ == f.table_size_) && - (bit_table_.size() == f.bit_table_.size()) && - (projected_element_count_ == f.projected_element_count_) && - (inserted_element_count_ == f.inserted_element_count_) && - (random_seed_ == f.random_seed_) && - (desired_false_positive_probability_ == - f.desired_false_positive_probability_) && - (bit_table_ == f.bit_table_); - } - else - return true; - } - - inline bool - operator!=(const bloom_filter& f) const - { - return !operator==(f); - } - - inline bloom_filter& - operator=(const bloom_filter& f) - { - if (this != &f) - { - hash_count_ = f.hash_count_; - table_size_ = f.table_size_; - bit_table_ = f.bit_table_; - - projected_element_count_ = f.projected_element_count_; - inserted_element_count_ = f.inserted_element_count_; - - random_seed_ = f.random_seed_; - - desired_false_positive_probability_ = - f.desired_false_positive_probability_; - } - - return *this; - } - - virtual ~bloom_filter() - { - } - - inline bool - operator!() const - { - return (0 == table_size_); - } - - inline void - clear() - { - std::fill(bit_table_.begin(), bit_table_.end(), - static_cast(0x00)); - inserted_element_count_ = 0; - } - - inline void - insert(const unsigned char* key_begin, const std::size_t& length) - { - std::size_t bit_index = 0; - std::size_t bit = 0; - - for (auto i = 0u; i < hash_count_; ++i) - { - compute_indices(hash_ap(key_begin, length, i), bit_index, bit); - - bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; - } - - ++inserted_element_count_; - } - - template - inline void - insert(const T& t) - { - // Note: T must be a C++ POD type. - insert(reinterpret_cast(&t), sizeof(T)); - } - - inline void - insert(const std::string& key) - { - insert(reinterpret_cast(key.data()), key.size()); - } - - inline void - insert(const char* data, const std::size_t& length) - { - insert(reinterpret_cast(data), length); - } - - template - inline void - insert(const InputIterator begin, const InputIterator end) - { - InputIterator itr = begin; - - while (end != itr) - { - insert(*(itr++)); - } - } - - inline virtual bool - contains(const unsigned char* key_begin, const std::size_t length) const - { - ZoneScoped; - std::size_t bit_index = 0; - std::size_t bit = 0; - - for (auto i = 0u; i < hash_count_; ++i) - { - compute_indices(hash_ap(key_begin, length, i), bit_index, bit); - - if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != - bit_mask[bit]) - { - return false; - } - } - - return true; - } - - template - inline bool - contains(const T& t) const - { - return contains(reinterpret_cast(&t), - static_cast(sizeof(T))); - } - - inline bool - contains(const std::string& key) const - { - return contains(reinterpret_cast(key.c_str()), - key.size()); - } - - inline bool - contains(const char* data, const std::size_t& length) const - { - return contains(reinterpret_cast(data), length); - } - - template - inline InputIterator - contains_all(const InputIterator begin, const InputIterator end) const - { - InputIterator itr = begin; - - while (end != itr) - { - if (!contains(*itr)) - { - return itr; - } - - ++itr; - } - - return end; - } - - template - inline InputIterator - contains_none(const InputIterator begin, const InputIterator end) const - { - InputIterator itr = begin; - - while (end != itr) - { - if (contains(*itr)) - { - return itr; - } - - ++itr; - } - - return end; - } - - inline virtual unsigned long long int - size() const - { - return table_size_; - } - - inline unsigned long long int - element_count() const - { - return inserted_element_count_; - } - - inline double - effective_fpp() const - { - /* - Note: - The effective false positive probability is calculated using the - designated table size and hash function count in conjunction with - the current number of inserted elements - not the user defined - predicated/expected number of inserted elements. - */ - return std::pow(1.0 - std::exp(-1.0 * hash_count_ * - inserted_element_count_ / size()), - 1.0 * hash_count_); - } - - inline bloom_filter& - operator&=(const bloom_filter& f) - { - /* intersection */ - if ((hash_count_ == f.hash_count_) && (table_size_ == f.table_size_) && - (random_seed_ == f.random_seed_)) - { - for (std::size_t i = 0; i < bit_table_.size(); ++i) - { - bit_table_[i] &= f.bit_table_[i]; - } - } - - return *this; - } - - inline bloom_filter& - operator|=(const bloom_filter& f) - { - /* union */ - if ((hash_count_ == f.hash_count_) && (table_size_ == f.table_size_) && - (random_seed_ == f.random_seed_)) - { - for (std::size_t i = 0; i < bit_table_.size(); ++i) - { - bit_table_[i] |= f.bit_table_[i]; - } - } - - return *this; - } - - inline bloom_filter& - operator^=(const bloom_filter& f) - { - /* difference */ - if ((hash_count_ == f.hash_count_) && (table_size_ == f.table_size_) && - (random_seed_ == f.random_seed_)) - { - for (std::size_t i = 0; i < bit_table_.size(); ++i) - { - bit_table_[i] ^= f.bit_table_[i]; - } - } - - return *this; - } - - inline const cell_type* - table() const - { - return bit_table_.data(); - } - - inline std::size_t - hash_count() - { - return hash_count_; - } - - template - void - serialize(Archive& ar) - { - ar(bit_table_, hash_count_, table_size_, projected_element_count_, - inserted_element_count_, random_seed_, - desired_false_positive_probability_); - } - - protected: - inline virtual void - compute_indices(const bloom_type& hash, std::size_t& bit_index, - std::size_t& bit) const - { - bit_index = hash % table_size_; - bit = bit_index % bits_per_char; - } - - inline bloom_type - hash_ap(const unsigned char* begin, std::size_t remaining_length, - bloom_type hash_num) const - { - SipHash24 sh(random_seed_.data()); - sh.update(reinterpret_cast(&hash_num), sizeof(hash_num)); - sh.update(begin, remaining_length); - return static_cast(sh.digest() & - std::numeric_limits::max()); - } - - std::vector bit_table_; - unsigned int hash_count_; - unsigned long long int table_size_; - unsigned long long int projected_element_count_; - unsigned long long int inserted_element_count_; - bloom_parameters::rand_t random_seed_; - double desired_false_positive_probability_; -}; - -inline bloom_filter -operator&(const bloom_filter& a, const bloom_filter& b) -{ - bloom_filter result = a; - result &= b; - return result; -} - -inline bloom_filter -operator|(const bloom_filter& a, const bloom_filter& b) -{ - bloom_filter result = a; - result |= b; - return result; -} - -inline bloom_filter -operator^(const bloom_filter& a, const bloom_filter& b) -{ - bloom_filter result = a; - result ^= b; - return result; -} - -class compressible_bloom_filter : public bloom_filter -{ - public: - compressible_bloom_filter(const bloom_parameters& p) : bloom_filter(p) - { - size_list.push_back(table_size_); - } - - inline unsigned long long int - size() const - { - return size_list.back(); - } - - inline bool - compress(const double& percentage) - { - if ((percentage < 0.0) || (percentage >= 100.0)) - { - return false; - } - - unsigned long long int original_table_size = size_list.back(); - unsigned long long int new_table_size = - static_cast( - (size_list.back() * (1.0 - (percentage / 100.0)))); - - new_table_size -= new_table_size % bits_per_char; - - if ((bits_per_char > new_table_size) || - (new_table_size >= original_table_size)) - { - return false; - } - - desired_false_positive_probability_ = effective_fpp(); - - const unsigned long long int new_tbl_raw_size = - new_table_size / bits_per_char; - - table_type tmp(new_tbl_raw_size); - - std::copy(bit_table_.begin(), bit_table_.begin() + new_tbl_raw_size, - tmp.begin()); - - typedef table_type::iterator itr_t; - - itr_t itr = bit_table_.begin() + (new_table_size / bits_per_char); - itr_t end = bit_table_.begin() + (original_table_size / bits_per_char); - itr_t itr_tmp = tmp.begin(); - - while (end != itr) - { - *(itr_tmp++) |= (*itr++); - } - - std::swap(bit_table_, tmp); - - size_list.push_back(new_table_size); - - return true; - } - - private: - inline void - compute_indices(const bloom_type& hash, std::size_t& bit_index, - std::size_t& bit) const - { - bit_index = hash; - - for (std::size_t i = 0; i < size_list.size(); ++i) - { - bit_index %= size_list[i]; - } - - bit = bit_index % bits_per_char; - } - - std::vector size_list; -}; - -#endif - -/* - Note 1: - If it can be guaranteed that bits_per_char will be of the form 2^n then - the following optimization can be used: - - bit_table_[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; - - Note 2: - For performance reasons where possible when allocating memory it should - be aligned (aligned_alloc) according to the architecture being used. -*/ \ No newline at end of file diff --git a/src/bucket/BucketIndex.h b/src/bucket/BucketIndex.h index 91aa53fa6c..3509fd7dd2 100644 --- a/src/bucket/BucketIndex.h +++ b/src/bucket/BucketIndex.h @@ -74,7 +74,7 @@ class BucketIndex : public NonMovableOrCopyable IndividualIndex::const_iterator>; inline static const std::string DB_BACKEND_STATE = "bl"; - inline static const uint32_t BUCKET_INDEX_VERSION = 2; + inline static const uint32_t BUCKET_INDEX_VERSION = 3; // Returns true if LedgerEntryType not supported by BucketListDB static bool typeNotSupported(LedgerEntryType t); diff --git a/src/bucket/BucketIndexImpl.cpp b/src/bucket/BucketIndexImpl.cpp index f28c05a665..c879cfa6ee 100644 --- a/src/bucket/BucketIndexImpl.cpp +++ b/src/bucket/BucketIndexImpl.cpp @@ -5,15 +5,15 @@ #include "bucket/BucketIndexImpl.h" #include "bucket/Bucket.h" #include "bucket/BucketManager.h" +#include "crypto/Hex.h" #include "crypto/ShortHash.h" #include "main/Config.h" +#include "util/BinaryFuseFilter.h" #include "util/Fs.h" #include "util/LogSlowExecution.h" #include "util/Logging.h" -#include "util/XDRCereal.h" #include "util/XDRStream.h" -#include "lib/bloom_filter.hpp" #include #include #include @@ -22,7 +22,9 @@ #include #include +#include #include +#include namespace stellar { @@ -77,45 +79,13 @@ BucketIndexImpl::BucketIndexImpl(BucketManager& bm, auto timer = LogSlowExecution("Indexing bucket"); mData.pageSize = pageSize; - size_t const estimatedLedgerEntrySize = - xdr::xdr_traits::serial_size(BucketEntry{}); - auto fileSize = fs::size(filename.string()); - auto estimatedNumElems = fileSize / estimatedLedgerEntrySize; - - // Initialize bloom filter for range index + // We don't have a good way of estimating IndividualIndex size since + // keys are variable size, so only reserve range indexes since we know + // the page size ahead of time if constexpr (std::is_same::value) { - ZoneNamedN(bloomInit, "bloomInit", true); - bloom_parameters params; - params.projected_element_count = estimatedNumElems; - - // Our target false positive rate is 0.1% even though we set the - // bloom filter false positive rate to 0.05%. We do this because our - // entry count estimation can be an underestimation (we assume every - // BucketEntry is an account LiveEntry, but TTL and DEADENTRY are - // smaller). If we gave a larger entry count estimate, the size of - // our bloom filter would significantly increase. Instead, by - // setting the desired false positive rate to 0.05%, the bloom - // filter size stays approximately the same and we give ourselves an - // additional 10% of wiggle room on the estimation. - params.false_positive_probability = 0.0005; // 0.05% - - params.random_seed = shortHash::getShortHashInitKey(); - params.compute_optimal_parameters(); - mData.filter = std::make_unique(params); + auto fileSize = fs::size(filename); auto estimatedIndexEntries = fileSize / mData.pageSize; - CLOG_DEBUG( - Bucket, - "Bloom filter initialized with params: projected element count " - "{} false positive probability: {}, number of hashes: {}, " - "table size: {}", - params.projected_element_count, - params.false_positive_probability, - params.optimal_parameters.number_of_hashes, - params.optimal_parameters.table_size); - - // We don't have a good way of estimating IndividualIndex size, so - // only reserve range indexes mData.keysToOffset.reserve(estimatedIndexEntries); } @@ -126,6 +96,10 @@ BucketIndexImpl::BucketIndexImpl(BucketManager& bm, BucketEntry be; size_t iter = 0; size_t count = 0; + + std::vector keyHashes; + auto seed = shortHash::getShortHashInitKey(); + while (in && in.readOne(be)) { // peridocially check if bucket manager is exiting to stop indexing @@ -173,6 +147,11 @@ BucketIndexImpl::BucketIndexImpl(BucketManager& bm, if constexpr (std::is_same::value) { + auto keyBuf = xdr::xdr_to_opaque(key); + SipHash24 hasher(seed.data()); + hasher.update(keyBuf.data(), keyBuf.size()); + keyHashes.emplace_back(hasher.digest()); + if (pos >= pageUpperBound) { pageUpperBound = @@ -186,9 +165,6 @@ BucketIndexImpl::BucketIndexImpl(BucketManager& bm, releaseAssert(rangeEntry.upperBound < key); rangeEntry.upperBound = key; } - - auto keybuf = xdr::xdr_to_opaque(key); - mData.filter->insert(keybuf.data(), keybuf.size()); } else { @@ -199,6 +175,16 @@ BucketIndexImpl::BucketIndexImpl(BucketManager& bm, pos = in.pos(); } + if constexpr (std::is_same::value) + { + // Binary Fuse filter requires at least 2 elements + if (keyHashes.size() > 1) + { + mData.filter = + std::make_unique(keyHashes, seed); + } + } + CLOG_DEBUG(Bucket, "Indexed {} positions in {}", mData.keysToOffset.size(), filename.filename()); ZoneValue(static_cast(count)); @@ -433,9 +419,7 @@ BucketIndexImpl::scan(Iterator start, LedgerKey const& k) const // If the key is not in the bloom filter or in the lower bounded index // entry, return nullopt markBloomLookup(); - auto keybuf = xdr::xdr_to_opaque(k); - if ((mData.filter && - !mData.filter->contains(keybuf.data(), keybuf.size())) || + if ((mData.filter && !mData.filter->contains(k)) || keyIter == mData.keysToOffset.end() || keyNotInIndexEntry(k, keyIter->first)) { @@ -546,7 +530,7 @@ BucketIndexImpl::operator==(BucketIndex const& inRaw) const { releaseAssert(mData.filter); releaseAssert(in.mData.filter); - if (*(mData.filter) != *(in.mData.filter)) + if (!(*(mData.filter) == *(in.mData.filter))) { return false; } diff --git a/src/bucket/BucketIndexImpl.h b/src/bucket/BucketIndexImpl.h index b265582fc7..51bd562342 100644 --- a/src/bucket/BucketIndexImpl.h +++ b/src/bucket/BucketIndexImpl.h @@ -6,11 +6,12 @@ #include "bucket/BucketIndex.h" #include "medida/meter.h" +#include "util/BinaryFuseFilter.h" +#include "xdr/Stellar-types.h" #include #include - -class bloom_filter; +#include namespace stellar { @@ -30,7 +31,7 @@ template class BucketIndexImpl : public BucketIndex { IndexT keysToOffset{}; std::streamoff pageSize{}; - std::unique_ptr filter{}; + std::unique_ptr filter{}; std::map> assetToPoolID{}; template diff --git a/src/bucket/test/BucketIndexTests.cpp b/src/bucket/test/BucketIndexTests.cpp index 35e9d64c8c..7ca0737644 100644 --- a/src/bucket/test/BucketIndexTests.cpp +++ b/src/bucket/test/BucketIndexTests.cpp @@ -5,10 +5,10 @@ // This file contains tests for the BucketIndex and higher-level operations // concerning key-value lookup based on the BucketList. -#include "bucket/BucketIndexImpl.h" #include "bucket/BucketList.h" #include "bucket/BucketListSnapshot.h" #include "bucket/BucketManager.h" +#include "bucket/BucketSnapshotManager.h" #include "bucket/test/BucketTestUtils.h" #include "ledger/test/LedgerTestUtils.h" #include "lib/catch.hpp" @@ -16,10 +16,6 @@ #include "main/Config.h" #include "test/test.h" -#include "lib/bloom_filter.hpp" - -#include "util/XDRCereal.h" - using namespace stellar; using namespace BucketTestUtils; @@ -541,7 +537,7 @@ TEST_CASE("ContractData key with same ScVal", "[bucket][bucketindex]") testAllIndexTypes(f); } -TEST_CASE("serialize bucket indexes", "[bucket][bucketindex][!hide]") +TEST_CASE("serialize bucket indexes", "[bucket][bucketindex]") { Config cfg(getTestConfig(0, Config::TESTDB_ON_DISK_SQLITE)); @@ -549,12 +545,13 @@ TEST_CASE("serialize bucket indexes", "[bucket][bucketindex][!hide]") cfg.BUCKETLIST_DB_INDEX_CUTOFF = 0; cfg.DEPRECATED_SQL_LEDGER_STATE = false; cfg.BUCKETLIST_DB_PERSIST_INDEX = true; + cfg.INVARIANT_CHECKS = {}; // Node is not a validator, so indexes will persist cfg.NODE_IS_VALIDATOR = false; cfg.FORCE_SCP = false; - auto test = BucketIndexTest(cfg); + auto test = BucketIndexTest(cfg, /*levels=*/3); test.buildGeneralTest(); auto buckets = test.getBM().getBucketListReferencedBuckets(); diff --git a/src/protocol-curr/xdr b/src/protocol-curr/xdr index 70180d5e8d..78ef986077 160000 --- a/src/protocol-curr/xdr +++ b/src/protocol-curr/xdr @@ -1 +1 @@ -Subproject commit 70180d5e8d9caee9e8645ed8a38c36a8cf403cd9 +Subproject commit 78ef9860777bd53e75a8ce8b978131cade26b321 diff --git a/src/protocol-next/xdr b/src/protocol-next/xdr index 38629ee46f..c3f1a272dc 160000 --- a/src/protocol-next/xdr +++ b/src/protocol-next/xdr @@ -1 +1 @@ -Subproject commit 38629ee46fbc7dfb034e00c38b2ddb954c6dd41d +Subproject commit c3f1a272dcc9351fe23891c2857414f7befa8831 diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 4cd45d8f57..345c338679 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -33,10 +33,10 @@ itertools = "=0.11.0" # N. [dependencies.soroban-env-host-curr] -version = "=21.1.0" +version = "=21.2.0" git = "https://github.com/stellar/rs-soroban-env" package = "soroban-env-host" -rev = "8d76e4037417b80dc25c979cc44b8e1281e8ad84" +rev = "8809852dcf8489f99407a5ceac12625ee3d14693" # This copy of the soroban host is _optional_ and only enabled during protocol # transitions. When transitioning from protocol N to N+1, the `curr` copy @@ -63,14 +63,14 @@ package = "soroban-env-host" rev = "93120b6b32cd910fcc224cbf6aec1333f771a8bc" [dependencies.soroban-test-wasms] -version = "=21.1.0" +version = "=21.2.0" git = "https://github.com/stellar/rs-soroban-env" -rev = "8d76e4037417b80dc25c979cc44b8e1281e8ad84" +rev = "8809852dcf8489f99407a5ceac12625ee3d14693" [dependencies.soroban-synth-wasm] -version = "=21.1.0" +version = "=21.2.0" git = "https://github.com/stellar/rs-soroban-env" -rev = "8d76e4037417b80dc25c979cc44b8e1281e8ad84" +rev = "8809852dcf8489f99407a5ceac12625ee3d14693" [dependencies.cargo-lock] version = "=9.0.0" diff --git a/src/rust/src/host-dep-tree-curr.txt b/src/rust/src/host-dep-tree-curr.txt index bbd9fc7572..0623e7483d 100644 --- a/src/rust/src/host-dep-tree-curr.txt +++ b/src/rust/src/host-dep-tree-curr.txt @@ -1,4 +1,4 @@ -soroban-env-host 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84 +soroban-env-host 21.2.0 git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693 ├── wasmparser 0.116.1 checksum:a58e28b80dd8340cb07b8242ae654756161f6fc8d0038123d679b7b99964fa50 │ ├── semver 1.0.17 checksum:bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed │ │ └── serde 1.0.192 checksum:bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001 @@ -102,10 +102,10 @@ soroban-env-host 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4 │ ├── wasmi_arena 0.4.0 git+https://github.com/stellar/wasmi?rev=0ed3f3dee30dc41ebe21972399e0a73a41944aa0#0ed3f3dee30dc41ebe21972399e0a73a41944aa0 │ ├── spin 0.9.8 checksum:6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67 │ └── smallvec 1.10.0 checksum:a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0 -├── soroban-env-common 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84 +├── soroban-env-common 21.2.0 git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693 │ ├── wasmparser 0.116.1 checksum:a58e28b80dd8340cb07b8242ae654756161f6fc8d0038123d679b7b99964fa50 │ ├── tracy-client 0.16.4 checksum:82da0d50d9df1106619b1e5b118f39de779f7d8b9c3504485b291cb16fabd20f -│ ├── stellar-xdr 21.1.0 checksum:ec43c9c5ae7ec7b6ac9e263b6d5b9e3781aa05ba3a1c05f6e70701c5c6600665 +│ ├── stellar-xdr 21.2.0 git+https://github.com/stellar/rs-stellar-xdr?rev=ae805d0f8c28ca86327a834eea9ce7d29b0a63bb#ae805d0f8c28ca86327a834eea9ce7d29b0a63bb │ │ ├── stellar-strkey 0.0.8 checksum:12d2bf45e114117ea91d820a846fd1afbe3ba7d717988fee094ce8227a3bf8bd │ │ ├── hex 0.4.3 checksum:7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70 │ │ ├── escape-bytes 0.1.1 checksum:2bfcf67fea2815c2fc3b90873fae90957be12ff417335dfadc7f52927feb03b2 @@ -113,9 +113,9 @@ soroban-env-host 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4 │ │ └── base64 0.13.1 checksum:9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8 │ ├── static_assertions 1.1.0 checksum:a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f │ ├── soroban-wasmi 0.31.1-soroban.20.0.1 git+https://github.com/stellar/wasmi?rev=0ed3f3dee30dc41ebe21972399e0a73a41944aa0#0ed3f3dee30dc41ebe21972399e0a73a41944aa0 -│ ├── soroban-env-macros 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84 +│ ├── soroban-env-macros 21.2.0 git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693 │ │ ├── syn 2.0.39 checksum:23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a -│ │ ├── stellar-xdr 21.1.0 checksum:ec43c9c5ae7ec7b6ac9e263b6d5b9e3781aa05ba3a1c05f6e70701c5c6600665 +│ │ ├── stellar-xdr 21.2.0 git+https://github.com/stellar/rs-stellar-xdr?rev=ae805d0f8c28ca86327a834eea9ce7d29b0a63bb#ae805d0f8c28ca86327a834eea9ce7d29b0a63bb │ │ ├── serde_json 1.0.108 checksum:3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b │ │ ├── serde 1.0.192 checksum:bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001 │ │ ├── quote 1.0.33 checksum:5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae @@ -129,7 +129,7 @@ soroban-env-host 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4 │ │ └── proc-macro2 1.0.69 checksum:134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da │ ├── ethnum 1.5.0 checksum:b90ca2580b73ab6a1f724b76ca11ab632df820fd6040c336200d2c1df7b3c82c │ └── crate-git-revision 0.0.6 checksum:c521bf1f43d31ed2f73441775ed31935d77901cb3451e44b38a1c1612fcbaf98 -├── soroban-builtin-sdk-macros 21.1.0 git+https://github.com/stellar/rs-soroban-env?rev=8d76e4037417b80dc25c979cc44b8e1281e8ad84#8d76e4037417b80dc25c979cc44b8e1281e8ad84 +├── soroban-builtin-sdk-macros 21.2.0 git+https://github.com/stellar/rs-soroban-env?rev=8809852dcf8489f99407a5ceac12625ee3d14693#8809852dcf8489f99407a5ceac12625ee3d14693 │ ├── syn 2.0.39 checksum:23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a │ ├── quote 1.0.33 checksum:5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae │ ├── proc-macro2 1.0.69 checksum:134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da diff --git a/src/util/BinaryFuseFilter.cpp b/src/util/BinaryFuseFilter.cpp new file mode 100644 index 0000000000..df31386764 --- /dev/null +++ b/src/util/BinaryFuseFilter.cpp @@ -0,0 +1,52 @@ +// Copyright 2024 Stellar Development Foundation and contributors. Licensed +// under the Apache License, Version 2.0. See the COPYING file at the root +// of this distribution or at http://www.apache.org/licenses/LICENSE-2.0 + +#include "util/BinaryFuseFilter.h" +#include "util/siphash.h" +#include +#include + +namespace stellar +{ + +template +BinaryFuseFilter::BinaryFuseFilter(std::vector& keyHashes, + binary_fuse_seed_t const& seed) + : mFilter(keyHashes.size(), keyHashes, seed), mHashSeed(seed) +{ +} + +template +BinaryFuseFilter::BinaryFuseFilter( + SerializedBinaryFuseFilter const& xdrFilter) + : mFilter(xdrFilter), mHashSeed([&] { + binary_fuse_seed_t s{}; + std::copy(xdrFilter.inputHashSeed.seed.begin(), + xdrFilter.inputHashSeed.seed.end(), s.begin()); + return s; + }()) +{ +} + +template +bool +BinaryFuseFilter::contains(LedgerKey const& key) const +{ + SipHash24 hasher(mHashSeed.data()); + auto keybuf = xdr::xdr_to_opaque(key); + hasher.update(keybuf.data(), keybuf.size()); + return mFilter.contain(hasher.digest()); +} + +template +bool +BinaryFuseFilter::operator==(BinaryFuseFilter const& other) const +{ + return mFilter == other.mFilter && mHashSeed == other.mHashSeed; +} + +template class BinaryFuseFilter; +template class BinaryFuseFilter; +template class BinaryFuseFilter; +} \ No newline at end of file diff --git a/src/util/BinaryFuseFilter.h b/src/util/BinaryFuseFilter.h new file mode 100644 index 0000000000..30e22b522e --- /dev/null +++ b/src/util/BinaryFuseFilter.h @@ -0,0 +1,91 @@ +#pragma once + +// Copyright 2024 Stellar Development Foundation and contributors. Licensed +// under the Apache License, Version 2.0. See the COPYING file at the root +// of this distribution or at http://www.apache.org/licenses/LICENSE-2.0 + +#include "lib/binaryfusefilter.h" +#include "util/NonCopyable.h" +#include "xdr/Stellar-ledger-entries.h" +#include "xdr/Stellar-types.h" + +#include +#include + +namespace stellar +{ + +// This class is a wrapper around the binary_fuse_t library that provides +// serialization for the XDR BinaryFuseFilter type and provides a deterministic +// LedgerKey interface. +template class BinaryFuseFilter : public NonMovableOrCopyable +{ + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "Binary Fuse Filter only supports 8, 16, or 32 bit width"); + + private: + binary_fuse_t const mFilter; + + // Note: This is the seed used to hash keys going into the filter, and we + // need to preserve it + // to hash keys we're looking up in the filter. The filter also has its own + // seed determining its overall structure, which _starts_ with this seed, + // but which may be rotated a bit in during repeated attempts to + // successfully populate the filter. So we have to keep both seeds. + binary_fuse_seed_t const mHashSeed; + + public: + // keyHashes is the SipHash24 digest on the keys to insert into the filter. + // Seed is the random seed used to initialize the hash function + explicit BinaryFuseFilter(std::vector& keyHashes, + binary_fuse_seed_t const& seed); + explicit BinaryFuseFilter(SerializedBinaryFuseFilter const& xdrFilter); + + bool contains(LedgerKey const& key) const; + + bool operator==(BinaryFuseFilter const& other) const; + + template + void + save(Archive& archive) const + { + SerializedBinaryFuseFilter xdrFilter; + std::copy(mHashSeed.begin(), mHashSeed.end(), + xdrFilter.inputHashSeed.seed.begin()); + + mFilter.copyTo(xdrFilter); + archive(xdrFilter); + } + + template + void + load(Archive& archive) + { + SerializedBinaryFuseFilter xdrFilter; + archive(xdrFilter); + } + + template + static void + load_and_construct(Archive& ar, + cereal::construct>& construct) + { + SerializedBinaryFuseFilter xdrFilter; + ar(xdrFilter); + construct(xdrFilter); + } +}; + +// False positive rate: 1/256 +// Approximate bits per entry: 9 +typedef BinaryFuseFilter BinaryFuseFilter8; + +// False positive rate: 1/65536 +// Approximate bits per entry: 18 +typedef BinaryFuseFilter BinaryFuseFilter16; + +// False positive rate: 1 / 4 billion +// Approximate bits per entry: 36 +typedef BinaryFuseFilter BinaryFuseFilter32; +} \ No newline at end of file diff --git a/src/util/test/BinaryFuseTests.cpp b/src/util/test/BinaryFuseTests.cpp new file mode 100644 index 0000000000..d098746b25 --- /dev/null +++ b/src/util/test/BinaryFuseTests.cpp @@ -0,0 +1,137 @@ +// Copyright 2024 Stellar Development Foundation and contributors. Licensed +// under the Apache License, Version 2.0. See the COPYING file at the root +// of this distribution or at http://www.apache.org/licenses/LICENSE-2.0 + +#include "crypto/ShortHash.h" +#include "lib/catch.hpp" +#include "util/BinaryFuseFilter.h" +#include "util/types.h" +#include "xdr/Stellar-types.h" +#include +#include +#include +#include +#include +#include + +using namespace stellar; + +static auto ledgerKeyGenerator = autocheck::such_that( + [](LedgerKey const& k) { return k.type() != CONFIG_SETTING; }, + autocheck::generator()); + +template +void +testFilter(double expectedFalsePositiveRate) +{ + LedgerKeySet keys; + for (size_t size = 100; size <= 1'000'000; size *= 100) + { + while (keys.size() < size) + { + auto k = ledgerKeyGenerator(); + keys.insert(k); + } + + auto seed = shortHash::getShortHashInitKey(); + + std::vector hashes; + hashes.reserve(keys.size()); + for (auto const& k : keys) + { + auto keyBuf = xdr::xdr_to_opaque(k); + + SipHash24 hasher(seed.data()); + hasher.update(keyBuf.data(), keyBuf.size()); + hashes.emplace_back(hasher.digest()); + } + + // Test in-memory filter, serialize, deserialize, then test again + std::stringstream ss; + FilterT inMemoryFilter(hashes, seed); + { + for (auto const& k : keys) + { + REQUIRE(inMemoryFilter.contains(k)); + } + + cereal::BinaryOutputArchive oarchive(ss); + oarchive(inMemoryFilter); + } + + SerializedBinaryFuseFilter xdrFilter; + { + cereal::BinaryInputArchive iarchive(ss); + iarchive(xdrFilter); + } + + FilterT deserializedFilter(xdrFilter); + REQUIRE(deserializedFilter == inMemoryFilter); + + // Repeat correctness check for deserialized filter + for (auto const& k : keys) + { + REQUIRE(deserializedFilter.contains(k)); + } + + size_t randomMatches = 0; + size_t trials = 2'000'000; + for (size_t i = 0; i < trials; i++) + { + LedgerKey randomKey; + do + { + randomKey = ledgerKeyGenerator(); + } while (keys.find(randomKey) != keys.end()); + + // Make sure both filters have identical behavior + if (inMemoryFilter.contains(randomKey)) + { + REQUIRE(deserializedFilter.contains(randomKey)); + ++randomMatches; + } + else + { + REQUIRE(!deserializedFilter.contains(randomKey)); + } + } + + if (expectedFalsePositiveRate == 0) + { + // 32 bit filter has too small a false positive rate to test, + // allow up to one false positive for test stability + REQUIRE(randomMatches <= 1); + } + else + { + // False positive rate should be within 15% of the expected rate + // The fpp is so small, we have to give a relatively large margin + // to account for float imprecision + double fpp = randomMatches * 1.0 / trials; + double upperBound = expectedFalsePositiveRate * 1.15; + REQUIRE(fpp < upperBound); + } + } +} + +TEST_CASE("binary fuse filter", "[BinaryFuseFilter][!hide]") +{ + + SECTION("8 bit") + { + auto epsilon = 1ul << 8; + testFilter(1.0 / epsilon); + } + + SECTION("16 bit") + { + auto epsilon = 1ul << 16; + testFilter(1.0 / epsilon); + } + + SECTION("32 bit") + { + // The actual false positive rate is 1/ 4 billion + testFilter(0); + } +} \ No newline at end of file