| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,145 @@ |
| 1 |
+#include <string.h> |
|
| 2 |
+#include <stdio.h> |
|
| 3 |
+ |
|
| 4 |
+#include "theft.h" |
|
| 5 |
+#include "theft_bloom.h" |
|
| 6 |
+ |
|
| 7 |
+#define DEFAULT_BLOOM_BITS 17 |
|
| 8 |
+#define DEBUG_BLOOM_FILTER 0 |
|
| 9 |
+#define LOG2_BITS_PER_BYTE 3 |
|
| 10 |
+#define HASH_COUNT 4 |
|
| 11 |
+ |
|
| 12 |
+static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte); |
|
| 13 |
+ |
|
| 14 |
+/* Initialize a bloom filter. */ |
|
| 15 |
+struct theft_bloom *theft_bloom_init(uint8_t bit_size2) {
|
|
| 16 |
+ size_t sz = 1 << (bit_size2 - LOG2_BITS_PER_BYTE); |
|
| 17 |
+ struct theft_bloom *b = malloc(sizeof(struct theft_bloom) + sz); |
|
| 18 |
+ if (b) {
|
|
| 19 |
+ b->size = sz; |
|
| 20 |
+ b->bit_count = bit_size2; |
|
| 21 |
+ memset(b->bits, 0, sz); |
|
| 22 |
+ } |
|
| 23 |
+ return b; |
|
| 24 |
+} |
|
| 25 |
+ |
|
| 26 |
+/* Hash data and mark it in the bloom filter. */ |
|
| 27 |
+void theft_bloom_mark(struct theft_bloom *b, uint8_t *data, size_t data_size) {
|
|
| 28 |
+ uint64_t hash = theft_hash_onepass(data, data_size); |
|
| 29 |
+ uint8_t bc = b->bit_count; |
|
| 30 |
+ uint64_t mask = (1 << bc) - 1; |
|
| 31 |
+ |
|
| 32 |
+ /* Use HASH_COUNT distinct slices of MASK bits as hashes for the bloom filter. */ |
|
| 33 |
+ int bit_inc = (64 - bc) / HASH_COUNT; |
|
| 34 |
+ |
|
| 35 |
+ for (int i = 0; i < (64 - bc); i += bit_inc) {
|
|
| 36 |
+ uint64_t v = (hash & (mask << i)) >> i; |
|
| 37 |
+ size_t offset = v / 8; |
|
| 38 |
+ uint8_t bit = 1 << (v & 0x07); |
|
| 39 |
+ b->bits[offset] |= bit; |
|
| 40 |
+ } |
|
| 41 |
+} |
|
| 42 |
+ |
|
| 43 |
+/* Check whether the data's hash is in the bloom filter. */ |
|
| 44 |
+bool theft_bloom_check(struct theft_bloom *b, uint8_t *data, size_t data_size) {
|
|
| 45 |
+ uint64_t hash = theft_hash_onepass(data, data_size); |
|
| 46 |
+ uint8_t bc = b->bit_count; |
|
| 47 |
+ uint64_t mask = (1 << bc) - 1; |
|
| 48 |
+ |
|
| 49 |
+ int bit_inc = (64 - bc) / HASH_COUNT; |
|
| 50 |
+ |
|
| 51 |
+ for (int i = 0; i < (64 - bc); i += bit_inc) {
|
|
| 52 |
+ uint64_t v = (hash & (mask << i)) >> i; |
|
| 53 |
+ size_t offset = v / 8; |
|
| 54 |
+ uint8_t bit = 1 << (v & 0x07); |
|
| 55 |
+ if (0 == (b->bits[offset] & bit)) { return false; }
|
|
| 56 |
+ } |
|
| 57 |
+ |
|
| 58 |
+ return true; |
|
| 59 |
+} |
|
| 60 |
+ |
|
| 61 |
+/* Free the bloom filter. */ |
|
| 62 |
+void theft_bloom_free(struct theft_bloom *b) { free(b); }
|
|
| 63 |
+ |
|
| 64 |
+/* Dump the bloom filter's contents. (Debugging.) */ |
|
| 65 |
+void theft_bloom_dump(struct theft_bloom *b) {
|
|
| 66 |
+ uint8_t counts[256]; |
|
| 67 |
+ memset(counts, 0xFF, sizeof(counts)); |
|
| 68 |
+ |
|
| 69 |
+ size_t total = 0; |
|
| 70 |
+ uint16_t row_total = 0; |
|
| 71 |
+ |
|
| 72 |
+ for (size_t i = 0; i < b->size; i++) {
|
|
| 73 |
+ uint8_t count = get_bits_set_count(counts, b->bits[i]); |
|
| 74 |
+ total += count; |
|
| 75 |
+ row_total += count; |
|
| 76 |
+ #if DEBUG_BLOOM_FILTER > 1 |
|
| 77 |
+ char c = (count == 0 ? '.' : '0' + count); |
|
| 78 |
+ printf("%c", c);
|
|
| 79 |
+ if ((i & 63) == 0 || i == b->size - 1) {
|
|
| 80 |
+ printf(" - %2.1f%%\n",
|
|
| 81 |
+ (100 * row_total) / (64.0 * 8)); |
|
| 82 |
+ row_total = 0; |
|
| 83 |
+ } |
|
| 84 |
+ #endif |
|
| 85 |
+ } |
|
| 86 |
+ |
|
| 87 |
+ #if DEBUG_BLOOM_FILTER |
|
| 88 |
+ printf(" -- bloom filter: %zd of %zd bits set (%2d%%)\n",
|
|
| 89 |
+ total, 8*b->size, (int)((100.0 * total)/(8.0*b->size))); |
|
| 90 |
+ #endif |
|
| 91 |
+ |
|
| 92 |
+ /* If the total number of bits set is > the number of bytes |
|
| 93 |
+ * in the table (i.e., > 1/8 full) then warn the user. */ |
|
| 94 |
+ if (total > b->size) {
|
|
| 95 |
+ fprintf(stderr, "\nWARNING: bloom filter is %zd%% full, " |
|
| 96 |
+ "larger bloom_bits value recommended.\n", |
|
| 97 |
+ (size_t)((100 * total) / (8 * b->size))); |
|
| 98 |
+ } |
|
| 99 |
+} |
|
| 100 |
+ |
|
| 101 |
+/* Recommend a bloom filter size for a given number of trials. */ |
|
| 102 |
+uint8_t theft_bloom_recommendation(int trials) {
|
|
| 103 |
+ /* With a preferred priority of false positives under 0.1%, |
|
| 104 |
+ * the required number of bits m in the bloom filter is: |
|
| 105 |
+ * m = -lg(0.001)/(lg(2)^2) == 14.378 ~= 14, |
|
| 106 |
+ * so we want an array with 1 << ceil(log2(14*trials)) cells. |
|
| 107 |
+ * |
|
| 108 |
+ * Note: The above formula is for the *ideal* number of hash |
|
| 109 |
+ * functions, but we're using a hardcoded count. It appears to work |
|
| 110 |
+ * well enough in practice, though, and this can be adjusted later |
|
| 111 |
+ * without impacting the API. */ |
|
| 112 |
+ #define TRIAL_MULTIPILER 14 |
|
| 113 |
+ uint8_t res = DEFAULT_BLOOM_BITS; |
|
| 114 |
+ |
|
| 115 |
+ const uint8_t min = THEFT_BLOOM_BITS_MIN - LOG2_BITS_PER_BYTE; |
|
| 116 |
+ const uint8_t max = THEFT_BLOOM_BITS_MAX - LOG2_BITS_PER_BYTE; |
|
| 117 |
+ |
|
| 118 |
+ for (uint8_t i = min; i < max; i++) {
|
|
| 119 |
+ int32_t v = (1 << i); |
|
| 120 |
+ if (v > (TRIAL_MULTIPILER * trials)) {
|
|
| 121 |
+ res = i + LOG2_BITS_PER_BYTE; |
|
| 122 |
+ break; |
|
| 123 |
+ } |
|
| 124 |
+ } |
|
| 125 |
+ |
|
| 126 |
+ #if DEBUG_BLOOM_FILTER |
|
| 127 |
+ size_t sz = 1 << (res - LOG2_BITS_PER_BYTE); |
|
| 128 |
+ printf("Using %zd bytes for bloom filter: %d trials -> bit_size2 %u\n",
|
|
| 129 |
+ sizeof(struct theft_bloom) + sz, trials, res); |
|
| 130 |
+ #endif |
|
| 131 |
+ |
|
| 132 |
+ return res; |
|
| 133 |
+} |
|
| 134 |
+ |
|
| 135 |
+/* Check a byte->bits set table, and lazily populate it. */ |
|
| 136 |
+static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte) {
|
|
| 137 |
+ uint8_t v = counts[byte]; |
|
| 138 |
+ if (v != 0xFF) { return v; }
|
|
| 139 |
+ uint8_t t = 0; |
|
| 140 |
+ for (uint8_t i = 0; i < 8; i++) {
|
|
| 141 |
+ if (byte & (1 << i)) { t++; }
|
|
| 142 |
+ } |
|
| 143 |
+ counts[byte] = t; |
|
| 144 |
+ return t; |
|
| 145 |
+} |