Browse code

Add theft

John Hawthorn authored on 03/04/2017 09:11:20
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,145 @@
1
+#include <string.h>
2
+#include <stdio.h>
3
+
4
+#include "theft.h"
5
+#include "theft_bloom.h"
6
+
7
+#define DEFAULT_BLOOM_BITS 17
8
+#define DEBUG_BLOOM_FILTER 0
9
+#define LOG2_BITS_PER_BYTE 3
10
+#define HASH_COUNT 4
11
+
12
+static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte);
13
+
14
+/* Initialize a bloom filter. */
15
+struct theft_bloom *theft_bloom_init(uint8_t bit_size2) {
16
+    size_t sz = 1 << (bit_size2 - LOG2_BITS_PER_BYTE);
17
+    struct theft_bloom *b = malloc(sizeof(struct theft_bloom) + sz);
18
+    if (b) {
19
+        b->size = sz;
20
+        b->bit_count = bit_size2;
21
+        memset(b->bits, 0, sz);
22
+    }
23
+    return b;
24
+}
25
+
26
+/* Hash data and mark it in the bloom filter. */
27
+void theft_bloom_mark(struct theft_bloom *b, uint8_t *data, size_t data_size) {
28
+    uint64_t hash = theft_hash_onepass(data, data_size);
29
+    uint8_t bc = b->bit_count;
30
+    uint64_t mask = (1 << bc) - 1;
31
+
32
+    /* Use HASH_COUNT distinct slices of MASK bits as hashes for the bloom filter. */
33
+    int bit_inc = (64 - bc) / HASH_COUNT;
34
+
35
+    for (int i = 0; i < (64 - bc); i += bit_inc) {
36
+        uint64_t v = (hash & (mask << i)) >> i;
37
+        size_t offset = v / 8;
38
+        uint8_t bit = 1 << (v & 0x07);
39
+        b->bits[offset] |= bit;
40
+    }
41
+}
42
+
43
+/* Check whether the data's hash is in the bloom filter. */
44
+bool theft_bloom_check(struct theft_bloom *b, uint8_t *data, size_t data_size) {
45
+    uint64_t hash = theft_hash_onepass(data, data_size);
46
+    uint8_t bc = b->bit_count;
47
+    uint64_t mask = (1 << bc) - 1;
48
+
49
+    int bit_inc = (64 - bc) / HASH_COUNT;
50
+
51
+    for (int i = 0; i < (64 - bc); i += bit_inc) {
52
+        uint64_t v = (hash & (mask << i)) >> i;
53
+        size_t offset = v / 8;
54
+        uint8_t bit = 1 << (v & 0x07);
55
+        if (0 == (b->bits[offset] & bit)) { return false; }
56
+    }
57
+
58
+    return true;
59
+}
60
+
61
+/* Free the bloom filter. */
62
+void theft_bloom_free(struct theft_bloom *b) { free(b); }
63
+
64
+/* Dump the bloom filter's contents. (Debugging.) */
65
+void theft_bloom_dump(struct theft_bloom *b) {
66
+    uint8_t counts[256];
67
+    memset(counts, 0xFF, sizeof(counts));
68
+
69
+    size_t total = 0;
70
+    uint16_t row_total = 0;
71
+    
72
+    for (size_t i = 0; i < b->size; i++) {
73
+        uint8_t count = get_bits_set_count(counts, b->bits[i]);
74
+        total += count;
75
+        row_total += count;
76
+        #if DEBUG_BLOOM_FILTER > 1
77
+        char c = (count == 0 ? '.' : '0' + count);
78
+        printf("%c", c);
79
+        if ((i & 63) == 0 || i == b->size - 1) {
80
+            printf(" - %2.1f%%\n",
81
+                (100 * row_total) / (64.0 * 8));
82
+            row_total = 0;
83
+        }
84
+        #endif
85
+    }
86
+
87
+    #if DEBUG_BLOOM_FILTER
88
+    printf(" -- bloom filter: %zd of %zd bits set (%2d%%)\n",
89
+        total, 8*b->size, (int)((100.0 * total)/(8.0*b->size)));
90
+    #endif
91
+
92
+    /* If the total number of bits set is > the number of bytes
93
+     * in the table (i.e., > 1/8 full) then warn the user. */
94
+    if (total > b->size) {
95
+        fprintf(stderr, "\nWARNING: bloom filter is %zd%% full, "
96
+            "larger bloom_bits value recommended.\n",
97
+            (size_t)((100 * total) / (8 * b->size)));
98
+    }
99
+}
100
+
101
+/* Recommend a bloom filter size for a given number of trials. */
102
+uint8_t theft_bloom_recommendation(int trials) {
103
+    /* With a preferred priority of false positives under 0.1%,
104
+     * the required number of bits m in the bloom filter is:
105
+     *     m = -lg(0.001)/(lg(2)^2) == 14.378 ~= 14,
106
+     * so we want an array with 1 << ceil(log2(14*trials)) cells.
107
+     *
108
+     * Note: The above formula is for the *ideal* number of hash
109
+     * functions, but we're using a hardcoded count. It appears to work
110
+     * well enough in practice, though, and this can be adjusted later
111
+     * without impacting the API. */
112
+    #define TRIAL_MULTIPILER 14
113
+    uint8_t res = DEFAULT_BLOOM_BITS;
114
+
115
+    const uint8_t min = THEFT_BLOOM_BITS_MIN - LOG2_BITS_PER_BYTE;
116
+    const uint8_t max = THEFT_BLOOM_BITS_MAX - LOG2_BITS_PER_BYTE;
117
+
118
+    for (uint8_t i = min; i < max; i++) {
119
+        int32_t v = (1 << i);
120
+        if (v > (TRIAL_MULTIPILER * trials)) {
121
+            res = i + LOG2_BITS_PER_BYTE;
122
+            break;
123
+        }
124
+    }
125
+
126
+    #if DEBUG_BLOOM_FILTER
127
+    size_t sz = 1 << (res - LOG2_BITS_PER_BYTE);
128
+    printf("Using %zd bytes for bloom filter: %d trials -> bit_size2 %u\n",
129
+        sizeof(struct theft_bloom) + sz, trials, res);
130
+    #endif
131
+
132
+    return res;
133
+}
134
+
135
+/* Check a byte->bits set table, and lazily populate it. */
136
+static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte) {
137
+    uint8_t v = counts[byte];
138
+    if (v != 0xFF) { return v; }
139
+    uint8_t t = 0;
140
+    for (uint8_t i = 0; i < 8; i++) {
141
+        if (byte & (1 << i)) { t++; }
142
+    }
143
+    counts[byte] = t;
144
+    return t;
145
+}