bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2018 Dovecot authors, see the included COPYING file */
bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#include "lib.h"
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#include "bloomfilter.h"
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#include "murmurhash3.h"
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#include "md5.h"
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#include "randgen.h"
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#include <math.h>
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomistruct bloomfilter {
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi pool_t pool;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi int refcount;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi size_t size;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi size_t total_added;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi unsigned int nk;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi uint32_t seed;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bloomfilter_hash_func_t *const *k;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi uint8_t *bitmap;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi};
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#define BITMAP_HAS_BIT(map, idx) (((map)[((idx)/CHAR_BIT)] & (0x1<<((idx)%CHAR_BIT))) != 0)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#define BITMAP_SET_BIT(map, idx) ((map)[((idx)/CHAR_BIT)] |= (0x1<<((idx)%CHAR_BIT)))
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#define BLOOMFILTER_HASH_BYTES 16
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi/* use only murmurhash3 by default */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomibloomfilter_hash_func_t *const bloomfilter_default_functions[] = {
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bloomfilter_murmur3_hash,
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi NULL
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi};
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomistatic inline size_t
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomibloomfilter_hash_fold(unsigned char result[STATIC_ARRAY BLOOMFILTER_HASH_BYTES],
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi uint32_t seed)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#ifdef _LP64
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi /* rolls 128 bit result into a 64 bit result by xoring the first 64 bits
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi and seed, and remaining 64 bits. */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return be64_to_cpu_unaligned(&result[0]) ^
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi be64_to_cpu_unaligned(&result[8]) ^
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi (((size_t)seed) << 32);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#else
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi /* rolls 128 bit result into a 32 bit result by folding
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi all the successive 32 bit values into one together with seed. */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return be32_to_cpu_unaligned(&result[0]) ^
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi be32_to_cpu_unaligned(&result[4]) ^
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi be32_to_cpu_unaligned(&result[8]) ^
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi be32_to_cpu_unaligned(&result[12]) ^
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi seed;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi#endif
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomisize_t bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi unsigned char result[MURMURHASH3_128_RESULTBYTES];
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi murmurhash3_128(data, len, seed, result);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi /* murmur includes seed already */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return bloomfilter_hash_fold(result, 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomisize_t bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi unsigned char result[MD5_RESULTLEN];
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi md5_get_digest(data, len, result);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return bloomfilter_hash_fold(result, seed);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomistruct bloomfilter *
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomibloomfilter_create(pool_t pool, size_t size,
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bloomfilter_hash_func_t *const *hash_functions)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi struct bloomfilter *bf = p_new(pool, struct bloomfilter, 1);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi i_assert(size > 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->pool = pool;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi /* allocate extra byte to round up result */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->bitmap = p_malloc(pool, size/CHAR_BIT + 1);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->k = hash_functions;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->size = size;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi while(*hash_functions != NULL) {
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->nk++;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi hash_functions++;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi }
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi i_assert(bf->nk > 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi random_fill(&bf->seed, sizeof(bf->seed));
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->refcount = 1;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return bf;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomivoid bloomfilter_ref(struct bloomfilter *bf)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi i_assert(bf->refcount > 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->refcount++;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomivoid bloomfilter_unref(struct bloomfilter **_bf)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi struct bloomfilter *bf = *_bf;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi if (*_bf == NULL)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi *_bf = NULL;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi i_assert(bf->refcount > 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi if (--bf->refcount > 0)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi /* in case system pool was used .. */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi p_free(bf->pool, bf->bitmap);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi p_free(bf->pool, bf);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomisize_t bloomfilter_estimated_item_count(struct bloomfilter *bf)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return bf->total_added;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomibool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi i_assert(data != NULL || len == 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bloomfilter_hash_func_t *const *k = bf->k;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi for(;*k != NULL; k++) {
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi size_t result;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi result = (*k)(data, len, bf->seed) % bf->size;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi if (!BITMAP_HAS_BIT(bf->bitmap, result))
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return FALSE;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi }
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi return TRUE;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomivoid bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi{
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi i_assert(data != NULL || len == 0);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bloomfilter_hash_func_t *const *k = bf->k;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi /* total added will cap at size_t, because it's an estimate */
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi if (bf->total_added < (size_t)-1)
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi bf->total_added++;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi for(;*k != NULL; k++) {
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi size_t result;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi result = (*k)(data, len, bf->seed) % bf->size;
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi BITMAP_SET_BIT(bf->bitmap, result);
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi }
44d29bbfa14662dc1b2873f1a570a0767f827cd1Aki Tuomi}