bloom-filter: code cleanup
Change-Id: I72cec068778d26c62b05060030013f148ef7c5da
diff --git a/PSync/detail/bloom-filter.cpp b/PSync/detail/bloom-filter.cpp
index f700f16..89447c0 100644
--- a/PSync/detail/bloom-filter.cpp
+++ b/PSync/detail/bloom-filter.cpp
@@ -47,22 +47,19 @@
#include "PSync/detail/util.hpp"
#include <ndn-cxx/util/exception.hpp>
-#include <ndn-cxx/util/logger.hpp>
#include <algorithm>
#include <cmath>
#include <cstddef>
-#include <iterator>
-#include <limits>
#include <cstdlib>
-
-// https://github.com/ArashPartow/bloom
-
-NDN_LOG_INIT(psync.BloomFilter);
+#include <limits>
namespace psync {
-static const std::size_t bits_per_char = 0x08;
+// https://github.com/ArashPartow/bloom
+
+static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
+
static const unsigned char bit_mask[bits_per_char] = {
0x01, //00000001
0x02, //00000010
@@ -74,89 +71,154 @@
0x80 //10000000
};
-BloomParameters::BloomParameters()
-: minimum_size(1)
-, maximum_size(std::numeric_limits<unsigned int>::max())
-, minimum_number_of_hashes(1)
-, maximum_number_of_hashes(std::numeric_limits<unsigned int>::max())
-, projected_element_count(200)
-, false_positive_probability(1.0 / projected_element_count)
-, random_seed(0xA5A5A5A55A5A5A5AULL)
-{}
-
-bool
-BloomParameters::compute_optimal_parameters()
+class bloom_parameters
{
- if (!(*this)) {
- return false;
- }
+public:
- double min_m = std::numeric_limits<double>::infinity();
- double min_k = 0.0;
- double curr_m = 0.0;
- double k = 1.0;
+ bloom_parameters()
+ : minimum_size(1),
+ maximum_size(std::numeric_limits<unsigned int>::max()),
+ minimum_number_of_hashes(1),
+ maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
+ projected_element_count(200),
+ false_positive_probability(1.0 / projected_element_count),
+ random_seed(0xA5A5A5A55A5A5A5AULL)
+ {}
- while (k < 1000.0)
- {
- double numerator = (- k * projected_element_count);
- double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
- curr_m = numerator / denominator;
- if (curr_m < min_m)
- {
- min_m = curr_m;
- min_k = k;
- }
- k += 1.0;
- }
+ inline bool operator!()
+ {
+ return (minimum_size > maximum_size) ||
+ (minimum_number_of_hashes > maximum_number_of_hashes) ||
+ (minimum_number_of_hashes < 1) ||
+ (0 == maximum_number_of_hashes) ||
+ (0 == projected_element_count) ||
+ (false_positive_probability < 0.0) ||
+ (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
+ (0 == random_seed) ||
+ (0xFFFFFFFFFFFFFFFFULL == random_seed);
+ }
- optimal_parameters_t& optp = optimal_parameters;
+ // Allowable min/max size of the bloom filter in bits
+ unsigned int minimum_size;
+ unsigned int maximum_size;
- optp.number_of_hashes = static_cast<unsigned int>(min_k);
- optp.table_size = static_cast<unsigned int>(min_m);
- optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
+ // Allowable min/max number of hash functions
+ unsigned int minimum_number_of_hashes;
+ unsigned int maximum_number_of_hashes;
- if (optp.number_of_hashes < minimum_number_of_hashes)
- optp.number_of_hashes = minimum_number_of_hashes;
- else if (optp.number_of_hashes > maximum_number_of_hashes)
- optp.number_of_hashes = maximum_number_of_hashes;
+ // The approximate number of elements to be inserted
+ // into the bloom filter, should be within one order
+ // of magnitude. The default is 200.
+ unsigned int projected_element_count;
- if (optp.table_size < minimum_size)
- optp.table_size = minimum_size;
- else if (optp.table_size > maximum_size)
- optp.table_size = maximum_size;
+ // The approximate false positive probability expected
+ // from the bloom filter. The default is assumed to be
+ // the reciprocal of the projected_element_count.
+ double false_positive_probability;
- return true;
-}
+ unsigned long long int random_seed;
-BloomFilter::BloomFilter()
- : bit_table_(0)
- , salt_count_(0)
- , table_size_(0)
- , raw_table_size_(0)
- , projected_element_count_(0)
- , inserted_element_count_(0)
- , random_seed_(0)
- , desired_false_positive_probability_(0.0)
-{}
+ struct optimal_parameters_t
+ {
+ optimal_parameters_t()
+ : number_of_hashes(0),
+ table_size(0)
+ {}
-BloomFilter::BloomFilter(const BloomParameters& p)
- : bit_table_(0)
- , projected_element_count_(p.projected_element_count)
- , inserted_element_count_(0)
- , random_seed_((p.random_seed * 0xA5A5A5A5) + 1)
- , desired_false_positive_probability_(p.false_positive_probability)
+ unsigned int number_of_hashes;
+ unsigned int table_size;
+ };
+
+ optimal_parameters_t optimal_parameters;
+
+ bool compute_optimal_parameters()
+ {
+ /*
+ Note:
+ The following will attempt to find the number of hash functions
+ and minimum amount of storage bits required to construct a bloom
+ filter consistent with the user defined false positive probability
+ and estimated element insertion count.
+ */
+
+ if (!(*this))
+ return false;
+
+ double min_m = std::numeric_limits<double>::infinity();
+ double min_k = 0.0;
+ double k = 1.0;
+
+ while (k < 1000.0)
+ {
+ const double numerator = (- k * projected_element_count);
+ const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
+
+ const double curr_m = numerator / denominator;
+
+ if (curr_m < min_m)
+ {
+ min_m = curr_m;
+ min_k = k;
+ }
+
+ k += 1.0;
+ }
+
+ optimal_parameters_t& optp = optimal_parameters;
+
+ optp.number_of_hashes = static_cast<unsigned int>(min_k);
+
+ optp.table_size = static_cast<unsigned int>(min_m);
+
+ optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
+
+ if (optp.number_of_hashes < minimum_number_of_hashes)
+ optp.number_of_hashes = minimum_number_of_hashes;
+ else if (optp.number_of_hashes > maximum_number_of_hashes)
+ optp.number_of_hashes = maximum_number_of_hashes;
+
+ if (optp.table_size < minimum_size)
+ optp.table_size = minimum_size;
+ else if (optp.table_size > maximum_size)
+ optp.table_size = maximum_size;
+
+ return true;
+ }
+
+};
+
+
+BloomFilter::BloomFilter(const bloom_parameters& p)
+: projected_element_count_(p.projected_element_count),
+ inserted_element_count_(0),
+ random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
+ desired_false_positive_probability_(p.false_positive_probability)
{
salt_count_ = p.optimal_parameters.number_of_hashes;
table_size_ = p.optimal_parameters.table_size;
+
generate_unique_salt();
- raw_table_size_ = table_size_ / bits_per_char;
- //bit_table_ = new cell_type[static_cast<std::size_t>(raw_table_size_)];
- bit_table_.resize(static_cast<std::size_t>(raw_table_size_), 0x00);
+
+ bit_table_.resize(table_size_ / bits_per_char, static_cast<cell_type>(0x00));
+}
+
+static bloom_parameters
+makeParameters(unsigned int projected_element_count,
+ double false_positive_probability)
+{
+ bloom_parameters p;
+ p.projected_element_count = projected_element_count;
+ p.false_positive_probability = false_positive_probability;
+
+ if (!p.compute_optimal_parameters()) {
+ NDN_THROW(BloomFilter::Error("Bloom filter parameters are not correct!"));
+ }
+ return p;
}
BloomFilter::BloomFilter(unsigned int projected_element_count,
double false_positive_probability)
- : BloomFilter(getParameters(projected_element_count, false_positive_probability))
+ : BloomFilter(makeParameters(projected_element_count, false_positive_probability))
{
}
@@ -165,42 +227,25 @@
const ndn::name::Component& bfName)
: BloomFilter(projected_element_count, false_positive_probability)
{
- std::vector<BloomFilter::cell_type> table(bfName.value_begin(), bfName.value_end());
-
- if (table.size() != raw_table_size_) {
- NDN_THROW(Error("Received BloomFilter cannot be decoded!"));
+ std::vector<cell_type> table(bfName.value_begin(), bfName.value_end());
+ if (table.size() != table_size_ / bits_per_char) {
+ NDN_THROW(Error("Bloom filter cannot be decoded!"));
}
- bit_table_ = table;
-}
-
-BloomParameters
-BloomFilter::getParameters(unsigned int projected_element_count,
- double false_positive_probability)
-{
- BloomParameters opt;
- opt.false_positive_probability = false_positive_probability;
- opt.projected_element_count = projected_element_count;
-
- if (!opt) {
- NDN_LOG_WARN("Bloom parameters are not correct!");
- }
-
- opt.compute_optimal_parameters();
- return opt;
+ bit_table_ = std::move(table);
}
void
BloomFilter::appendToName(ndn::Name& name) const
{
name.appendNumber(projected_element_count_);
- name.appendNumber((int)(desired_false_positive_probability_ * 1000));
+ name.appendNumber(static_cast<uint64_t>(desired_false_positive_probability_ * 1000));
name.append(bit_table_.begin(), bit_table_.end());
}
void
BloomFilter::clear()
{
- bit_table_.resize(static_cast<std::size_t>(raw_table_size_), 0x00);
+ std::fill(bit_table_.begin(), bit_table_.end(), static_cast<cell_type>(0x00));
inserted_element_count_ = 0;
}
@@ -208,12 +253,15 @@
BloomFilter::insert(const std::string& key)
{
std::size_t bit_index = 0;
- std::size_t bit = 0;
+ std::size_t bit = 0;
+
for (std::size_t i = 0; i < salt_.size(); ++i)
{
- compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
- bit_table_[bit_index/bits_per_char] |= bit_mask[bit];
+ compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
+
+ bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
}
+
++inserted_element_count_;
}
@@ -221,12 +269,14 @@
BloomFilter::contains(const std::string& key) const
{
std::size_t bit_index = 0;
- std::size_t bit = 0;
+ std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
- if ((bit_table_[bit_index/bits_per_char] & bit_mask[bit]) != bit_mask[bit]) {
+
+ if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+ {
return false;
}
}
@@ -234,10 +284,11 @@
return true;
}
-std::vector <BloomFilter::cell_type>
-BloomFilter::table() const
+void
+BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
{
- return bit_table_;
+ bit_index = hash % table_size_;
+ bit = bit_index % bits_per_char;
}
void
@@ -250,6 +301,7 @@
hash function with different values seems to be adequate.
*/
const unsigned int predef_salt_count = 128;
+
static const bloom_type predef_salt[predef_salt_count] =
{
0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
@@ -291,19 +343,31 @@
std::copy(predef_salt,
predef_salt + salt_count_,
std::back_inserter(salt_));
- for (unsigned int i = 0; i < salt_.size(); ++i)
+
+ for (std::size_t i = 0; i < salt_.size(); ++i)
{
+ /*
+ Note:
+ This is done to integrate the user defined random seed,
+ so as to allow for the generation of unique bloom filter
+ instances.
+ */
salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
}
}
else
{
- std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+ std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
+
srand(static_cast<unsigned int>(random_seed_));
+
while (salt_.size() < salt_count_)
{
bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
- if (0 == current_salt) continue;
+
+ if (0 == current_salt)
+ continue;
+
if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
{
salt_.push_back(current_salt);
@@ -312,39 +376,4 @@
}
}
-void
-BloomFilter::compute_indices(const bloom_type& hash,
- std::size_t& bit_index, std::size_t& bit) const
-{
- bit_index = hash % table_size_;
- bit = bit_index % bits_per_char;
-}
-
-bool
-operator==(const BloomFilter& bf1, const BloomFilter& bf2)
-{
- auto table1 = bf1.table();
- auto table2 = bf2.table();
-
- if (table1.size() != table2.size()) {
- return false;
- }
-
- for (size_t i = 0; i < table1.size(); i++) {
- if (table1[i] != table2[i]) {
- return false;
- }
- }
- return true;
-}
-
-std::ostream&
-operator<<(std::ostream& out, const BloomFilter& bf)
-{
- for (const auto& element : bf.table()) {
- out << unsigned(element);
- }
- return out;
-}
-
} // namespace psync
diff --git a/PSync/detail/bloom-filter.hpp b/PSync/detail/bloom-filter.hpp
index f7e3726..ff0d213 100644
--- a/PSync/detail/bloom-filter.hpp
+++ b/PSync/detail/bloom-filter.hpp
@@ -47,63 +47,17 @@
#define PSYNC_DETAIL_BLOOM_FILTER_HPP
#include <ndn-cxx/name.hpp>
+#include <ndn-cxx/util/string-helper.hpp>
#include <string>
#include <vector>
-#include <cmath>
-#include <cstdlib>
namespace psync {
-struct optimal_parameters_t
-{
- optimal_parameters_t()
- : number_of_hashes(0),
- table_size(0)
- {}
-
- unsigned int number_of_hashes;
- unsigned int table_size;
-};
-
-class BloomParameters
-{
-public:
- BloomParameters();
-
- bool
- compute_optimal_parameters();
-
- bool operator!() const
- {
- return (minimum_size > maximum_size) ||
- (minimum_number_of_hashes > maximum_number_of_hashes) ||
- (minimum_number_of_hashes < 1) ||
- (0 == maximum_number_of_hashes) ||
- (0 == projected_element_count) ||
- (false_positive_probability < 0.0) ||
- (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
- (0 == random_seed) ||
- (0xFFFFFFFFFFFFFFFFULL == random_seed);
- }
-
- unsigned int minimum_size;
- unsigned int maximum_size;
- unsigned int minimum_number_of_hashes;
- unsigned int maximum_number_of_hashes;
- unsigned int projected_element_count;
- double false_positive_probability;
- unsigned long long int random_seed;
- optimal_parameters_t optimal_parameters;
-};
+class bloom_parameters;
class BloomFilter
{
-protected:
- typedef uint32_t bloom_type;
- typedef uint8_t cell_type;
- typedef std::vector <cell_type>::iterator Iterator;
-
public:
class Error : public std::runtime_error
{
@@ -111,9 +65,7 @@
using std::runtime_error::runtime_error;
};
- BloomFilter();
-
- explicit BloomFilter(const BloomParameters& p);
+ BloomFilter() = default;
BloomFilter(unsigned int projected_element_count,
double false_positive_probability);
@@ -122,10 +74,6 @@
double false_positive_probability,
const ndn::name::Component& bfName);
- BloomParameters
- getParameters(unsigned int projected_element_count,
- double false_positive_probability);
-
/**
* @brief Append our bloom filter to the given name
*
@@ -146,35 +94,53 @@
bool
contains(const std::string& key) const;
- std::vector<cell_type>
- table() const;
-
private:
+ typedef uint32_t bloom_type;
+ typedef uint8_t cell_type;
+
+ explicit
+ BloomFilter(const bloom_parameters& p);
+
+ void
+ compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const;
+
void
generate_unique_salt();
- void
- compute_indices(const bloom_type& hash,
- std::size_t& bit_index, std::size_t& bit) const;
+private: // non-member operators
+ // NOTE: the following "hidden friend" operators are available via
+ // argument-dependent lookup only and must be defined inline.
+
+ friend bool
+ operator==(const BloomFilter& lhs, const BloomFilter& rhs)
+ {
+ return lhs.bit_table_ == rhs.bit_table_;
+ }
+
+ friend bool
+ operator!=(const BloomFilter& lhs, const BloomFilter& rhs)
+ {
+ return lhs.bit_table_ != rhs.bit_table_;
+ }
+
+ friend std::ostream&
+ operator<<(std::ostream& os, const BloomFilter& bf)
+ {
+ ndn::printHex(os, bf.bit_table_.data(), bf.bit_table_.size(), false);
+ return os;
+ }
private:
- std::vector <bloom_type> salt_;
- std::vector <cell_type> bit_table_;
- unsigned int salt_count_;
- unsigned int table_size_; // 8 * raw_table_size;
- unsigned int raw_table_size_;
- unsigned int projected_element_count_;
- unsigned int inserted_element_count_;
- unsigned long long int random_seed_;
- double desired_false_positive_probability_;
+ std::vector<bloom_type> salt_;
+ std::vector<cell_type> bit_table_;
+ unsigned int salt_count_ = 0;
+ unsigned int table_size_ = 0;
+ unsigned int projected_element_count_ = 0;
+ unsigned int inserted_element_count_ = 0;
+ unsigned long long int random_seed_ = 0;
+ double desired_false_positive_probability_ = 0.0;
};
-bool
-operator==(const BloomFilter& bf1, const BloomFilter& bf2);
-
-std::ostream&
-operator<<(std::ostream& out, const BloomFilter& bf);
-
} // namespace psync
#endif // PSYNC_DETAIL_BLOOM_FILTER_HPP