# 利用bloom filter算法处理大规模数据过滤

2014/08/15 14:28

Bloom Filter是由Bloom在1970年提出的一种快速查找算法，通过多个hash算法来共同判断某个元素是否在某个集合内。可以用于网络爬虫的url重复过滤、垃圾邮件的过滤等等。

bloom filter主要的难点其实在于估算： 保证指定误判率的情况下，到底需要多少个hash函数，多少的存储空间。

p = (1 - ((1 - 1/ m) ^ kn))^k ~= (1 - e^(-kn/m))^k


k = (m / n) * ln2


p = (1 - e ^-((m/nln2)n/m))^(m/nln2)


lnp = -m/n * (ln2)^2


s = m/n = -lnp / (ln2 * ln2) = -log2(p) / ln2
k = s * ln2 = -log2(p)


p < 0.1: k = 3.321928, m/n = 4.79
p < 0.01: k = 6.643856, m/n = 9.58
p < 0.001: k = 9.965784, m/n = 14.37
p < 0.0001: k = 13.287712, m/n = 19.170117


p = (1 - e^(-kn/m))^k


p = (1 - e^(-k/s))^k


lnp = k * ln(1 - e^(-k/s))


(lnp) / k = ln(1 - e^(-k/s))


e^((lnp) / k) = 1 - e^(-k/s)


e^(-k/s) = 1 - e^((lnp) / k) = 1 - (e^lnp)^(1/k) = 1 - p^(1/k)


-k/s = ln(1 - p^(1/k))


s = -k / ln(1 - p^(1/k))


s = -k / ln(1 - c)


s ~= -k / (-c-0.5c^2) = 2k / (2c + c * c)


c = p^(1/k)
s = m / n = 2k / (2c + c * c)


p < 0.1 and k = 1: s = m/n = 9.523810
p < 0.1 and k = 2: s = m/n = 5.461082
p < 0.1 and k = 3: s = m/n = 5.245850, space ~= 6.3MB
p < 0.1 and k = 4: s = m/n = 5.552045, space ~= 6.6MB

p < 0.01 and k = 1: s = m/n = 99.502488
p < 0.01 and k = 2: s = m/n = 19.047619
p < 0.01 and k = 3: s = m/n = 12.570636, space ~= 15MB
p < 0.01 and k = 4: s = m/n = 10.922165, space ~= 13MB

p < 0.001 and k = 1: s = m/n = 999.500250
p < 0.001 and k = 2: s = m/n = 62.261118
p < 0.001 and k = 3: s = m/n = 28.571429, space ~= 34MB
p < 0.001 and k = 4: s = m/n = 20.656961, space ~= 24.6MB

p < 0.0001 and k = 1: s = m/n = 9999.500025
p < 0.0001 and k = 2: s = m/n = 199.004975
p < 0.0001 and k = 3: s = m/n = 63.167063, space ~= 75.3MB
p < 0.0001 and k = 4: s = m/n = 38.095238, space ~= 45.4MB
p < 0.0001 and k = 5: s = m/n = 29.231432, space ~= 24.8MB


p < 0.01 and k = 3的情况下，其实际误判率为：0.004965
p < 0.001 and k = 3的情况下，其实际误判率为：0.000967


/* 初始化bloom filter
*
* TB_BLOOM_FILTER_PROBABILITY_0_01: 预定义的误判率，接近0.01
* 注：由于内部使用位移数来表示：1 / 2^6 = 0.015625 ~= 0.01
* 所以实际传入的误判率，有可能稍微大一点，但是还是相当接近的
*
* 3：为k值，hash函数的个数，最大不超过15个
*
* count：指定的元素规模数
*
* tb_item_func_long()：容器的元素类型，主要是用其内定的hash函数，如果要自定义hash函数，可以替换:
*
* tb_size_t tb_xxxxxx_hash(tb_item_func_t* func, tb_cpointer_t data, tb_size_t mask, tb_size_t index)
* {
*      return compute_hash(data, index) & mask;
* }
*
* tb_item_func_t func = tb_item_func_long();
* func.hash = tb_xxxxxx_hash;
*
* 来进行
*/
tb_bloom_filter_ref_t filter = tb_bloom_filter_init(TB_BLOOM_FILTER_PROBABILITY_0_01, 3, count, tb_item_func_long());

if (filter)
{
tb_size_t i = 0;
for (i = 0; i < count; i++)
{
// 产生随机数
tb_long_t value = tb_random();

// 设置值到filter内，如果不存在，则返回tb_true表示设置成功
if (tb_bloom_filter_set(filter, (tb_cpointer_t)value))
{
// 添加元素成功，之前元素不存在
// 不会存在误判
}
else
{
// 添加失败，添加的元素已经存在
// 这里可能会存在误判
}

// 仅仅判断元素是否存在
if (tb_bloom_filter_get(filter, (tb_cpointer_t)data)
{
// 元素已经存在
// 这里可能会存在误判
}
else
{
// 元素不存在
// 不会存在误判
}
}

// 退出filter
tb_bloom_filter_exit(filter);
}

// 常用预定义的误判率，也可以指定其他值，注：必须是位移数，而不是实际值
typedef enum __tb_bloom_filter_probability_e
{
TB_BLOOM_FILTER_PROBABILITY_0_1         = 3 ///!< 1 / 2^3 = 0.125 ~= 0.1
,   TB_BLOOM_FILTER_PROBABILITY_0_01        = 6 ///!< 1 / 2^6 = 0.015625 ~= 0.01
,   TB_BLOOM_FILTER_PROBABILITY_0_001       = 10 ///!< 1 / 2^10 = 0.0009765625 ~= 0.001
,   TB_BLOOM_FILTER_PROBABILITY_0_0001      = 13 ///!< 1 / 2^13 = 0.0001220703125 ~= 0.0001
,   TB_BLOOM_FILTER_PROBABILITY_0_00001     = 16 ///!< 1 / 2^16 = 0.0000152587890625 ~= 0.00001
,   TB_BLOOM_FILTER_PROBABILITY_0_000001    = 20 ///!< 1 / 2^20 = 0.00000095367431640625 ~= 0.000001

}tb_bloom_filter_probability_e;


0 评论
12 收藏
0