Inbloom 是跨语言的 Bloom filter 实现。Bloom filter 是由 Howard Bloom 在 1970 年提出的二进制向量数据结构,它具有很好的空间和时间效率,被用来检测一个元素是不是集合中的一个成员。如果检测结果为是,该元素不一定在集合中;但如果 检测结果为否,该元素一定不在集合中。因此Bloom filter具有100%的召回率。这样每个检测请求返回有“在集合内(可能错误)”和“不在集合内(绝对不在集合内)”两种情况,可见 Bloom filter 是牺牲了正确率和时间以节省空间。
示例代码:
Python:
import inbloom import base64 import requests # Basic usage bf = inbloom.Filter(entries=100, error=0.01) bf.add("abc") bf.add("def") assert bf.contains("abc") assert bf.contains("def") assert not bf.contains("ghi") bf2 = inbloom.Filter(entries=100, error=0.01, data=bf.buffer()) assert bf2.contains("abc") assert bf2.contains("def") assert not bf2.contains("ghi") # Serialization payload = 'Yg0AZAAAABQAAAAAACAAEAAIAAAAAAAAIAAQAAgABAA=' assert base64.b64encode(inbloom.dump(inbloom.load(base64.b64decode(payload)))) == payload # Sending it over HTTP serialized = base64.b64encode(inbloom.dump(bf)) requests.get('http://api.endpoint.me', params={'filter': serialized})
Go:
// create a blank filter - expecting 20 members and an error rate of 1/100 f, err := NewFilter(20, 0.01) if err != nil { panic(err) } // the size of the filter fmt.Println(f.Len()) // insert some values f.Add("foo") f.Add("bar") // test for existence of keys fmt.Println(f.Contains("foo")) fmt.Println(f.Contains("wat")) fmt.Println("marshaled data:", f.MarshalBase64()) // Output: // 24 // true // false // marshaled data: oU4AZAAAABQAAAAAAEIAABEAGAQAAgAgAAAwEAAJAAA= // a 20 cardinality 0.01 precision filter with "foo" and "bar" in it data := "oU4AZAAAABQAAAAAAEIAABEAGAQAAgAgAAAwEAAJAAA=" // load it from base64 f, err := UnmarshalBase64(data) if err != nil { panic(err) } // test it... fmt.Println(f.Contains("foo")) fmt.Println(f.Contains("wat")) fmt.Println(f.Len()) // dump to pure binary fmt.Printf("%x\n", f.Marshal()) // Output: // true // false // 24 // a14e006400000014000000000042000011001804000200200000301000090000
Java:
// The basics BloomFilter bf = new BloomFilter(20, 0.01); bf.add("foo"); bf.add("bar"); assertTrue(bf.contains("foo")); assertTrue(bf.contains("bar")); assertFalse(bf.contains("baz")); BloomFilter bf2 = new BloomFilter(bf.bf, bf.entries, bf.error); assertTrue(bf2.contains("foo")); assertTrue(bf2.contains("bar")); assertFalse(bf2.contains("baz")); // Serialization String serialized = BinAscii.hexlify(BloomFilter.dump(bf)); System.out.printf("Serialized: %s\n", serialized); String hexPayload = "620d006400000014000000000020001000080000000000002000100008000400"; BloomFilter deserialized = BloomFilter.load(BinAscii.unhexlify(hexPayload)); String dump = BinAscii.hexlify(BloomFilter.dump(deserialized)); System.out.printf("Re-Serialized: %s\n", dump); assertEquals(dump.toLowerCase(), hexPayload); assertEquals(deserialized.entries, 20); assertEquals(deserialized.error, 0.01); assertTrue(deserialized.contains("abc"));