private void writeStoreFile(StoreFile.Writer writer) throws IOException { final int rowLen = 32; for (int i = 0; i < NUM_KV; ++i) { byte[] k = TestHFileWriterV2.randomOrderedKey(rand, i); byte[] v = TestHFileWriterV2.randomValue(rand); int cfLen = rand.nextInt(k.length - rowLen + 1); KeyValue kv = new KeyValue( k, 0, rowLen, k, rowLen, cfLen, k, rowLen + cfLen, k.length - rowLen - cfLen, rand.nextLong(), generateKeyType(rand), v, 0, v.length); writer.append(kv); } }
private List<KeyValue> createSortedKeyValues(Random rand, int n) { List<KeyValue> kvList = new ArrayList<KeyValue>(n); for (int i = 0; i < n; ++i) kvList.add(TestHFileWriterV2.randomKeyValue(rand)); Collections.sort(kvList, KeyValue.COMPARATOR); return kvList; }
private boolean isInBloom(StoreFileScanner scanner, byte[] row, byte[] qualifier) { Scan scan = new Scan(row, row); scan.addColumn(Bytes.toBytes(TestHFileWriterV2.COLUMN_FAMILY_NAME), qualifier); Store store = mock(Store.class); HColumnDescriptor hcd = mock(HColumnDescriptor.class); when(hcd.getName()).thenReturn(Bytes.toBytes(TestHFileWriterV2.COLUMN_FAMILY_NAME)); when(store.getFamily()).thenReturn(hcd); return scanner.shouldUseScanner(scan, store, Long.MIN_VALUE); }
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose }
private boolean isInBloom(StoreFileScanner scanner, byte[] row, BloomType bt, Random rand) { return isInBloom(scanner, row, TestHFileWriterV2.randomRowOrQualifier(rand)); }
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt, NoOpDataBlockEncoder.INSTANCE); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose }