|
||||
0001 /* 0002 * Licensed to the Apache Software Foundation (ASF) under one or more 0003 * contributor license agreements. See the NOTICE file distributed with 0004 * this work for additional information regarding copyright ownership. 0005 * The ASF licenses this file to You under the Apache License, Version 2.0 0006 * (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * 0009 * http://www.apache.org/licenses/LICENSE-2.0 0010 * 0011 * Unless required by applicable law or agreed to in writing, software 0012 * distributed under the License is distributed on an "AS IS" BASIS, 0013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 0014 * See the License for the specific language governing permissions and 0015 * limitations under the License. 0016 */ 0017 0018 package org.apache.spark.util.sketch; 0019 0020 import java.io.IOException; 0021 import java.io.InputStream; 0022 import java.io.OutputStream; 0023 0024 /** 0025 * A Bloom filter is a space-efficient probabilistic data structure that offers an approximate 0026 * containment test with one-sided error: if it claims that an item is contained in it, this 0027 * might be in error, but if it claims that an item is <i>not</i> contained in it, then this is 0028 * definitely true. Currently supported data types include: 0029 * <ul> 0030 * <li>{@link Byte}</li> 0031 * <li>{@link Short}</li> 0032 * <li>{@link Integer}</li> 0033 * <li>{@link Long}</li> 0034 * <li>{@link String}</li> 0035 * </ul> 0036 * The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability that 0037 * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that has 0038 * not actually been put in the {@code BloomFilter}. 0039 * 0040 * The implementation is largely based on the {@code BloomFilter} class from Guava. 0041 */ 0042 public abstract class BloomFilter { 0043 0044 public enum Version { 0045 /** 0046 * {@code BloomFilter} binary format version 1. All values written in big-endian order: 0047 * <ul> 0048 * <li>Version number, always 1 (32 bit)</li> 0049 * <li>Number of hash functions (32 bit)</li> 0050 * <li>Total number of words of the underlying bit array (32 bit)</li> 0051 * <li>The words/longs (numWords * 64 bit)</li> 0052 * </ul> 0053 */ 0054 V1(1); 0055 0056 private final int versionNumber; 0057 0058 Version(int versionNumber) { 0059 this.versionNumber = versionNumber; 0060 } 0061 0062 int getVersionNumber() { 0063 return versionNumber; 0064 } 0065 } 0066 0067 /** 0068 * Returns the probability that {@linkplain #mightContain(Object)} erroneously return {@code true} 0069 * for an object that has not actually been put in the {@code BloomFilter}. 0070 * 0071 * Ideally, this number should be close to the {@code fpp} parameter passed in 0072 * {@linkplain #create(long, double)}, or smaller. If it is significantly higher, it is usually 0073 * the case that too many items (more than expected) have been put in the {@code BloomFilter}, 0074 * degenerating it. 0075 */ 0076 public abstract double expectedFpp(); 0077 0078 /** 0079 * Returns the number of bits in the underlying bit array. 0080 */ 0081 public abstract long bitSize(); 0082 0083 /** 0084 * Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of 0085 * {@linkplain #mightContain(Object)} with the same item will always return {@code true}. 0086 * 0087 * @return true if the bloom filter's bits changed as a result of this operation. If the bits 0088 * changed, this is <i>definitely</i> the first time {@code object} has been added to the 0089 * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} 0090 * has been added to the filter. Note that {@code put(t)} always returns the 0091 * <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time 0092 * it is called. 0093 */ 0094 public abstract boolean put(Object item); 0095 0096 /** 0097 * A specialized variant of {@link #put(Object)} that only supports {@code String} items. 0098 */ 0099 public abstract boolean putString(String item); 0100 0101 /** 0102 * A specialized variant of {@link #put(Object)} that only supports {@code long} items. 0103 */ 0104 public abstract boolean putLong(long item); 0105 0106 /** 0107 * A specialized variant of {@link #put(Object)} that only supports byte array items. 0108 */ 0109 public abstract boolean putBinary(byte[] item); 0110 0111 /** 0112 * Determines whether a given bloom filter is compatible with this bloom filter. For two 0113 * bloom filters to be compatible, they must have the same bit size. 0114 * 0115 * @param other The bloom filter to check for compatibility. 0116 */ 0117 public abstract boolean isCompatible(BloomFilter other); 0118 0119 /** 0120 * Combines this bloom filter with another bloom filter by performing a bitwise OR of the 0121 * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the 0122 * bloom filters are appropriately sized to avoid saturating them. 0123 * 0124 * @param other The bloom filter to combine this bloom filter with. It is not mutated. 0125 * @throws IncompatibleMergeException if {@code isCompatible(other) == false} 0126 */ 0127 public abstract BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeException; 0128 0129 /** 0130 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter, 0131 * {@code false} if this is <i>definitely</i> not the case. 0132 */ 0133 public abstract boolean mightContain(Object item); 0134 0135 /** 0136 * A specialized variant of {@link #mightContain(Object)} that only tests {@code String} items. 0137 */ 0138 public abstract boolean mightContainString(String item); 0139 0140 /** 0141 * A specialized variant of {@link #mightContain(Object)} that only tests {@code long} items. 0142 */ 0143 public abstract boolean mightContainLong(long item); 0144 0145 /** 0146 * A specialized variant of {@link #mightContain(Object)} that only tests byte array items. 0147 */ 0148 public abstract boolean mightContainBinary(byte[] item); 0149 0150 /** 0151 * Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's 0152 * responsibility to close the stream. 0153 */ 0154 public abstract void writeTo(OutputStream out) throws IOException; 0155 0156 /** 0157 * Reads in a {@link BloomFilter} from an input stream. It is the caller's responsibility to close 0158 * the stream. 0159 */ 0160 public static BloomFilter readFrom(InputStream in) throws IOException { 0161 return BloomFilterImpl.readFrom(in); 0162 } 0163 0164 /** 0165 * Computes the optimal k (number of hashes per item inserted in Bloom filter), given the 0166 * expected insertions and total number of bits in the Bloom filter. 0167 * 0168 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. 0169 * 0170 * @param n expected insertions (must be positive) 0171 * @param m total number of bits in Bloom filter (must be positive) 0172 */ 0173 private static int optimalNumOfHashFunctions(long n, long m) { 0174 // (m / n) * log(2), but avoid truncation due to division! 0175 return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); 0176 } 0177 0178 /** 0179 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified 0180 * expected insertions, the required false positive probability. 0181 * 0182 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. 0183 * 0184 * @param n expected insertions (must be positive) 0185 * @param p false positive rate (must be 0 < p < 1) 0186 */ 0187 private static long optimalNumOfBits(long n, double p) { 0188 return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); 0189 } 0190 0191 static final double DEFAULT_FPP = 0.03; 0192 0193 /** 0194 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected 0195 * false positive probability of 3%. 0196 * 0197 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, 0198 * will result in its saturation, and a sharp deterioration of its false positive probability. 0199 */ 0200 public static BloomFilter create(long expectedNumItems) { 0201 return create(expectedNumItems, DEFAULT_FPP); 0202 } 0203 0204 /** 0205 * Creates a {@link BloomFilter} with the expected number of insertions and expected false 0206 * positive probability. 0207 * 0208 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, 0209 * will result in its saturation, and a sharp deterioration of its false positive probability. 0210 */ 0211 public static BloomFilter create(long expectedNumItems, double fpp) { 0212 if (fpp <= 0D || fpp >= 1D) { 0213 throw new IllegalArgumentException( 0214 "False positive probability must be within range (0.0, 1.0)" 0215 ); 0216 } 0217 0218 return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); 0219 } 0220 0221 /** 0222 * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will 0223 * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. 0224 */ 0225 public static BloomFilter create(long expectedNumItems, long numBits) { 0226 if (expectedNumItems <= 0) { 0227 throw new IllegalArgumentException("Expected insertions must be positive"); 0228 } 0229 0230 if (numBits <= 0) { 0231 throw new IllegalArgumentException("Number of bits must be positive"); 0232 } 0233 0234 return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); 0235 } 0236 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |