Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017 
0018 package org.apache.spark.util.sketch;
0019 
0020 import java.io.IOException;
0021 import java.io.InputStream;
0022 import java.io.OutputStream;
0023 
0024 /**
0025  * A Bloom filter is a space-efficient probabilistic data structure that offers an approximate
0026  * containment test with one-sided error: if it claims that an item is contained in it, this
0027  * might be in error, but if it claims that an item is <i>not</i> contained in it, then this is
0028  * definitely true. Currently supported data types include:
0029  * <ul>
0030  *   <li>{@link Byte}</li>
0031  *   <li>{@link Short}</li>
0032  *   <li>{@link Integer}</li>
0033  *   <li>{@link Long}</li>
0034  *   <li>{@link String}</li>
0035  * </ul>
0036  * The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability that
0037  * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that has
0038  * not actually been put in the {@code BloomFilter}.
0039  *
0040  * The implementation is largely based on the {@code BloomFilter} class from Guava.
0041  */
0042 public abstract class BloomFilter {
0043 
0044   public enum Version {
0045     /**
0046      * {@code BloomFilter} binary format version 1. All values written in big-endian order:
0047      * <ul>
0048      *   <li>Version number, always 1 (32 bit)</li>
0049      *   <li>Number of hash functions (32 bit)</li>
0050      *   <li>Total number of words of the underlying bit array (32 bit)</li>
0051      *   <li>The words/longs (numWords * 64 bit)</li>
0052      * </ul>
0053      */
0054     V1(1);
0055 
0056     private final int versionNumber;
0057 
0058     Version(int versionNumber) {
0059       this.versionNumber = versionNumber;
0060     }
0061 
0062     int getVersionNumber() {
0063       return versionNumber;
0064     }
0065   }
0066 
0067   /**
0068    * Returns the probability that {@linkplain #mightContain(Object)} erroneously return {@code true}
0069    * for an object that has not actually been put in the {@code BloomFilter}.
0070    *
0071    * Ideally, this number should be close to the {@code fpp} parameter passed in
0072    * {@linkplain #create(long, double)}, or smaller. If it is significantly higher, it is usually
0073    * the case that too many items (more than expected) have been put in the {@code BloomFilter},
0074    * degenerating it.
0075    */
0076   public abstract double expectedFpp();
0077 
0078   /**
0079    * Returns the number of bits in the underlying bit array.
0080    */
0081   public abstract long bitSize();
0082 
0083   /**
0084    * Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of
0085    * {@linkplain #mightContain(Object)} with the same item will always return {@code true}.
0086    *
0087    * @return true if the bloom filter's bits changed as a result of this operation. If the bits
0088    *     changed, this is <i>definitely</i> the first time {@code object} has been added to the
0089    *     filter. If the bits haven't changed, this <i>might</i> be the first time {@code object}
0090    *     has been added to the filter. Note that {@code put(t)} always returns the
0091    *     <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time
0092    *     it is called.
0093    */
0094   public abstract boolean put(Object item);
0095 
0096   /**
0097    * A specialized variant of {@link #put(Object)} that only supports {@code String} items.
0098    */
0099   public abstract boolean putString(String item);
0100 
0101   /**
0102    * A specialized variant of {@link #put(Object)} that only supports {@code long} items.
0103    */
0104   public abstract boolean putLong(long item);
0105 
0106   /**
0107    * A specialized variant of {@link #put(Object)} that only supports byte array items.
0108    */
0109   public abstract boolean putBinary(byte[] item);
0110 
0111   /**
0112    * Determines whether a given bloom filter is compatible with this bloom filter. For two
0113    * bloom filters to be compatible, they must have the same bit size.
0114    *
0115    * @param other The bloom filter to check for compatibility.
0116    */
0117   public abstract boolean isCompatible(BloomFilter other);
0118 
0119   /**
0120    * Combines this bloom filter with another bloom filter by performing a bitwise OR of the
0121    * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the
0122    * bloom filters are appropriately sized to avoid saturating them.
0123    *
0124    * @param other The bloom filter to combine this bloom filter with. It is not mutated.
0125    * @throws IncompatibleMergeException if {@code isCompatible(other) == false}
0126    */
0127   public abstract BloomFilter mergeInPlace(BloomFilter other) throws IncompatibleMergeException;
0128 
0129   /**
0130    * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
0131    * {@code false} if this is <i>definitely</i> not the case.
0132    */
0133   public abstract boolean mightContain(Object item);
0134 
0135   /**
0136    * A specialized variant of {@link #mightContain(Object)} that only tests {@code String} items.
0137    */
0138   public abstract boolean mightContainString(String item);
0139 
0140   /**
0141    * A specialized variant of {@link #mightContain(Object)} that only tests {@code long} items.
0142    */
0143   public abstract boolean mightContainLong(long item);
0144 
0145   /**
0146    * A specialized variant of {@link #mightContain(Object)} that only tests byte array items.
0147    */
0148   public abstract boolean mightContainBinary(byte[] item);
0149 
0150   /**
0151    * Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's
0152    * responsibility to close the stream.
0153    */
0154   public abstract void writeTo(OutputStream out) throws IOException;
0155 
0156   /**
0157    * Reads in a {@link BloomFilter} from an input stream. It is the caller's responsibility to close
0158    * the stream.
0159    */
0160   public static BloomFilter readFrom(InputStream in) throws IOException {
0161     return BloomFilterImpl.readFrom(in);
0162   }
0163 
0164   /**
0165    * Computes the optimal k (number of hashes per item inserted in Bloom filter), given the
0166    * expected insertions and total number of bits in the Bloom filter.
0167    *
0168    * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
0169    *
0170    * @param n expected insertions (must be positive)
0171    * @param m total number of bits in Bloom filter (must be positive)
0172    */
0173   private static int optimalNumOfHashFunctions(long n, long m) {
0174     // (m / n) * log(2), but avoid truncation due to division!
0175     return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
0176   }
0177 
0178   /**
0179    * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
0180    * expected insertions, the required false positive probability.
0181    *
0182    * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
0183    *
0184    * @param n expected insertions (must be positive)
0185    * @param p false positive rate (must be 0 < p < 1)
0186    */
0187   private static long optimalNumOfBits(long n, double p) {
0188     return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
0189   }
0190 
0191   static final double DEFAULT_FPP = 0.03;
0192 
0193   /**
0194    * Creates a {@link BloomFilter} with the expected number of insertions and a default expected
0195    * false positive probability of 3%.
0196    *
0197    * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
0198    * will result in its saturation, and a sharp deterioration of its false positive probability.
0199    */
0200   public static BloomFilter create(long expectedNumItems) {
0201     return create(expectedNumItems, DEFAULT_FPP);
0202   }
0203 
0204   /**
0205    * Creates a {@link BloomFilter} with the expected number of insertions and expected false
0206    * positive probability.
0207    *
0208    * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
0209    * will result in its saturation, and a sharp deterioration of its false positive probability.
0210    */
0211   public static BloomFilter create(long expectedNumItems, double fpp) {
0212     if (fpp <= 0D || fpp >= 1D) {
0213       throw new IllegalArgumentException(
0214         "False positive probability must be within range (0.0, 1.0)"
0215       );
0216     }
0217 
0218     return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
0219   }
0220 
0221   /**
0222    * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
0223    * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
0224    */
0225   public static BloomFilter create(long expectedNumItems, long numBits) {
0226     if (expectedNumItems <= 0) {
0227       throw new IllegalArgumentException("Expected insertions must be positive");
0228     }
0229 
0230     if (numBits <= 0) {
0231       throw new IllegalArgumentException("Number of bits must be positive");
0232     }
0233 
0234     return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
0235   }
0236 }