Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017 
0018 package org.apache.spark.util.sketch;
0019 
0020 import java.io.ByteArrayInputStream;
0021 import java.io.IOException;
0022 import java.io.InputStream;
0023 import java.io.OutputStream;
0024 
0025 /**
0026  * A Count-min sketch is a probabilistic data structure used for cardinality estimation using
0027  * sub-linear space.  Currently, supported data types include:
0028  * <ul>
0029  *   <li>{@link Byte}</li>
0030  *   <li>{@link Short}</li>
0031  *   <li>{@link Integer}</li>
0032  *   <li>{@link Long}</li>
0033  *   <li>{@link String}</li>
0034  * </ul>
0035  * A {@link CountMinSketch} is initialized with a random seed, and a pair of parameters:
0036  * <ol>
0037  *   <li>relative error (or {@code eps}), and
0038  *   <li>confidence (or {@code delta})
0039  * </ol>
0040  * Suppose you want to estimate the number of times an element {@code x} has appeared in a data
0041  * stream so far.  With probability {@code delta}, the estimate of this frequency is within the
0042  * range {@code true frequency <= estimate <= true frequency + eps * N}, where {@code N} is the
0043  * total count of items have appeared the data stream so far.
0044  *
0045  * Under the cover, a {@link CountMinSketch} is essentially a two-dimensional {@code long} array
0046  * with depth {@code d} and width {@code w}, where
0047  * <ul>
0048  *   <li>{@code d = ceil(2 / eps)}</li>
0049  *   <li>{@code w = ceil(-log(1 - confidence) / log(2))}</li>
0050  * </ul>
0051  *
0052  * This implementation is largely based on the {@code CountMinSketch} class from stream-lib.
0053  */
0054 public abstract class CountMinSketch {
0055 
0056   public enum Version {
0057     /**
0058      * {@code CountMinSketch} binary format version 1.  All values written in big-endian order:
0059      * <ul>
0060      *   <li>Version number, always 1 (32 bit)</li>
0061      *   <li>Total count of added items (64 bit)</li>
0062      *   <li>Depth (32 bit)</li>
0063      *   <li>Width (32 bit)</li>
0064      *   <li>Hash functions (depth * 64 bit)</li>
0065      *   <li>
0066      *     Count table
0067      *     <ul>
0068      *       <li>Row 0 (width * 64 bit)</li>
0069      *       <li>Row 1 (width * 64 bit)</li>
0070      *       <li>...</li>
0071      *       <li>Row {@code depth - 1} (width * 64 bit)</li>
0072      *     </ul>
0073      *   </li>
0074      * </ul>
0075      */
0076     V1(1);
0077 
0078     private final int versionNumber;
0079 
0080     Version(int versionNumber) {
0081       this.versionNumber = versionNumber;
0082     }
0083 
0084     int getVersionNumber() {
0085       return versionNumber;
0086     }
0087   }
0088 
0089   /**
0090    * Returns the relative error (or {@code eps}) of this {@link CountMinSketch}.
0091    */
0092   public abstract double relativeError();
0093 
0094   /**
0095    * Returns the confidence (or {@code delta}) of this {@link CountMinSketch}.
0096    */
0097   public abstract double confidence();
0098 
0099   /**
0100    * Depth of this {@link CountMinSketch}.
0101    */
0102   public abstract int depth();
0103 
0104   /**
0105    * Width of this {@link CountMinSketch}.
0106    */
0107   public abstract int width();
0108 
0109   /**
0110    * Total count of items added to this {@link CountMinSketch} so far.
0111    */
0112   public abstract long totalCount();
0113 
0114   /**
0115    * Increments {@code item}'s count by one.
0116    */
0117   public abstract void add(Object item);
0118 
0119   /**
0120    * Increments {@code item}'s count by {@code count}.
0121    */
0122   public abstract void add(Object item, long count);
0123 
0124   /**
0125    * Increments {@code item}'s count by one.
0126    */
0127   public abstract void addLong(long item);
0128 
0129   /**
0130    * Increments {@code item}'s count by {@code count}.
0131    */
0132   public abstract void addLong(long item, long count);
0133 
0134   /**
0135    * Increments {@code item}'s count by one.
0136    */
0137   public abstract void addString(String item);
0138 
0139   /**
0140    * Increments {@code item}'s count by {@code count}.
0141    */
0142   public abstract void addString(String item, long count);
0143 
0144   /**
0145    * Increments {@code item}'s count by one.
0146    */
0147   public abstract void addBinary(byte[] item);
0148 
0149   /**
0150    * Increments {@code item}'s count by {@code count}.
0151    */
0152   public abstract void addBinary(byte[] item, long count);
0153 
0154   /**
0155    * Returns the estimated frequency of {@code item}.
0156    */
0157   public abstract long estimateCount(Object item);
0158 
0159   /**
0160    * Merges another {@link CountMinSketch} with this one in place.
0161    *
0162    * Note that only Count-Min sketches with the same {@code depth}, {@code width}, and random seed
0163    * can be merged.
0164    *
0165    * @exception IncompatibleMergeException if the {@code other} {@link CountMinSketch} has
0166    *            incompatible depth, width, relative-error, confidence, or random seed.
0167    */
0168   public abstract CountMinSketch mergeInPlace(CountMinSketch other)
0169       throws IncompatibleMergeException;
0170 
0171   /**
0172    * Writes out this {@link CountMinSketch} to an output stream in binary format. It is the caller's
0173    * responsibility to close the stream.
0174    */
0175   public abstract void writeTo(OutputStream out) throws IOException;
0176 
0177   /**
0178    * Serializes this {@link CountMinSketch} and returns the serialized form.
0179    */
0180   public abstract byte[] toByteArray() throws IOException;
0181 
0182   /**
0183    * Reads in a {@link CountMinSketch} from an input stream. It is the caller's responsibility to
0184    * close the stream.
0185    */
0186   public static CountMinSketch readFrom(InputStream in) throws IOException {
0187     return CountMinSketchImpl.readFrom(in);
0188   }
0189 
0190   /**
0191    * Reads in a {@link CountMinSketch} from a byte array.
0192    */
0193   public static CountMinSketch readFrom(byte[] bytes) throws IOException {
0194     try (InputStream in = new ByteArrayInputStream(bytes)) {
0195       return readFrom(in);
0196     }
0197   }
0198 
0199   /**
0200    * Creates a {@link CountMinSketch} with given {@code depth}, {@code width}, and random
0201    * {@code seed}.
0202    *
0203    * @param depth depth of the Count-min Sketch, must be positive
0204    * @param width width of the Count-min Sketch, must be positive
0205    * @param seed random seed
0206    */
0207   public static CountMinSketch create(int depth, int width, int seed) {
0208     return new CountMinSketchImpl(depth, width, seed);
0209   }
0210 
0211   /**
0212    * Creates a {@link CountMinSketch} with given relative error ({@code eps}), {@code confidence},
0213    * and random {@code seed}.
0214    *
0215    * @param eps relative error, must be positive
0216    * @param confidence confidence, must be positive and less than 1.0
0217    * @param seed random seed
0218    */
0219   public static CountMinSketch create(double eps, double confidence, int seed) {
0220     return new CountMinSketchImpl(eps, confidence, seed);
0221   }
0222 }