|
||||
0001 /* 0002 * Licensed to the Apache Software Foundation (ASF) under one or more 0003 * contributor license agreements. See the NOTICE file distributed with 0004 * this work for additional information regarding copyright ownership. 0005 * The ASF licenses this file to You under the Apache License, Version 2.0 0006 * (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * 0009 * http://www.apache.org/licenses/LICENSE-2.0 0010 * 0011 * Unless required by applicable law or agreed to in writing, software 0012 * distributed under the License is distributed on an "AS IS" BASIS, 0013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 0014 * See the License for the specific language governing permissions and 0015 * limitations under the License. 0016 */ 0017 0018 package org.apache.spark.util.sketch; 0019 0020 import java.io.ByteArrayInputStream; 0021 import java.io.IOException; 0022 import java.io.InputStream; 0023 import java.io.OutputStream; 0024 0025 /** 0026 * A Count-min sketch is a probabilistic data structure used for cardinality estimation using 0027 * sub-linear space. Currently, supported data types include: 0028 * <ul> 0029 * <li>{@link Byte}</li> 0030 * <li>{@link Short}</li> 0031 * <li>{@link Integer}</li> 0032 * <li>{@link Long}</li> 0033 * <li>{@link String}</li> 0034 * </ul> 0035 * A {@link CountMinSketch} is initialized with a random seed, and a pair of parameters: 0036 * <ol> 0037 * <li>relative error (or {@code eps}), and 0038 * <li>confidence (or {@code delta}) 0039 * </ol> 0040 * Suppose you want to estimate the number of times an element {@code x} has appeared in a data 0041 * stream so far. With probability {@code delta}, the estimate of this frequency is within the 0042 * range {@code true frequency <= estimate <= true frequency + eps * N}, where {@code N} is the 0043 * total count of items have appeared the data stream so far. 0044 * 0045 * Under the cover, a {@link CountMinSketch} is essentially a two-dimensional {@code long} array 0046 * with depth {@code d} and width {@code w}, where 0047 * <ul> 0048 * <li>{@code d = ceil(2 / eps)}</li> 0049 * <li>{@code w = ceil(-log(1 - confidence) / log(2))}</li> 0050 * </ul> 0051 * 0052 * This implementation is largely based on the {@code CountMinSketch} class from stream-lib. 0053 */ 0054 public abstract class CountMinSketch { 0055 0056 public enum Version { 0057 /** 0058 * {@code CountMinSketch} binary format version 1. All values written in big-endian order: 0059 * <ul> 0060 * <li>Version number, always 1 (32 bit)</li> 0061 * <li>Total count of added items (64 bit)</li> 0062 * <li>Depth (32 bit)</li> 0063 * <li>Width (32 bit)</li> 0064 * <li>Hash functions (depth * 64 bit)</li> 0065 * <li> 0066 * Count table 0067 * <ul> 0068 * <li>Row 0 (width * 64 bit)</li> 0069 * <li>Row 1 (width * 64 bit)</li> 0070 * <li>...</li> 0071 * <li>Row {@code depth - 1} (width * 64 bit)</li> 0072 * </ul> 0073 * </li> 0074 * </ul> 0075 */ 0076 V1(1); 0077 0078 private final int versionNumber; 0079 0080 Version(int versionNumber) { 0081 this.versionNumber = versionNumber; 0082 } 0083 0084 int getVersionNumber() { 0085 return versionNumber; 0086 } 0087 } 0088 0089 /** 0090 * Returns the relative error (or {@code eps}) of this {@link CountMinSketch}. 0091 */ 0092 public abstract double relativeError(); 0093 0094 /** 0095 * Returns the confidence (or {@code delta}) of this {@link CountMinSketch}. 0096 */ 0097 public abstract double confidence(); 0098 0099 /** 0100 * Depth of this {@link CountMinSketch}. 0101 */ 0102 public abstract int depth(); 0103 0104 /** 0105 * Width of this {@link CountMinSketch}. 0106 */ 0107 public abstract int width(); 0108 0109 /** 0110 * Total count of items added to this {@link CountMinSketch} so far. 0111 */ 0112 public abstract long totalCount(); 0113 0114 /** 0115 * Increments {@code item}'s count by one. 0116 */ 0117 public abstract void add(Object item); 0118 0119 /** 0120 * Increments {@code item}'s count by {@code count}. 0121 */ 0122 public abstract void add(Object item, long count); 0123 0124 /** 0125 * Increments {@code item}'s count by one. 0126 */ 0127 public abstract void addLong(long item); 0128 0129 /** 0130 * Increments {@code item}'s count by {@code count}. 0131 */ 0132 public abstract void addLong(long item, long count); 0133 0134 /** 0135 * Increments {@code item}'s count by one. 0136 */ 0137 public abstract void addString(String item); 0138 0139 /** 0140 * Increments {@code item}'s count by {@code count}. 0141 */ 0142 public abstract void addString(String item, long count); 0143 0144 /** 0145 * Increments {@code item}'s count by one. 0146 */ 0147 public abstract void addBinary(byte[] item); 0148 0149 /** 0150 * Increments {@code item}'s count by {@code count}. 0151 */ 0152 public abstract void addBinary(byte[] item, long count); 0153 0154 /** 0155 * Returns the estimated frequency of {@code item}. 0156 */ 0157 public abstract long estimateCount(Object item); 0158 0159 /** 0160 * Merges another {@link CountMinSketch} with this one in place. 0161 * 0162 * Note that only Count-Min sketches with the same {@code depth}, {@code width}, and random seed 0163 * can be merged. 0164 * 0165 * @exception IncompatibleMergeException if the {@code other} {@link CountMinSketch} has 0166 * incompatible depth, width, relative-error, confidence, or random seed. 0167 */ 0168 public abstract CountMinSketch mergeInPlace(CountMinSketch other) 0169 throws IncompatibleMergeException; 0170 0171 /** 0172 * Writes out this {@link CountMinSketch} to an output stream in binary format. It is the caller's 0173 * responsibility to close the stream. 0174 */ 0175 public abstract void writeTo(OutputStream out) throws IOException; 0176 0177 /** 0178 * Serializes this {@link CountMinSketch} and returns the serialized form. 0179 */ 0180 public abstract byte[] toByteArray() throws IOException; 0181 0182 /** 0183 * Reads in a {@link CountMinSketch} from an input stream. It is the caller's responsibility to 0184 * close the stream. 0185 */ 0186 public static CountMinSketch readFrom(InputStream in) throws IOException { 0187 return CountMinSketchImpl.readFrom(in); 0188 } 0189 0190 /** 0191 * Reads in a {@link CountMinSketch} from a byte array. 0192 */ 0193 public static CountMinSketch readFrom(byte[] bytes) throws IOException { 0194 try (InputStream in = new ByteArrayInputStream(bytes)) { 0195 return readFrom(in); 0196 } 0197 } 0198 0199 /** 0200 * Creates a {@link CountMinSketch} with given {@code depth}, {@code width}, and random 0201 * {@code seed}. 0202 * 0203 * @param depth depth of the Count-min Sketch, must be positive 0204 * @param width width of the Count-min Sketch, must be positive 0205 * @param seed random seed 0206 */ 0207 public static CountMinSketch create(int depth, int width, int seed) { 0208 return new CountMinSketchImpl(depth, width, seed); 0209 } 0210 0211 /** 0212 * Creates a {@link CountMinSketch} with given relative error ({@code eps}), {@code confidence}, 0213 * and random {@code seed}. 0214 * 0215 * @param eps relative error, must be positive 0216 * @param confidence confidence, must be positive and less than 1.0 0217 * @param seed random seed 0218 */ 0219 public static CountMinSketch create(double eps, double confidence, int seed) { 0220 return new CountMinSketchImpl(eps, confidence, seed); 0221 } 0222 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |