|
||||
0001 /* 0002 * Licensed to the Apache Software Foundation (ASF) under one or more 0003 * contributor license agreements. See the NOTICE file distributed with 0004 * this work for additional information regarding copyright ownership. 0005 * The ASF licenses this file to You under the Apache License, Version 2.0 0006 * (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * 0009 * http://www.apache.org/licenses/LICENSE-2.0 0010 * 0011 * Unless required by applicable law or agreed to in writing, software 0012 * distributed under the License is distributed on an "AS IS" BASIS, 0013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 0014 * See the License for the specific language governing permissions and 0015 * limitations under the License. 0016 */ 0017 package org.apache.spark.sql.vectorized; 0018 0019 import org.apache.spark.annotation.Evolving; 0020 import org.apache.spark.sql.types.DataType; 0021 import org.apache.spark.sql.types.Decimal; 0022 import org.apache.spark.unsafe.types.CalendarInterval; 0023 import org.apache.spark.unsafe.types.UTF8String; 0024 0025 /** 0026 * An interface representing in-memory columnar data in Spark. This interface defines the main APIs 0027 * to access the data, as well as their batched versions. The batched versions are considered to be 0028 * faster and preferable whenever possible. 0029 * 0030 * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values 0031 * in this ColumnVector. 0032 * 0033 * Spark only calls specific `get` method according to the data type of this {@link ColumnVector}, 0034 * e.g. if it's int type, Spark is guaranteed to only call {@link #getInt(int)} or 0035 * {@link #getInts(int, int)}. 0036 * 0037 * ColumnVector supports all the data types including nested types. To handle nested types, 0038 * ColumnVector can have children and is a tree structure. Please refer to {@link #getStruct(int)}, 0039 * {@link #getArray(int)} and {@link #getMap(int)} for the details about how to implement nested 0040 * types. 0041 * 0042 * ColumnVector is expected to be reused during the entire data loading process, to avoid allocating 0043 * memory again and again. 0044 * 0045 * ColumnVector is meant to maximize CPU efficiency but not to minimize storage footprint. 0046 * Implementations should prefer computing efficiency over storage efficiency when design the 0047 * format. Since it is expected to reuse the ColumnVector instance while loading data, the storage 0048 * footprint is negligible. 0049 */ 0050 @Evolving 0051 public abstract class ColumnVector implements AutoCloseable { 0052 0053 /** 0054 * Returns the data type of this column vector. 0055 */ 0056 public final DataType dataType() { return type; } 0057 0058 /** 0059 * Cleans up memory for this column vector. The column vector is not usable after this. 0060 * 0061 * This overwrites `AutoCloseable.close` to remove the `throws` clause, as column vector is 0062 * in-memory and we don't expect any exception to happen during closing. 0063 */ 0064 @Override 0065 public abstract void close(); 0066 0067 /** 0068 * Returns true if this column vector contains any null values. 0069 */ 0070 public abstract boolean hasNull(); 0071 0072 /** 0073 * Returns the number of nulls in this column vector. 0074 */ 0075 public abstract int numNulls(); 0076 0077 /** 0078 * Returns whether the value at rowId is NULL. 0079 */ 0080 public abstract boolean isNullAt(int rowId); 0081 0082 /** 0083 * Returns the boolean type value for rowId. The return value is undefined and can be anything, 0084 * if the slot for rowId is null. 0085 */ 0086 public abstract boolean getBoolean(int rowId); 0087 0088 /** 0089 * Gets boolean type values from [rowId, rowId + count). The return values for the null slots 0090 * are undefined and can be anything. 0091 */ 0092 public boolean[] getBooleans(int rowId, int count) { 0093 boolean[] res = new boolean[count]; 0094 for (int i = 0; i < count; i++) { 0095 res[i] = getBoolean(rowId + i); 0096 } 0097 return res; 0098 } 0099 0100 /** 0101 * Returns the byte type value for rowId. The return value is undefined and can be anything, 0102 * if the slot for rowId is null. 0103 */ 0104 public abstract byte getByte(int rowId); 0105 0106 /** 0107 * Gets byte type values from [rowId, rowId + count). The return values for the null slots 0108 * are undefined and can be anything. 0109 */ 0110 public byte[] getBytes(int rowId, int count) { 0111 byte[] res = new byte[count]; 0112 for (int i = 0; i < count; i++) { 0113 res[i] = getByte(rowId + i); 0114 } 0115 return res; 0116 } 0117 0118 /** 0119 * Returns the short type value for rowId. The return value is undefined and can be anything, 0120 * if the slot for rowId is null. 0121 */ 0122 public abstract short getShort(int rowId); 0123 0124 /** 0125 * Gets short type values from [rowId, rowId + count). The return values for the null slots 0126 * are undefined and can be anything. 0127 */ 0128 public short[] getShorts(int rowId, int count) { 0129 short[] res = new short[count]; 0130 for (int i = 0; i < count; i++) { 0131 res[i] = getShort(rowId + i); 0132 } 0133 return res; 0134 } 0135 0136 /** 0137 * Returns the int type value for rowId. The return value is undefined and can be anything, 0138 * if the slot for rowId is null. 0139 */ 0140 public abstract int getInt(int rowId); 0141 0142 /** 0143 * Gets int type values from [rowId, rowId + count). The return values for the null slots 0144 * are undefined and can be anything. 0145 */ 0146 public int[] getInts(int rowId, int count) { 0147 int[] res = new int[count]; 0148 for (int i = 0; i < count; i++) { 0149 res[i] = getInt(rowId + i); 0150 } 0151 return res; 0152 } 0153 0154 /** 0155 * Returns the long type value for rowId. The return value is undefined and can be anything, 0156 * if the slot for rowId is null. 0157 */ 0158 public abstract long getLong(int rowId); 0159 0160 /** 0161 * Gets long type values from [rowId, rowId + count). The return values for the null slots 0162 * are undefined and can be anything. 0163 */ 0164 public long[] getLongs(int rowId, int count) { 0165 long[] res = new long[count]; 0166 for (int i = 0; i < count; i++) { 0167 res[i] = getLong(rowId + i); 0168 } 0169 return res; 0170 } 0171 0172 /** 0173 * Returns the float type value for rowId. The return value is undefined and can be anything, 0174 * if the slot for rowId is null. 0175 */ 0176 public abstract float getFloat(int rowId); 0177 0178 /** 0179 * Gets float type values from [rowId, rowId + count). The return values for the null slots 0180 * are undefined and can be anything. 0181 */ 0182 public float[] getFloats(int rowId, int count) { 0183 float[] res = new float[count]; 0184 for (int i = 0; i < count; i++) { 0185 res[i] = getFloat(rowId + i); 0186 } 0187 return res; 0188 } 0189 0190 /** 0191 * Returns the double type value for rowId. The return value is undefined and can be anything, 0192 * if the slot for rowId is null. 0193 */ 0194 public abstract double getDouble(int rowId); 0195 0196 /** 0197 * Gets double type values from [rowId, rowId + count). The return values for the null slots 0198 * are undefined and can be anything. 0199 */ 0200 public double[] getDoubles(int rowId, int count) { 0201 double[] res = new double[count]; 0202 for (int i = 0; i < count; i++) { 0203 res[i] = getDouble(rowId + i); 0204 } 0205 return res; 0206 } 0207 0208 /** 0209 * Returns the struct type value for rowId. If the slot for rowId is null, it should return null. 0210 * 0211 * To support struct type, implementations must implement {@link #getChild(int)} and make this 0212 * vector a tree structure. The number of child vectors must be same as the number of fields of 0213 * the struct type, and each child vector is responsible to store the data for its corresponding 0214 * struct field. 0215 */ 0216 public final ColumnarRow getStruct(int rowId) { 0217 if (isNullAt(rowId)) return null; 0218 return new ColumnarRow(this, rowId); 0219 } 0220 0221 /** 0222 * Returns the array type value for rowId. If the slot for rowId is null, it should return null. 0223 * 0224 * To support array type, implementations must construct an {@link ColumnarArray} and return it in 0225 * this method. {@link ColumnarArray} requires a {@link ColumnVector} that stores the data of all 0226 * the elements of all the arrays in this vector, and an offset and length which points to a range 0227 * in that {@link ColumnVector}, and the range represents the array for rowId. Implementations 0228 * are free to decide where to put the data vector and offsets and lengths. For example, we can 0229 * use the first child vector as the data vector, and store offsets and lengths in 2 int arrays in 0230 * this vector. 0231 */ 0232 public abstract ColumnarArray getArray(int rowId); 0233 0234 /** 0235 * Returns the map type value for rowId. If the slot for rowId is null, it should return null. 0236 * 0237 * In Spark, map type value is basically a key data array and a value data array. A key from the 0238 * key array with a index and a value from the value array with the same index contribute to 0239 * an entry of this map type value. 0240 * 0241 * To support map type, implementations must construct a {@link ColumnarMap} and return it in 0242 * this method. {@link ColumnarMap} requires a {@link ColumnVector} that stores the data of all 0243 * the keys of all the maps in this vector, and another {@link ColumnVector} that stores the data 0244 * of all the values of all the maps in this vector, and a pair of offset and length which 0245 * specify the range of the key/value array that belongs to the map type value at rowId. 0246 */ 0247 public abstract ColumnarMap getMap(int ordinal); 0248 0249 /** 0250 * Returns the decimal type value for rowId. If the slot for rowId is null, it should return null. 0251 */ 0252 public abstract Decimal getDecimal(int rowId, int precision, int scale); 0253 0254 /** 0255 * Returns the string type value for rowId. If the slot for rowId is null, it should return null. 0256 * Note that the returned UTF8String may point to the data of this column vector, please copy it 0257 * if you want to keep it after this column vector is freed. 0258 */ 0259 public abstract UTF8String getUTF8String(int rowId); 0260 0261 /** 0262 * Returns the binary type value for rowId. If the slot for rowId is null, it should return null. 0263 */ 0264 public abstract byte[] getBinary(int rowId); 0265 0266 /** 0267 * Returns the calendar interval type value for rowId. If the slot for rowId is null, it should 0268 * return null. 0269 * 0270 * In Spark, calendar interval type value is basically two integer values representing the number 0271 * of months and days in this interval, and a long value representing the number of microseconds 0272 * in this interval. An interval type vector is the same as a struct type vector with 3 fields: 0273 * `months`, `days` and `microseconds`. 0274 * 0275 * To support interval type, implementations must implement {@link #getChild(int)} and define 3 0276 * child vectors: the first child vector is an int type vector, containing all the month values of 0277 * all the interval values in this vector. The second child vector is an int type vector, 0278 * containing all the day values of all the interval values in this vector. The third child vector 0279 * is a long type vector, containing all the microsecond values of all the interval values in this 0280 * vector. 0281 */ 0282 public final CalendarInterval getInterval(int rowId) { 0283 if (isNullAt(rowId)) return null; 0284 final int months = getChild(0).getInt(rowId); 0285 final int days = getChild(1).getInt(rowId); 0286 final long microseconds = getChild(2).getLong(rowId); 0287 return new CalendarInterval(months, days, microseconds); 0288 } 0289 0290 /** 0291 * @return child [[ColumnVector]] at the given ordinal. 0292 */ 0293 public abstract ColumnVector getChild(int ordinal); 0294 0295 /** 0296 * Data type for this column. 0297 */ 0298 protected DataType type; 0299 0300 /** 0301 * Sets up the data type of this column vector. 0302 */ 0303 protected ColumnVector(DataType type) { 0304 this.type = type; 0305 } 0306 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |