Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017 package org.apache.spark.sql.vectorized;
0018 
0019 import org.apache.spark.annotation.Evolving;
0020 import org.apache.spark.sql.types.DataType;
0021 import org.apache.spark.sql.types.Decimal;
0022 import org.apache.spark.unsafe.types.CalendarInterval;
0023 import org.apache.spark.unsafe.types.UTF8String;
0024 
0025 /**
0026  * An interface representing in-memory columnar data in Spark. This interface defines the main APIs
0027  * to access the data, as well as their batched versions. The batched versions are considered to be
0028  * faster and preferable whenever possible.
0029  *
0030  * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
0031  * in this ColumnVector.
0032  *
0033  * Spark only calls specific `get` method according to the data type of this {@link ColumnVector},
0034  * e.g. if it's int type, Spark is guaranteed to only call {@link #getInt(int)} or
0035  * {@link #getInts(int, int)}.
0036  *
0037  * ColumnVector supports all the data types including nested types. To handle nested types,
0038  * ColumnVector can have children and is a tree structure. Please refer to {@link #getStruct(int)},
0039  * {@link #getArray(int)} and {@link #getMap(int)} for the details about how to implement nested
0040  * types.
0041  *
0042  * ColumnVector is expected to be reused during the entire data loading process, to avoid allocating
0043  * memory again and again.
0044  *
0045  * ColumnVector is meant to maximize CPU efficiency but not to minimize storage footprint.
0046  * Implementations should prefer computing efficiency over storage efficiency when design the
0047  * format. Since it is expected to reuse the ColumnVector instance while loading data, the storage
0048  * footprint is negligible.
0049  */
0050 @Evolving
0051 public abstract class ColumnVector implements AutoCloseable {
0052 
0053   /**
0054    * Returns the data type of this column vector.
0055    */
0056   public final DataType dataType() { return type; }
0057 
0058   /**
0059    * Cleans up memory for this column vector. The column vector is not usable after this.
0060    *
0061    * This overwrites `AutoCloseable.close` to remove the `throws` clause, as column vector is
0062    * in-memory and we don't expect any exception to happen during closing.
0063    */
0064   @Override
0065   public abstract void close();
0066 
0067   /**
0068    * Returns true if this column vector contains any null values.
0069    */
0070   public abstract boolean hasNull();
0071 
0072   /**
0073    * Returns the number of nulls in this column vector.
0074    */
0075   public abstract int numNulls();
0076 
0077   /**
0078    * Returns whether the value at rowId is NULL.
0079    */
0080   public abstract boolean isNullAt(int rowId);
0081 
0082   /**
0083    * Returns the boolean type value for rowId. The return value is undefined and can be anything,
0084    * if the slot for rowId is null.
0085    */
0086   public abstract boolean getBoolean(int rowId);
0087 
0088   /**
0089    * Gets boolean type values from [rowId, rowId + count). The return values for the null slots
0090    * are undefined and can be anything.
0091    */
0092   public boolean[] getBooleans(int rowId, int count) {
0093     boolean[] res = new boolean[count];
0094     for (int i = 0; i < count; i++) {
0095       res[i] = getBoolean(rowId + i);
0096     }
0097     return res;
0098   }
0099 
0100   /**
0101    * Returns the byte type value for rowId. The return value is undefined and can be anything,
0102    * if the slot for rowId is null.
0103    */
0104   public abstract byte getByte(int rowId);
0105 
0106   /**
0107    * Gets byte type values from [rowId, rowId + count). The return values for the null slots
0108    * are undefined and can be anything.
0109    */
0110   public byte[] getBytes(int rowId, int count) {
0111     byte[] res = new byte[count];
0112     for (int i = 0; i < count; i++) {
0113       res[i] = getByte(rowId + i);
0114     }
0115     return res;
0116   }
0117 
0118   /**
0119    * Returns the short type value for rowId. The return value is undefined and can be anything,
0120    * if the slot for rowId is null.
0121    */
0122   public abstract short getShort(int rowId);
0123 
0124   /**
0125    * Gets short type values from [rowId, rowId + count). The return values for the null slots
0126    * are undefined and can be anything.
0127    */
0128   public short[] getShorts(int rowId, int count) {
0129     short[] res = new short[count];
0130     for (int i = 0; i < count; i++) {
0131       res[i] = getShort(rowId + i);
0132     }
0133     return res;
0134   }
0135 
0136   /**
0137    * Returns the int type value for rowId. The return value is undefined and can be anything,
0138    * if the slot for rowId is null.
0139    */
0140   public abstract int getInt(int rowId);
0141 
0142   /**
0143    * Gets int type values from [rowId, rowId + count). The return values for the null slots
0144    * are undefined and can be anything.
0145    */
0146   public int[] getInts(int rowId, int count) {
0147     int[] res = new int[count];
0148     for (int i = 0; i < count; i++) {
0149       res[i] = getInt(rowId + i);
0150     }
0151     return res;
0152   }
0153 
0154   /**
0155    * Returns the long type value for rowId. The return value is undefined and can be anything,
0156    * if the slot for rowId is null.
0157    */
0158   public abstract long getLong(int rowId);
0159 
0160   /**
0161    * Gets long type values from [rowId, rowId + count). The return values for the null slots
0162    * are undefined and can be anything.
0163    */
0164   public long[] getLongs(int rowId, int count) {
0165     long[] res = new long[count];
0166     for (int i = 0; i < count; i++) {
0167       res[i] = getLong(rowId + i);
0168     }
0169     return res;
0170   }
0171 
0172   /**
0173    * Returns the float type value for rowId. The return value is undefined and can be anything,
0174    * if the slot for rowId is null.
0175    */
0176   public abstract float getFloat(int rowId);
0177 
0178   /**
0179    * Gets float type values from [rowId, rowId + count). The return values for the null slots
0180    * are undefined and can be anything.
0181    */
0182   public float[] getFloats(int rowId, int count) {
0183     float[] res = new float[count];
0184     for (int i = 0; i < count; i++) {
0185       res[i] = getFloat(rowId + i);
0186     }
0187     return res;
0188   }
0189 
0190   /**
0191    * Returns the double type value for rowId. The return value is undefined and can be anything,
0192    * if the slot for rowId is null.
0193    */
0194   public abstract double getDouble(int rowId);
0195 
0196   /**
0197    * Gets double type values from [rowId, rowId + count). The return values for the null slots
0198    * are undefined and can be anything.
0199    */
0200   public double[] getDoubles(int rowId, int count) {
0201     double[] res = new double[count];
0202     for (int i = 0; i < count; i++) {
0203       res[i] = getDouble(rowId + i);
0204     }
0205     return res;
0206   }
0207 
0208   /**
0209    * Returns the struct type value for rowId. If the slot for rowId is null, it should return null.
0210    *
0211    * To support struct type, implementations must implement {@link #getChild(int)} and make this
0212    * vector a tree structure. The number of child vectors must be same as the number of fields of
0213    * the struct type, and each child vector is responsible to store the data for its corresponding
0214    * struct field.
0215    */
0216   public final ColumnarRow getStruct(int rowId) {
0217     if (isNullAt(rowId)) return null;
0218     return new ColumnarRow(this, rowId);
0219   }
0220 
0221   /**
0222    * Returns the array type value for rowId. If the slot for rowId is null, it should return null.
0223    *
0224    * To support array type, implementations must construct an {@link ColumnarArray} and return it in
0225    * this method. {@link ColumnarArray} requires a {@link ColumnVector} that stores the data of all
0226    * the elements of all the arrays in this vector, and an offset and length which points to a range
0227    * in that {@link ColumnVector}, and the range represents the array for rowId. Implementations
0228    * are free to decide where to put the data vector and offsets and lengths. For example, we can
0229    * use the first child vector as the data vector, and store offsets and lengths in 2 int arrays in
0230    * this vector.
0231    */
0232   public abstract ColumnarArray getArray(int rowId);
0233 
0234   /**
0235    * Returns the map type value for rowId. If the slot for rowId is null, it should return null.
0236    *
0237    * In Spark, map type value is basically a key data array and a value data array. A key from the
0238    * key array with a index and a value from the value array with the same index contribute to
0239    * an entry of this map type value.
0240    *
0241    * To support map type, implementations must construct a {@link ColumnarMap} and return it in
0242    * this method. {@link ColumnarMap} requires a {@link ColumnVector} that stores the data of all
0243    * the keys of all the maps in this vector, and another {@link ColumnVector} that stores the data
0244    * of all the values of all the maps in this vector, and a pair of offset and length which
0245    * specify the range of the key/value array that belongs to the map type value at rowId.
0246    */
0247   public abstract ColumnarMap getMap(int ordinal);
0248 
0249   /**
0250    * Returns the decimal type value for rowId. If the slot for rowId is null, it should return null.
0251    */
0252   public abstract Decimal getDecimal(int rowId, int precision, int scale);
0253 
0254   /**
0255    * Returns the string type value for rowId. If the slot for rowId is null, it should return null.
0256    * Note that the returned UTF8String may point to the data of this column vector, please copy it
0257    * if you want to keep it after this column vector is freed.
0258    */
0259   public abstract UTF8String getUTF8String(int rowId);
0260 
0261   /**
0262    * Returns the binary type value for rowId. If the slot for rowId is null, it should return null.
0263    */
0264   public abstract byte[] getBinary(int rowId);
0265 
0266   /**
0267    * Returns the calendar interval type value for rowId. If the slot for rowId is null, it should
0268    * return null.
0269    *
0270    * In Spark, calendar interval type value is basically two integer values representing the number
0271    * of months and days in this interval, and a long value representing the number of microseconds
0272    * in this interval. An interval type vector is the same as a struct type vector with 3 fields:
0273    * `months`, `days` and `microseconds`.
0274    *
0275    * To support interval type, implementations must implement {@link #getChild(int)} and define 3
0276    * child vectors: the first child vector is an int type vector, containing all the month values of
0277    * all the interval values in this vector. The second child vector is an int type vector,
0278    * containing all the day values of all the interval values in this vector. The third child vector
0279    * is a long type vector, containing all the microsecond values of all the interval values in this
0280    * vector.
0281    */
0282   public final CalendarInterval getInterval(int rowId) {
0283     if (isNullAt(rowId)) return null;
0284     final int months = getChild(0).getInt(rowId);
0285     final int days = getChild(1).getInt(rowId);
0286     final long microseconds = getChild(2).getLong(rowId);
0287     return new CalendarInterval(months, days, microseconds);
0288   }
0289 
0290   /**
0291    * @return child [[ColumnVector]] at the given ordinal.
0292    */
0293   public abstract ColumnVector getChild(int ordinal);
0294 
0295   /**
0296    * Data type for this column.
0297    */
0298   protected DataType type;
0299 
0300   /**
0301    * Sets up the data type of this column vector.
0302    */
0303   protected ColumnVector(DataType type) {
0304     this.type = type;
0305   }
0306 }