connector/read/Scan.java

0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017
0018 package org.apache.spark.sql.connector.read;
0019
0020 import org.apache.spark.annotation.Evolving;
0021 import org.apache.spark.sql.connector.read.streaming.ContinuousStream;
0022 import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;
0023 import org.apache.spark.sql.types.StructType;
0024 import org.apache.spark.sql.connector.catalog.Table;
0025 import org.apache.spark.sql.connector.catalog.TableCapability;
0026
0027 /**
0028  * A logical representation of a data source scan. This interface is used to provide logical
0029  * information, like what the actual read schema is.
0030  * <p>
0031  * This logical representation is shared between batch scan, micro-batch streaming scan and
0032  * continuous streaming scan. Data sources must implement the corresponding methods in this
0033  * interface, to match what the table promises to support. For example, {@link #toBatch()} must be
0034  * implemented, if the {@link Table} that creates this {@link Scan} returns
0035  * {@link TableCapability#BATCH_READ} support in its {@link Table#capabilities()}.
0036  * </p>
0037  *
0038  * @since 3.0.0
0039  */
0040 @Evolving
0041 public interface Scan {
0042
0043   /**
0044    * Returns the actual schema of this data source scan, which may be different from the physical
0045    * schema of the underlying storage, as column pruning or other optimizations may happen.
0046    */
0047   StructType readSchema();
0048
0049   /**
0050    * A description string of this scan, which may includes information like: what filters are
0051    * configured for this scan, what's the value of some important options like path, etc. The
0052    * description doesn't need to include {@link #readSchema()}, as Spark already knows it.
0053    * <p>
0054    * By default this returns the class name of the implementation. Please override it to provide a
0055    * meaningful description.
0056    * </p>
0057    */
0058   default String description() {
0059     return this.getClass().toString();
0060   }
0061
0062   /**
0063    * Returns the physical representation of this scan for batch query. By default this method throws
0064    * exception, data sources must overwrite this method to provide an implementation, if the
0065    * {@link Table} that creates this scan returns {@link TableCapability#BATCH_READ} support in its
0066    * {@link Table#capabilities()}.
0067    *
0068    * @throws UnsupportedOperationException
0069    */
0070   default Batch toBatch() {
0071     throw new UnsupportedOperationException(description() + ": Batch scan are not supported");
0072   }
0073
0074   /**
0075    * Returns the physical representation of this scan for streaming query with micro-batch mode. By
0076    * default this method throws exception, data sources must overwrite this method to provide an
0077    * implementation, if the {@link Table} that creates this scan returns
0078    * {@link TableCapability#MICRO_BATCH_READ} support in its {@link Table#capabilities()}.
0079    *
0080    * @param checkpointLocation a path to Hadoop FS scratch space that can be used for failure
0081    *                           recovery. Data streams for the same logical source in the same query
0082    *                           will be given the same checkpointLocation.
0083    *
0084    * @throws UnsupportedOperationException
0085    */
0086   default MicroBatchStream toMicroBatchStream(String checkpointLocation) {
0087     throw new UnsupportedOperationException(description() + ": Micro-batch scan are not supported");
0088   }
0089
0090   /**
0091    * Returns the physical representation of this scan for streaming query with continuous mode. By
0092    * default this method throws exception, data sources must overwrite this method to provide an
0093    * implementation, if the {@link Table} that creates this scan returns
0094    * {@link TableCapability#CONTINUOUS_READ} support in its {@link Table#capabilities()}.
0095    *
0096    * @param checkpointLocation a path to Hadoop FS scratch space that can be used for failure
0097    *                           recovery. Data streams for the same logical source in the same query
0098    *                           will be given the same checkpointLocation.
0099    *
0100    * @throws UnsupportedOperationException
0101    */
0102   default ContinuousStream toContinuousStream(String checkpointLocation) {
0103     throw new UnsupportedOperationException(description() + ": Continuous scan are not supported");
0104   }
0105 }