read/streaming/MicroBatchStream.java

0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017
0018 package org.apache.spark.sql.connector.read.streaming;
0019
0020 import org.apache.spark.annotation.Evolving;
0021 import org.apache.spark.sql.connector.read.InputPartition;
0022 import org.apache.spark.sql.connector.read.PartitionReader;
0023 import org.apache.spark.sql.connector.read.PartitionReaderFactory;
0024 import org.apache.spark.sql.connector.read.Scan;
0025
0026 /**
0027  * A {@link SparkDataStream} for streaming queries with micro-batch mode.
0028  *
0029  * @since 3.0.0
0030  */
0031 @Evolving
0032 public interface MicroBatchStream extends SparkDataStream {
0033
0034   /**
0035    * Returns the most recent offset available.
0036    */
0037   Offset latestOffset();
0038
0039   /**
0040    * Returns a list of {@link InputPartition input partitions} given the start and end offsets. Each
0041    * {@link InputPartition} represents a data split that can be processed by one Spark task. The
0042    * number of input partitions returned here is the same as the number of RDD partitions this scan
0043    * outputs.
0044    * <p>
0045    * If the {@link Scan} supports filter pushdown, this stream is likely configured with a filter
0046    * and is responsible for creating splits for that filter, which is not a full scan.
0047    * </p>
0048    * <p>
0049    * This method will be called multiple times, to launch one Spark job for each micro-batch in this
0050    * data stream.
0051    * </p>
0052    */
0053   InputPartition[] planInputPartitions(Offset start, Offset end);
0054
0055   /**
0056    * Returns a factory to create a {@link PartitionReader} for each {@link InputPartition}.
0057    */
0058   PartitionReaderFactory createReaderFactory();
0059 }