0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 A simple example demonstrating Spark SQL data sources.
0020 Run with:
0021 ./bin/spark-submit examples/src/main/python/sql/datasource.py
0022 """
0023 from __future__ import print_function
0024
0025 from pyspark.sql import SparkSession
0026
0027 from pyspark.sql import Row
0028
0029
0030
0031 def generic_file_source_options_example(spark):
0032
0033
0034 spark.sql("set spark.sql.files.ignoreCorruptFiles=true")
0035
0036 test_corrupt_df = spark.read.parquet("examples/src/main/resources/dir1/",
0037 "examples/src/main/resources/dir1/dir2/")
0038 test_corrupt_df.show()
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048 recursive_loaded_df = spark.read.format("parquet")\
0049 .option("recursiveFileLookup", "true")\
0050 .load("examples/src/main/resources/dir1")
0051 recursive_loaded_df.show()
0052
0053
0054
0055
0056
0057
0058
0059 spark.sql("set spark.sql.files.ignoreCorruptFiles=false")
0060
0061
0062 df = spark.read.load("examples/src/main/resources/dir1",
0063 format="parquet", pathGlobFilter="*.parquet")
0064 df.show()
0065
0066
0067
0068
0069
0070
0071
0072
0073 def basic_datasource_example(spark):
0074
0075 df = spark.read.load("examples/src/main/resources/users.parquet")
0076 df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
0077
0078
0079
0080 df.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet")
0081
0082
0083
0084 df = spark.read.parquet("examples/src/main/resources/users.parquet")
0085 (df
0086 .write
0087 .partitionBy("favorite_color")
0088 .bucketBy(42, "name")
0089 .saveAsTable("people_partitioned_bucketed"))
0090
0091
0092
0093 df = spark.read.load("examples/src/main/resources/people.json", format="json")
0094 df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
0095
0096
0097
0098 df = spark.read.load("examples/src/main/resources/people.csv",
0099 format="csv", sep=":", inferSchema="true", header="true")
0100
0101
0102
0103 df = spark.read.orc("examples/src/main/resources/users.orc")
0104 (df.write.format("orc")
0105 .option("orc.bloom.filter.columns", "favorite_color")
0106 .option("orc.dictionary.key.threshold", "1.0")
0107 .option("orc.column.encoding.direct", "name")
0108 .save("users_with_options.orc"))
0109
0110
0111
0112 df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
0113
0114
0115
0116 df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
0117
0118
0119 spark.sql("DROP TABLE IF EXISTS people_bucketed")
0120 spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed")
0121
0122
0123 def parquet_example(spark):
0124
0125 peopleDF = spark.read.json("examples/src/main/resources/people.json")
0126
0127
0128 peopleDF.write.parquet("people.parquet")
0129
0130
0131
0132
0133 parquetFile = spark.read.parquet("people.parquet")
0134
0135
0136 parquetFile.createOrReplaceTempView("parquetFile")
0137 teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
0138 teenagers.show()
0139
0140
0141
0142
0143
0144
0145
0146
0147 def parquet_schema_merging_example(spark):
0148
0149
0150
0151 sc = spark.sparkContext
0152
0153 squaresDF = spark.createDataFrame(sc.parallelize(range(1, 6))
0154 .map(lambda i: Row(single=i, double=i ** 2)))
0155 squaresDF.write.parquet("data/test_table/key=1")
0156
0157
0158
0159 cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11))
0160 .map(lambda i: Row(single=i, triple=i ** 3)))
0161 cubesDF.write.parquet("data/test_table/key=2")
0162
0163
0164 mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
0165 mergedDF.printSchema()
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177 def json_dataset_example(spark):
0178
0179
0180 sc = spark.sparkContext
0181
0182
0183
0184 path = "examples/src/main/resources/people.json"
0185 peopleDF = spark.read.json(path)
0186
0187
0188 peopleDF.printSchema()
0189
0190
0191
0192
0193
0194 peopleDF.createOrReplaceTempView("people")
0195
0196
0197 teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19")
0198 teenagerNamesDF.show()
0199
0200
0201
0202
0203
0204
0205
0206
0207 jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']
0208 otherPeopleRDD = sc.parallelize(jsonStrings)
0209 otherPeople = spark.read.json(otherPeopleRDD)
0210 otherPeople.show()
0211
0212
0213
0214
0215
0216
0217
0218
0219 def jdbc_dataset_example(spark):
0220
0221
0222
0223 jdbcDF = spark.read \
0224 .format("jdbc") \
0225 .option("url", "jdbc:postgresql:dbserver") \
0226 .option("dbtable", "schema.tablename") \
0227 .option("user", "username") \
0228 .option("password", "password") \
0229 .load()
0230
0231 jdbcDF2 = spark.read \
0232 .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
0233 properties={"user": "username", "password": "password"})
0234
0235
0236 jdbcDF3 = spark.read \
0237 .format("jdbc") \
0238 .option("url", "jdbc:postgresql:dbserver") \
0239 .option("dbtable", "schema.tablename") \
0240 .option("user", "username") \
0241 .option("password", "password") \
0242 .option("customSchema", "id DECIMAL(38, 0), name STRING") \
0243 .load()
0244
0245
0246 jdbcDF.write \
0247 .format("jdbc") \
0248 .option("url", "jdbc:postgresql:dbserver") \
0249 .option("dbtable", "schema.tablename") \
0250 .option("user", "username") \
0251 .option("password", "password") \
0252 .save()
0253
0254 jdbcDF2.write \
0255 .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
0256 properties={"user": "username", "password": "password"})
0257
0258
0259 jdbcDF.write \
0260 .option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)") \
0261 .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
0262 properties={"user": "username", "password": "password"})
0263
0264
0265
0266 if __name__ == "__main__":
0267 spark = SparkSession \
0268 .builder \
0269 .appName("Python Spark SQL data source example") \
0270 .getOrCreate()
0271
0272 basic_datasource_example(spark)
0273 generic_file_source_options_example(spark)
0274 parquet_example(spark)
0275 parquet_schema_merging_example(spark)
0276 json_dataset_example(spark)
0277 jdbc_dataset_example(spark)
0278
0279 spark.stop()