0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 from __future__ import print_function
0019
0020
0021 from pyspark.ml.feature import IndexToString, StringIndexer
0022
0023 from pyspark.sql import SparkSession
0024
0025 if __name__ == "__main__":
0026 spark = SparkSession\
0027 .builder\
0028 .appName("IndexToStringExample")\
0029 .getOrCreate()
0030
0031
0032 df = spark.createDataFrame(
0033 [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
0034 ["id", "category"])
0035
0036 indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
0037 model = indexer.fit(df)
0038 indexed = model.transform(df)
0039
0040 print("Transformed string column '%s' to indexed column '%s'"
0041 % (indexer.getInputCol(), indexer.getOutputCol()))
0042 indexed.show()
0043
0044 print("StringIndexer will store labels in output column metadata\n")
0045
0046 converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
0047 converted = converter.transform(indexed)
0048
0049 print("Transformed indexed column '%s' back to original string column '%s' using "
0050 "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
0051 converted.select("id", "categoryIndex", "originalCategory").show()
0052
0053
0054 spark.stop()