0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 An example demonstrating MinHashLSH.
0020 Run with:
0021 bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
0022 """
0023 from __future__ import print_function
0024
0025
0026 from pyspark.ml.feature import MinHashLSH
0027 from pyspark.ml.linalg import Vectors
0028 from pyspark.sql.functions import col
0029
0030 from pyspark.sql import SparkSession
0031
0032 if __name__ == "__main__":
0033 spark = SparkSession \
0034 .builder \
0035 .appName("MinHashLSHExample") \
0036 .getOrCreate()
0037
0038
0039 dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
0040 (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
0041 (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
0042 dfA = spark.createDataFrame(dataA, ["id", "features"])
0043
0044 dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
0045 (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
0046 (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
0047 dfB = spark.createDataFrame(dataB, ["id", "features"])
0048
0049 key = Vectors.sparse(6, [1, 3], [1.0, 1.0])
0050
0051 mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
0052 model = mh.fit(dfA)
0053
0054
0055 print("The hashed dataset where hashed values are stored in the column 'hashes':")
0056 model.transform(dfA).show()
0057
0058
0059
0060
0061
0062 print("Approximately joining dfA and dfB on distance smaller than 0.6:")
0063 model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
0064 .select(col("datasetA.id").alias("idA"),
0065 col("datasetB.id").alias("idB"),
0066 col("JaccardDistance")).show()
0067
0068
0069
0070
0071
0072
0073
0074 print("Approximately searching dfA for 2 nearest neighbors of the key:")
0075 model.approxNearestNeighbors(dfA, key, 2).show()
0076
0077
0078
0079 spark.stop()