1. 程式人生 > >Spark Mlib(四)用spark計算tf-idf值

Spark Mlib(四)用spark計算tf-idf值

tf-idf演算法是用統計的手法衡量一個元素在一個集合中的重要程度。在自然語言處理中,該演算法可以衡量一個詞在語料中的重要程度。其本思想很簡單,字詞的重要性隨著它在檔案中出現的次數成正比增加,但同時會隨著它在語料庫中出現的頻率成反比下降。下面是spark官網(http://spark.apache.org/docs/latest/ml-features.html#tf-idf)給出的例子

package alg
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.
SparkSession object tfidf { def main(args:Array[String]):Unit={ val spark: SparkSession = SparkSession.builder .appName("My") .master("local[*]") .getOrCreate() val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"
), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol
("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.collect().foreach(print(_)) //rescaledData.select("label", "features").show() } }