1. 程式人生 > >Spark Mlib(五)用spark n元模型

Spark Mlib(五)用spark n元模型

通常在NLP中,人們基於一定的語料庫,可以利用N-Gram來預計或者評估一個句子是否合理。另外一方面,N-Gram的另外一個作用是用來評估兩個字串之間的差異程度。下面是spark官網(http://spark.apache.org/docs/latest/ml-features.html#tokenizer)給出的例子

def main(args:Array[String]):Unit={


    val spark: SparkSession = SparkSession.builder
      .appName("My")
      .master("local[*]")
      .
getOrCreate() val wordDataFrame = spark.createDataFrame(Seq( (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")) )).toDF("id", "words") val ngram =
new NGram().setN(2).setInputCol("words").setOutputCol("ngrams") val ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.select("ngrams").show(false) }