1. 程式人生 > >特征變化--->標簽和索引的轉換(StringIndexer)

特征變化--->標簽和索引的轉換(StringIndexer)

def input data rgs del exe pack .get apache

package Spark_MLlib

import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.sql.SparkSession


object 特征變換_StringIndexer {
     val spark = SparkSession.builder().master("local[2]").appName("標簽和索引的轉換").getOrCreate()
     import spark.implicits._
  def main(args: Array[String]): Unit 
= { val df=spark.createDataFrame(Seq( (0,"log"), (1,"text"), (2,"text"), (3,"soyo"), (4,"text"), (5,"log"), (6,"log"), (7,"log") )).toDF("id","type") val indexer=new StringIndexer().setInputCol("type").setOutputCol("
type_index") val model=indexer.fit(df) model.labels.foreach(println) //類型的頻率順序(高-->低) val index=model.transform(df) //索引先排頻率高的即log為0 index.show(false) } }

結果:

log
text
soyo
+---+----+----------+
|id |type|type_index|
+---+----+----------+
|0 |log |0.0 |
|1 |text|1.0 |
|2 |text|1.0 |
|3 |soyo|2.0 |
|4 |text|1.0 |
|5 |log |0.0 |
|6 |log |0.0 |
|7 |log |0.0 |
+---+----+----------+

特征變化--->標簽和索引的轉換(StringIndexer)