1. 程式人生 > >【spark+nlp】 Feature Extract and Preprocess

【spark+nlp】 Feature Extract and Preprocess

:Spark NLP常用方法
package com.bbw5.ml.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF

/**
 * feature extract for text
 */
object FeatureExtractandPreprocess {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("LinearRegression4Wine")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)
  }

  def nlpPipeline(sc: SparkContext, sqlContext: SQLContext) {
    import sqlContext.implicits._
    val documents = sc.textFile("G:/temp/data/documents.txt")

    val df = documents.toDF("text")
    df.show()

    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("tokens")
    val remover = new StopWordsRemover().setInputCol("tokens").setOutputCol("words")
    //val countVector = new CountVectorizer().setInputCol("words").setOutputCol("count_words").setVocabSize(20).setMinDF(1)
    val hashingTF = new HashingTF().setNumFeatures(50).setInputCol("words").setOutputCol("rawFeatures")
    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val pipeline = new Pipeline().setStages(Array(tokenizer, remover, hashingTF, idf))

    val model = pipeline.fit(df)
    val tf = model.transform(df)
    tf.show()
    tf.select("words", "count_words").show(2, false)
    tf.select("features").show(2, false)
  }

  /**
   * fruits.txt:
   * 蘋果, 香蕉, 梨子
   * 蘋果, 草莓, 芒果, 西瓜
   * 草莓, 葡萄, 香瓜
   * 榴蓮, 橘子, 橙子
   */
  def countVectorizer(sc: SparkContext, sqlContext: SQLContext) {
    import sqlContext.implicits._

    val fruits = sc.textFile("G:/temp/data/fruits.txt")
    val df = fruits.map { x => (0, x.split(",").map { x => x.trim().toLowerCase() }) }.toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(7).setMinDF(1).fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c")).setInputCol("words").setOutputCol("features")

    println(cvModel.vocabulary.toList)
    cvModel.transform(df).show()

    cvModel.transform(df).select("features").foreach { r => println(r.get(0).getClass()) }
  }

  def tokenizer(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }

    val sentenceDataFrame = sqlContext.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat"))).toDF("label", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("words", "label").take(3).foreach(println)
    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("words", "label").take(3).foreach(println)
  }

  /**
   * english stop word:
   * http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
   */
  def stopWorld(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.StopWordsRemover

    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = sqlContext.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "baloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb")))).toDF("id", "raw")

    remover.transform(dataSet).show()
  }

  def stem(sqlContext: SQLContext) {

  }

  def lemmatization(sqlContext: SQLContext) {

  }

  def tfidf(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.{ HashingTF, IDF, Tokenizer }

    val sentenceData = sqlContext.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (0, "I wish Java could use case classes"),
      (1, "Logistic regression models are neat"))).toDF("label", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val wordsData = tokenizer.transform(sentenceData)
    wordsData.select("words").show(2, false)

    val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(10)
    val featurizedData = hashingTF.transform(wordsData)
    featurizedData.select("rawFeatures").show(2, false)

    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val idfModel = idf.fit(featurizedData)
    val rescaledData = idfModel.transform(featurizedData)
    rescaledData.select("features", "label").take(3).foreach(println)
  }

  def word2vec(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.Word2Vec

    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = sqlContext.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)
    val result = model.transform(documentDF)
    result.select("result").take(3).foreach(println)
  }

  def n_gram(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.NGram

    val wordDataFrame = sqlContext.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat")))).toDF("label", "words")

    val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
  }

}

:

相關推薦

spark+nlp Feature Extract and Preprocess

:Spark NLP常用方法package com.bbw5.ml.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.m

機器學習Feature selection – Part II: linear models and regularization

Selecting good features – Part II: linear models and regularization 在我之前的文章中,我討論了單變數特徵選擇,其中每個特徵都是根據響應變數獨立評估的。另一種流行的方法是利用機器學習模型進行特徵排序。許多機器學習模型要麼具有一

Codeforces 98E Help Shrek and Donkey 遊戲策略神題

html str 直線 最終 view lns 是否 最優 rip from http://www.cnblogs.com/MashiroSky/p/6576398.html   A君有n張牌,B君有m張牌,桌上還有一張反扣著的牌,每張牌都不一樣。   每個回合可以做兩

Spark---Spark中Transformations轉換算子

pack gpo rds color boolean long als sam park 一、前述 Spark中默認有兩大類算子,Transformation(轉換算子),懶執行。action算子,立即執行,有一個action算子 ,就有一個job。 通俗些來說由RDD變成

Spark---Spark中資源調度源碼分析與應用

部分 app post 類名 inf master 執行過程 efault spark 一、前述 Spark中資源調度是一個非常核心的模塊,尤其對於我們提交參數來說,需要具體到某些配置,所以提交配置的參數於源碼一一對應,掌握此節對於Spark在任務執行過程中的資源分配會更上

Spark---Spark調優之代碼調優,數據本地化調優,內存調優,SparkShuffle調優,Executor的堆外內存調優

左右 任務調度 combiner flight 觸發 年齡 ans minor 序列化機制 一、前述 Spark中調優大致分為以下幾種 ,代碼調優,數據本地化,內存調優,SparkShuffle調優,調節Executor的堆外內存。 二、具體 1、代碼調優 1、避免創

Spark---Spark中Shuffle文件的尋址

sta lock exe 數據 小文件 默認 節點 刪除 提高 一、前述 Spark中Shuffle文件的尋址是一個文件底層的管理機制,所以還是有必要了解一下的。 二、架構圖 三、基本概念: 1) MapOutputTracker MapOutputTracker是Spa

Spark筆記Windows10 本地搭建單機版Spark開發環境

語句 spl 嘗試 spa efi 下載界面 RR 是否 錯誤 0x00 環境及軟件 1、系統環境 OS:Windows10_x64 專業版 2、所需軟件或工具 JDK1.8.0_131 spark-2.3.0-bin-hadoop2.7.tgz hadoop-2.8

LeetCode找元素Find First and Last Position of Element in Sorted Array

com pub bsp starting tin example pan ray 範圍 描述: Given an array of integers nums sorted in ascending order, find the starting and ending p

PAT甲級1062 Talent and Virtue

About 900 years ago, a Chinese philosopher Sima Guang wrote a history book in which he talked about people's talent and virtue. According to his theor

分析轉換模型AGC019B Reverse and Compare

分析: 由於只能換一次,所以我們考慮換哪些會重複: 首先,對於 a l

Network ArchitectureFeature Pyramid Networks for Object Detection(FPN)論文解析(轉)

目錄 0. 前言 1. 部落格一 2.。 部落格二 0. 前言   這篇論文提出了一種新的特徵融合方式來解決多尺度問題, 感覺挺有創新性的, 如果需要與其他網路進行拼接,還是需要再回到原文看一下細節。這裡轉了兩篇比較好的部落格作為備忘。 1. 部落格一 這篇論文是CVPR20

CodeForces - 514BHan Solo and Lazer Gun

There are n Imperial stormtroopers on the field. The battle field is a plane with Cartesian coordinate system. Each stormtrooper is associ

Spark---Spark中yarn模式兩種提交任務方式

一、前述Spark可以和Yarn整合,將Application提交到Yarn上執行,和StandAlone提交模式一樣,Yarn也有兩種提交任務的方式。二、具體           1、yarn-client提交任務方式配置         在client節點配置中spark

機器學習Feature selection – Part III: random forests

Selecting good features – Part III: random forests 在我以前的文章中,我研究了單變數的特徵選擇和線性模型,以及用於特徵選擇的正則化。 在這篇文章中,我將討論隨機森林,另一種流行的特徵排名方法。 隨機森林特徵重要性 隨機森林由於其相

機器學習Feature selection – Part I: univariate selection

Feature selection – Part I: univariate selection 特徵選擇——1:單變數選擇 原文連結:http://blog.datadive.net/selecting-good-features-part-i-univariate-selection/

Authentic PreferenceTechnologies come and technologies go, but insight is forever.

交流思想,注重分析,例項闡述,通俗易懂,包含但不限於:經典演算法,機器學習,深度學習,LeetCode 題解,Kaggle 實戰。期待您的到來! 演算法與人工智慧交流群:646901659

Codeforces Round 332 (Div 2)A水題A. Patrick and Shopping 遍歷三元環的最小成本

Today Patrick waits for a visit from his friend Spongebob. To prepare for the visit, Patrick needs to buy some goodies in two stores located near his hous

Spark Mllib效能評估 ——MSE/RMSE與MAPK/MAP

推薦模型評估 MSE/RMSE 均方差(MSE),就是對各個實際存在評分的項,pow(預測評分-實際評分,2)的值進行累加,在除以項數。而均方根差(RMSE)就是MSE開根號。 我們先用ratings生成(user,product)RDD,作為mo