【spark+nlp】 Feature Extract and Preprocess

阿新 • • 發佈：2019-01-21

：Spark NLP常用方法

package com.bbw5.ml.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF

/**
 * feature extract for text
 */
object FeatureExtractandPreprocess {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("LinearRegression4Wine")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)
  }

  def nlpPipeline(sc: SparkContext, sqlContext: SQLContext) {
    import sqlContext.implicits._
    val documents = sc.textFile("G:/temp/data/documents.txt")

    val df = documents.toDF("text")
    df.show()

    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("tokens")
    val remover = new StopWordsRemover().setInputCol("tokens").setOutputCol("words")
    //val countVector = new CountVectorizer().setInputCol("words").setOutputCol("count_words").setVocabSize(20).setMinDF(1)
    val hashingTF = new HashingTF().setNumFeatures(50).setInputCol("words").setOutputCol("rawFeatures")
    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val pipeline = new Pipeline().setStages(Array(tokenizer, remover, hashingTF, idf))

    val model = pipeline.fit(df)
    val tf = model.transform(df)
    tf.show()
    tf.select("words", "count_words").show(2, false)
    tf.select("features").show(2, false)
  }

  /**
   * fruits.txt:
   * 蘋果, 香蕉, 梨子
   * 蘋果, 草莓, 芒果, 西瓜
   * 草莓, 葡萄, 香瓜
   * 榴蓮, 橘子, 橙子
   */
  def countVectorizer(sc: SparkContext, sqlContext: SQLContext) {
    import sqlContext.implicits._

    val fruits = sc.textFile("G:/temp/data/fruits.txt")
    val df = fruits.map { x => (0, x.split(",").map { x => x.trim().toLowerCase() }) }.toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(7).setMinDF(1).fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c")).setInputCol("words").setOutputCol("features")

    println(cvModel.vocabulary.toList)
    cvModel.transform(df).show()

    cvModel.transform(df).select("features").foreach { r => println(r.get(0).getClass()) }
  }

  def tokenizer(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }

    val sentenceDataFrame = sqlContext.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat"))).toDF("label", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("words", "label").take(3).foreach(println)
    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("words", "label").take(3).foreach(println)
  }

  /**
   * english stop word:
   * http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
   */
  def stopWorld(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.StopWordsRemover

    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")

    val dataSet = sqlContext.createDataFrame(Seq(
      (0, Seq("I", "saw", "the", "red", "baloon")),
      (1, Seq("Mary", "had", "a", "little", "lamb")))).toDF("id", "raw")

    remover.transform(dataSet).show()
  }

  def stem(sqlContext: SQLContext) {

  }

  def lemmatization(sqlContext: SQLContext) {

  }

  def tfidf(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.{ HashingTF, IDF, Tokenizer }

    val sentenceData = sqlContext.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (0, "I wish Java could use case classes"),
      (1, "Logistic regression models are neat"))).toDF("label", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val wordsData = tokenizer.transform(sentenceData)
    wordsData.select("words").show(2, false)

    val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(10)
    val featurizedData = hashingTF.transform(wordsData)
    featurizedData.select("rawFeatures").show(2, false)

    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val idfModel = idf.fit(featurizedData)
    val rescaledData = idfModel.transform(featurizedData)
    rescaledData.select("features", "label").take(3).foreach(println)
  }

  def word2vec(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.Word2Vec

    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = sqlContext.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)
    val result = model.transform(documentDF)
    result.select("result").take(3).foreach(println)
  }

  def n_gram(sqlContext: SQLContext) {
    import org.apache.spark.ml.feature.NGram

    val wordDataFrame = sqlContext.createDataFrame(Seq(
      (0, Array("Hi", "I", "heard", "about", "Spark")),
      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
      (2, Array("Logistic", "regression", "models", "are", "neat")))).toDF("label", "words")

    val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
    val ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
  }

}

【spark+nlp】 Feature Extract and Preprocess

：Spark NLP常用方法package com.bbw5.ml.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.m

【機器學習】Feature selection – Part II: linear models and regularization

Selecting good features – Part II: linear models and regularization 在我之前的文章中，我討論了單變數特徵選擇，其中每個特徵都是根據響應變數獨立評估的。另一種流行的方法是利用機器學習模型進行特徵排序。許多機器學習模型要麼具有一

【Codeforces 98E】 Help Shrek and Donkey 遊戲策略神題

html str 直線最終 view lns 是否最優 rip from http://www.cnblogs.com/MashiroSky/p/6576398.html 　　A君有n張牌，B君有m張牌，桌上還有一張反扣著的牌，每張牌都不一樣。　　每個回合可以做兩

【Spark篇】---Spark中Transformations轉換算子

pack gpo rds color boolean long als sam park 一、前述 Spark中默認有兩大類算子，Transformation（轉換算子）,懶執行。action算子，立即執行，有一個action算子，就有一個job。通俗些來說由RDD變成

【Spark篇】---Spark中資源調度源碼分析與應用

部分 app post 類名 inf master 執行過程 efault spark 一、前述 Spark中資源調度是一個非常核心的模塊，尤其對於我們提交參數來說，需要具體到某些配置，所以提交配置的參數於源碼一一對應，掌握此節對於Spark在任務執行過程中的資源分配會更上

【Spark篇】---Spark調優之代碼調優，數據本地化調優，內存調優，SparkShuffle調優，Executor的堆外內存調優

左右任務調度 combiner flight 觸發年齡 ans minor 序列化機制一、前述 Spark中調優大致分為以下幾種，代碼調優，數據本地化，內存調優，SparkShuffle調優，調節Executor的堆外內存。二、具體 1、代碼調優 1、避免創

【Spark篇】---Spark中Shuffle文件的尋址

sta lock exe 數據小文件默認節點刪除提高一、前述 Spark中Shuffle文件的尋址是一個文件底層的管理機制，所以還是有必要了解一下的。二、架構圖三、基本概念： 1) MapOutputTracker MapOutputTracker是Spa

【Spark筆記】Windows10 本地搭建單機版Spark開發環境

語句 spl 嘗試 spa efi 下載界面 RR 是否錯誤 0x00 環境及軟件 1、系統環境 OS：Windows10_x64 專業版 2、所需軟件或工具 JDK1.8.0_131 spark-2.3.0-bin-hadoop2.7.tgz hadoop-2.8

【LeetCode】【找元素】Find First and Last Position of Element in Sorted Array

com pub bsp starting tin example pan ray 範圍描述： Given an array of integers nums sorted in ascending order, find the starting and ending p

【PAT甲級】1062 Talent and Virtue

About 900 years ago, a Chinese philosopher Sima Guang wrote a history book in which he talked about people's talent and virtue. According to his theor

【分析】【轉換模型】AGC019B Reverse and Compare

分析：由於只能換一次，所以我們考慮換哪些會重複：首先，對於 a l

【Network Architecture】Feature Pyramid Networks for Object Detection(FPN)論文解析（轉）

目錄 0. 前言 1. 部落格一 2.。部落格二 0. 前言這篇論文提出了一種新的特徵融合方式來解決多尺度問題，感覺挺有創新性的，如果需要與其他網路進行拼接，還是需要再回到原文看一下細節。這裡轉了兩篇比較好的部落格作為備忘。 1. 部落格一這篇論文是CVPR20

【CodeForces - 514B】Han Solo and Lazer Gun

There are n Imperial stormtroopers on the field. The battle field is a plane with Cartesian coordinate system. Each stormtrooper is associ

【Spark篇】---Spark中yarn模式兩種提交任務方式

一、前述Spark可以和Yarn整合，將Application提交到Yarn上執行，和StandAlone提交模式一樣，Yarn也有兩種提交任務的方式。二、具體 1、yarn-client提交任務方式配置在client節點配置中spark

【機器學習】Feature selection – Part III: random forests

Selecting good features – Part III: random forests 在我以前的文章中，我研究了單變數的特徵選擇和線性模型，以及用於特徵選擇的正則化。在這篇文章中，我將討論隨機森林，另一種流行的特徵排名方法。隨機森林特徵重要性隨機森林由於其相

【機器學習】Feature selection – Part I: univariate selection

Feature selection – Part I: univariate selection 特徵選擇——1：單變數選擇原文連結：http://blog.datadive.net/selecting-good-features-part-i-univariate-selection/

【forever1dreamsxx--NLP】日子在指尖悄悄流淌，不覺間卻沉積出暗香陣陣。一個普通的數學系本科生，熱愛數學，熱愛自然語言處理，從事自然語言處理相關工作。郵箱：

日子在指尖悄悄流淌，不覺間卻沉積出暗香陣陣。一個普通的數學系本科生，熱愛數學，熱愛自然語言處理，從事自然語言處理相關工作。郵箱：[email protected]，希望能夠不吝交流。...

【Authentic Preference】Technologies come and technologies go, but insight is forever.

交流思想，注重分析，例項闡述，通俗易懂，包含但不限於：經典演算法，機器學習，深度學習，LeetCode 題解，Kaggle 實戰。期待您的到來！演算法與人工智慧交流群：646901659

【Codeforces Round 332 (Div 2)A】【水題】A. Patrick and Shopping 遍歷三元環的最小成本

Today Patrick waits for a visit from his friend Spongebob. To prepare for the visit, Patrick needs to buy some goodies in two stores located near his hous

【Spark Mllib】效能評估 ——MSE/RMSE與MAPK/MAP

推薦模型評估 MSE/RMSE 均方差（MSE），就是對各個實際存在評分的項，pow（預測評分-實際評分，2）的值進行累加，在除以項數。而均方根差（RMSE）就是MSE開根號。我們先用ratings生成（user，product）RDD，作為mo

【spark+nlp】 Feature Extract and Preprocess

相關推薦