1. 程式人生 > >轉載:Spark 使用ansj進行中文分詞

轉載:Spark 使用ansj進行中文分詞

轉載:https://www.cnblogs.com/JustIsQiGe/p/8006734.html

在Spark中使用ansj分詞先要將ansj_seg-5.1.1.jar和nlp-lang-1.7.2.jar加入工程

ansj原始碼github:https://github.com/NLPchina/ansj_seg

ansj下載連結:https://oss.sonatype.org/content/repositories/releases/org/ansj/ansj_seg/

nlp-lang下載連結:https://oss.sonatype.org/content/repositories/releases/org/nlpcn/nlp-lang/

package com.spark.test

import org.apache.spark.sql.SparkSession
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.io.Source
import org.ansj.splitWord.analysis.DicAnalysis
import org.ansj.library.DicLibrary
import org.ansj.recognition.impl.StopRecognition
import org.nlpcn.commons.lang.tire.library
import java.util.Arrays

object Participle {
  case class Movies(productId:String,userId:String,profileName:String,
                    helpfulness:String,score:String,time:String,summary:String,text:String)

  def main(args: Array[String]){
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    //分詞準備
    val stop = new StopRecognition()
    stop.insertStopNatures("w")//過濾掉標點
    stop.insertStopNatures("m")//過濾掉m詞性
    stop.insertStopNatures("null")//過濾null詞性
    stop.insertStopNatures("<br />")//過濾<br />詞性
    stop.insertStopNatures(":")
    stop.insertStopNatures("'")

    val spark = SparkSession.builder().master("local[4]").appName("prepare").getOrCreate()
    val data = spark.sparkContext.textFile("/Users/yangyang/Desktop/b.txt")
    import spark.implicits._
    val splits = data.filter(line => !line.contains("4.0")).map{x =>
      val fields = x.split("\t")
      if(fields(4).toString <= "3.0"){
        fields(4) = "0"
      }else if(fields(4).toString == "5.0"){
        fields(4) = "1"
      }
      fields(0)+"\t"+fields(1)+"\t"+fields(2)+"\t"+fields(3)+"\t"+fields(4)+"\t"+fields(5)+"\t"+fields(6)+"\t"+fields(7)
    }
    //生成訓練資料集
    val trains = splits.map(_.split("\t")).map(x => Movies(x(0).toString,x(1).toString,x(2).toString,x(3).toString,x(4).toString,x(5).toString,x(6).toString,x(7).toString)).toDF()
    //trains.show()
    trains.createOrReplaceTempView("train")
    val doc = spark.sql("select text from train").rdd
    // println(doc)
    //    val testsentence = DicAnalysis.parse("好喜歡《武林外傳》這部電視劇!").recognition(stop).toStringWithOutNature("|")
    //    println(testsentence)
    //去掉逗號、句號等
    val splited = doc.map{ x =>
      val str = x.toString()
      DicAnalysis.parse(str).recognition(stop).toStringWithOutNature("|")
    }.saveAsTextFile("/Users/XXXXX/Desktop/c")
    //splited.show()
    //.saveAsTextFile("/Users/XXXXX/Desktop/c")
    //    println(splited)
    spark.close()
  }
}
部分參考:http://m.blog.csdn.net/ozinco/article/details/70184347