1. 程式人生 > >Java過濾停用詞原始碼

Java過濾停用詞原始碼

package SimilarityCompution; 
import Java.io.BufferedReader; 
import java.io.BufferedWriter; 
import java.io.File; 
import java.io.FileInputStream; 
import java.io.FileNotFoundException; 
import java.io.FileOutputStream; 
import java.io.InputStreamReader; 
import java.io.OutputStreamWriter; 
import java.util.HashSet; 
import java.util.Set; 
import ICTCLAS.I3S.AC.ICTCLAS50; 
public class FileExcludeStopWord { 
//停用詞詞表 
public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable.txt"; 
public static void main(String[] args) { 
//原始檔和目的檔案 
String srcFile = "." + File.separator + "srcFile" + File.separator + "如何正確的使用化妝品效.txt"; 
String destFile = "." + File.separator + "destFile" + File.separator + "如何正確的使用化妝品效.txt"; 
new FileExcludeStopWord().fileExcludeStopWord(srcFile, destFile); 

public void fileExcludeStopWord(String srcFile,StringdestFile){ 
try { 
//讀取原檔案和停用詞表 
BufferedReadersrcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile)))); 
BufferedReaderStopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable)))); 
//將去除停用詞的文字資訊存入輸出檔案 
BufferedWriterdestFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile)))); 
//用來存放停用詞的集合 Set stopWordSet = new HashSet<String>(); //初如化停用詞集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ stopWordSet.add(stopWord); } //分詞工具 ICTCLAS50 ICTCLAS = new ICTCLAS50(); 
// 初始化分詞所用庫的路徑 
String argu = "."; 
if (ICTCLAS.ICTCLAS_Init(argu.getBytes("gb2312")) == false) { 
System.out.println("分詞所用庫初始化失敗。"); 
return; 

String paragraph = null; 
for(; (paragraph = srcFileBr.readLine()) != null;){ 
//對讀入的文字進行分詞 
byte[] spiltResult = ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes("gb2312"), 2, 0); 
String spiltResultStr = new String(spiltResult,0,spiltResult.length,"gb2312"); 
//得到分詞後的詞彙陣列,以便後續比較 
String[] resultArray = spiltResultStr.split(" "); 
//過濾停用詞 
for(int i = 0; i<resultArray.length; i++){ 
if(stopWordSet.contains(resultArray[i])){ 
resultArray[i] = null; 


//把過濾後的字串陣列存入到一個字串中 
StringBufferfinalStr = new StringBuffer(); 
for(int i = 0; i<resultArray.length; i++){ 
if(resultArray[i] != null){ 
finalStr = finalStr.append(resultArray[i]).append(" "); 


} } } 

//將過濾後的文字資訊寫入到指定檔案中

 destFileBw.write(finalStr.toString()); 

destFileBw.newLine(); 

//關閉輸入流

 destFileBw.close(); 

StopWordFileBr.close(); srcFileBr.close(); } 

catch (FileNotFoundException e) { 

e.printStackTrace(); 

} catch(Exception e){ 

e.printStackTrace();