1. 程式人生 > >To be or not to be ...

To be or not to be ...

/*
需求:為檔案建立倒排索引
step1:
    |--為所有的檔案建立索引號 FileID_Number
      |--首先查詢到所有的檔案目錄 file.list[]
      |--將所有的檔案寫到一個檔案索引檔案中 fileIndex.txt
step2:
    |--根據檔案的路徑將檔案載入到程式中,並將其中的單詞分詞統計
      |--統計每個單詞在各個檔案中出現的頻率,並將統計資訊寫到結果檔案wordIndex.txt中

*/

import java.io.*;
import java.util.*;

class InvertedEngine
{
    public static void
main(String[] args) throws IOException { String filePath = "documents"; String docIndex = "docIndex.txt"; String wordIndex = "wordIndex.txt"; getFileIndex(filePath , docIndex); getWordsFrequency(docIndex,wordIndex); System.out.println("Work Done!"
); } public static void getFileIndex(String filePath , String docIndex) { //通過傳入的filePath找到檔案所在,並將該檔案下所有檔案資訊寫到docIndex.txt中 File file = new File(filePath); File[] fileList = file.listFiles(); BufferedWriter bufw = null; try { //將所有filePath下的檔案路徑寫到docIndex檔案中
bufw = new BufferedWriter(new FileWriter(docIndex)); for(int x = 0 ; x <fileList.length ; x++ ) { String docPath = fileList[x].getAbsolutePath(); bufw.write("DocID_" + x + "\t" + docPath); bufw.newLine();bufw.flush();//重新整理寫入 } } catch (IOException e) { System.out.println("開啟檔案失敗" + e); } finally { try { if(bufw != null) bufw.close(); } catch (IOException ex) { System.out.println("關閉檔案失敗" + ex); } } } public static void getWordsFrequency(String docIndex , String wordIndex) throws IOException { //通過docIndex檔案中的內容找到每個檔案,並將檔案中的內容做單詞統計 TreeMap<String,TreeMap<String,Integer>> tmp = new TreeMap<String,TreeMap<String,Integer>>();//統計map BufferedReader bufr = new BufferedReader(new FileReader(docIndex));//讀取docIndex.txt BufferedWriter bufw = new BufferedWriter(new FileWriter(wordIndex));//寫入到wordIndex.txt BufferedReader bufrDoc = null; String docIDandPath = null; while( (docIDandPath = bufr.readLine()) != null) { String[] docInfo = docIDandPath.split("\t"); String docID = docInfo[0]; String docPath = docInfo[1];//獲取到docID和檔案的路徑 bufrDoc = new BufferedReader(new FileReader(docPath)); String wordLine = null; while( (wordLine = bufrDoc.readLine()) != null) { String[] words = wordLine.split("\\W"); for(String wordOfDoc : words) if(!wordOfDoc.equals("")) wordDeal(wordOfDoc,docID,tmp);//將從docIndex讀取到對應檔案內容對做統計處理 } } //將處理後的結果寫入到wordIndex.txt檔案中 String wordFreInfo = null; Set<Map.Entry<String,TreeMap<String,Integer>>> entrySet = tmp.entrySet(); Iterator<Map.Entry<String,TreeMap<String,Integer>>> it = entrySet.iterator(); while(it.hasNext()) { Map.Entry<String,TreeMap<String,Integer>> em = it.next(); wordFreInfo = em.getKey() +"\t" + em.getValue(); bufw.write(wordFreInfo); bufw.newLine();bufw.flush(); } bufw.close(); bufr.close(); bufrDoc.close(); } public static void wordDeal(String wordOfDoc,String docID,TreeMap<String,TreeMap<String,Integer>> tmp) { wordOfDoc = wordOfDoc.toLowerCase(); if(!tmp.containsKey(wordOfDoc)) { //單詞在統計中是首次出現 TreeMap<String , Integer> tmpST = new TreeMap<String , Integer>(); tmpST.put(docID,1); tmp.put(wordOfDoc,tmpST); } else {//單詞在tmp中已近存在獲取該單詞在對應docID中出現次數,若是首次出現 //count = null,則將(docID ,1)加入到tmpST中;若不是首次出現,則將count++後,再將資訊回寫到tmpST中。 TreeMap<String ,Integer> tmpST = tmp.get(wordOfDoc); Integer count = tmpST.get(docID); count = ((count == null) ? 1 : count++); tmpST.put(docID,count); tmp.put(wordOfDoc,tmpST); //將最新結果回寫到tmp中 } } }