1. 程式人生 > >檔案編碼檢測.測試程式碼

檔案編碼檢測.測試程式碼

ZC:這個是在 G轉SVG的C++專案中要用到這個功能的,然後逐步查資料查到 jchardet的,相關的文章為“檔案編碼檢測.ZC一些資料(包含java的) - CppSkill - 部落格園.html(https://www.cnblogs.com/cppskill/p/9906599.html)”

ZC:相關檔案位於:“..\ZC_IDE\Java_3rd\ZC_檔案編碼自動檢測\jchardet-1.1.zip”,jar 位於 “...\ZC_IDE\Java_3rd\ZC_檔案編碼自動檢測\jchardet-1.1\dist\lib\chardet.jar

 

1、測試程式碼:

 (1)、字符集編碼的自動識別jchardet - 雲守護的專欄 - CSDN部落格.html(

https://blog.csdn.net/earbao/article/details/38709701

  (1.1)、

package com.AAA;

import java.io.BufferedInputStream;
import java.net.URL;
 
import org.mozilla.intl.chardet.HtmlCharsetDetector;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import
org.mozilla.intl.chardet.nsPSMDetector; public class FFF { public static void main(String[] args) throws Exception { String strUrl = "C:\\Users\\33\\Desktop\\zzz.g"; int lang = nsPSMDetector.ALL; // if (args.length < 1) { // System.out.println("usage:Main url <int>lang");
// return; // } // int lang = (args.length == 2) ? Integer.parseInt(args[1]) // : nsPSMDetector.ALL; // 實現nsICharsetDetectionObserver介面,這個介面只有一個Notify()方法. // 當jchardet引擎自己認為已經識別出字串的字符集後(不論識別的對錯),都會呼叫這個Notify方法。 nsICharsetDetectionObserver cdo = new nsICharsetDetectionObserver() { public void Notify(String charset) { HtmlCharsetDetector.found = true; System.out.println("CHARSET = " + charset); } }; /** * 初始化nsDetector() lang為一個整數,用以提示語言線索,可以提供的語言線索有以下幾個: Japanese Chinese * Simplified Chinese Traditional Chinese Korean Dont know (預設) */ nsDetector det = new nsDetector(lang); // 設定一個Oberver det.Init(cdo); //URL url = new URL(args[0]); URL url = new URL(strUrl); BufferedInputStream imp = new BufferedInputStream(url.openStream()); byte[] buf = new byte[1024]; boolean done = false; // 是否已經確定某種字符集 boolean isAscii = true;// 假定當前的串是ASCII編碼 int len; boolean found = false; while ((len = imp.read(buf, 0, buf.length)) != -1) { // 檢查是不是全是ascii字元,當有一個字元不是ASC編碼時,則所有的資料即不是ASCII編碼了。 if (isAscii) isAscii = det.isAscii(buf, len); // 如果不是ascii字元,則呼叫DoIt方法. if (!isAscii && !done) done = det.DoIt(buf, len, false);// 如果不是ASCII,又還沒確定編碼集,則繼續檢測。 } det.DataEnd();// 最後要呼叫此方法,此時,Notify被呼叫。 if (isAscii) { System.out.println("CHARSET = ASCII"); found = true; } if (!found) {// 如果沒找到,則找到最可能的那些字符集 String prob[] = det.getProbableCharsets(); for (int i = 0; i < prob.length; i++) { System.out.println("Probable Charset = " + prob[i]); } } } }

 

 (2)、jChardet探測檔案字元編碼-部落格-雲棲社群-阿里雲.html(https://yq.aliyun.com/articles/59514

      ZC:這裡的程式碼,在檢測 D:\DRGIS\BIN\Graphics裡面的圖形時,若 檔案是 UTF-8編碼的,則它顯示是 ASCII;若檔案是 UTF-8 + BOM編碼的,則它顯示是 UTF8.看起來 不太準...

  (2.1)、

package com.AAA;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
// jChardet 火狐所用字元編碼檢測演算法
public class FileCharsetDetector {
    private boolean found = false;
    private String encoding = null;

    public static void main(String[] argv) throws Exception
    {
        //File file1 = new File("C:\\Users\\Administrator\\Desktop\\VideoViewDemo\\VideoViewDemo\\src\\org\\apache\\android\\media\\AudioPlayer.java");
        //File file1 = new File("C:\\Users\\33\\Desktop\\zzz.g");
//        File file1 = new File("D:\\DRGIS\\BIN\\Graphics\\35kV婁宮變.fac.svg");
//        File file1 = new File("C:\\Users\\33\\Desktop\\MXB.fac.svg");
        File file1 = new File("C:\\Users\\33\\Desktop\\220kVXJB.fac.svg");
        
        
        System.out.println("檔案編碼:" + new FileCharsetDetector().guessFileEncoding(file1));
    }

    /**
     * 傳入一個檔案(File)物件,檢查檔案編碼
     * 
     * @param file
     *            File物件例項
     * @return 檔案編碼,若無,則返回null
     * @throws FileNotFoundException
     * @throws IOException
     */
    public String guessFileEncoding(File file) throws FileNotFoundException, IOException {
        return guessFileEncoding(file, new nsDetector());
    }

    /**
     * <pre>
     * 獲取檔案的編碼
     * @param file
     *            File物件例項
     * @param languageHint
     *            語言提示區域程式碼 @see #nsPSMDetector ,取值如下:
     *             1 : Japanese
     *             2 : Chinese
     *             3 : Simplified Chinese
     *             4 : Traditional Chinese
     *             5 : Korean
     *             6 : Dont know(default)
     * </pre>
     * 
     * @return 檔案編碼,eg:UTF-8,GBK,GB2312形式(不確定的時候,返回可能的字元編碼序列);若無,則返回null
     * @throws FileNotFoundException
     * @throws IOException
     */
    public String guessFileEncoding(File file, int languageHint) throws FileNotFoundException, IOException {
        return guessFileEncoding(file, new nsDetector(languageHint));
    }

    /**
     * 獲取檔案的編碼
     * 
     * @param file
     * @param det
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    private String guessFileEncoding(File file, nsDetector det) throws FileNotFoundException, IOException {
        // Set an observer...
        // The Notify() will be called when a matching charset is found.
        det.Init(new nsICharsetDetectionObserver() {
            public void Notify(String charset) {
                encoding = charset;
                found = true;
            }
        });

        BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));
        byte[] buf = new byte[1024];
        int len;
        boolean done = false;
        boolean isAscii = false;

        while ((len = imp.read(buf, 0, buf.length)) != -1) {
            // Check if the stream is only ascii.
            isAscii = det.isAscii(buf, len);
            if (isAscii) {
                break;
            }
            // DoIt if non-ascii and not done yet.
            done = det.DoIt(buf, len, false);
            if (done) {
                break;
            }
        }
        imp.close();
        det.DataEnd();

        if (isAscii) {
            encoding = "ASCII";
            found = true;
        }

        if (!found) {
            String[] prob = det.getProbableCharsets();
            //這裡將可能的字符集組合起來返回
            for (int i = 0; i < prob.length; i++) {
                if (i == 0) {
                    encoding = prob[i];
                } else {
                    encoding += "," + prob[i];
                }
            }

            if (prob.length > 0) {
                // 在沒有發現情況下,也可以只取第一個可能的編碼,這裡返回的是一個可能的序列
                return encoding;
            } else {
                return null;
            }
        }
        return encoding;
    }
}

 

 (3)、藉助JCharDet獲取檔案字符集 - robin·張 - 部落格園.html(https://www.cnblogs.com/amunote/p/4178472.html

      ZC:這個文章裡面的程式碼,在檢測 D:\DRGIS\BIN\Graphics裡面的圖形時 能檢測出 UTF-8編碼的檔案是UTF8編碼

     ZC:看來 都是使用的 chardet.jar,編寫的程式碼不一樣 效果也是不同的。原始工具一樣 使用者 水平很關鍵

  (3.1)、

package com.AAA;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;

/**
 * 藉助JCharDet獲取檔案字符集
 * 
 * @author robin
 * 
 */
public class FileCharsetDetector01
{

    /**
     * 字符集名稱
     */
    private static String encoding;
    
    /**
     * 字符集是否已檢測到
     */
    private static boolean found;
    
    private static nsDetector detector;
    
    private static nsICharsetDetectionObserver observer;

    /**
     * 適應語言列舉
     * @author robin
     *
     */
    enum Language{
        Japanese(1),
        Chinese(2),
        SimplifiedChinese(3),
        TraditionalChinese(4), 
        Korean(5), 
        DontKnow(6);
        
        private int hint;
        
        Language(int hint){
            this.hint = hint;
        }
        
        public int getHint(){
            return this.hint;
        }
    }
    
    /**
     * 傳入一個檔案(File)物件,檢查檔案編碼
     * 
     * @param file
     *            File物件例項
     * @return 檔案編碼,若無,則返回null
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static String checkEncoding(File file) throws FileNotFoundException,
            IOException {
        return checkEncoding(file, getNsdetector());
    }

    /**
     * 獲取檔案的編碼
     * 
     * @param file
     *            File物件例項
     * @param language
     *            語言
     * @return 檔案編碼
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static String checkEncoding(File file, Language lang)
            throws FileNotFoundException, IOException {
        return checkEncoding(file, new nsDetector(lang.getHint()));
    }

    /**
     * 獲取檔案的編碼
     * 
     * @param path
     *            檔案路徑
     * @return 檔案編碼,eg:UTF-8,GBK,GB2312形式,若無,則返回null
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static String checkEncoding(String path) throws FileNotFoundException,
            IOException {
        return checkEncoding(new File(path));
    }

    /**
     * 獲取檔案的編碼
     * 
     * @param path
     *            檔案路徑
     * @param language
     *                 語言
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static String checkEncoding(String path, Language lang)
            throws FileNotFoundException, IOException {
        return checkEncoding(new File(path), lang);
    }

    /**
     * 獲取檔案的編碼
     * 
     * @param file
     * @param det
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    private static String checkEncoding(File file, nsDetector detector)
            throws FileNotFoundException, IOException {
        
        detector.Init(getCharsetDetectionObserver());
        
        if (isAscii(file, detector)) {
            encoding = "ASCII";
            found = true;
        }

        if (!found) {
            String prob[] = detector.getProbableCharsets();
            if (prob.length > 0) {
                encoding = prob[0];
            } else {
                return null;
            }
        }
        
        return encoding;
    }
    
    /**
     * 檢查檔案編碼型別是否是ASCII型
     * @param file
     *             要檢查編碼的檔案
     * @param detector
     * @return
     * @throws IOException
     */
    private static boolean isAscii(File file, nsDetector detector) throws IOException{
        BufferedInputStream input = null;
        try{
            input = new BufferedInputStream(new FileInputStream(file));
            
            byte[] buffer = new byte[1024];
            int hasRead;
            boolean done = false;
            boolean isAscii = true;

            while ((hasRead=input.read(buffer)) != -1) {
                if (isAscii)
                    isAscii = detector.isAscii(buffer, hasRead);
                if (!isAscii && !done)
                    done = detector.DoIt(buffer, hasRead, false);
            }
            
            return isAscii;
        }finally{
            detector.DataEnd();
            if(null!=input)input.close();
        }
    }
    
    /**
     * nsDetector單例建立
     * @return
     */
    private static nsDetector getNsdetector(){
        if(null == detector){
            detector = new nsDetector();
        }
        return detector;
    }
    
    /**
     * nsICharsetDetectionObserver 單例建立
     * @return
     */
    private static nsICharsetDetectionObserver getCharsetDetectionObserver(){
        if(null==observer){
            observer = new nsICharsetDetectionObserver() {
                public void Notify(String charset) {
                    found = true;
                    encoding = charset;
                }
            };
        }
        return observer;
    }
    
    public static void main(String[] argv) throws Exception
    {
        //File file1 = new File("C:\\Users\\Administrator\\Desktop\\VideoViewDemo\\VideoViewDemo\\src\\org\\apache\\android\\media\\AudioPlayer.java");
        //File file1 = new File("C:\\Users\\33\\Desktop\\zzz.g");
//        File file1 = new File("D:\\DRGIS\\BIN\\Graphics\\35kV婁宮變.fac.svg");
//        File file1 = new File("C:\\Users\\33\\Desktop\\MXB.fac.svg");
//        File file1 = new File("C:\\Users\\33\\Desktop\\220kVXJB.fac.svg");

        String str = FileCharsetDetector01.checkEncoding("C:\\Users\\33\\Desktop\\220kVXJB.fac.svg");
        System.out.println("檔案編碼:" + str);
    }
    
}

 

2、

3、

4、

5、