1. 程式人生 > >java判斷txt檔案的編碼格式

java判斷txt檔案的編碼格式

 /**
     * txt轉html
     * @param s
     * @return
     */
    public static String txtToHtml(String s) {
        try {
            StringBuilder builder = new StringBuilder();
            File file=new File(s);
            if(file.isFile() && file.exists()){ //判斷檔案是否存在
                String encoding=getFilecharset(new File(s));
                InputStreamReader read = new InputStreamReader(
                        new FileInputStream(file),encoding);//考慮到編碼格式
                BufferedReader bufferedReader = new BufferedReader(read);
                String lineTxt = null;
                while((lineTxt = bufferedReader.readLine()) != null){
                    boolean previousWasASpace = false;
                    for (char c : (lineTxt+"\n").toCharArray()) {
                        if (c == ' ') {
                            if (previousWasASpace) {
                                builder.append(" ");
                                previousWasASpace = false;
                                continue;
                            }
                            previousWasASpace = true;
                        } else {
                            previousWasASpace = false;
                        }
                        switch (c) {
                            case '<':
                                builder.append("<");
                                break;
                            case '>':
                                builder.append(">");
                                break;
                            case '&':
                                builder.append("&");
                                break;
                            case '"':
                                builder.append("");
                                break;
                            case '\n':
                                builder.append("<br>");
                                break;
                            // We need Tab support here, because we print StackTraces as HTML
                            case '\t':
                                builder.append("     ");
                                break;
                            default:
                                builder.append(c);
                        }
                    }
                }
                read.close();
                String converted = builder.toString();
                String str = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\".,<>?«»“”‘’]))";
                Pattern patt = Pattern.compile(str);
                Matcher matcher = patt.matcher(converted);
                converted = matcher.replaceAll("<a href=\"$1\">$1</a>");
                return converted;
            }else{
                logger.error("找不到指定的檔案");
                return null;
            }
        } catch (Exception e) {
            logger.error("讀取檔案內容出錯");
            e.printStackTrace();
            return null;
        }
    }

    //判斷編碼格式方法
    private static  String getFilecharset(File sourceFile) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1) {
                return charset; //檔案編碼為 ANSI
            } else if (first3Bytes[0] == (byte) 0xFF
                    && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE"; //檔案編碼為 Unicode
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE
                    && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE"; //檔案編碼為 Unicode big endian
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF
                    && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8"; //檔案編碼為 UTF-8
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的,也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) // 雙位元組 (0xC0 - 0xDF)
                            // (0x80
                            // - 0xBF),也可能在GB編碼內
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是機率較小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
            bis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return charset;
    }

轉自:http://blog.163.com/wf_shunqiziran/blog/static/176307209201258102217810/