1. 程式人生 > >JAVA中通過poi和pdfbox讀取office檔案和pdf檔案內容

JAVA中通過poi和pdfbox讀取office檔案和pdf檔案內容

xlsx
* @param file
* @return
* @throws IOException
*/
public static String readEXCEL2007(String file) throws IOException {
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook(file);
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 獲得一個sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 獲得一個行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
XSSFCell aCell = aRow.getCell(cellNumOfRow);// 獲得列值
if (convertCell(aCell).length() > 0) {
content.append(convertCell(aCell));
}
}
content.append("\n");
}
}
}
}
}
return content.toString();
}

private static String convertCell(Cell cell) {
NumberFormat formater = NumberFormat.getInstance();
formater.setGroupingUsed(false);
String cellValue = "";
if (cell == null) {
return cellValue;
}
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_NUMERIC:
cellValue = formater.format(cell.getNumericCellValue());
break;
case HSSFCell.CELL_TYPE_STRING:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BLANK:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
break;
case HSSFCell.CELL_TYPE_ERROR:
cellValue = String.valueOf(cell.getErrorCellValue());
break;
default:
cellValue = "";
}
return cellValue.trim();
}

/**
* 讀取doc檔案

* @param file
* @return
* @throws Exception
*/
public static String readWORD(String file) throws Exception {
String returnStr = "";
try {
WordExtractor wordExtractor = new WordExtractor(new FileInputStream(new File(file)));
returnStr = wordExtractor.getText();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return returnStr;
}

/**
* 讀取docx檔案

* @param file
* @return
* @throws Exception
*/
public static String readWORD2007(String file) throws Exception {
     return new XWPFWordExtractor(POIXMLDocument.openPackage(file)).getText();   
}

/**
* 讀取txt檔案
* @param file
* @return
* @throws IOException
*/
public  static String readTXT(String file) throws IOException {
String encoding = get_charset(new File(file));
if (encoding.equalsIgnoreCase("GBK")) {
return FileUtils.readFileToString(new File(file), "gbk");
} else {
return FileUtils.readFileToString(new File(file), "utf8");
}
}

private static String get_charset(File file) throws IOException {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
BufferedInputStream bis = null;
try {
boolean checked = false;
bis = new BufferedInputStream(new FileInputStream(file));
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1)
return charset;
if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8";
checked = true;
}
bis.reset();
if (!checked) {
// int len = 0;
int loc = 0;


while ((read = bis.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 雙位元組 (0xC0 - 0xDF)
// (0x80
// - 0xBF),也可能在GB編碼內
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是機率較小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
// System.out.println( loc + " " + Integer.toHexString( read )
// );
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (bis != null) {
bis.close();
}
}
return charset;
}

/**
* 讀取pdf檔案內容

* @param file
* @return
* @throws IOException
*/
public static String readPDF(String file) throws IOException {
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(file);
PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} finally {
if (is != null) {
is.close();
}
if (document != null) {
document.close();
}
}
return result;
}