使用poi+pdfbox實現office檔案提取內容
阿新 • • 發佈:2018-12-27
引入maven
<!-- poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.16</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.16</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.16</version> </dependency> <!-- pdf --> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.4</version> </dependency> </dependencies>
提取內容
private static String read(File file) { StringBuilder builder = new StringBuilder(); String name = file.getName(); boolean txt = name.endsWith(".txt"); if (txt) { try (FileInputStream inputStream = new FileInputStream(file)) { int len; byte[] bytes = new byte[1024]; while ((len = inputStream.read(bytes)) != -1) { builder.append(new String(bytes, 0, len)); } inputStream.close(); } catch (IOException e) { e.printStackTrace(); } } POITextExtractor extractor = null; boolean word = name.endsWith(".doc") || name.endsWith(".docx"); if (word) { try { extractor = new WordExtractor(new HWPFDocument(new FileInputStream(file))); } catch (Exception e) { try { extractor = new XWPFWordExtractor(new XWPFDocument(new FileInputStream(file))); } catch (Exception ignored) { } } } boolean excel = name.endsWith(".xls") || name.endsWith(".xlsx"); if (excel) { try { extractor = new ExcelExtractor(new HSSFWorkbook(new POIFSFileSystem(file))); } catch (Exception e) { try { extractor = new XSSFExcelExtractor(new XSSFWorkbook(file)); } catch (Exception ignored) { } } } boolean slide = name.endsWith(".ppt") || name.endsWith(".pptx"); if (slide) { try { extractor = new PowerPointExtractor(new FileInputStream(file)); } catch (Exception e) { try { extractor = new XSLFPowerPointExtractor(new XSLFSlideShow(OPCPackage.open(file))); } catch (Exception ignored) { } } } if (extractor != null) { builder.append(extractor.getText()); try { extractor.close(); } catch (IOException ignored) { } } boolean pdf = name.endsWith(".pdf"); if (pdf) { try { PDDocument document = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); builder.append(stripper.getText(document)); document.close(); } catch (IOException e) { e.printStackTrace(); } } return builder.toString(); }