Java使用PDFBox操作PDF檔案獲取頁碼、文章內容、縮圖
阿新 • • 發佈:2018-11-24
一、依賴
<!--使用的是pdfbox計數總頁數與縮圖--> <!-- https://mvnrepository.com/artifact/com.sleepycat/je --> <dependency> <groupId>com.sleepycat</groupId> <artifactId>je</artifactId> <version>5.0.73</version> </dependency> <!--pdf--> <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox --> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.8</version> </dependency>
二、實現程式碼
import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import javax.imageio.IIOImage; import javax.imageio.ImageIO; import javax.imageio.ImageWriter; import javax.imageio.stream.ImageOutputStream; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.Iterator; @Slf4j public class PdfUtil { /** * 通過PDFbox獲取文章總頁數 * * @param filePath:檔案路徑 * @return * @throws IOException */ public static int getNumberOfPages(String filePath) throws IOException, InterruptedException { File file = new File(filePath); PDDocument pdDocument = PDDocument.load(new File(filePath)); int pages = pdDocument.getNumberOfPages(); pdDocument.close(); return pages; } } /** * 通過PDFbox獲取文章內容 * * @param filePath * @return */ public static String getContent(String filePath) throws IOException { PDFParser pdfParser = new PDFParser(new org.apache.pdfbox.io.RandomAccessFile(new File(filePath), "rw")); pdfParser.parse(); PDDocument pdDocument = pdfParser.getPDDocument(); String text = new PDFTextStripper().getText(pdDocument); pdDocument.close(); return text; } /** * 通過PDFbox生成檔案的縮圖 * * @param filePath:檔案路徑 * @param outPath:輸出圖片路徑 * @throws IOException */ public static void getThumbnails(String filePath, String outPath) throws IOException { // 利用PdfBox生成影象 PDDocument pdDocument = PDDocument.load(new File(filePath)); PDFRenderer renderer = new PDFRenderer(pdDocument); // 構造圖片 BufferedImage img_temp = renderer.renderImageWithDPI(0, 30, ImageType.RGB); // 設定圖片格式 Iterator<ImageWriter> it = ImageIO.getImageWritersBySuffix("png"); // 將檔案寫出 ImageWriter writer = (ImageWriter) it.next(); ImageOutputStream imageout = ImageIO.createImageOutputStream(new FileOutputStream(outPath)); writer.setOutput(imageout); writer.write(new IIOImage(img_temp, null, null)); img_temp.flush(); imageout.flush(); imageout.close(); //Warning: You did not close a PDF Document pdDocument.close(); } }
三、測試類--Main
import java.io.IOException; /** * @author Mr.lu * @Title: Main * @ProjectName DocCloud * @Description: TODO * @date 2018/11/6:22:17 */ public class Main { public static void main(String[] args) throws IOException, InterruptedException { int numberOfPages = getNumberOfPages("D:\\Desktop\\DocCloud\\testDir\\hadoopClientCode.pdf"); System.out.println(numberOfPages); String content = getContent(""); System.out.println(content); getThumbnails("D:\\Desktop\\DocCloud\\testDir\\hadoopClientCoed.pdf", "D:\\Desktop\\DocCloud\\testDir\\hadoopClientCoed.pdf.png"); } }
1>首先測試生成PDF檔案的頁碼,在控制檯可以看到
2>測試獲取PDF檔案的內容,在控制檯可以看到--你自己PDF檔案中的內容
3>測試生成PDF縮圖
縮圖的大小,可以在程式碼中修改