1. 程式人生 > >pdf轉圖片、提取pdf文字、提取pdf圖片

pdf轉圖片、提取pdf文字、提取pdf圖片

 
  package com.midevip.common.util;
  import com.itextpdf.text.pdf.PdfReader;
  import net.coobird.thumbnailator.Thumbnails;
  import org.apache.pdfbox.cos.COSName;
  import org.apache.pdfbox.pdmodel.*;
  import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
  import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
  import org.apache.pdfbox.rendering.PDFRenderer;
  import org.apache.pdfbox.text.PDFTextStripper;
   
  import javax.imageio.IIOImage;
  import javax.imageio.ImageIO;
  import javax.imageio.ImageWriteParam;
  import javax.imageio.ImageWriter;
  import java.awt.image.BufferedImage;
  import java.io.File;
  import java.io.IOException;
  import java.io.InputStream;
  import java.text.SimpleDateFormat;
  import java.util.Calendar;
  import java.util.Iterator;
   
  /**
  * 使用pdfbox提取pdf文件的文字和圖片內容
  * pdfbox官網:https://pdfbox.apache.org/
  * maven依賴如下:
  * <dependency>
  * <groupId>org.apache.pdfbox</groupId>
  * <artifactId>fontbox</artifactId>
  * <version>2.0.1</version>
  * </dependency>
  * <dependency>
  * <groupId>org.apache.pdfbox</groupId>
  * <artifactId>pdfbox</artifactId>
  * <version>2.0.1</version>
  * </dependency>
  * <dependency>
  * <groupId>com.itextpdf</groupId>
  * <artifactId>itextpdf</artifactId>
  * <version>5.5.13</version>
  * </dependency>
  * <dependency>
  * <groupId>net.coobird</groupId>
  * <artifactId>thumbnailator</artifactId>
  * <version>0.4.8</version>
  * </dependency>
  */
  public class PdfTest {
  public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
   
  /**
  *
  *
  * @param pdfFilePath
  * @throws Exception
  */
  public static void extractText(String pdfFilePath) throws Exception{
  try (PDDocument document = PDDocument.load(new File(pdfFilePath)))
  {
  AccessPermission ap = document.getCurrentAccessPermission();
  if (!ap.canExtractContent())
  {
  throw new IOException("You do not have permission to extract text");
  }
  PDFTextStripper stripper = new PDFTextStripper();
  stripper.setSortByPosition(true);
   
  for (int p = 1; p <= document.getNumberOfPages(); ++p)
  {
  // 這裡分為一頁一頁的提取,如果不設定,預設會把所有頁的內容一次性提取出來,根據需要選擇
  stripper.setStartPage(p);
  stripper.setEndPage(p);
   
  //提取內容就這一行程式碼
  //提取內容很徹底,包括了頁首頁尾的內容也都會被提出來
  String text = stripper.getText(document);
   
  String pageStr = String.format("page %d:", p);
  System.out.println(pageStr);
  //為了打印出來更美觀
  for (int i = 0; i < pageStr.length(); ++i)
  {
  System.out.print("-");
  }
  System.out.println();
  System.out.println(text.trim());
  System.out.println();
  }
  }
  }
   
  public static void pdfParse(String pdfPath) throws Exception {
  InputStream input = null;
  PDDocument document = null;
  try {
  document = PDDocument.load(new File(pdfPath));
   
  /** 文件屬性資訊 **/
  PDDocumentInformation info = document.getDocumentInformation();
  System.out.println("標題:" + info.getTitle());
  System.out.println("主題:" + info.getSubject());
  System.out.println("作者:" + info.getAuthor());
  System.out.println("關鍵字:" + info.getKeywords());
   
  System.out.println("應用程式:" + info.getCreator());
  System.out.println("pdf 製作程式:" + info.getProducer());
   
  System.out.println("作者:" + info.getTrapped());
   
  System.out.println("建立時間:" + dateFormat(info.getCreationDate()));
  System.out.println("修改時間:" + dateFormat(info.getModificationDate()));
   
   
  //獲取內容資訊
  PDFTextStripper pts = new PDFTextStripper();
  String content = pts.getText(document);
  System.out.println("內容:" + content);
   
  /** 文件頁面資訊 **/
  PDDocumentCatalog cata = document.getDocumentCatalog();
  int count = 1;
  for (int i = 0; i < document.getNumberOfPages(); i++) {
  PDPage page = document.getPage(i);
  if (null != page) {
  //獲取到所有rescourse資訊
  PDResources res = page.getResources();
  Iterable<COSName> xit = res.getXObjectNames();
  Iterator<COSName> iterator = xit.iterator();
  while (iterator.hasNext()){
  COSName cosName = iterator.next();
  System.out.println(cosName.getName());
  //判斷是否圖片資源,這個提取圖片也很徹底,包括頁首頁尾的圖片也會被獲取到
  if(res.isImageXObject(cosName)){
  PDImageXObject pdImageXObject = (PDImageXObject)res.getXObject(cosName);
  //這裡儲存圖片我用了谷歌的thumbnailator框架,也可以用自己的方法去儲存BufferedImage物件到本地圖片
  Thumbnails.of(pdImageXObject.getImage()).scale(0.9).toFile(new File("D:\\pdf\\"+System.currentTimeMillis()+".jpg"));
  }
  }
  }
  }
  } catch (Exception e) {
  throw e;
  } finally {
  if (null != input)
  input.close();
  if (null != document)
  document.close();
  }
  }
   
  /***
  * PDF檔案轉PNG圖片,全部頁數
  *
  * @param PdfFilePath pdf完整路徑
  * @param dpi dpi越大轉換後越清晰,相對轉換速度越慢
  * @return
  */
  private static boolean pdf2Image(String PdfFilePath, String dstImgFolder, int dpi) {
  File file = new File(PdfFilePath);
  PDDocument pdDocument;
  try {
  String imgPDFPath = file.getParent();
  int dot = file.getName().lastIndexOf('.');
  String imagePDFName = file.getName().substring(0, dot); // 獲取圖片檔名
  String imgFolderPath = null;
  if (dstImgFolder.equals("")) {
  imgFolderPath = imgPDFPath + File.separator + imagePDFName;// 獲取圖片存放的資料夾路徑
  } else {
  imgFolderPath = dstImgFolder + File.separator + imagePDFName;
  }
   
  if (createDirectory(imgFolderPath)) {
   
  pdDocument = PDDocument.load(file);
  PDFRenderer renderer = new PDFRenderer(pdDocument);
  /* dpi越大轉換後越清晰,相對轉換速度越慢 */
  PdfReader reader = new PdfReader(PdfFilePath);
  int pages = reader.getNumberOfPages();
  StringBuffer imgFilePath = null;
  for (int i = 0; i < pages; i++) {
  String imgFilePathPrefix = imgFolderPath + File.separator + imagePDFName;
  imgFilePath = new StringBuffer();
  imgFilePath.append(imgFilePathPrefix);
  imgFilePath.append("_");
  imgFilePath.append(String.valueOf(formatNumber(i+1)));
  imgFilePath.append(".jpg");
  File dstFile = new File(imgFilePath.toString());
  BufferedImage image = renderer.renderImageWithDPI(i, dpi);
   
   
  ImageWriter writer = ImageIO.getImageWritersByFormatName("jpg").next();
  writer.setOutput(ImageIO.createImageOutputStream(dstFile));
  ImageWriteParam param = writer.getDefaultWriteParam();
  param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
  param.setCompressionQuality(0.3f);
  writer.write(null, new IIOImage(image, null, null), param);
   
  // ImageIO.write(image, "jpg", dstFile);
  }
  System.out.println("PDF文件轉圖片成功!"+dstImgFolder);
  return true;
  } else {
  System.out.println("PDF文件轉圖片失敗:" + "建立" + imgFolderPath + "失敗");
  }
   
  } catch (IOException e) {
  e.printStackTrace();
  }
  return false;
  }
   
  private static String formatNumber(int i){
  if(i<10){
  return "00"+i;
  }else if(i<100){
  return "0"+i;
  }else{
  return i+"";
  }
  }
   
  private static boolean createDirectory(String folder) {
  File dir = new File(folder);
  if (dir.exists()) {
  return true;
  } else {
  return dir.mkdirs();
  }
  }
   
  public static String dateFormat(Calendar calendar) throws Exception {
  if (null == calendar)
  return null;
  String date = null;
  try {
  String pattern = DATE_FORMAT;
  SimpleDateFormat format = new SimpleDateFormat(pattern);
  date = format.format(calendar.getTime());
  } catch (Exception e) {