1. 程式人生 > >java讀取pdf檔案的圖片和文字內容

java讀取pdf檔案的圖片和文字內容

引用的jar包:

pdfbox-1.8.13.jar

fontbox-1.8.13.jar

public static void main(String[] args) {
PDFReader pdfReader1 = new PDFReader();
pdfReader1.pdfReader("C:/Users/Administrator/Desktop/2017_PDF.pdf");
}


/**
* 匯出pdf中的圖片(有問題)

* @param filename
*/
public static void extractImage(String filename) {
PdfReader reader = null;
try {
// 讀取pdf檔案
reader = new PdfReader(filename);
// 獲得pdf檔案的頁數
int sumPage = reader.getNumberOfPages();
// 讀取pdf檔案中的每一頁
for (int i = 1; i <= sumPage; i++) {
// 得到pdf每一頁的字典物件
PdfDictionary dictionary = reader.getPageN(i);
// 通過RESOURCES得到對應的字典物件
PdfDictionary res = (PdfDictionary) PdfReader
.getPdfObject(dictionary.get(PdfName.RESOURCES));
// 得到XOBJECT圖片物件
PdfDictionary xobj = (PdfDictionary) PdfReader.getPdfObject(res
.get(PdfName.XOBJECT));
if (xobj != null) {
for (Iterator it = xobj.getKeys().iterator(); it.hasNext();) {
PdfObject obj = xobj.get((PdfName) it.next());
if (obj.isIndirect()) {
PdfDictionary tg = (PdfDictionary) PdfReader
.getPdfObject(obj);
PdfName type = (PdfName) PdfReader.getPdfObject(tg
.get(PdfName.SUBTYPE));
if (PdfName.IMAGE.equals(type)) {
PdfObject object = reader.getPdfObject(obj);
if (object.isStream()) {
PRStream prstream = (PRStream) object;
byte[] b;
try {
b = reader.getStreamBytes(prstream);
} catch (UnsupportedPdfException e) {
b = reader.getStreamBytesRaw(prstream);
}
FileOutputStream output = new FileOutputStream(
String.format(
"d:/pdf/output%d.jpg", i));
output.write(b);
output.flush();
output.close();
}
}
}
}
}
}


} catch (IOException e) {
e.printStackTrace();
}
}


/**
* 解析pdf檔案
* @param fileName  pdf檔案的路徑
*/
public String pdfReader(String fileName) {
try {
InputStream inputStream = new BufferedInputStream(
new FileInputStream(new File(fileName)));
// PDFParser parser = new PDFParser( inputStream );
// parser.parse();
PDDocument pdfDocument = PDDocument.load(inputStream);
// PDDocument pdfDocument = parser.getPDDocument();
StringWriter writer = new StringWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
String contents = writer.getBuffer().toString();


/*
* PDDocumentInformation documentInformation =
* pdfDocument.getDocumentInformation(); System.out.println("標題:" +
* documentInformation.getTitle()); PDDocumentInformation info =
* pdfDocument.getDocumentInformation(); System.out.println("標題:" +
* info.getTitle()); System.out.println("主題:" + info.getSubject());
* System.out.println("作者:" + info.getAuthor());
* System.out.println("關鍵字:" + info.getKeywords());
* System.out.println("應用程式:" + info.getCreator());
* System.out.println("pdf 製作程式:" + info.getProducer());
* System.out.println("作者:" + info.getTrapped());
* System.out.println("建立時間:" + dateFormat(info.getCreationDate()));
* System.out.println("修改時間:" +
* dateFormat(info.getModificationDate()));
*/


/** 文件頁面資訊 **/
PDDocumentCatalog cata = pdfDocument.getDocumentCatalog();
//文件文字內容
String content = "";
List pages = cata.getAllPages();
int count = 1;
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
if (null != page) {
// 本頁面文字內容
StringWriter sw = new StringWriter();
PDFTextStripper pst = new PDFTextStripper();
pst.setStartPage(i + 1);
pst.setEndPage(i + 1);
pst.writeText(pdfDocument, sw);
content += sw.getBuffer().toString();
//System.out.println(content);
//PDResources res = page.findResources();
// 獲取頁面圖片資訊
//Map imgs = res.getImages();
//if (null != imgs) {
//Set keySet = imgs.keySet();
//Iterator it = keySet.iterator();
//while (it.hasNext()) {
//Object obj = it.next();
//PDXObjectImage img = (PDXObjectImage) imgs.get(obj);
//img.write2file("D:/pdf/" + count);
//count++;
//}
//}
}
}
return content;
} catch (Exception e) {
e.printStackTrace();
return "";
}
}