iText實現URL頁面轉PDF
阿新 • • 發佈:2019-01-30
概述
軟體要求
實現過程
[一]、概述
前面已經介紹瞭如何實現對HTML中文字元的轉換以及HTML檔案生成PDF檔案的基本方法,本文主要演示下如何把URL地址對應的內容直接轉換生成PDF檔案,這個需求也有很多的應用場景,最簡單的應用場景比如:自己blog中的文章如何轉PDF,如果能生成PDF檔案,一方面可以方便自己的閱讀,亦可作為一種備份。
[二]、軟體要求
org.jsoup
jsoup
1.7.1
jar
compile
1
2
3
4
5
6
7
org.jsoup
jsoup
1.7.1
jar
compile
[三]、實現過程
Java實現程式碼:Demo4URL2PDF.java
package com.bigdata.ai.util.pdf;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import com.itextpdf.text.BaseColor;
import com.itextpdf.text.Chapter;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Document;
import com.itextpdf.text.Element;
import com.itextpdf.text.Font;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.Section;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.draw.LineSeparator;
import com.itextpdf.tool.xml.ElementHandler;
import com.itextpdf.tool.xml.Writable;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.pipeline.WritableElement;
/**
* 依據jsoup.jar包和itextReader匯出pdf文件
* @file Demo4URL2Pdf.java
* @package com.bigdata.ai.util.pdf
* @author dxh
* @createTime 2017年8月23日08:05:31
*/
public class Demo4URL2Pdf {
/**
* @param args
*/
public static void main(String[] args) throws Exception {
String blogURL = "http://www.micmiu.com/os/linux/shell-dev-null/";
// 直接把網頁內容轉為PDF檔案
String pdfFile = "d:/test/itext/demo-URL.pdf";
Demo4URL2Pdf.parseURL2PDFFile(pdfFile, blogURL);
// 把網頁內容轉為PDF中的Elements
String pdfFile2 = "d:/test/itext/demo-URL2.pdf";
Demo4URL2Pdf.parseURL2PDFElement(pdfFile2, blogURL);
}
/**
* 根據URL提前blog的基本資訊,返回結果>>:[主題 ,分類,日期,內容]等.
*
* @param blogURL
* @return
* @throws Exception
*/
public static String[] extractBlogInfo(String blogURL) throws Exception {
String[] info = new String[4];
org.jsoup.nodes.Document doc = Jsoup.connect(blogURL).get();
org.jsoup.nodes.Element e_title = doc.select("h2.title").first();
info[0] = e_title.text();
org.jsoup.nodes.Element e_category = doc.select("a[rel=category tag]")
.first();
info[1] = e_category.attr("href").replace("http://www.micmiu.com/", "");
org.jsoup.nodes.Element e_date = doc.select("span.post-info-date")
.first();
String dateStr = e_date.text().split("日期")[1].trim();
info[2] = dateStr;
org.jsoup.nodes.Element entry = doc.select("div.entry").first();
info[3] = formatContentTag(entry);
return info;
}
/**
* 格式化 img標籤
*
* @param entry
* @return
*/
private static String formatContentTag(org.jsoup.nodes.Element entry) {
try {
entry.select("div").remove();
// 把 <a href="*.jpg" ><img src="*.jpg"/></a> 替換為 <img
// src="*.jpg"/>
for (org.jsoup.nodes.Element imgEle : entry
.select("a[href~=(?i)\\.(png|jpe?g)]")) {
imgEle.replaceWith(imgEle.select("img").first());
}
return entry.html();
} catch (Exception e) {
return "";
}
}
/**
* 把String 轉為 InputStream
*
* @param content
* @return
*/
public static InputStream parse2Stream(String content) {
try {
ByteArrayInputStream stream = new ByteArrayInputStream(
content.getBytes("utf-8"));
return stream;
} catch (Exception e) {
return null;
}
}
/**
* 直接把網頁內容轉為PDF檔案
*
* @param fileName
* @throws Exception
*/
public static void parseURL2PDFFile(String pdfFile, String blogURL)
throws Exception {
BaseFont bfCN = BaseFont.createFont("C:/WINDOWS/Fonts/SIMYOU.TTF", BaseFont.IDENTITY_H,BaseFont.NOT_EMBEDDED);
// 中文字型定義
Font chFont = new Font(bfCN, 14, Font.NORMAL, BaseColor.BLUE);
Font secFont = new Font(bfCN, 12, Font.NORMAL, new BaseColor(0, 204,
255));
Font textFont = new Font(bfCN, 12, Font.NORMAL, BaseColor.BLACK);
Document document = new Document();
PdfWriter pdfwriter = PdfWriter.getInstance(document,
new FileOutputStream(pdfFile));
pdfwriter.setViewerPreferences(PdfWriter.HideToolbar);
document.open();
String[] blogInfo = extractBlogInfo(blogURL);
int chNum = 1;
Chapter chapter = new Chapter(new Paragraph("URL轉PDF測試", chFont),
chNum++);
Section section = chapter
.addSection(new Paragraph(blogInfo[0], secFont));
section.setIndentation(10);
section.setIndentationLeft(10);
section.setBookmarkOpen(false);
section.setNumberStyle(Section.NUMBERSTYLE_DOTTED_WITHOUT_FINAL_DOT);
section.add(new Chunk("分類:" + blogInfo[1] + " 日期:" + blogInfo[2],
textFont));
LineSeparator line = new LineSeparator(1, 100, new BaseColor(204, 204,
204), Element.ALIGN_CENTER, -2);
Paragraph p_line = new Paragraph(" ");
p_line.add(line);
section.add(p_line);
section.add(Chunk.NEWLINE);
document.add(chapter);
// html檔案
XMLWorkerHelper.getInstance().parseXHtml(pdfwriter, document,
parse2Stream(blogInfo[3]));
document.close();
}
/**
* 把網頁內容轉為PDF中的Elements
*
* @param pdfFile
* @param htmlFileStream
*/
public static void parseURL2PDFElement(String pdfFile, String blogURL) {
try {
Document document = new Document(PageSize.A4);
FileOutputStream outputStream = new FileOutputStream(pdfFile);
PdfWriter pdfwriter = PdfWriter.getInstance(document, outputStream);
// pdfwriter.setViewerPreferences(PdfWriter.HideToolbar);
document.open();
BaseFont bfCN = BaseFont.createFont("C:/WINDOWS/Fonts/SIMYOU.TTF", BaseFont.IDENTITY_H,BaseFont.NOT_EMBEDDED);
// 中文字型定義
Font chFont = new Font(bfCN, 14, Font.NORMAL, BaseColor.BLUE);
Font secFont = new Font(bfCN, 12, Font.NORMAL, new BaseColor(0,
204, 255));
Font textFont = new Font(bfCN, 12, Font.NORMAL, BaseColor.BLACK);
int chNum = 1;
Chapter chapter = new Chapter(new Paragraph("URL轉PDF元素,便於追加其他內容",
chFont), chNum++);
String[] blogInfo = extractBlogInfo(blogURL);
Section section = chapter.addSection(new Paragraph(blogInfo[0],
secFont));
section.setIndentation(10);
section.setIndentationLeft(10);
section.setBookmarkOpen(false);
section.setNumberStyle(Section.NUMBERSTYLE_DOTTED_WITHOUT_FINAL_DOT);
section.add(new Chunk("分類:" + blogInfo[1] + " 發表日期:" + blogInfo[2],
textFont));
LineSeparator line = new LineSeparator(1, 100, new BaseColor(204,
204, 204), Element.ALIGN_CENTER, -2);
Paragraph p_line = new Paragraph();
p_line.add(line);
section.add(p_line);
section.add(Chunk.NEWLINE);
final List<Element> pdfeleList = new ArrayList<Element>();
ElementHandler elemH = new ElementHandler() {
public void add(final Writable w) {
if (w instanceof WritableElement) {
pdfeleList.addAll(((WritableElement) w).elements());
}
}
};
XMLWorkerHelper.getInstance().parseXHtml(elemH,
new InputStreamReader(parse2Stream(blogInfo[3]), "utf-8"));
List<Element> list = new ArrayList<Element>();
for (Element ele : pdfeleList) {
if (ele instanceof LineSeparator
|| ele instanceof WritableElement) {
continue;
}
list.add(ele);
}
section.addAll(list);
section = chapter.addSection(new Paragraph("繼續新增章節", secFont));
section.setIndentation(10);
section.setIndentationLeft(10);
section.setBookmarkOpen(false);
section.setNumberStyle(Section.NUMBERSTYLE_DOTTED_WITHOUT_FINAL_DOT);
section.add(new Chunk("測試URL轉為PDF元素,方便追加其他內容", textFont));
document.add(chapter);
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}