1. 程式人生 > >在線查看PDF文件,pdf.js使用方法

在線查看PDF文件,pdf.js使用方法

mda mon () str 去掉 gen asc eve led

PDF.js可以實現在html下直接瀏覽pdf文檔,是一款開源的pdf文檔讀取解析插件,非常強大,能將PDF文件渲染成Canvas。PDF.js主要包含兩個庫文件,一個pdf.js和一個pdf.worker.js,一個負責API解析,一個負責核心解析。

首先引入pdf.js文件<script type="text/javascript" src=‘pdf.js‘></script>
PDF.js大部分用法都是基於Promise的,PDFJS.getDocument(url)方法返回的就是一個Promise:

PDFJS.getDocument(‘helloworld.pdf‘).then(function(pdf) {
});

PDF的解析工作需要通過pdf.getPage(page)去執行,這個方法返回的也是一個Promise,因此可以去逐頁解析PDF:

pdf.getPage(1).then(function(page) { 
});

官網地址:http://mozilla.github.io/pdf.js/

渲染頁面

PDF頁面有它自己的視窗,它定義了像素大小(n.72dpi和初始旋轉。默認情況下,該窗口將縮放到PDF但是通過修改視圖可以更改此操作。當創建了視圖時,還會創建一個初始轉換矩陣,它考慮到期望的規模、旋轉,並轉換坐標系統(0點)PDF文檔底部左邊,而畫布0是 左。

var scale = 1.5;
var viewport = page.getViewport(scale); var canvas = document.getElementById(‘the-canvas‘); var context = canvas.getContext(‘2d‘); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; page.render(renderContext);
還可以自定義canvas大小:
var desiredWidth = 100;
var viewport = page.getViewport(1);
var scale = desiredWidth / viewport.width;
var scaledViewport = page.getViewport(scale);

官方給出的示例:

ar url = ‘//cdn.mozilla.net/pdfjs/helloworld.pdf‘;
PDFJS.workerSrc = ‘//mozilla.github.io/pdf.js/build/pdf.worker.js‘;

var loadingTask = PDFJS.getDocument(url);
loadingTask.promise.then(function(pdf) {
  console.log(‘PDF loaded‘);
  
  var pageNumber = 1;
  pdf.getPage(pageNumber).then(function(page) {
    console.log(‘Page loaded‘);
    
    var scale = 1.5;
    var viewport = page.getViewport(scale);

    var canvas = document.getElementById(‘the-canvas‘);
    var context = canvas.getContext(‘2d‘);
    canvas.height = viewport.height;
    canvas.width = viewport.width;

    var renderContext = {
      canvasContext: context,
      viewport: viewport
    };
    var renderTask = page.render(renderContext);
    renderTask.then(function () {
      console.log(‘Page rendered‘);
    });
  });
}, function (reason) {
  console.error(reason);
});

另外較大的PDF文件可以用base 64編碼方式加載,例如:

var pdfData = atob(
  ‘JVBERi0xLjcKCjEgMCBvYmogICUgZW50cnkgcG9pbnQKPDwKICAvVHlwZSAvQ2F0YWxvZwog‘ +
  ‘IC9QYWdlcyAyIDAgUgo+PgplbmRvYmoKCjIgMCBvYmoKPDwKICAvVHlwZSAvUGFnZXMKICAv‘ +
  ‘TWVkaWFCb3ggWyAwIDAgMjAwIDIwMCBdCiAgL0NvdW50IDEKICAvS2lkcyBbIDMgMCBSIF0K‘ +
  ‘Pj4KZW5kb2JqCgozIDAgb2JqCjw8CiAgL1R5cGUgL1BhZ2UKICAvUGFyZW50IDIgMCBSCiAg‘ +
  ‘L1Jlc291cmNlcyA8PAogICAgL0ZvbnQgPDwKICAgICAgL0YxIDQgMCBSIAogICAgPj4KICA+‘ +
  ‘PgogIC9Db250ZW50cyA1IDAgUgo+PgplbmRvYmoKCjQgMCBvYmoKPDwKICAvVHlwZSAvRm9u‘ +
  ‘dAogIC9TdWJ0eXBlIC9UeXBlMQogIC9CYXNlRm9udCAvVGltZXMtUm9tYW4KPj4KZW5kb2Jq‘ +
  ‘Cgo1IDAgb2JqICAlIHBhZ2UgY29udGVudAo8PAogIC9MZW5ndGggNDQKPj4Kc3RyZWFtCkJU‘ +
  ‘CjcwIDUwIFRECi9GMSAxMiBUZgooSGVsbG8sIHdvcmxkISkgVGoKRVQKZW5kc3RyZWFtCmVu‘ +
  ‘ZG9iagoKeHJlZgowIDYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDAwMDEwIDAwMDAwIG4g‘ +
  ‘CjAwMDAwMDAwNzkgMDAwMDAgbiAKMDAwMDAwMDE3MyAwMDAwMCBuIAowMDAwMDAwMzAxIDAw‘ +
  ‘MDAwIG4gCjAwMDAwMDAzODAgMDAwMDAgbiAKdHJhaWxlcgo8PAogIC9TaXplIDYKICAvUm9v‘ +
  ‘dCAxIDAgUgo+PgpzdGFydHhyZWYKNDkyCiUlRU9G‘);

PDFJS.workerSrc = ‘//mozilla.github.io/pdf.js/build/pdf.worker.js‘; var loadingTask = PDFJS.getDocument({data: pdfData}); loadingTask.promise.then(function(pdf) { console.log(‘PDF loaded‘); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log(‘Page loaded‘); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(‘the-canvas‘); var context = canvas.getContext(‘2d‘); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log(‘Page rendered‘); }); }); }, function (reason) { console.error(reason); });

pdf翻頁處理:

// If absolute URL from the remote server is provided, configure the CORS
// header on that server.
var url = ‘//cdn.mozilla.net/pdfjs/tracemonkey.pdf‘;

// The workerSrc property shall be specified.
PDFJS.workerSrc = ‘//mozilla.github.io/pdf.js/build/pdf.worker.js‘;

var pdfDoc = null,
    pageNum = 1,
    pageRendering = false,
    pageNumPending = null,
    scale = 0.8,
    canvas = document.getElementById(‘the-canvas‘),
    ctx = canvas.getContext(‘2d‘);

/**
 * Get page info from document, resize canvas accordingly, and render page.
 * @param num Page number.
 */
function renderPage(num) {
  pageRendering = true;
pdfDoc.getPage(num).then(function(page) { var viewport = page.getViewport(scale); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: ctx, viewport: viewport }; var renderTask = page.render(renderContext);
renderTask.promise.then(function() { pageRendering = false; if (pageNumPending !== null) { renderPage(pageNumPending); pageNumPending = null; } }); }); document.getElementById(‘page_num‘).textContent = num; }
function queueRenderPage(num) { if (pageRendering) { pageNumPending = num; } else { renderPage(num); } }
function onPrevPage() { if (pageNum <= 1) { return; } pageNum--; queueRenderPage(pageNum); } document.getElementById(‘prev‘).addEventListener(‘click‘, onPrevPage);
function onNextPage() { if (pageNum >= pdfDoc.numPages) { return; } pageNum++; queueRenderPage(pageNum); } document.getElementById(‘next‘).addEventListener(‘click‘, onNextPage);

PDFJS.getDocument(url).then(function(pdfDoc_) { pdfDoc = pdfDoc_; document.getElementById(‘page_count‘).textContent = pdfDoc.numPages; renderPage(pageNum); });

關於page方式的使用:

解析結果,我們可以看下這個對象提供的方法:

方法返回
getAnnotations A promise that is resolved with an {Array} of the annotation objects.
getTextContent That is resolved a TextContent object that represent the page text content.
getViewport Contains ‘width’ and ‘height’ properties along with transforms required for rendering.
render An object that contains the promise, which is resolved when the page finishes rendering.

我們可以試試調用getTextContent方法,並將其結果打印出來:

pdf.getPage(1).then(function(page) { 
    console.log(page);
});

輸入格式大致如下:

{
    "items": [
        {
            "str": "xxx",
            "dir": "xxx",
            "width": xxx,
            "height": xxx,
            "transform": [
                48,
                0,
                0,
                48,
                45.32495,
                679.04
            ],
            "fontName": "g_d0_f1"
        },
        {
            "str": " ",
            "dir": "ltr",
            "width": 9.600000000000001,
            "height": 2304,
            "transform": [
                48,
                0,
                0,
                48,
                285.325,
                679.04
            ],
            "fontName": "g_d0_f2"
        }
      ],
    "styles": {
        "g_d0_f1": {
            "fontFamily": "monospace",
            "ascent": 1.05810546875,
            "descent": -0.26171875,
            "vertical": false
        },
        "g_d0_f2": {
            "fontFamily": "sans-serif",
            "ascent": 0.74365234375,
            "descent": -0.25634765625
        }
    }
 }

PDF.js能將每頁文本的字符串、位置、字體都解析出來。

官網用的viewer.js:http://mozilla.github.io/pdf.js/web/viewer.html,首先底圖是一個Canvas,內容和PDF一樣(通過下面介紹的page.render方法可以得到),底圖之上是一個textLayer,這一層就是通過page.getTextContent()得到了字體的位置和樣式,再覆蓋在Canvas上。

我們可以直接使用官網view.html的demo,然後修改樣式去掉用不掉的功能,簡單粗暴。只需要在跳轉鏈接後面加上參數就行,例:http://xxxx/viewer.html?file=‘xxxx.pdf‘;

在線查看PDF文件,pdf.js使用方法