1. 程式人生 > >Excel、PDF文檔解析

Excel、PDF文檔解析

style 內容 exce out sys tin broker 是否 方便

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed, PDFPage
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.converter import
PDFPageAggregator import sys import xlrd reload(sys) sys.setdefaultencoding(utf-8) def pdf_transform_text(): print "開始解析pdf" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") content = ‘‘ fp = open(tempPdfFile_new3.pdf, rb) # 來創建一個pdf文檔分析器 parser = PDFParser(fp)
# 創建一個PDF文檔對象存儲文檔結構 document = PDFDocument(parser) # 檢查文件是否允許文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 創建一個PDF資源管理器對象來存儲共賞資源 rsrcmgr = PDFResourceManager() # 設定參數進行分析 laparams = LAParams() # 創建一個PDF設備對象
# device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 創建一個PDF解釋器對象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 處理每一頁 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受該頁面的LTPage對象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): content += x.get_text().encode(utf-8) + \n print "解析pdf成功" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") return content.decode("utf-8") def parse_excel(url, filename): print 開始解析excel文檔, filename, url req = WebRequests() inputStream = req.get(url, timeout=180) # 寫入到本地暫存,方便解析成文本 with open("tempExcelFile_new3.xls", "wb") as xls: xls.write(inputStream.content) data = xlrd.open_workbook("tempExcelFile_new3.xls") for i in range(len(data.sheets())): table = data.sheets()[i] print table.name print table.nrows for i in range(2, table.nrows): if len(table.row_values(i))>=12: result = {} result[company_name] = table.row_values(i)[1] result[province] = table.row_values(i)[2] result[industry] = table.row_values(i)[3] result[broker_company] = table.row_values(i)[4] result[broker_person] = table.row_values(i)[5] result[law_firm] = table.row_values(i)[6] result[laywyer] = table.row_values(i)[7] result[accounting_firm] = table.row_values(i)[8] result[accountant] = table.row_values(i)[9] # result[‘‘] = table.row_values(i)[10] #掛牌同時發行((是/否)) result[progress] = table.row_values(i)[11] if len(table.row_values(i))==13: result[receive_date] = table.row_values(i)[12] saveOrUpdateNew3CompanyBaseInfo(result)

註意:Excelt在解析時要看一下有沒有多個sheet。該pdf解析只能解析文本內容的pdf

Excel、PDF文檔解析