Excel、PDF文檔解析
阿新 • • 發佈:2017-10-15
style 內容 exce out sys tin broker 是否 方便
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFTextExtractionNotAllowed, PDFPage from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter from pdfminer.layout import LAParams, LTTextBoxHorizontal from pdfminer.converter importPDFPageAggregator import sys import xlrd reload(sys) sys.setdefaultencoding(‘utf-8‘) def pdf_transform_text(): print "開始解析pdf" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") content = ‘‘ fp = open(‘tempPdfFile_new3.pdf‘, ‘rb‘) # 來創建一個pdf文檔分析器 parser = PDFParser(fp)# 創建一個PDF文檔對象存儲文檔結構 document = PDFDocument(parser) # 檢查文件是否允許文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 創建一個PDF資源管理器對象來存儲共賞資源 rsrcmgr = PDFResourceManager() # 設定參數進行分析 laparams = LAParams() # 創建一個PDF設備對象# device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 創建一個PDF解釋器對象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 處理每一頁 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受該頁面的LTPage對象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): content += x.get_text().encode(‘utf-8‘) + ‘\n‘ print "解析pdf成功" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") return content.decode("utf-8") def parse_excel(url, filename): print ‘開始解析excel文檔‘, filename, url req = WebRequests() inputStream = req.get(url, timeout=180) # 寫入到本地暫存,方便解析成文本 with open("tempExcelFile_new3.xls", "wb") as xls: xls.write(inputStream.content) data = xlrd.open_workbook("tempExcelFile_new3.xls") for i in range(len(data.sheets())): table = data.sheets()[i] print table.name print table.nrows for i in range(2, table.nrows): if len(table.row_values(i))>=12: result = {} result[‘company_name‘] = table.row_values(i)[1] result[‘province‘] = table.row_values(i)[2] result[‘industry‘] = table.row_values(i)[3] result[‘broker_company‘] = table.row_values(i)[4] result[‘broker_person‘] = table.row_values(i)[5] result[‘law_firm‘] = table.row_values(i)[6] result[‘laywyer‘] = table.row_values(i)[7] result[‘accounting_firm‘] = table.row_values(i)[8] result[‘accountant‘] = table.row_values(i)[9] # result[‘‘] = table.row_values(i)[10] #掛牌同時發行((是/否)) result[‘progress‘] = table.row_values(i)[11] if len(table.row_values(i))==13: result[‘receive_date‘] = table.row_values(i)[12] saveOrUpdateNew3CompanyBaseInfo(result)
註意:Excelt在解析時要看一下有沒有多個sheet。該pdf解析只能解析文本內容的pdf
Excel、PDF文檔解析