1. 程式人生 > >python3用PyPDF2解析pdf檔案,用正則匹配資料

python3用PyPDF2解析pdf檔案,用正則匹配資料

    import PyPDF2
    import re

    pdf_file = open('xxx.pdf', mode='rb')
    read_pdf = PyPDF2.PdfFileReader(pdf_file)
    # 獲取pdf檔案的所有頁數
    number_of_pages = read_pdf.getNumPages()
    # print('total_page: ', number_of_pages)
    line_list = []
    # 迴圈遍歷每一頁
    for i in range(0, number_of_pages):
        # 讀取每一頁的內容
        page = read_pdf.getPage(i)
        page_content = page.extractText()
        # 將這一頁的內容分割為列表,,並相加所有的頁面內容
        line_list += page_content.split()
    # 關閉pdf檔案
    pdf_file.close()
    line_buf = ''
    for buf in line_list:
        line_buf = line_buf+' '+buf
    # 匹配資料:第一列和第二列  如:000069.sz  和 100
    # print(line_buf)
    a = re.findall('([0-9]+[0-9]+[0-9]+[0-9]+[0-9]+[0-9]+.[a-z]+[a-z])', line_buf)
    b = re.findall('[0-9]+[0-9]+[0-9]+[0-9]+[0-9]+[0-9]+.[a-z]+[a-z].([0-9,]+)', line_buf)
    # print(b)
    for i in range(0, len(a)):
        a[i] = a[i].upper()
    for i in range(0, len(b)):
        b[i] = int(b[i].replace(',', ''))
    # print(b)
    # 組成字典
    results = dict(zip(a, b))

正則的其他用法:

fp = open(filename,"w")

fp.write(re.search('(StockDescription:)([a-zA-Z]+-[a-zA-Z]+)',line_buf).group(2) +',')

fp.write(time.strftime('%Y%m%d',time.strptime(re.search('(TradeDate:)([0-9]+[a-zA-Z]+[0-9]+)',line_buf).group(2),'%d%B%Y')) +',')

fp.write(re.search('(Price:[A-Z]+)([0-9.,]+)',line_buf).group(2).replace(',','')+',')

fp.close()