1. 程式人生 > >Python列表處理網頁表格資料

Python列表處理網頁表格資料

涉及知識點

  1. 正則表示式re
  2. 列表處理

程式碼如下

import requests
import sys, io
import re

url = "http://www.nifdc.org.cn/CL0903/11390.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Referer':'',
}

html_doc = requests.get(url=url, headers=headers).content

# 解決編碼問題
sys.stdout =
io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # 顯示中文字元 html_doc = html_doc.decode("gb18030") # print(html_doc) # 分別獲取批號和生產企業 # pat_num = re.compile(r">((B|C)?2018[A-Za-z0-9]{5,6})<") # # |R01[0-9]{4}|P3K7[1,2]1M|YHBV[A-Za-z0-9]{5,6} # lst_num = pat_num.findall(html_doc) # # lst_num = re.search(r">((B|C)?2018[A-Za-z0-9]{5,6})<", html_doc).group(1)
# print(lst_num) pat = re.compile(r'<span style="font-family:宋體;color:black;font-size:12px">(.+?)</span>') lst = pat.findall(html_doc) # print(len(lst)) # print('table_data:\n', lst) # print('*' * 120) serial_no_lst = [] for i in range(150): serial_no = lst.index(str(i+1)) # if lst[serial_no-1] == "中國食品藥品檢定研究院":
serial_no_lst.append(serial_no) # print(len(serial_no_lst)) # print(serial_no_lst) # print('*' * 120) # print(lst.count("中國食品藥品檢定研究院")) # print(lst.index("中國食品藥品檢定研究院")) # 檢查未按照大小順序排序的列表項 count = 0 while count+1 < 150: if serial_no_lst[count] >= serial_no_lst[count + 1]: # print('the %d number is not in order' % (count+1)) pass # count += 1 else: # print('ok--%d' % (count+1)) pass # count += 1 count += 1 # print('4th: ', serial_no_lst[4]) # print('95th: ', serial_no_lst[95]) # print('*' * 120) # lst_ck = [] # for item in serial_no_lst: # elem = lst[item] # lst_ck.append(elem) # print('lst_ck--%s:\n%s' % (len(lst_ck),lst_ck)) serial_no_update_lst = serial_no_lst[:] serial_no_update_lst[4] = int((serial_no_update_lst[3] + serial_no_update_lst[5])/2) serial_no_update_lst[95] = int((serial_no_update_lst[94] + serial_no_update_lst[96])/2+1) # print('序號1-150所在位置:\n', serial_no_update_lst) # print('4th: ', serial_no_update_lst[4]) # print('95th: ', serial_no_update_lst[95]) # print('*' * 120) # print('lst[81]:', lst[81]) # print('lst[1594]:', lst[1594]) print('*' * 120) # 根據表格元素列表lst和序號列表serial_no_update_lst逐行拆分表格 table_row_lst = [] title_row = lst[:serial_no_update_lst[0]] table_row_lst.append(title_row) for i in range(149): tmp = lst[serial_no_update_lst[i]:serial_no_update_lst[i+1]] table_row_lst.append(tmp) last_row = lst[serial_no_update_lst[149]:] table_row_lst.append(last_row) print('table_row_lst:\n', table_row_lst)