[Python] [爬蟲] 6.批量政府網站的招投標、中標資訊爬取和推送的自動化爬蟲——網頁解析器
阿新 • • 發佈:2018-11-09
目錄
1.Intro
檔名:pageResolver.py
模組名:網頁解析器
引用庫:
re | lxml | datetime | sys |
retry | random | urllib2 |
自定義引用檔案:configManager
功能:解析網頁原始碼,獲得相應的資料,以字典形式儲存行記錄,最後返回包含字典物件的列表。
2.Source
#!/usr/bin/env Python # -*- coding: utf-8 -*- ''' # Author : YSW # Time : 2018/6/6 14:04 # File : pageResolver.py # Version : 1.1 # Describe: 網頁解析器 # Update : 1.增加了中標網頁的解析方法 ''' import re from lxml import etree import datetime import sys from retry import retry import configManager import random import urllib2 # 設定預設編碼,防止出現中文字元亂碼 defaultencoding = 'utf-8' if sys.getdefaultencoding() != defaultencoding: reload(sys) sys.setdefaultencoding(defaultencoding) HEADERS = { "User-Agent": random.choice(configManager.headers) } class Resolver(object): def time_parse(self, currentTime): ''' 獲取系統當前時間,返回規約後的時間資訊 :param currentTime: 當前時間(字串型別) :return:當前時間(時間型別) ''' date = datetime.datetime.strptime(currentTime, '%Y-%m-%d') return date #### 招投標資料 #### @retry(tries=3, delay=2) def resovler_ynsggzxxt(self, html, page_num): ''' 雲南省公共資源交易中心電子服務系統解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) # 獲取招標資訊 xpathPattern = "//div/table[@id='data_tab']/tbody/tr" # 通過 xpath 返回符合匹配的結果列表 node_list = text.xpath(xpathPattern) # 正則規約欄位 strParse = re.compile("\s") # 遍歷結果列表 for node in node_list: # 篩除標題的空值標籤 if len(node.xpath("./td")) > 0: # 專案編號 projectNumber = node.xpath("./td")[1].text # 公告標題(正則規約) title = strParse.sub("", node.xpath("./td/a")[0].text) # 釋出時間 startTime = node.xpath("./td")[3].text start_time = self.time_parse(startTime) # 截止時間 endTime = node.xpath("./td")[4].text end_time = self.time_parse(endTime) # 狀態(正則規約) status = strParse.sub("", node.xpath("./td")[5].text) # 判斷狀態是否為空,如果為空,則跳轉到下一級標籤 i if status is "": status = strParse.sub("", node.xpath("./td/i")[0].text) # href 連結地址 href = "https://www.ynggzyxx.gov.cn" + str(node.xpath("./td/a/@href")[0]) # 儲存到字典 resolveMessage = { "專案編號": projectNumber, "公告標題": title, "釋出時間": start_time, "截止時間": end_time, "狀態": status, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) def resovler_ynsggzzw(self, html, page_num): ''' 雲南省公共資源交易中心網解析器 :param html: :param page_num: :return: ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 獲取招標資訊 xpathPattern = "//table[@id='data_tab']/tbody/tr" # 解析原始碼並返回 XML 物件 text = etree.HTML(html) # 通過 xpath 返回符合匹配的結果列表 node_list = text.xpath(xpathPattern) # 正則規約欄位 strParse = re.compile("\s") # 遍歷結果列表 for node in node_list: # 篩除標題的空值標籤 if len(node.xpath("./td")) > 0: # 序號 serialNumber = node.xpath("./td")[0].text # 專案編號 projectNumber = node.xpath("./td")[1].text # href 連結地址 href = "https://www.ynggzyxx.gov.cn" + str(node.xpath("./td/a/@href")[0]) # 釋出時間 startTime = node.xpath("./td")[3].text start_time = self.time_parse(startTime) # 公告標題(正則規約) title = strParse.sub("", node.xpath("./td/a")[0].text) # 儲存到字典 resolveMessage = { "專案編號": projectNumber, "公告標題": title, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) def resovler_kmsgg(self, html, page_num): ''' 昆明市公共資源交易中心網解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr") for i in range(1, 16): # 編號 num = node_list[i].xpath("./td")[1].text # 工程名稱 project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8') # 連結 href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0]) start_time = None # 起始時間 startTime = node_list[i].xpath("./td")[3].text if startTime is not None: start_time = self.time_parse(startTime) end_time = None # 結束時間 endTime = node_list[i].xpath("./td")[4].text if endTime is not None: end_time = self.time_parse(endTime) status = None # 狀態 if node_list[i].xpath("./td")[5].text is not None: status = (node_list[i].xpath("./td")[5].text).encode('utf8') # 儲存到字典 if num and project_name and start_time and end_time and status is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": start_time, "結束時間": end_time, "狀態": status, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) def resovler_kmsgg_gc(self, html, page_num): ''' 昆明市公共資源交易中心網解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr") for i in range(1, 16): # 編號 num = node_list[i].xpath("./td")[1].text # 工程名稱 project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8') # 連結 href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0]) start_time = None # 起始時間 startTime = node_list[i].xpath("./td")[3].text if startTime is not None: start_time = self.time_parse(startTime) end_time = None # 結束時間 endTime = node_list[i].xpath("./td")[4].text if endTime is not None: end_time = self.time_parse(endTime) status = None # 狀態 if node_list[i].xpath("./td")[5].text is not None: status = (node_list[i].xpath("./td")[5].text).encode('utf8') # 儲存到字典 if num and project_name and start_time and end_time and status is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": start_time, "結束時間": end_time, "狀態": status, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) def resovler_ynsggzxxt_zf(self, html, page_num): ''' 雲南省公共資源交易中心電子服務系統解析器 政府採購 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) # 獲取招標資訊 xpathPattern = "//div/table[@id='data_tab']/tbody/tr" # 通過 xpath 返回符合匹配的結果列表 node_list = text.xpath(xpathPattern) # 正則規約欄位 strParse = re.compile("\s") # 遍歷結果列表 for node in node_list: # 篩除標題的空值標籤 if len(node.xpath("./td")) > 0: # 專案編號 projectNumber = node.xpath("./td")[1].text # 公告標題(正則規約) title = strParse.sub("", node.xpath("./td/a")[0].text) # 釋出時間 startTime = node.xpath("./td")[3].text start_time = self.time_parse(startTime) # 截止時間 endTime = node.xpath("./td")[4].text end_time = self.time_parse(endTime) # 狀態(正則規約) status = strParse.sub("", node.xpath("./td")[5].text) # 判斷狀態是否為空,如果為空,則跳轉到下一級標籤 i if status is "": status = strParse.sub("", node.xpath("./td/i")[0].text) # href 連結地址 href = "https://www.ynggzyxx.gov.cn" + str(node.xpath("./td/a/@href")[0]) # 儲存到字典 resolveMessage = { "專案編號": projectNumber, "公告標題": title, "釋出時間": start_time, "截止時間": end_time, "狀態": status, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) def resovler_ynszfcgw(self, html, page_num): ''' 雲南省政府採購網 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) for i in range(0, 10): node_list = text.xpath("//tr[@data-row-id='{0}']".format(i)) for node in node_list: text_total = node.xpath('./td')[0].xpath('./a')[0].text # 編號 num = text_total[:text_total.find(':')] # 工程名稱 project_name = text_total[text_total.find(':') + 1:] # 區劃 area = node.xpath('./td')[2].text time_push = None # 釋出時間 timePush = node.xpath('./td')[3].text if timePush is not None: time_push = self.time_parse(timePush) # 連結 cursor = node.xpath('./td')[0].xpath('./a/@data-bulletin_id')[0] href = "http://www.yngp.com/newbulletin_zz.do?method=preinsertgomodify&operator_state=1&flag=view&bulletin_id={0}".format( cursor) # 儲存到字典 if num and project_name and area and href and time_push is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": time_push, "區劃": area, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult #### 中標資料 #### @retry(tries=3, delay=2) def get_url(self, url, proxy_dict): proxyIP = proxy_dict['ip'] proxyPort = proxy_dict['port'] proxyProtocol = proxy_dict['protocol'] proxy_handler = urllib2.ProxyHandler({proxyProtocol: "{0}:{1}".format(proxyIP, proxyPort)}) opener_proxy = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener_proxy) request = urllib2.Request(url=url, headers=HEADERS) response = urllib2.urlopen(request) html = response.read() return html @retry(tries=3, delay=2) # 70% def resovler_ynsggzxxt_gc_zb(self, html, page_num, proxy_dict): ''' 雲南省公共資源交易資訊網_工程建設_中標公告解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' def resolve_pp_0(html): try: people = '' price = 0.0 text = etree.HTML(html) node_second_list = text.xpath("//div[@class='con']//tr") for node_second in node_second_list: if "中標人:" == node_second.xpath("./td")[0].text: people = node_second.xpath("./td")[1].xpath('./b//span')[0].text if "中標價" in node_second.xpath("./td")[0].text: totalCount = node_second.xpath("./td")[1].xpath('./b//span')[0].text price = float(re.sub("\D", "", totalCount)) return people, price except: return None, 0.0 def resolve_pp_1(html): ''' 子網頁解析器_1 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=7befec50-6cf1-49b1-a5ec-b3b1cf6d3ab2&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: people = '' price = 0.0 text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//table" node_list = text.xpath(xpathPattern)[0] for index, node in enumerate(node_list): if index == 7: people = node.xpath('./td//tr')[1].xpath('./td')[1].text price_tmp = node.xpath('./td//tr')[1].xpath('./td')[6].text if price_tmp == 0 or price_tmp == '/': price = float(0.0) # print("中標人: {0},中標價:{1}".format(people, price)) return people, price except: return None, 0.0 def resolve_pp_2(html): ''' 子網頁解析器_2 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=2ab5a6f5-30e2-4599-846b-22597815e3dd&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: people = '' price = 0.0 text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//div[@class='detail_contect']//p" node_list = text.xpath(xpathPattern) for node in node_list: if "第一中標候選人" in node.text: people_tmp = str(node.text).strip() people = people_tmp[people_tmp.find(':') + 3:] elif "投標報價" in node.text: price_tmp = node.xpath('./span')[0].text price = float(price_tmp) # print("中標人: {0},中標價:{1}".format(people, price)) return people, price except: return None, 0.0 def resolve_pp_3(html): ''' 子網頁解析器_3 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=e145f187-b9d9-4573-b4b0-f5c4c66ddbdb&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: people = '' price = 0.0 text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//tr" node_list = text.xpath(xpathPattern) for node in node_list: ## 中標人 tmp = node.xpath('./td//span')[0].text if "第一中標候選人" == tmp: people = node.xpath('./td//span')[1].text ## 中標價格 node_td = node.xpath('./td') if len(node_td) > 3: for no in node_td: if len(no.xpath('./span')) > 0 and "中標價(萬元)" == no.xpath('./span')[0].text: price = float(node_td[3].xpath('./span')[0].text) # print("中標人: {0},中標價:{1}".format(people, price)) return people, price except: return None, 0.0 def resolve_pp_4(html): ''' 子網頁解析器_4 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=562df3b5-207a-4f2e-b3f7-3b29736ae191&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//tr" node_list = text.xpath(xpathPattern) node = node_list[12] people_td = node.xpath('./td')[1] people = people_td.xpath('./p/span')[0].text price_td = node.xpath('./td')[2] price_tmp = price_td.xpath('./p/span')[0].text price = float(price_tmp) return people, price except: return None, 0.0 def resolve_pp_5(html): ''' 子網頁解析器_5 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=61a3019b-33cb-44ba-a193-20c5d7f38543&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//table" node_list = text.xpath(xpathPattern) tr_list = node_list[0].xpath('./tbody//tr') td_list = tr_list[1] people_td = td_list[2] people = people_td.xpath('./p/b/span')[0].text price_td = td_list[4] price_tmp = price_td.xpath('./p/b/span')[0].text price = float(price_tmp) return people, price except: return None, 0.0 def resolve_pp_6(html): ''' 子網頁解析器_6 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=e8cc5564-4664-4d45-aabd-2690a3366e2b&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//table//td[@colspan='4']//tr" node_list = text.xpath(xpathPattern) people = node_list[1].xpath('./td')[1].text price_tmp = node_list[1].xpath('./td')[4].text price = float(price_tmp) return people, price except: return None, 0.0 def resolve_pp_7(html): ''' 子網頁解析器_7 eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=2a7c021d-db9d-4dc5-8294-39083501dd9f&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: text = etree.HTML(html) xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//table//tr" node_list = text.xpath(xpathPattern) people = node_list[9].xpath('./td')[1].xpath('./p/span')[0].text return people, 0.0 except: return None, 0.0 print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) xpathPattern = "//div/table[@id='data_tab']/tbody/tr" node_list = text.xpath(xpathPattern) for node in node_list: if len(node.xpath("./td")) > 0: project_name = node.xpath("./td//a")[0].text project_name_parse = project_name.replace('\n', '').replace(u'\t', '').replace(u' ', '') startTime = node.xpath("./td")[2].text start_time = self.time_parse(startTime) href = "https://www.ynggzyxx.gov.cn" + node.xpath('./td//a//@href')[0] html_second = self.get_url(href, proxy_dict) people, price = resolve_pp_0(html_second) if people == '': people, price = resolve_pp_2(html_second) if people == '': people, price = resolve_pp_1(html_second) if people == '': people, price = resolve_pp_3(html_second) if people == None: people, price = resolve_pp_4(html_second) if people == None: people, price = resolve_pp_5(html_second) if people == None: people, price = resolve_pp_6(html_second) if people == None: people, price = resolve_pp_7(html_second) # 儲存到字典 resolveMessage = { "公告名稱": project_name_parse, "釋出時間": start_time, "連結": href, "中標公司": people, "中標價格": price, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resovler_ynsggzxxt_zf_zb(self, html, page_num): ''' 雲南省公共資源交易資訊網_政府採購_中標結果解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) xpathPattern = "//div/table[@id='data_tab']/tbody/tr" node_list = text.xpath(xpathPattern) for node in node_list: if len(node.xpath("./td")) > 0: project_name = node.xpath("./td//a")[0].text project_name_parse = project_name.replace('\n', '').replace(u'\t', '').replace(u' ', '') startTime = node.xpath("./td")[2].text start_time = self.time_parse(startTime) href = "https://www.ynggzyxx.gov.cn" + node.xpath('./td//a//@href')[0] # 儲存到字典 resolveMessage = { "公告名稱": project_name_parse, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resovler_ynsggzzw_gc_zb(self, html, page_num, proxy_dict): ''' 雲南省公共資源交易中心_工程建設_中標結果解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' def resolve_pp_1(html): ''' 子網頁解析器_1 eg: https://www.ynggzy.com/jyxx/jsgcZbjggsDetail?guid=fbd514af-5716-4e30-bc1d-b42892986f85&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: people = '' price = '' text = etree.HTML(html) node_second_list = text.xpath("//div[@class='con']//tr") for node_second in node_second_list: if "中標人:" == node_second.xpath("./td")[0].text: people = node_second.xpath("./td")[1].xpath('./b//span')[0].text if "中標價" in node_second.xpath("./td")[0].text: totalCount = node_second.xpath("./td")[1].xpath('./b//span')[0].text price = totalCount return people, price except: return None, '' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 儲存的列表 text = etree.HTML(html) xpathPattern = "//div/table[@id='data_tab']/tbody/tr" node_list = text.xpath(xpathPattern) # 正則規約欄位 strParse = re.compile("\s") for node in node_list: if len(node.xpath("./td")) > 0: # 公告標題(正則規約) title = strParse.sub("", node.xpath("./td")[1].xpath("./a")[0].text) # 釋出時間 startTime = node.xpath("./td")[2].text start_time = self.time_parse(startTime) # href 連結地址 href = "https://www.ynggzy.com" + str(node.xpath("./td/a/@href")[0]) html_second = self.get_url(href, proxy_dict) people, price = resolve_pp_1(html_second) # 儲存到字典 resolveMessage = { "公告標題": title, "釋出時間": start_time, "連結": href, "中標公司": people, "中標價格": price, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resovler_ynsggzzw_zf_zb(self, html, page_num): ''' 雲南省公共資源交易中心_政府採購_結果公示解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 儲存的列表 text = etree.HTML(html) xpathPattern = "//div/table[@id='data_tab']/tbody/tr" node_list = text.xpath(xpathPattern) # 正則規約欄位 strParse = re.compile("\s") for node in node_list: if len(node.xpath("./td")) > 0: # 公告標題(正則規約) title = strParse.sub("", node.xpath("./td")[1].xpath("./a")[0].text) # 釋出時間 startTime = node.xpath("./td")[2].text start_time = self.time_parse(startTime) # href 連結地址 href = "https://www.ynggzy.com" + str(node.xpath("./td/a/@href")[0]) # 儲存到字典 resolveMessage = { "公告標題": title, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resolver_kmsgg_gc_zb(self, html, page_num): ''' 昆明市公共資源交易平臺公共服務系統_工程建設_中標結果公示解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr") for i in range(1, 16): # 編號 num = node_list[i].xpath("./td")[1].text # 工程名稱 project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8') # 連結 href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0]) start_time = None # 釋出時間 startTime = node_list[i].xpath("./td")[3].text if startTime is not None: start_time = self.time_parse(startTime) # 儲存到字典 if num and project_name and start_time is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resolver_kmsgg_zf_zb(self, html, page_num): ''' 昆明市公共資源交易平臺公共服務系統_政府採購_結果公示解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr") for i in range(1, 16): # 編號 num = node_list[i].xpath("./td")[1].text # 工程名稱 project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8') # 連結 href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0]) start_time = None # 釋出時間 startTime = node_list[i].xpath("./td")[3].text if startTime is not None: start_time = self.time_parse(startTime) # 儲存到字典 if num and project_name and start_time is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resolver_kmsgg_gc_by(self, html, page_num): ''' 昆明市公共資源交易平臺公共服務系統_工程建設_補遺通知解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr") for i in range(1, 16): # 編號 num = node_list[i].xpath("./td")[1].text # 工程名稱 project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8') # 連結 href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0]) start_time = None # 釋出時間 startTime = node_list[i].xpath("./td")[3].text if startTime is not None: start_time = self.time_parse(startTime) # 儲存到字典 if num and project_name and start_time is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resolver_kmsgg_zf_by(self, html, page_num): ''' 昆明市公共資源交易平臺公共服務系統_政府採購_補遺通知解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] # 解析原始碼並返回 XML 物件 text = etree.HTML(html) node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr") for i in range(1, 16): # 編號 num = node_list[i].xpath("./td")[1].text # 工程名稱 project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8') # 連結 href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0]) start_time = None # 釋出時間 startTime = node_list[i].xpath("./td")[3].text if startTime is not None: start_time = self.time_parse(startTime) # 儲存到字典 if num and project_name and start_time is not None: resolveMessage = { "編號": num, "工程名稱": project_name, "釋出時間": start_time, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult @retry(tries=3, delay=2) # Done def resolver_ynszfcgw_cg(self, html, page_num, driver_second): ''' 雲南省政府採購網_採購結果解析器 :param html: 網頁原始碼 :param page_num: 網頁頁數 :return: 返回包含資料字典的列表 ''' def resolver_pp_1(url_second): ''' 子網頁解析器_1 eg: https://www.ynggzy.com/jyxx/jsgcZbjggsDetail?guid=fbd514af-5716-4e30-bc1d-b42892986f85&isOther=false :param html:網頁原始碼 :return:中標公司和中標價格 ''' try: driver_second.get(url_second) people = driver_second.find_element_by_id('winSupply').get_attribute('value') price_tmp = driver_second.find_element_by_id('winMoney').get_attribute('value') price = price_tmp + "萬元" return people, price except: return None, '' if page_num != 0: print("[+] 正在解析第{0}頁資訊".format(page_num)) # 儲存的列表 resolveResult = [] text = etree.HTML(html) for i in range(0, 10): node_list = text.xpath("//tr[@data-row-id='{0}']".format(i)) for node in node_list: text_total = node.xpath('./td')[0].xpath('./a')[0].text # 編號 num = text_total[:text_total.find(':')] # 工程名稱 project_name = text_total[text_total.find(':') + 1:] # 區劃 area = node.xpath('./td')[2].text time_push = None # 釋出時間 timePush = node.xpath('./td')[3].text if timePush is not None: time_push = self.time_parse(timePush) # 連結 cursor = node.xpath('./td')[0].xpath('./a/@data-bulletin_id')[0] href = "http://www.yngp.com/newbulletin_zz.do?method=preinsertgomodify&operator_state=1&flag=view&bulletin_id={0}".format( cursor) people, price = resolver_pp_1(href) # 儲存到字典 resolveMessage = { "編號": num, "工程名稱": project_name, "區劃": area, "釋出時間": time_push, "中標公司": people, "中標價格": price, "連結": href, "推送": False } resolveResult.append(resolveMessage) return resolveResult