爬蟲(進階),爬取網頁資訊並寫入json檔案
阿新 • • 發佈:2019-02-17
import requests # python HTTP客戶端庫,編寫爬蟲和測試伺服器響應資料會用到的類庫 import re import json from bs4 import BeautifulSoup import copy print('正在爬取網頁連結……') List = [] for page in range(8): if page == 0: url = 'http://usagz.bailitop.com/public/' else: url = 'http://usagz.bailitop.com/public/' + str(page + 1) + '.html' print('-----------正在爬取第' + str(page + 1) + '頁------') html = requests.get(url) html.raise_for_status() html.encoding = 'utf-8' try: soup = BeautifulSoup(html.text, 'html.parser') soup = str(soup) # 正則表示式找到網頁連結 href = re.compile('http://usagz\.bailitop\.com/public/\d*/\d*\.html') URLs = re.findall(href, soup) flag = 0 # 過濾前面重複的3條 for webUrl in URLs: flag = flag + 1 if flag > 4: List.append(webUrl) # 每個頁面15條資料 except Exception as e: print(e) print(List) # 建立字典 data = {'title': '', 'content': '', 'time': ''} dataList = [] for webSite in List: print('\n') html = requests.get(webSite) html.raise_for_status() html.encoding = 'utf-8' try: soup = BeautifulSoup(html.text, 'html.parser') soup = str(soup) # 標題 reg = re.compile('<div id="CLM_one">.*<h1>(.*)</h1>.*</div>', re.S) title = re.findall(reg, soup) title = title[0] if title.count('img'): # 去前面的標籤 title = title.split('>', 1) title = title[1] # 去後面的標籤 title = title.split('<', 1) title = title[0] # 日期 reg = re.compile('\d{4}-\d\d-\d\d') date = re.findall(reg, soup) date = date[0] # 正文 reg = re.compile('<div class="center_main">(.*)</div>.*<div class="text-c" id="pages"', re.S) content = re.findall(reg, soup) content = content[0] # 替換文字 content = content.replace('百利天下', '智課') # 更新字典資訊 data['title'] = title data['content'] = content data['time'] = date # 加入List dataList.append(data) # 更改字典地址 data = copy.copy(data) except Exception as e: print(e) # 轉換json,注意編碼 jsonList = json.dumps(dataList, ensure_ascii=False) print(jsonList) # 寫入檔案 with open("record.json", "w", encoding='utf-8') as f: f.write(jsonList) print("載入入檔案完成...")
import requests # python HTTP客戶端庫,編寫爬蟲和測試伺服器響應資料會用到的類庫 import re import json from bs4 import BeautifulSoup import copy import urllib.request import urllib.parse def downloadPostPage(url, dictdata, headers, charset='utf-8', reqnum=5): data = bytes(urllib.parse.urlencode(dictdata), encoding=charset) req = urllib.request.Request(url, data, headers=headers, method='POST') info = None try: response = urllib.request.urlopen(req) info = response.read().decode(charset) except Exception as e: # 伺服器錯誤 print(e) return info if __name__ == '__main__': dic = { 'title': '標題', 'abstract': '摘要', 'studentInfo': { 'study_exp': '最高教育經歷', 'school_type': '院校型別', # 成績分類 託福 小託福 SSAT SLEP 'grade': [] }, 'offerInfo': { 'school': 'value1', 'degree': 'value2', 'date': 'value2' }, 'paragraphs': [ {'title': '標題1', 'content': 'content1'}, {'title': '標題2', 'content': 'content2'}, {'title': '標題3', 'content': 'content3'}, {'title': '標題4', 'content': 'content4'}, {'title': '標題5', 'content': 'content5'} ] } dicList = [] urlList = [] url = 'http://case.bailitop.com/cases/yuanxiaoajax.shtml' headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", } for jzgd in range(1): dictdata = { 'jzgd': 4, 'type': 2, 'mbcountry': '美國', 'mbdegree': '高中', 'univ': '', 'major': '', 'gpa': '', 'toefl': '', 'ielts': '' } # 請求url , 請求引數dictdata , 構造的頭headers , 請求次數reqnum info = downloadPostPage(url, dictdata, headers=headers, reqnum=1) jsonLoads = json.loads(info)[0] reg = re.compile('http://case\.bailitop\.com/yuanxiao/\d*\.shtml') urlList = urlList + re.findall(reg, jsonLoads) print(urlList) for webSite in urlList: print('\n') html = requests.get(webSite) html.raise_for_status() html.encoding = 'utf-8' try: data1 = {'type': '', 'value': ''} soup = BeautifulSoup(html.text, 'html.parser') # 正文 soupContent = soup.find("div", class_="anli_wenzhang") title1 = str(soupContent.p) reg = re.compile('【(.*)】', re.S) title = re.findall(reg, title1) # print(title) flag = 1 content1 = '' content2 = '' content3 = '' content4 = '' content5 = '' title2 = '' title3 = '' title4 = '' title5 = '' for sibling in soupContent.p.next_siblings: sibling = str(sibling) sibling = sibling.replace('\r', '') sibling = sibling.replace('\n', '') if sibling.count('【'): flag = flag + 1 reg = re.compile('【(.*)】', re.S) if flag == 2: title2 = re.findall(reg, sibling) elif flag == 3: title3 = re.findall(reg, sibling) elif flag == 4: title4 = re.findall(reg, sibling) elif flag == 5: title5 = re.findall(reg, sibling) else: if flag == 1: if content1 == '': content1 = content1 + sibling else: content1 = content1 + '\n' + sibling elif flag == 2: if content2 == '': content2 = content2 + sibling else: content2 = content2 + '\n' + sibling elif flag == 3: if content3 == '': content3 = content3 + sibling else: content3 = content3 + '\n' + sibling elif flag == 4: if content4 == '': content4 = content4 + sibling else: content4 = content4 + '\n' + sibling elif flag == 5: if content5 == '': content5 = content5 + sibling else: content5 = content5 + '\n' + sibling content1 = content1.replace('</p>', '') content1 = content1.replace('<p>', '') content2 = content2.replace('</p>', '') content2 = content2.replace('<p>', '') content3 = content3.replace('</p>', '') content3 = content3.replace('<p>', '') content4 = content4.replace('</p>', '') content4 = content4.replace('<p>', '') content5 = content5.replace('</p>', '') content5 = content5.replace('<p>', '') content3 = content3.replace('百利天下', '智課') content4 = content4.replace('百利天下', '智課') content5 = content5.replace('百利天下', '智課') content1 = content1.replace('\u3000', '') content2 = content2.replace('\u3000', '') content3 = content3.replace('\u3000', '') content4 = content4.replace('\u3000', '') content5 = content5.replace('\u3000', '') content5 = content5.replace('\n', '') content5 = content5.replace('<br/>', '') if content5.count('<p'): reg = re.compile('(.*?)<p', re.S) content5 = re.findall(reg, content5) dic['paragraphs'][0]['title'] = title[0] dic['paragraphs'][1]['title'] = title2[0] dic['paragraphs'][2]['title'] = title3[0] dic['paragraphs'][3]['title'] = title4[0] dic['paragraphs'][4]['title'] = title5[0] dic['paragraphs'][0]['content'] = content1 dic['paragraphs'][1]['content'] = content2 dic['paragraphs'][2]['content'] = content3 dic['paragraphs'][3]['content'] = content4 dic['paragraphs'][4]['content'] = content5 soup = str(soup) # 摘要 有個\r\n 手動刪 reg = re.compile('<p><strong>摘要</strong>:(.*)</p>.*<div.*class="zixun">', re.S) abstract = re.findall(reg, soup)[0] abstract = abstract.replace('百利天下', '智課') dic['abstract'] = abstract print(abstract) # title 個別<strong>未解決,手動刪 reg = re.compile('<h2>(.*)</h2>', re.S) title = re.findall(reg, soup)[0] if title.count('<strong') == 1: reg = re.compile('<strong.*?>(.*)', re.S) title = re.findall(reg, title)[0] title = title.replace('</strong>', '') dic['title'] = title print(title) # offer詳情 reg = re.compile( '<p>錄取院校:(.*)</p>\n<p></p>\n<p>授予學位:(.*)</p.*<p>入學時間:(.*?)</p>\n</div>\n<div class="g_btns">', re.S) offerInfo = re.findall(reg, soup)[0] dic['offerInfo']['school'] = offerInfo[0] dic['offerInfo']['degree'] = offerInfo[1] dic['offerInfo']['date'] = offerInfo[2] print(offerInfo) # 學生檔案 reg = re.compile( '<p>最高教育經歷:(.*)</p>\n<p>院校型別:(.*)</p>\n<p></p>\n<p>語言成績:(.*?)</p>', re.S) studentInfo = re.findall(reg, soup) if len(studentInfo) == 0: # 只有 最高教育經歷 院校型別 reg = re.compile( '<p>最高教育經歷:(.*)</p>\n<p>院校型別:(.*?)</p>\n<p></p>', re.S) studentInfo = re.findall(reg, soup) if len(studentInfo) == 0: # 只有 院校型別 語言成績 reg = re.compile( '<p>院校型別:(.*?)</p>\n<p></p>\n<p>語言成績:(.*?)</p>', re.S) studentInfo = re.findall(reg, soup) studentInfo = studentInfo[0] grade = studentInfo[1] grade = grade.replace(' ', ' ') grade = grade.replace(';', '') grade = grade.replace(' ', ' ') dic['studentInfo']['study_exp'] = '' dic['studentInfo']['school_type'] = studentInfo[0] # dic['studentInfo']['grade'] = grade print('院校型別:', studentInfo[0], '||語言成績:', grade) reg = re.compile(r' ') gradeList = re.split(reg, grade) for n in range(int((len(gradeList) - 1) / 2)): data1['type'] = gradeList[n * 2] data1['value'] = gradeList[n * 2 + 1] dic['studentInfo']['grade'].append(data1) data1 = copy.deepcopy(data1) else: studentInfo = studentInfo[0] dic['studentInfo']['study_exp'] = studentInfo[0] dic['studentInfo']['school_type'] = studentInfo[1] # dic['studentInfo']['grade'] = '' print('最高教育經歷:', studentInfo[0], '||院校型別:', studentInfo[1]) else: studentInfo = studentInfo[0] grade = studentInfo[2] grade = grade.replace(' ', ' ') grade = grade.replace(';', '') grade = grade.replace(' ', ' ') dic['studentInfo']['study_exp'] = studentInfo[0] dic['studentInfo']['school_type'] = studentInfo[1] # dic['studentInfo']['grade'] = grade print('最高教育經歷:', studentInfo[0], '||院校型別:', studentInfo[1], '||語言成績:', grade) reg = re.compile(r' ') gradeList = re.split(reg, grade) for n in range(int((len(gradeList) - 1) / 2)): data1['type'] = gradeList[n * 2] data1['value'] = gradeList[n * 2 + 1] dic['studentInfo']['grade'].append(data1) data1 = copy.copy(data1) dicList.append(dic) dic = copy.deepcopy(dic) dic['studentInfo']['grade'].clear() except Exception as e: print(e) jsonList = json.dumps(dicList, ensure_ascii=False) print(jsonList) # 寫入檔案 with open("CaseRecord.json", "w", encoding='utf-8') as f: f.write(jsonList) print("載入入檔案完成...")