利用Python搜索51CTO推薦博客並保存至Excel
阿新 • • 發佈:2018-01-31
html_ 名稱 pri bin def text 網頁 .com contex 一、背景
近期在學習爬蟲,利用Requests模塊獲取頁面,BeautifulSoup來獲取需要的內容,最後利用xlsxwriter模塊講內容保存至excel,在此記錄一下,後續可舉一反三,利用其抓取其他內容持久和存儲到文件內,或數據庫等。
二、代碼
編寫了兩個模塊,geturl3和getexcel3,最後在main內調用
geturl3.py
代碼內容如下:
#!/bin/env python # -*- coding:utf-8 -*- # @Author : kaliarch import requests from bs4 import BeautifulSoup class get_urldic: #獲取搜索關鍵字 def get_url(self): urlList = [] first_url = ‘http://blog.51cto.com/search/result?q=‘ after_url = ‘&type=&page=‘ try: search = input("Please input search name:") page = int(input("Please input page:")) except Exception as e: print(‘Input error:‘,e) exit() for num in range(1,page+1): url = first_url + search + after_url + str(num) urlList.append(url) print("Please wait....") return urlList,search #獲取網頁文件 def get_html(self,urlList): response_list = [] for r_num in urlList: request = requests.get(r_num) response = request.content response_list.append(response) return response_list #獲取blog_name和blog_url def get_soup(self,html_doc): result = {} for g_num in html_doc: soup = BeautifulSoup(g_num,‘html.parser‘) context = soup.find_all(‘a‘,class_=‘m-1-4 fl‘) for i in context: title=i.get_text() result[title.strip()]=i[‘href‘] return result if __name__ == ‘__main__‘: blog = get_urldic() urllist, search = blog.get_url() html_doc = blog.get_html(urllist) result = blog.get_soup(html_doc) for k,v in result.items(): print(‘search blog_name is:%s,blog_url is:%s‘ % (k,v))
getexcel3.py
代碼內容如下:
#!/bin/env python # -*- coding:utf-8 -*- # @Author : kaliarch import xlsxwriter class create_excle: def __init__(self): self.tag_list = ["blog_name", "blog_url"] def create_workbook(self,search=" "): excle_name = search + ‘.xlsx‘ #定義excle名稱 workbook = xlsxwriter.Workbook(excle_name) worksheet_M = workbook.add_worksheet(search) print(‘create %s....‘ % excle_name) return workbook,worksheet_M def col_row(self,worksheet): worksheet.set_column(‘A:A‘, 12) worksheet.set_row(0, 17) worksheet.set_column(‘A:A‘,58) worksheet.set_column(‘B:B‘, 58) def shell_format(self,workbook): #表頭格式 merge_format = workbook.add_format({ ‘bold‘: 1, ‘border‘: 1, ‘align‘: ‘center‘, ‘valign‘: ‘vcenter‘, ‘fg_color‘: ‘#FAEBD7‘ }) #標題格式 name_format = workbook.add_format({ ‘bold‘: 1, ‘border‘: 1, ‘align‘: ‘center‘, ‘valign‘: ‘vcenter‘, ‘fg_color‘: ‘#E0FFFF‘ }) #正文格式 normal_format = workbook.add_format({ ‘align‘: ‘center‘, }) return merge_format,name_format,normal_format #寫入title和列名 def write_title(self,worksheet,search,merge_format): title = search + "搜索結果" worksheet.merge_range(‘A1:B1‘, title, merge_format) print(‘write title success‘) def write_tag(self,worksheet,name_format): tag_row = 1 tag_col = 0 for num in self.tag_list: worksheet.write(tag_row,tag_col,num,name_format) tag_col += 1 print(‘write tag success‘) #寫入內容 def write_context(self,worksheet,con_dic,normal_format): row = 2 for k,v in con_dic.items(): if row > len(con_dic): break col = 0 worksheet.write(row,col,k,normal_format) col+=1 worksheet.write(row,col,v,normal_format) row+=1 print(‘write context success‘) #關閉excel def workbook_close(self,workbook): workbook.close() if __name__ == ‘__main__‘: print(‘This is create excel mode‘)
main.py
代碼內容如下:
#!/bin/env python # -*- coding:utf-8 -*- # @Author : kaliarch import geturl3 import getexcel3 #獲取url字典 def get_dic(): blog = geturl3.get_urldic() urllist, search = blog.get_url() html_doc = blog.get_html(urllist) result = blog.get_soup(html_doc) return result,search #寫入excle def write_excle(urldic,search): excle = getexcel3.create_excle() workbook, worksheet = excle.create_workbook(search) excle.col_row(worksheet) merge_format, name_format, normal_format = excle.shell_format(workbook) excle.write_title(worksheet,search,merge_format) excle.write_tag(worksheet,name_format) excle.write_context(worksheet,urldic,normal_format) excle.workbook_close(workbook) def main(): url_dic ,search_name = get_dic() write_excle(url_dic,search_name) if __name__ == ‘__main__‘: main()
三、效果展示
運行代碼,填寫搜索的關鍵字,及搜索多少頁
查看會生成一個以搜索關鍵字命名的excel,打開寫入的內容
利用其就可以搜索並保持自己需要的51CTO推薦博客,可以多搜索幾個
利用Python搜索51CTO推薦博客並保存至Excel