1. 程式人生 > >python爬蟲採集網路資訊

python爬蟲採集網路資訊

from bs4 import BeautifulSoup
import re
import urllib.parse
import urllib.request
import os
import datetime
import json

# params  CategoryId=808 CategoryType=SiteHome ItemListActionName=PostList PageIndex=3 ParentCategoryId=0 TotalPostCount=4000
def getHtml(url,values):
    user_agent='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
headers = {'User-Agent':user_agent} data = urllib.parse.urlencode(values) response_result = urllib.request.urlopen(url+'?'+data).read() html = response_result.decode('utf-8') return html #獲取資料 def requestCnblogs(index): print('請求資料') url = 'https://www.csdn.net/' value= { 'CategoryId'
:808, 'CategoryType' : 'SiteHome', 'ItemListActionName' :'PostList', 'PageIndex' : index, 'ParentCategoryId' : 0, 'TotalPostCount' : 4000 } result = getHtml(url,value) return result #解析最外層 def blogParser(index): cnblogs = requestCnblogs(index) soup = BeautifulSoup(cnblogs, 'html.parser'
) all_div = soup.find_all('div', attrs={'class': 'list_con'}, limit=20) blogs = [] #迴圈div獲取詳細資訊 for item in all_div: blog = analyzeBlog(item) blogs.append(blog) return blogs #解析每一條資料 def analyzeBlog(item): result = {} a_title = find_all(item,'div','title')[0].find_all('a')[0] if a_title is not None: # 部落格標題 result["title"] = a_title.string.replace("\n","").strip() # 部落格連結 result["href"] = a_title['href'] p_summary = find_all(item,'div','summary oneline') if p_summary is not None: # 簡介 result["summary"] = p_summary[0].text.replace("\n","").replace("\n","").strip() footers = find_all(item,'dl','list_userbar')[0] author = find_all(footers,'dd','name')[0] # 作者 result["author"] = author.find_all('a')[0].string.replace("\n","").strip() # 作者url result["author_url"] = author.find_all('a')[0]['href'] time = find_all(footers,'dd','time')[0].text result["create_time"] = time.replace("\n","").strip() comment_str = find_all(footers,'dd','read_num')[0].find_all('span')[0].text result["comment_num"] = comment_str view_str = find_all(footers,'dd','common_num ')[0].find_all('span')[0].string result["view_num"] = view_str return result def find_all(item,attr,c): return item.find_all(attr,attrs={'class':c},limit=1) def writeToTxt(list_name,file_path): try: #這裡直接write item 即可,不要自己給序列化在寫入,會導致json格式不正確的問題 fp = open(file_path,"w+",encoding='utf-8') print(file_path) l = len(list_name) i = 0 fp.write('[') for item in list_name: fp.write(str(item)) if i<l-1: fp.write(',\n') i += 1 fp.write(']') fp.close() except IOError: print("fail to open file") def saveBlogs(): for i in range(1,2): print('request for '+str(i)+'...') blogs = blogParser(1) #儲存到檔案 path = createFile() writeToTxt(blogs,path+'/blog_'+ str(i) +'.json') print('第'+ str(i) +'頁已經完成') return 'success' def createFile(): date = datetime.datetime.now().strftime('%Y-%m-%d') path = 'F:/Blog/'+date if os.path.exists(path): return path else: os.mkdir(path) return path if __name__ == '__main__': result = blogParser(1) print(result) # print("python爬取CSDN:") # info = saveBlogs() # print(info)