1. 程式人生 > >requests+xpath+map爬取百度貼吧

requests+xpath+map爬取百度貼吧

name ads int strip 獲取 app open http col

 1 # requests+xpath+map爬取百度貼吧
 2 # 目標內容:跟帖用戶名,跟帖內容,跟帖時間
 3 # 分解:
 4 # requests獲取網頁
 5 # xpath提取內容
 6 # map實現多線程爬蟲
 7 import requests
 8 from requests.exceptions import RequestException
 9 from lxml import etree
10 import json
11 from multiprocessing.dummy import Pool as ThreadPool
12 
13 def get_html(url):
14 try: 15 response = requests.get(url) 16 if response.status_code == 200: 17 return response.text 18 else: 19 return None 20 except RequestException: 21 return None 22 23 def parse_html(html): 24 selector = etree.HTML(html) 25 data = selector.xpath(
//div[@class="l_post j_l_post l_post_bright "]) 26 for each in data: 27 rs = each.xpath(@data-field)[0] 28 rs = json.loads(rs) 29 author = rs.get(author).get(user_name) 30 author_id = rs.get(content).get(post_id) 31 content = each.xpath(div/div/cc/div[@id="post_content_%s"]/text()
% author_id)[0].strip() 32 date = rs.get(content).get(date) 33 yield { 34 author:author, 35 content:content, 36 date:date 37 } 38 39 def save_to_txt(result): 40 print(正在存儲:,result) 41 42 with open(tieba.txt,a,encoding=utf-8) as f: 43 f.write(回帖作者:+result[author]+\n) 44 f.write(回帖內容:+result[content]+\n) 45 f.write(回帖時間:+result[date]+\n) 46 f.write(\n) 47 48 49 def main(url): 50 html = get_html(url) 51 if html: 52 for result in parse_html(html): 53 save_to_txt(result) 54 55 if __name__==__main__: 56 57 pool = ThreadPool(4) 58 urls=[] 59 base_url = http://tieba.baidu.com/p/3522395718?pn= 60 for page_num in range(1, 21): 61 url = base_url + str(page_num) 62 urls.append(url) 63 64 pool.map(main,urls) 65 pool.close() 66 pool.join()

requests+xpath+map爬取百度貼吧