大規模數據爬取 -- Python
阿新 • • 發佈:2017-09-03
code xtra data format find() 數據 {} __name__ -m
Python書寫爬蟲,目的是爬取所有的個人商家商品信息及詳情,並進行數據歸類分析
整個工作流程圖:
第一步:采用自動化的方式從前臺頁面獲取所有的頻道
from bs4 import BeautifulSoup import requests #1、找到左側邊欄所有頻道的鏈接 start_url = ‘http://hz.58.com/sale.shtml‘ url_host = ‘http://hz.58.com‘ def get_channel_urls(url): wb_data = requests.get(start_url) soup = BeautifulSoup(wb_data.text,‘lxml‘) links = soup.select(‘ul.ym-mainmnu > li > span > a["href"]‘) for link in links: page_url = url_host + link.get(‘href‘) print(page_url) #print(links) get_channel_urls(start_url) channel_list = ‘‘‘ http://hz.58.com/shouji/ http://hz.58.com/tongxunyw/ http://hz.58.com/danche/ http://hz.58.com/diandongche/ http://hz.58.com/diannao/ http://hz.58.com/shuma/ http://hz.58.com/jiadian/ http://hz.58.com/ershoujiaju/ http://hz.58.com/yingyou/ http://hz.58.com/fushi/ http://hz.58.com/meirong/ http://hz.58.com/yishu/ http://hz.58.com/tushu/ http://hz.58.com/wenti/ http://hz.58.com/bangong/ http://hz.58.com/shebei.shtml http://hz.58.com/chengren/‘‘‘
第二步:通過第一步獲取的所有頻道去獲取所有的列表詳情,並存入URL_list表中,同時獲取商品詳情信息
from bs4 import BeautifulSoup import requests import time import pymongo client = pymongo.MongoClient(‘localhost‘,27017) ceshi = client[‘ceshi‘] url_list = ceshi[‘url_list‘] item_info = ceshi[‘item_info‘] def get_links_from(channel,pages,who_sells=0):#http://hz.58.com/shouji/0/pn7/ list_view = ‘{}{}/pn{}/‘.format(channel,str(who_sells),str(pages)) wb_data = requests.get(list_view) time.sleep(1) soup = BeautifulSoup(wb_data.text,‘lxml‘) links = soup.select(‘td.t > a[onclick]‘) if soup.find(‘td‘,‘t‘): for link in links: item_link = link.get(‘href‘).split(‘?‘)[0] url_list.insert_one({‘url‘:item_link}) print(item_link) else: pass # Nothing def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,‘lxml‘) no_longer_exist = ‘商品已下架‘ in soup if no_longer_exist: pass else: title = soup.title.text price = soup.select(‘span.price_now > i‘)[0].text area = soup.select(‘div.palce_li > span > i‘)[0].text #url_list.insert_one({‘title‘:title,‘price‘:price,‘area‘:area}) print({‘title‘:title,‘price‘:price,‘area‘:area}) #get_links_from(‘http://hz.58.com/pbdn/‘,7) #get_item_info(‘http://zhuanzhuan.58.com/detail/840577950118920199z.shtml‘)
第三步:采用多進程的方式的main主函數入口
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from def get_all_links_from(channel): for num in range(1,31): get_links_from(channel,num) if __name__ == ‘__main__‘: pool = Pool() pool.map(get_all_links_from,channel_list.split())
第四步:實時對獲取到的數據進行監控
from time import sleep from page_parsing import url_list while True: print(url_list.find().count()) sleep(5)
具體運行效果:
大規模數據爬取 -- Python