1. 程式人生 > >大規模數據爬取 -- Python

大規模數據爬取 -- Python

code xtra data format find() 數據 {} __name__ -m

Python書寫爬蟲,目的是爬取所有的個人商家商品信息及詳情,並進行數據歸類分析

整個工作流程圖:

技術分享

第一步:采用自動化的方式從前臺頁面獲取所有的頻道

from bs4 import BeautifulSoup
import requests

#1、找到左側邊欄所有頻道的鏈接
start_url = http://hz.58.com/sale.shtml
url_host = http://hz.58.com

def get_channel_urls(url):
    wb_data = requests.get(start_url)
    soup = BeautifulSoup(wb_data.text,
lxml) links = soup.select(ul.ym-mainmnu > li > span > a["href"]) for link in links: page_url = url_host + link.get(href) print(page_url) #print(links) get_channel_urls(start_url) channel_list = ‘‘‘ http://hz.58.com/shouji/ http://hz.58.com/tongxunyw/ http://hz.58.com/danche/ http://hz.58.com/diandongche/ http://hz.58.com/diannao/ http://hz.58.com/shuma/ http://hz.58.com/jiadian/ http://hz.58.com/ershoujiaju/ http://hz.58.com/yingyou/ http://hz.58.com/fushi/ http://hz.58.com/meirong/ http://hz.58.com/yishu/ http://hz.58.com/tushu/ http://hz.58.com/wenti/ http://hz.58.com/bangong/ http://hz.58.com/shebei.shtml http://hz.58.com/chengren/
‘‘‘

第二步:通過第一步獲取的所有頻道去獲取所有的列表詳情,並存入URL_list表中,同時獲取商品詳情信息

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient(localhost,27017)
ceshi = client[ceshi]
url_list = ceshi[url_list]
item_info = ceshi[item_info]


def get_links_from(channel,pages,who_sells=0):
    
#http://hz.58.com/shouji/0/pn7/ list_view = {}{}/pn{}/.format(channel,str(who_sells),str(pages)) wb_data = requests.get(list_view) time.sleep(1) soup = BeautifulSoup(wb_data.text,lxml) links = soup.select(td.t > a[onclick]) if soup.find(td,t): for link in links: item_link = link.get(href).split(?)[0] url_list.insert_one({url:item_link}) print(item_link) else: pass # Nothing def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,lxml) no_longer_exist = 商品已下架 in soup if no_longer_exist: pass else: title = soup.title.text price = soup.select(span.price_now > i)[0].text area = soup.select(div.palce_li > span > i)[0].text #url_list.insert_one({‘title‘:title,‘price‘:price,‘area‘:area}) print({title:title,price:price,area:area}) #get_links_from(‘http://hz.58.com/pbdn/‘,7) #get_item_info(‘http://zhuanzhuan.58.com/detail/840577950118920199z.shtml‘)

第三步:采用多進程的方式的main主函數入口

from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from

def get_all_links_from(channel):
    for num in range(1,31):
        get_links_from(channel,num)

if __name__ == __main__:
    pool = Pool()
    pool.map(get_all_links_from,channel_list.split())

第四步:實時對獲取到的數據進行監控

from time import sleep
from page_parsing import url_list

while True:
    print(url_list.find().count())
    sleep(5)

具體運行效果:

技術分享

大規模數據爬取 -- Python