1. 程式人生 > >豆瓣爬蟲中的一些注意事項

豆瓣爬蟲中的一些注意事項

實現時所用的知識點

1.建立程序池(加快爬去)和程序池佇列(實現程序之間的通訊)來完成抓取

2.利用佇列(這裡用list代替)實現去重

去重有兩部分。一部分是在爬去前判斷要怕去的url是否在以爬取的url佇列裡面;另一部分時當不在以爬取的佇列中時,將其放入帶爬取的佇列時,判斷是否在待爬取的佇列中,再的話則丟棄。

 

3.使用佇列模擬廣度優先遍歷實現url的爬取

4.程序池是爬取url時用的,程序池佇列時用來程序之間通訊的,去重佇列是用來去重的

將帶爬取的url和程序池佇列(傳遞url)放入程序池中爬取,這樣的URL是同一組url

 

# -*- coding: utf-8 -*-
"""
Created on Tue May 29 10:33:56 2018

@author: Administrator
"""
from bs4 import BeautifulSoup
import re
import basicSpider
from multiprocessing import Pool,Manager

def get_html(url):
    """
    獲取一頁的網頁原始碼資訊
    """
    headers = [("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")]
    #proxy = {"http":"182.129.243.84:9000"}
    html = basicSpider.downloadHtml(url, headers=headers)
    return html

def get_movie_all(html):
    """
    獲取當前頁面中所有的電影的列表資訊
    """
    soup = BeautifulSoup(html, "html.parser")
    movie_list = soup.find_all('div', class_='bd doulist-subject')
    #print(movie_list)
    return movie_list

def get_movie_one(movie):
    """
    獲取一部電影的精細資訊,最終拼成一個大的字串
    """
    result = ""
    soup = BeautifulSoup(str(movie),"html.parser")
    title = soup.find_all('div', class_="title")
    soup_title = BeautifulSoup(str(title[0]), "html.parser")
    for line in soup_title.stripped_strings:
        result += line
    
    try:
        score = soup.find_all('span', class_='rating_nums')
        score_ = BeautifulSoup(str(score[0]), "html.parser")
        for line in score_.stripped_strings:
            result += "|| 評分:"
            result += line
    except:
         result += "|| 評分:5.0"
         
    abstract = soup.find_all('div', class_='abstract')
    abstract_info = BeautifulSoup(str(abstract[0]), "html.parser")
    for line in abstract_info.stripped_strings:
        result += "|| "
        result += line    
    
    result += '\n'
    #print(result)
    return result

def save_file(movieInfo, lock):
    """
    寫檔案的操作,這裡使用的追加的方式來寫檔案
    """
    with open("doubanMovie.txt","ab") as f:
        #lock.acquire()
        f.write(movieInfo.encode("utf-8"))
        #lock.release()

def CrawlMovieInfo(url, q, lock):
    """
    抓取電影一頁資料,並寫入檔案
    """
    html = get_html(url)
    movie_list = get_movie_all(html)
    for it in movie_list:
        save_file(get_movie_one(it), lock)
        
    q.put(url) #已完成的url


if __name__ == "__main__":
    # 建立程序池和程序池佇列來完成抓取
    pool = Pool()
    q = Manager().Queue()
    lock = Manager().Lock()
    
    url = "https://www.douban.com/doulist/3516235/?start=225&sort=seq&sub_type="    
    CrawlMovieInfo(url)
    
    html = get_html(url)
	#正則表示式注意引號的使用
    pattern = re.compile('(https://www.douban.com/doulist/3516235/\?start=.*)"')
    itemUrls = re.findall(pattern, html)
#    for i in itemUrls:
#        print(i)
        
    # 兩步去重操作
    crawl_queue = []    # 待爬佇列
    crawled_queue = []  # 已爬取佇列
    for item in itemUrls:
        if item not in crawled_queue: 
            # 第一步去重,確定這些url不在已爬佇列中
            crawl_queue.append(item)
    #第二步去重,對待爬佇列去重
    crawl_queue = list(set(crawl_queue))
    
    # 模擬廣度優先遍歷
    while crawl_queue: #去待爬佇列中取值,直到待爬佇列為空
        url = crawl_queue.pop(0)#取出待爬佇列中第一個值
        #CrawlMovieInfo(url)
        pool.apply_async(func=CrawlMovieInfo, args=(url,q,lock))
        # 把已經處理完的url放入已經爬取的佇列中
        urlCompeted = q.get()
        crawled_queue.append(urlCompeted)
    
    
    pool.close()
    pool.join()