1. 程式人生 > >使用python爬蟲——爬取淘寶圖片和知乎內容

使用python爬蟲——爬取淘寶圖片和知乎內容

本文主要內容:

目標:使用python爬取淘寶圖片;使用python的一個開源框架pyspider(非常好用,一個國人寫的)爬取知乎上的每個問題,及這個問題下的所有評論

有2種實現方案:
1、使用pyspider開源框架,安裝好pyspider並啟動後,預設是本地的5001埠,新建一個爬蟲專案,寫下如下python程式碼實踐爬去知乎的問題和評論資料,同時使用python-mysql,把爬到的資料存到自己建的一個數據庫,把資料留給自己使用分析哈哈!
2、使用urllib,PyQuery,requests,BeautifulSoup等庫自己實現一個簡單的爬蟲,可以爬取圖片下載下來,存到資料庫,或者爬取文字


本文共4部分:

  • 寫一個最簡單的爬蟲
  • 爬取淘寶上的模特圖片
  • 爬取知乎上的內容,並通過MySQLdb儲存到自己建的資料庫中
  • 爬取https://www.v2ex.com社群下的所有討論話題

最簡單的爬蟲——如下python程式碼

import requests
import re
from bs4 import BeautifulSoup
def most_simple_crawl():
    # 最簡單的爬蟲
    content = requests.get('http://www.qiushibaike.com').content
    soup = BeautifulSoup(content, 'html.parser')
    for div in soup.find_all('div', {'class': 'content'}):
        print div.text.strip()
        
if __name__ == '__main__':
        most_simple_crawl()

爬取淘寶上模特圖片

# coding=utf-8
import re
import urllib2
import urllib


def crawl_taobao():
    # 淘寶上搜索的關鍵詞
    key = "比基尼"
    key = urllib2.quote(key)
    headers = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0")
    opener = urllib2.build_opener()
    opener.addheaders = [headers]
    urllib2.install_opener(opener)
    # 分頁爬取
    for i in range(0, 4):
        url = "https://s.taobao.com/search?q=" + key
        data = urllib2.urlopen(url).read().decode("utf-8", "ignore")
        pat = 'pic_url":"//(.*?)"'
        imagelist = re.compile(pat).findall(data)
        # 爬取每一頁中所有的圖片
        for j in range(0, len(imagelist)):
            thisimg = imagelist[j]
            thisimgurl = "http://" + thisimg
            # 儲存到自己電腦的D盤
            savefile = 'D:/pic/' + str(i) + str(j) + '.jpg'
            urllib.urlretrieve(thisimgurl, filename=savefile)


if __name__ == '__main__':
    crawl_taobao()

爬取知乎的資料,需偽裝成GoogleBot去爬,否則ip會被封掉,並通過MySQLdb儲存到自己建的資料庫中

from pyspider.libs.base_handler import *
import MySQLdb
import random

class Handler(BaseHandler):
    crawl_config = {
        'headers':{
          'User-Agent':'GoogleBot',
          'Host':'www.zhihu.com',
        }
    }
    
    def __init__(self):
         self.db = MySQLdb.connect('localhost', 'root', '123456', 'onlineq', charset='utf8')
    # 把爬到的知乎問題存到自己建的資料庫中
    def add_question(self,title,content,comment_count):
        try:
            cursor=self.db.cursor()
            sql = 'insert into question(title, content, user_id, created_date,comment_count)values ("%s","%s", %d,now(),%d)'%(title,content,random.randint(20,26),comment_count)
            print sql
            cursor.execute(sql)
            qid= cursor.lastrowid
            print qid
            self.db.commit()
            return qid
        except Exception, e:
            self.db.rollback()
        return 0
    # 把爬到的問題評論存到自己建的資料庫中
    def add_comment(self,qid, comment):
        try:
            cursor=self.db.cursor()
            sql='insert into comment (content, entity_type, entity_id, user_id, created_date) values("%s",100,%d,%d ,now())' % (comment, qid,random.randint(20,26))
            print sql
            cursor.execute(sql)
            self.db.commit()
        except Exception, e:
            print e
            self.db.rollback()
        
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.zhihu.com/topic/19552330/top-answers', callback=self.index_page,validate_cert=False)
    
        
    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[data-za-detail-view-element_name="Title"]').items():
            self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False)

    @config(priority=2)
    def detail_page(self, response):
        title = response.doc('h1.QuestionHeader-title').text()
        content = response.doc('span.RichText.ztext').html()
        items = response.doc('span.RichText.ztext.CopyrightRichText-richText').items()
        if content==None:
            content = ''
        content = content.replace('"','\\"')
        qid=self.add_question(title,content,sum(1 for x in items))
        for each in response.doc('span.RichText.ztext.CopyrightRichText-richText').items():
            self.add_comment(qid,each.html().replace('"','\\"'))
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }

爬取https://www.v2ex.com社群下的所有討論話題


from pyspider.libs.base_handler import *
import random
import MySQLdb

class Handler(BaseHandler):
    crawl_config = {
    }

    def __init__(self):
         self.db = MySQLdb.connect('localhost', 'root', '123456', 'onlineq', charset='utf8')
    
    def add_question(self,title,content):
        try:
            cursor=self.db.cursor()
            sql = 'insert into question(title, content, user_id, created_date,comment_count)values ("%s","%s", %d,now(),0)'%(title,content,random.randint(20,22))
            print sql
            cursor.execute(sql)
            print cursor.lastrowid
            self.db.commit()
        except Exception, e:
            self.db.rollback()
    
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.v2ex.com/', callback=self.index_page,validate_cert=False)
        

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
            self.crawl(each.attr.href, callback=self.tab_page,validate_cert=False)
            

    @config(priority=2)
    def tab_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
            self.crawl(each.attr.href, callback=self.board_page,validate_cert=False)
            

    @config(priority=2)
    def board_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
            url=each.attr.href
            if url.find('#reply')>0:
                url=url[0:url.find('#')]
            self.crawl(url, callback=self.detail_page,validate_cert=False)
        for each in response.doc('a.page_normal').items():
            self.crawl(each.attr.href, callback=self.board_page,validate_cert=False)
            
            
    @config(priority=2)
    def detail_page(self, response):
        title = response.doc('h1').text()
        content = response.doc('div.topic_content').html().replace('"','\\"')
        self.add_question(title,content)
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }