1. 程式人生 > >爬蟲1.1爬取鬥圖啦圖片(關於open函式和urlretrieve函式)

爬蟲1.1爬取鬥圖啦圖片(關於open函式和urlretrieve函式)

文章只是我作為NewBird     ٩꒰▽ ꒱۶⁼³₌₃     學習的一小點小點的進步

還請不要笑我⁄(⁄ ⁄•⁄ω⁄•⁄ ⁄)⁄

我就直接貼程式碼了,我不會說很技術的話。

1.建立專案命令:

scrapy startproject <project_name>

例子:

scrapy startproject myproject

建立成功如下圖:

資料夾目錄如下

1 2 3 4 5 6 7 8 9 10 myproject/ scrapy.cfg          -------專案的配置檔案 myproject/         -------該專案的python模組。之後您將在此加入程式碼。
__init__.py items.py       --------專案中的item檔案. pipelines.py  --------專案中的pipelines檔案 settings.py    --------專案的設定檔案 spiders/        --------放置spider程式碼的目錄 __init__.py ...

以上覆制了別人的https://www.cnblogs.com/pachongshou/p/6125858.html

2.明確目標——Item.py

import scrapy

class Doutu2Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    img_url=scrapy.Field()
    name=scrapy.Field()


3.製作爬蟲,先爬再取——spider.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import Doutu2Item
from pyquery import PyQuery as pq
import os
import requests
from  urllib import request
import re
class DoutuSpider(scrapy.Spider):
    name = 'doutu'
allowed_domains = ['doutula.com']

    start_urls = ['http://doutula.com/photo/list/?page={}'
.format(i)for i in range(1,3)] def parse(self, response): jpy=pq(response.text) #我這裡使用了PyQuery Zurl=jpy('#pic-detail > div > div.col-sm-9 > div.random_picture > ul > li > div > div>a').items() i=0 for it in Zurl: #遍歷Zurl print(it.text()) #例項化item物件,進行儲存 item=Doutu2Item() #PyQuery中獲取屬性attr() #以下是動圖和jpg的url獲取 item['img_url']=it('img').attr('data-original') item['name']=it('p').text() if not item['img_url']: item['img_url']=it('img').eq(1).attr('data-original') print(item['img_url']) i+=1 # if os.path.exists('鬥圖'): # print('資料夾已存在') # else: # os.makedirs('鬥圖') # print('資料夾已經建立') if not os.path.exists('doutu'): print('建立資料夾:{}'.format('doutu')) os.mkdir('doutu') if not os.path.exists('pic'): print('建立資料夾:{}'.format('pic')) os.mkdir('pic') #正則表示式替換名稱中有特殊字元的 rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' new_title = re.sub(rstr, "_", item['name']) # 替換為下劃線 #第一種儲存方式:開啟檔案路徑的時候我不太會弄,所以錯了幾次,相對路徑比較好, with open('pic/%s.jpg' % new_title,'wb') as f: f.write(requests.get(item['img_url']).content) #第二種儲存方式 try: request.urlretrieve(item['img_url'],'doutu\%s.gif'% new_title) except: pass print(i) print('__________________________________________________') yield item

4.處理spider抽取的item——pipeline.py

from scrapy.exceptions import DropItem
from scrapy import log
import json
from pymongo import MongoClient
from  scrapy.conf import settings


class Doutu2Pipeline(object):
#這裡是初始化方法
#進行資料庫的一些設定
    def __init__(self):
        connection=MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db=connection[settings['MONGODB_DB']]
        self.collection=db[settings['MONGODB_COLLECTION']]

    def process_item(self, item, spider):
        self.collection.insert(dict(item))
        print("我已進入資料庫")
        valid=True
        for data in item:
            if not data:
                valid=False
                raise DropItem('MIssing{}'.format(data))
        if valid:
            log.msg('已經進入資料庫',level=log.DEBUG,spider=spider)


        return item
__init__方法在類的一個物件被建立時,馬上執行。這個方法可以用來對你的物件做一些你希望的 初始化 。注意,這個名稱的開始和結尾都是雙下劃線


5.配置檔案——setting.py

# Scrapy settings for doutu2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'doutu2'
SPIDER_MODULES = ['doutu2.spiders']
NEWSPIDER_MODULE = 'doutu2.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   'doutu2.middlewares.Doutu2SpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'doutu2.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'doutu2.pipelines.Doutu2Pipeline': 300,
}
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "test"
MONGODB_COLLECTION = "doutu2"


6.中介軟體進行request和response一些設定——middlewares.py

        這個暫時還不怎麼U