1. 程式人生 > >python中scrapy框架爬取攜程景點資料

python中scrapy框架爬取攜程景點資料

---------------------------------------------------------------------------------------------
[版權申明:本文系作者原創,轉載請註明出處] 
文章出處:https://blog.csdn.net/sdksdk0/article/details/82381198

作者:朱培      ID:sdksdk0     
--------------------------------------------------------------------------------------------

本文使用scrapy框架,python3.6進行爬取,主要獲取的是攜程上河南省的景點名稱,地址,省市縣,描述,圖片地址資訊等。首先通過搜尋可以得到河南的網頁地址為:http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/,然後以這個頁面為起始位置開始爬取。將爬取的資料儲存到mysql資料庫中。

1、建立scrapy專案

scrapy startproject ctrip

2、建立 spider,首先進入ctrip資料夾

scrapy genspider scenic "ctrip.com"

3、settings.py檔案中:

BOT_NAME = 'ctrip'

SPIDER_MODULES = ['ctrip.spiders']
NEWSPIDER_MODULE = 'ctrip.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
    'ctrip.middlewares.UserAgentDownloadMiddleware': 543,
}
ITEM_PIPELINES = {
    'ctrip.pipelines.DBPipeline': 300,
}

4、middlewares.py中

import random


class UserAgentDownloadMiddleware (object):
    USER_AGENTS = [
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
    ]

    def process_request(self,request,spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

5、items.py

import scrapy


class ScenicItem(scrapy.Item):
    province = scrapy.Field()
    city = scrapy.Field()
    county = scrapy.Field()
    name = scrapy.Field()
    scenic_url = scrapy.Field()
    image_url = scrapy.Field()
    address = scrapy.Field()
    descript = scrapy.Field()
    code = scrapy.Field()

6、scenic.py

# -*- coding: utf-8 -*-
import scrapy
import re
from ctrip.items import ScenicItem

class ScenicSpider(scrapy.Spider):
    name = 'scenic'
    allowed_domains = ['ctrip.com']
    start_urls = ['http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/']
    count = 0

    def parse(self, response):
        trs = response.xpath("//div[@id='searchResultContainer']//div[@class='searchresult_product04']")

        for tr in trs:
            ctrip_url = tr.xpath(".//div[1]/a/@href").get()
            c1_url = ctrip_url.split("t/t")
            scemic_num = c1_url[1].split(".")
            scemic_num = scemic_num[0]
            scenic_url = ""
            image_url = tr.xpath(".//div[1]/a/img/@src").get()
            address = tr.xpath(".//div[1]/div[@class='adress']//text()").get().strip()
            address = re.sub(r"地址:", "", address)
            descript = tr.xpath(".//div[1]/div[@class='exercise']//text()").get().strip()
            descript = re.sub(r"特色:", "", descript)
            name = tr.xpath(".//div[1]//h2/a/text()").get().strip()

            cityinfo=address
            province = "河南省"
            city = ""
            county = ""
            if "省" in cityinfo:
                matchObj = re.match(r'(.*)[?省](.+?)市(.+?)([縣]|[區])', cityinfo, re.M | re.I)
                if matchObj:
                    province = matchObj.group(1) + "省"
                    city = matchObj.group(2) + "市"
                    if "縣" in cityinfo:
                        county = matchObj.group(3) + "縣"
                    else:
                        county = matchObj.group(3) + "區"
                else:
                    matchObj2 = re.match(r'(.*)[?省](.+?)市(.+?)市', cityinfo, re.M | re.I)
                    matchObj1 = re.match(r'(.*)[?省](.+?)市', cityinfo, re.M | re.I)
                    if matchObj2:
                        city = matchObj2.group(2) + "市"
                        county = matchObj2.group(3) + "市"
                    elif matchObj1:
                        city = matchObj1.group(2) + "市"
                    else:
                        matchObj1 = re.match(r'(.*)[?省](.+?)([縣]|[區])', cityinfo, re.M | re.I)
                        if matchObj1:
                            if "縣" in cityinfo:
                                county = matchObj1.group(2) + "縣"
                            else:
                                county = matchObj1.group(2) + "區"

            else:
                matchObj = re.match(r'(.+?)市(.+?)([縣]|[區])', cityinfo, re.M | re.I)
                if matchObj:
                    city = matchObj.group(1) + "市"
                    if "縣" in cityinfo:
                        county = matchObj.group(2) + "縣"
                    else:
                        county = matchObj.group(2) + "區"
                else:
                    matchObj = re.match(r'(.+?)市', cityinfo, re.M | re.I)
                    if matchObj:
                        city = matchObj.group(1) + "市"
                    else:
                        matchObj = re.match(r'(.+?)縣', cityinfo, re.M | re.I)
                        if matchObj:
                            county = matchObj.group(1) + "縣"

            self.count += 1
            code = "A" + str(self.count)

            item = ScenicItem(name=name,province=province,city=city,county=county,address=address,descript=descript,
                              scenic_url=scenic_url,image_url=image_url,code=code)

            yield item
        next_url = response.xpath('//*[@id="searchResultContainer"]/div[11]/a[11]/@href').get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse,meta={})

7、pipelines.py,將資料儲存到mysql資料庫中

import pymysql


# 用於資料庫儲存
class DBPipeline(object):
    def __init__(self):
        # 連線資料庫
        self.connect = pymysql.connect(
            host='localhost',
            port=3306,
            db='edu_demo',
            user='root',
            passwd='123456',
            charset='utf8',
            use_unicode=True)

        # 通過cursor執行增刪查改
        self.cursor = self.connect.cursor();

    def process_item(self, item, spider):
        try:
            # 查重處理
            self.cursor.execute(
                """select * from a_scenic where ctrip_url = %s""",
                item['scenic_url'])
            # 是否有重複資料
            repetition = self.cursor.fetchone()

            # 重複
            if repetition:
                pass

            else:
                # 插入資料
                self.cursor.execute(
                    """insert into a_scenic(code,province, city, county, name ,description, ctrip_url,image_url,address,type)
                    value (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                    (item['code'],
                     item['province'],
                     item['city'],
                     item['county'],
                     item['name'],
                     item['descript'],
                     item['scenic_url'],
                     item['image_url'],
                     item['address'], '1'))

            # 提交sql語句
            self.connect.commit()

        except Exception as error:
            # 出現錯誤時列印錯誤日誌
            print(error)
        return item

8、start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl scenic".split())

9、執行start.py即可