1. 程式人生 > >用scrapy獲取代理ip地址

用scrapy獲取代理ip地址

items.py

 -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class GetproxyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    ip = scrapy.Field
() port = scrapy.Field() type = scrapy.Field() location = scrapy.Field() protocol = scrapy.Field() source = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class GetproxyPipeline(object): def process_item(self, item, spider): fileName = 'proxy.txt' with open(fileName,'a') as fp: fp.write(item['ip'].encode('utf8').strip() + '\t') fp.write(item['port'].encode('utf8').strip() + '\t') fp.write(item['protocol'
].encode('utf8').strip() + '\t') fp.write(item['type'].encode('utf8').strip() + '\t') fp.write(item['location'].encode('utf8').strip() + '\t') fp.write(item['source'].encode('utf8').strip() + '\n') return item

proxy360pider.py

# -*- coding: utf-8 -*-
import scrapy
from getProxy.items import GetproxyItem

class Proxy360piderSpider(scrapy.Spider):
    name = "proxy360pider"
    allowed_domains = ["proxy360.cn"]
    start_urls = []

    nations = ['Brazil','China','Taiwan','Japan','Thailand','Vietnam','bahrenin']

    for nation in nations:
        start_urls.append('http://www.proxy360.cn/Region/' + nation)


    def parse(self, response):
        subSelector = response.xpath('//div[@class="proxylistitem" and @name="list_proxy_ip"]')
        items = []
        for sub in subSelector:
            item = GetproxyItem()
            item['ip'] = sub.xpath('.//span[1]/text()').extract()[0]
            item['port'] = sub.xpath('.//span[2]/text()').extract()[0]
            item['type'] = sub.xpath('.//span[3]/text()').extract()[0]
            item['location'] = sub.xpath('.//span[4]/text()').extract()[0]
            item['protocol'] ='http'
            item['source'] = 'proxy360'
            items.append(item)

        return items
部分代理ip
210.246.192.149 80  http    高匿  泰國  proxy360
118.175.255.10  80  http    高匿  泰國  proxy360
203.158.167.152 8080    http    高匿  泰國  proxy360
58.147.80.194   3128    http    高匿  泰國  proxy360
122.155.0.244   3128    http    透明  泰國  proxy360
203.151.233.143 80  http    高匿  泰國  proxy360