在scrapy框架Header中使用Content-Length欄位使爬蟲返回400錯誤的問題
阿新 • • 發佈:2018-11-02
之前在抓一個網站Danaos的時候,發現能用requests和postman傳送同樣的header和payload的時候都能得到正確的結果,但是scrapy就會返回400錯誤,後來發現這是Twisted本身存在的問題,看了官網也沒找到解決方法
spider檔案在這裡:
class DanspiderSpider(scrapy.Spider): name = 'danspider' allowed_domains = ['www.danaos.com'] def start_requests(self): payload = '{"serviceDto":{"ViewType":"2","ViewDate":"","RevisionNumber":"1","LanguageId":"1","ItemCount":-1,"StartIndex":0,"Signature":"","TagList":[]},"year":-1,"assetType":"Fleet_Details"}' url = "https://www.danaos.com/Services/ContentAssetService.svc/GetContentAssetList" headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9", "Connection": "keep-alive", "Content-Length": "179", "Content-Type": "application/json; charset=UTF-8", "Cookie": "_ga=GA1.2.757680490.1537640028; _gid=GA1.2.1595345749.1537640028; _gat=1,Host: www.danaos.com", "Origin": "https://www.danaos.com", "Referer": "https://www.danaos.com/fleet/fleet-details/default.aspx", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36", "X-NewRelic-ID": "VQYBUlRVChABXFNXBAcCXw==", "X-Requested-With": "XMLHttpRequest" } yield scrapy.Request(url=url, body=payload, method="POST", headers=headers, callback=self.parse) def parse(self, response): print(response.text) pass
用twisted改寫這個爬蟲,去掉了content-length欄位,只留下了Content-Type欄位,是可以獲取返回結果的
twisted_danaos.py:
from __future__ import print_function from pprint import pformat from twisted.internet import reactor from twisted.internet.defer import Deferred from twisted.internet.protocol import Protocol from twisted.web.client import Agent from twisted.web.http_headers import Headers from twisted.web.iweb import UNKNOWN_LENGTH from bytesprod import BytesProducer class BeginningPrinter(Protocol): def __init__(self, finished): self.finished = finished self.remaining = 1024 * 10 def dataReceived(self, bytes): if self.remaining: display = bytes[:self.remaining] print('Some data received:') print(display) self.remaining -= len(display) def connectionLost(self, reason): print('Finished receiving body:', reason.getErrorMessage()) self.finished.callback(None) agent = Agent(reactor) body=BytesProducer(b'{"serviceDto":{"ViewType":"2","ViewDate":"","RevisionNumber":"1","LanguageId":"1","ItemCount":-1,"StartIndex":0,"Signature":"","TagList":[]},"year":-1,"assetType":"Fleet_Details"}') headers = { "Content-Type": ["application/json; charset=UTF-8"], } d = agent.request( b'POST', b'https://www.danaos.com/Services/ContentAssetService.svc/GetContentAssetList', Headers(headers),body) def cbRequest(response): print('Response version:', response.version) print('Response code:', response.code) print('Response phrase:', response.phrase) print('Response headers:') print(pformat(list(response.headers.getAllRawHeaders()))) finished = Deferred() response.deliverBody(BeginningPrinter(finished)) return finished d.addCallback(cbRequest) def cbShutdown(ignored): reactor.stop() d.addBoth(cbShutdown) reactor.run()
from zope.interface import implementer from twisted.internet.defer import succeed from twisted.web.iweb import IBodyProducer @implementer(IBodyProducer) class BytesProducer(object): def __init__(self, body): self.body = body self.length = len(body) def startProducing(self, consumer): consumer.write(self.body) return succeed(None) def pauseProducing(self): pass def stopProducing(self): pass
但是header中如果加上content-length欄位,爬蟲就會失敗,但是加上其他欄位不會影響結果獲取