Python Scrapy 煎蛋網妹子圖例項
阿新 • • 發佈:2018-11-29
前面介紹了爬蟲框架的一個例項,那個比較簡單,這裡在介紹一個例項
爬取 煎蛋網 妹子圖,遺憾的是 上週煎蛋網還有妹子圖了,但是這周妹子圖變成了 隨手拍,
不過沒關係,我們爬圖的目的是為了加強實戰應用,管他什麼圖了先爬下來再說。
言歸正傳
這個例項,主要是講將圖片爬了下來,儲存到本地,這個在上週已經實現了,這周有進一步做了講圖片資訊儲存到了資料庫中。
程式碼結構
如上圖
common 用來存放共程式碼,如上篇對資料庫的操作進行了封裝,這裡直接就放到了common 資料夾下,方便呼叫。
緊接著下面兩個資料夾是用來 存放爬取的圖片的,第一個是 這周爬的 隨手拍的圖片,第二個是 妹子圖的圖片。
然後下面就是 爬蟲檔案了。
一下貼出 對應檔案的程式碼
JdwSpider.py
# -*- coding: utf-8 -*- import base64 import scrapy import JianDanW.items as items from bs4 import BeautifulSoup class JdwspiderSpider(scrapy.Spider): name = 'JdwSpider' allowed_domains = ['jandan.net'] start_urls = ['http://jandan.net/ooxx/'] # start_urls = ['http://jandan.net/ooxx/page-1#comments'] def parse(self, response): item = items.JiandanwItem() # 通過 response.text 獲取 html 原始檔 html = response.text # 使用 lxml 解析器解析 html 此時 soup 為 html 樣式檔案。 soup = BeautifulSoup(html, 'lxml') # 查詢 html 中的 img-hash 返回 包含 class="img-hash" 的 列表 tags = soup.select('.img-hash') imgUrlList = [] for tag in tags: # tag 為 <span class="img-hash">Ly93eDQuc2luYWltZy5jbi9tdzYwMC82NmIzZGUxN2d5MWZ4bzZqaXM0aWVqMjFtYTB1MHg2ci5qcGc=</span> # img_hash = Ly93eDQuc2luYWltZy5jbi9tdzYwMC82NmIzZGUxN2d5MWZ4bzZqaXM0aWVqMjFtYTB1MHg2ci5qcGc= img_hash = tag.text # img_hash 進行解密 為 //wx4.sinaimg.cn/mw600/66b3de17gy1fxo6jis4iej21ma0u0x6r.jpg img_url = base64.b64decode(img_hash).decode('utf-8') # 將結果載入到列表 imgUrlList.append(img_url) # print(img_urls) # 將列表複製給item 對應的 image_urls item['image_urls'] = imgUrlList yield item # 獲取翻頁 下一頁 連線 p_url = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() # 如果存在下一頁 if p_url: p_url = str(p_url) hurl = 'http:' page_url = hurl + p_url # 如果存在下一頁,回撥 parse 函式 yield scrapy.Request(page_url,callback=self.parse)
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class JiandanwItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() image_urls = scrapy.Field()#圖片的連結
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import os import urllib import common.DBHandle as DBHandle class JiandanwPipeline(object): def process_item(self, item, spider): # 資料庫連線 host = '127.0.0.1' username = 'adil' password = 'helloyyj' database = 'AdilTest' port = 3306 DbHandle = DBHandle.DataBaseHandle(host, username, password, database, port) for image_url in item['image_urls']: # 截圖圖片連結 list_name = image_url.split('/') # 獲取圖片名稱 file_name = list_name[len(list_name) - 1] # 圖片名稱 # 獲取當前路徑 currentPath = os.getcwd() # 拼接圖片存放路徑 file_path = os.path.join(currentPath,spider.name) # 如果圖片路徑不存在,建立該檔案路徑 if not os.path.exists(file_path): os.makedirs(file_path) # 補全圖片路徑 path_name = os.path.join(file_path,file_name) # 獲取有效的url 因為 image_url = //wx4.sinaimg.cn/mw600/66b3de17gy1fxo6jis4iej21ma0u0x6r.jpg image_url = 'http:' + image_url # 此處執行 資料庫插入,將 圖片名稱、url 插入到資料庫 注意 這裡的 values('佔位符 一定要用 引號引起來,要不然執行不成功,血的教訓') sql = "insert into JdwSpider(image_name,image_url) values ('%s','%s')" % (file_name,image_url) # 如果不執行插入,可以註釋改該行程式碼 DbHandle.insertDB(sql) # 圖片儲存 with open(path_name, 'wb') as file_writer: conn = urllib.request.urlopen(image_url) # 下載圖片 # 儲存圖片 file_writer.write(conn.read()) file_writer.close() # 關閉資料庫 DbHandle.closeDb() return item
配置 setting.py
# -*- coding: utf-8 -*- # Scrapy settings for JianDanW project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'JianDanW' SPIDER_MODULES = ['JianDanW.spiders'] NEWSPIDER_MODULE = 'JianDanW.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'JianDanW (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'JianDanW.middlewares.JiandanwSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'JianDanW.middlewares.JiandanwDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'JianDanW.pipelines.JiandanwPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
遇到的問題
因為煎蛋網 使用了反爬蟲策略,所以導致 常規的爬蟲 失敗
需要增加 url 解密操作,具體 爬蟲程式碼裡有介紹
關於資料庫操作的封裝請看上篇介紹。
如果還有什麼問題,可以評論區內提問,或是QQ 聯絡。
如果覺得有用,歡迎打賞哦,哈哈哈~