1. 程式人生 > >記錄一下xpath提取不到iframe多層巢狀的問題

記錄一下xpath提取不到iframe多層巢狀的問題

今天爬取中彩網福彩3d[http://www.zhcw.com/3d/]的時候,碰到iframe巢狀,xpath始終取不到值,如下圖: 在這裡插入圖片描述 無論怎麼取值,都為null,後來發現有個這個東西 在這裡插入圖片描述 然後直接進入到url裡面,就可以取到值了 在這裡插入圖片描述 好了,問題解決,查閱網上資料,聽說可以正面攻克,比較麻煩,不推薦花時間去做這東西。 最後附上本人程式碼,爬蟲框架用的是scrapy,儲存用的MySQL資料庫。 items

import scrapy


class Lottery3DItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
# 開獎日期 date = scrapy.Field() # 期號 issue = scrapy.Field() # 第一個藍球號碼 blue1 = scrapy.Field() # 第二個藍球號碼 blue2 = scrapy.Field() # 第三個藍球號碼 blue3 = scrapy.Field()

spider

# -*- coding: utf-8 -*-
import scrapy
from ..items import Lottery3DItem


class LotterySpider(scrapy.
Spider): name = 'lottery' allowed_domains = ['zhcw.com'] start_urls = ['http://kaijiang.zhcw.com/zhcw/html/3d/list_1.html'] index = 1 items = [] def parse(self, response): node_list = response.xpath("//tr") node_list.pop(0) node_list.pop(0) node_list.pop(
) for node in node_list: item = Lottery3DItem() item["date"] = node.xpath("./td[1]/text()").extract_first() item["issue"] = node.xpath("./td[2]/text()").extract_first() item["blue1"] = node.xpath("./td[3]/em[1]/text()").extract_first() item["blue2"] = node.xpath("./td[3]/em[2]/text()").extract_first() item["blue3"] = node.xpath("./td[3]/em[3]/text()").extract_first() yield item self.index += 1 next_url = "http://kaijiang.zhcw.com/zhcw/html/3d/list_{}.html".format(self.index) yield scrapy.Request(url=next_url, callback=self.parse)

pipeline

import pymysql


class Lottery3DPipeline(object):

 def __init__(self):
     self.conn = pymysql.connect(host='103.27.5.156', user='developer', passwd='Developer!123', db='spider', charset='utf8')
     self.cursor = self.conn.cursor()

 def process_item(self, item, spider):
     lottery_date = item['date']
     issue = item['issue']
     blue1 = item['blue1']
     blue2 = item['blue2']
     blue3 = item['blue3']
     sql = "insert into lottery_3d(date, issue, blue1, blue2, blue3) VALUES(%s, %s, %s, %s, %s)"
     self.cursor.execute(sql, (lottery_date, issue, blue1, blue2, blue3,))
     self.conn.commit()
     return item

 def close_spider(self, spider):
     self.conn.close()