解析庫使用(xPath)〈Python3網路爬蟲開發實戰〉
阿新 • • 發佈:2018-11-20
僅做記錄
XPath對網頁進行解析的過程:
from lxml import etree text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">first item</a></li> <li class="item-inactive"><a href="link3.html">first item</a></li> <li class="item-1"><a href="link4.html">first item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> '''
html = etree.HTML(text) result = etree.tostring(html) print(result.decode('utf-8')) # 列印成str型別 print(result) #列印成bytes型別
# 或者匯入外部的文字檔案進行解析: html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8'))
# 所有節點 html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//*') print(result)
# 指定(所有)節點 html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//li') print(result) # 返回結果是一個列表 print(result[0])
# 子節點 html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//li/a') #返回li的全部a節點(直接子節點) result2 = html.xpath('//ul//a') # 返回ul的全部a節點(子孫節點) result3 = html.xpath('//ul/a') # 返回ul的全部a節點(直接子節點,沒有結果)
# 父節點 html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//a[@href="link4.html"]/../@class') result2 = html.xpath('//a[@href="link3.html"]/parent::*/@class') #另一種獲取父節點的方法
# 屬性匹配 html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//li[@class="item-1"]') # 返回class屬性為item-1的兩個li元素
# 文字獲取 # 直接節點方式(特定子節點下文字) html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//li[@class="item-0"]/a/text()') # 子孫節點方式(全部文字) html = etree.parse('./test.html', etree.HTMLParser()) result2 = html.xpath('//li[@class="item-0"]//text()') #最後一個〈li>上的換行符也返回
# 屬性獲取(注意跟屬性匹配[@href="link4.html"]之間的區別) html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//li/a/@href')
# 屬性多值匹配(某個屬性有多個值) text = ''' <li class="li li-first"><a href="link1.html">first item</a></li> ''' html = etree.HTML(text) # 解析“text”版本的html result = html.xpath('//li[contains(@class, "li")]/a/text()') print(result)
# 多屬性匹配 text = ''' <li class="myli li-first" name="item"><a href="link1.html">first item</a></li> ''' html = etree.HTML(text) result = html.xpath('//li[contains(@class, "myli") and @name="item"]/a/text()') print(result)
# 按序選擇(序號不是從0,而是從1開始) html = etree.HTML(text) result = html.xpath('//li[1]/a/text()') # 第一個li的文字內容 print(result) result = html.xpath('//li[last()]/a/text()') # 最後一個li的文字內容 print(result) result = html.xpath('//li[position()<3]/a/text()') # 序號比3小的li的文字內容(即第1,2兩個文字內容) print(result) result = html.xpath('//li[last()-2]/a/text()') # 倒數第3個li的文字內容 print(result)
# 節點軸選擇 html = etree.HTML(text) result = html.xpath('//li[1]/ancestor::*') # li[1]的所有祖先節點 print(result) result = html.xpath('//li[1]/ancestor::div') # li[1]的div祖先節點 print(result) result = html.xpath('//li[1]/attribute::*') # li[1]的所有屬性值 print(result) result = html.xpath('//li[1]/child::a[@href="link1.html"]') # li[1]的直接子節點中href屬性為link1.html的a節點 print(result) result = html.xpath('//li[1]/descendant::span') # li[1]子孫節點中只包含span節點而不包含a節點 print(result) result = html.xpath('//li[1]/following::*[2]') # 獲取節點a,(li[1]之後的所有節點中第2個後續節點 (li也算)) print(result) result = html.xpath('//li[1]/following::*[1]') # 獲取節點li,(li[1]之後的所有節點中第1個後續節點 (li也算)) print(result) result = html.xpath('//li[1]/following-sibling::*') # 獲取當前節點之後的所有同級節點 print(result)