1. 程式人生 > >實戰 利用Xpath爬取網頁資料

實戰 利用Xpath爬取網頁資料

#coding=utf-8 #step1 匯入模組 import re import requests from lxml import etree #抓取網頁原始碼 url = 'http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000' header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} # 獲取原始碼
htmll = requests.get(url,headers=header) html=htmll.text #匹配不同目錄網址 page = re.findall('<li id=.*?>.*?<a href="(.*?)">.*?</a>',html,re.S) # 匹配不同目錄後部分網址 i = 0 for each in page: #print each page1 ='http://www.liaoxuefeng.com'+each # 不同目錄前半部分+後半部分網址 html2 = requests.get(page1,headers
=header) html2 = html2.text i +=1 for each2 in page1: Selector = etree.HTML(html2) content = Selector.xpath('//*[@class="x-wiki-content"]/p') # 匹配漢字 是一個list for each2 in content: print each2.text