數據獲取的幾種方法
阿新 • • 發佈:2018-11-10
str string lar tags use int 簽名 params proxy
方法一:正則表達式
import re import urllib2 #不需要代理的 res = urllib2.urlopen(url).read() #需要代理使用這個 ueser-agent = ‘user‘ headers ={‘User-agent‘:ueser-agent} request = urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res= opener.open(request).read() print(res) #Regular是需要匹配數據的正則表達式 print(re.findall(Regular,res))
方法二:Beautiful Soup
安裝:
pip install beautifulsoup4
import re import urllib2 from bs4 import BeautifulSoup #不需要代理的 res = urllib2.urlopen(url).read() #需要代理使用這個 ueser-agent = ‘user‘ headers ={‘User-agent‘:ueser-agent} request= urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res = opener.open(request).read() print(res) soup = BeautifulSoup(res,‘html-parser‘) fixed_html = soup.prettity() print(fixed_html) #查找標簽對應的數據 #tag標簽名,如‘ul‘,attr標簽屬性及屬性值組成的鍵值對,數據類型為字典,如:{‘class‘:‘country‘} #查找單個標簽 tagData = soup.find(tag,attrs=attr) #查找同名的標簽集 tagsData = soup.find_all(tag)
方發三:Lxml
安裝
pip install lxml
import re import urllib2 import lxml #不需要代理的 res = urllib2.urlopen(url).read() #需要代理使用這個 ueser-agent = ‘user‘ headers ={‘User-agent‘:ueser-agent} request = urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res = opener.open(request).read() print(res) tree = lxml.html.fromstring(res) td = tree.cssselect(‘tr#place>td.area‘)[0] area = td.text_content() print(area)
數據獲取的幾種方法