1. 程式人生 > >數據獲取的幾種方法

數據獲取的幾種方法

str string lar tags use int 簽名 params proxy

方法一:正則表達式

import re
import urllib2
#不需要代理的
res = urllib2.urlopen(url).read()
#需要代理使用這個
ueser-agent = user
headers ={User-agent:ueser-agent}
request = urllib2.Request(url,headers = headers)
opener = urllib2.build_opener()
 proxy_params = {urlparse.urlparse(url).scheme:proxy}
opener.add_heandler(urllib2.ProxyHandler(proxy_params))
res 
= opener.open(request).read() print(res) #Regular是需要匹配數據的正則表達式 print(re.findall(Regular,res))

方法二:Beautiful Soup

安裝:

  pip install beautifulsoup4

import re
import urllib2
from bs4 import BeautifulSoup
#不需要代理的
res = urllib2.urlopen(url).read()
#需要代理使用這個
ueser-agent = user
headers ={User-agent:ueser-agent}
request 
= urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res = opener.open(request).read() print(res) soup = BeautifulSoup(res,html-parser) fixed_html = soup.prettity() print
(fixed_html) #查找標簽對應的數據 #tag標簽名,如‘ul‘,attr標簽屬性及屬性值組成的鍵值對,數據類型為字典,如:{‘class‘:‘country‘} #查找單個標簽 tagData = soup.find(tag,attrs=attr) #查找同名的標簽集 tagsData = soup.find_all(tag)

方發三:Lxml

安裝

  pip install lxml

import re
import urllib2
import lxml
#不需要代理的
res = urllib2.urlopen(url).read()
#需要代理使用這個
ueser-agent = user
headers ={User-agent:ueser-agent}
request = urllib2.Request(url,headers = headers)
opener = urllib2.build_opener()
 proxy_params = {urlparse.urlparse(url).scheme:proxy}
opener.add_heandler(urllib2.ProxyHandler(proxy_params))
res = opener.open(request).read()
print(res)

tree = lxml.html.fromstring(res)
td = tree.cssselect(tr#place>td.area)[0]
area = td.text_content()
print(area)

  

數據獲取的幾種方法