python爬蟲問題:TypeError: cannot use a string pattern on a bytes-like objec
阿新 • • 發佈:2018-12-09
- Python3.x在學到爬蟲是需要注意不同於Python2.x需要將html進行解碼:
import urllib
import re
def download(url,user_agent='XD',num_retries=2):
print('Downloading:',url)
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
try:
html = urllib.request.urlopen(url).read()
except urllib.error.URLError as e:
print('Download error:',e.reason)
html = None
if num_retries >0:
if hasattr(e,'code') and 500<= e.code <600:
# recursively retry 5xx HTTP errors
return download(url,user_agent,num_retries-1)
return html
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
sitemap = sitemap.decode('utf-8')
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>',sitemap)
# download each link
for link in links:
html = download(link)
# scrape html here
#...
if __name__ == '__main__':
crawl_sitemap('http://www.baidu.com/sitemap.xml')
在def crawl_sitemap(url):
中加入sitemap = sitemap.decode('utf-8')
進行解碼操作