Python爬蟲實戰一之使用Beautiful Soup抓取百度招聘資訊並存儲excel檔案
阿新 • • 發佈:2019-01-02
#encoding:utf-8
'''
Created on 2017年7月25日
@author: ********
'''
import urllib2
from bs4 import BeautifulSoup
import xlrd,os
from xlutils.copy import copy
from StatisticsReport import StatisticsReport
def GetJobinfo():
dict1={}
n=1
head = {} #設定頭
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
urlhead='http://zhaopin.baidu.com/quanzhi?query=%E6%B5%8B%E8%AF%95&sort_type=1&city=%E6%B5%8E%E5%8D%97&detailmode=close&rn=20&pn=';
for i in range(0,n):
#獲取url全路徑
get_url=urlhead+str(2*i*10);
#模擬瀏覽器,定製http請求頭
request=urllib2.Request(url=get_url,headers = head)
#模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
reponse=urllib2.urlopen(request)
#這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
zhaop_html=reponse.read().decode('utf-8')
# UTF-8模式讀取獲取的頁面資訊標籤和內容
zhaop_htmltables=BeautifulSoup(zhaop_html,'lxml');
#獲取所有'a'標籤以及內容
get_linka_list=zhaop_htmltables.find_all('a')
j=0
for alink in get_linka_list:
href=alink.get('href')
if href.find('/szzw?detailidx=')==-1:
pass
else:
get_joburl='http://zhaopin.baidu.com'+href;
jobname=alink.span.get_text()
jobarea=alink.find_all('p')[0].get_text().split('|')[0]
jobcompany=alink.find_all('p')[0].get_text().split('|')[1]
jobpay=alink.find_all('p')[1].get_text()
jobdate=alink.find_all('p')[2].get_text()
get_jobrequest=urllib2.Request(url=get_joburl,headers = head)
#模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
get_jobreponse=urllib2.urlopen(get_jobrequest)
#這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
get_job_html=get_jobreponse.read().decode('utf-8')
# UTF-8模式讀取獲取的頁面資訊標籤和內容
zhaop_htmltables=BeautifulSoup(get_job_html,'lxml');
job_tag=zhaop_htmltables.find_all(name='div',attrs={'class':'abs'});
job_decs=job_tag[0].find_all('p')
job_decinfo=''
for job_dec in job_decs:
job_decinfo+=job_dec.get_text()+'\n'
list1=[jobname,jobarea,jobcompany,jobpay,jobdate,job_decinfo]
j+=1
key='test'+str(j)
dict2=dict.fromkeys([key], list1)
dict1.update(dict2)
return dict1
def GenerateReport(report,job_dict):
reportpath=os.path.abspath("..")+'\\'
reportname=report.__getreportname__()
bk=xlrd.open_workbook(reportpath+reportname)
wb=copy(bk)
wa=wb.get_sheet(0)
for i in range(0,len(job_dict)):
for j in range(0,len(job_dict.values()[i])):
wa.write(i+1,j,job_dict.values()[i][j])
wb.save(reportpath+reportname)
if __name__ == '__main__':
report=StatisticsReport()
report.__createStatisticsReport__()
job_dict=GetJobinfo()
GenerateReport(report,job_dict)
'''
Created on 2017年7月25日
@author: ********
'''
import urllib2
from bs4 import BeautifulSoup
import xlrd,os
from xlutils.copy import copy
from StatisticsReport import StatisticsReport
def GetJobinfo():
dict1={}
n=1
head = {} #設定頭
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
urlhead='http://zhaopin.baidu.com/quanzhi?query=%E6%B5%8B%E8%AF%95&sort_type=1&city=%E6%B5%8E%E5%8D%97&detailmode=close&rn=20&pn=';
for i in range(0,n):
#獲取url全路徑
get_url=urlhead+str(2*i*10);
#模擬瀏覽器,定製http請求頭
request=urllib2.Request(url=get_url,headers = head)
#模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
reponse=urllib2.urlopen(request)
#這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
zhaop_html=reponse.read().decode('utf-8')
# UTF-8模式讀取獲取的頁面資訊標籤和內容
zhaop_htmltables=BeautifulSoup(zhaop_html,'lxml');
#獲取所有'a'標籤以及內容
get_linka_list=zhaop_htmltables.find_all('a')
j=0
for alink in get_linka_list:
href=alink.get('href')
if href.find('/szzw?detailidx=')==-1:
pass
else:
get_joburl='http://zhaopin.baidu.com'+href;
jobname=alink.span.get_text()
jobarea=alink.find_all('p')[0].get_text().split('|')[0]
jobcompany=alink.find_all('p')[0].get_text().split('|')[1]
jobpay=alink.find_all('p')[1].get_text()
jobdate=alink.find_all('p')[2].get_text()
get_jobrequest=urllib2.Request(url=get_joburl,headers = head)
#模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
get_jobreponse=urllib2.urlopen(get_jobrequest)
#這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
get_job_html=get_jobreponse.read().decode('utf-8')
# UTF-8模式讀取獲取的頁面資訊標籤和內容
zhaop_htmltables=BeautifulSoup(get_job_html,'lxml');
job_tag=zhaop_htmltables.find_all(name='div',attrs={'class':'abs'});
job_decs=job_tag[0].find_all('p')
job_decinfo=''
for job_dec in job_decs:
job_decinfo+=job_dec.get_text()+'\n'
list1=[jobname,jobarea,jobcompany,jobpay,jobdate,job_decinfo]
j+=1
key='test'+str(j)
dict2=dict.fromkeys([key], list1)
dict1.update(dict2)
return dict1
def GenerateReport(report,job_dict):
reportpath=os.path.abspath("..")+'\\'
reportname=report.__getreportname__()
bk=xlrd.open_workbook(reportpath+reportname)
wb=copy(bk)
wa=wb.get_sheet(0)
for i in range(0,len(job_dict)):
for j in range(0,len(job_dict.values()[i])):
wa.write(i+1,j,job_dict.values()[i][j])
wb.save(reportpath+reportname)
if __name__ == '__main__':
report=StatisticsReport()
report.__createStatisticsReport__()
job_dict=GetJobinfo()
GenerateReport(report,job_dict)