1. 程式人生 > >爬取拉勾網資訊,翻頁爬取

爬取拉勾網資訊,翻頁爬取

import requests #這個庫等價於 urllib 和urllib2
import bs4 #作用是用來解析網頁的
import json#主要是一種資料交換格式
import time
def main():
    header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
            "Host":"www.lagou.com","Referer":"https://www.lagou.com/jobs/list_python%20?labelWords=&fromSearch=true&suginput=",
            "X-Anit-Forge-Code":"0","X-Anit-Forge-Token":"None","X-Requested-With":"XMLHttpRequest"}
    positions=[]
    for i in range(1,31):
        data = {
                "first": "ture",
                "pn": i,
                "kd": "python"}
        result = requests.post("https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false",headers=header, data=data)
        json_result = result.json()
        #print(json_result)
        page_positions = json_result["content"]["positionResult"]["result"]
        positions.extend(page_positions)
        time.sleep(3)
        if i==2:
            break#這個只能一次爬取7頁,再爬的話會出現訪問次數太多。
    line = json.dumps(positions,ensure_ascii=False)
    with open('C:/Users/dell/Desktop/python1.txt', 'w', encoding="utf-8") as f:
            f.write(line)
if __name__ == '__main__':
    main()