「docker實戰篇」python的docker爬蟲技術-python指令碼app抓取(13)
上次已經分析出來具體的app的請求連線了,本次主要說說python的開發,抓取APP裡面的資訊。原始碼:https://github.com/limingios/dockerpython.git
分析app資料包
檢視分析
解析出來的header
夜神配置
python程式碼,爬取分類
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import requests #header內容比較多,因為各個廠家的思路不同, #fiddler爬取出來的欄位比較多,有些內容應該是非必填的,只能在實際的時候嘗試註釋一些來試。 def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } response = requests.post(url=url,headers=header,data=data) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) print(response.text) handle_index()
爬取詳情,資訊通過分類找到裡面的詳情
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import json import requests from multiprocessing import Queue #建立佇列 queue_list = Queue() def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } response = requests.post(url=url,headers=header,data=data) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) # print(response.text) index_response_dic = json.loads(response.text) for item_index in index_response_dic["result"]["cs"]: # print(item_index) for item_index_cs in item_index["cs"]: # print(item_index_cs) for item in item_index_cs["cs"]: #print(item) data_2 ={ "client":"4", "_session":"1547000257341354730010002552", "keyword":item["name"], "_vs ":"400" } #print(data_2) queue_list.put(data_2) handle_index() print(queue_list.qsize())
分類菜譜內部的詳情資訊
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import json import requests from multiprocessing import Queue #建立佇列 queue_list = Queue() def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } response = requests.post(url=url,headers=header,data=data) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) # print(response.text) index_response_dic = json.loads(response.text) for item_index in index_response_dic["result"]["cs"]: # print(item_index) for item_index_cs in item_index["cs"]: # print(item_index_cs) for item in item_index_cs["cs"]: #print(item) data_2 ={ "client":"4", #"_session":"1547000257341354730010002552", "keyword":item["name"], "_vs ":"400", "order":"0" } #print(data_2) queue_list.put(data_2) def handle_caipu_list(data): print("當前的食材:",data["keyword"]) caipu_list_url = "http://api.douguo.net/recipe/s/0/20"; caipu_response = handle_request(caipu_list_url, data) caipu_response_dict = json.loads(caipu_response.text) for caipu_item in caipu_response_dict["result"]["list"]: caipu_info ={} caipu_info["shicai"] = data["keyword"] if caipu_item["type"]==13: caipu_info["user_name"] = caipu_item["r"]["an"] caipu_info["shicai_id"] = caipu_item["r"]["id"] caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","") caipu_info["caipu_name"] = caipu_item["r"]["n"] caipu_info["zuoliao_list"] = caipu_item["r"]["major"] print(caipu_info) else: continue handle_index() handle_caipu_list(queue_list.get())
菜品內部的詳情資訊
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import json import requests from multiprocessing import Queue #建立佇列 queue_list = Queue() def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } response = requests.post(url=url,headers=header,data=data) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) # print(response.text) index_response_dic = json.loads(response.text) for item_index in index_response_dic["result"]["cs"]: # print(item_index) for item_index_cs in item_index["cs"]: # print(item_index_cs) for item in item_index_cs["cs"]: #print(item) data_2 ={ "client":"4", #"_session":"1547000257341354730010002552", "keyword":item["name"], "_vs ":"400", "order":"0" } #print(data_2) queue_list.put(data_2) def handle_caipu_list(data): print("當前的食材:",data["keyword"]) caipu_list_url = "http://api.douguo.net/recipe/s/0/20"; caipu_response = handle_request(caipu_list_url, data) caipu_response_dict = json.loads(caipu_response.text) for caipu_item in caipu_response_dict["result"]["list"]: caipu_info ={} caipu_info["shicai"] = data["keyword"] if caipu_item["type"]==13: caipu_info["user_name"] = caipu_item["r"]["an"] caipu_info["shicai_id"] = caipu_item["r"]["id"] caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","") caipu_info["caipu_name"] = caipu_item["r"]["n"] caipu_info["zuoliao_list"] = caipu_item["r"]["major"] #print(caipu_info) detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"]) detail_data ={ "client":"4", "_session":"1547000257341354730010002552", "author_id":"0", "_vs":"2803", "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"] caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"] print(json.dumps(caipu_info)) else: continue handle_index() handle_caipu_list(queue_list.get())
將資料儲存在mongodb中
- 通過vagrant 安裝虛擬機器
vagrant up
-
進入虛擬機器
>ip 192.168.66.100
su - #密碼:vagrant docker ````  * 拉取mongodb的映象 >https://hub.docker.com/r/bitnami/mongodb >預設埠:27017 ``` bash docker pull bitnami/mongodb:latest
- 建立mongodb的容器
mkdir bitnami cd bitnami mkdir mongodb docker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest #關閉防火牆 systemctl stop firewalld
用第三方工具連線
連線mongodb的工具
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/11 0:53 # @Author:liming # @Site: # @File: handle_mongodb.py # @url: idig8.com # @Software: PyCharm import pymongo from pymongo.collection import Collection class Connect_mongo(object): def __init__(self): self.client = pymongo.MongoClient(host="192.168.66.100",port=27017) self.db_data = self.client["dou_guo_mei_shi"] def insert_item(self,item): db_collection = Collection(self.db_data,'dou_guo_mei_shi_item') db_collection.insert(item) # 暴露出來 mongo_info = Connect_mongo()
python爬取的資料通過mongo的工具儲存到centos7的docker映象中
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import json import requests from multiprocessing import Queue from handle_mongo import mongo_info #建立佇列 queue_list = Queue() def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } response = requests.post(url=url,headers=header,data=data) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) # print(response.text) index_response_dic = json.loads(response.text) for item_index in index_response_dic["result"]["cs"]: # print(item_index) for item_index_cs in item_index["cs"]: # print(item_index_cs) for item in item_index_cs["cs"]: #print(item) data_2 ={ "client":"4", #"_session":"1547000257341354730010002552", "keyword":item["name"], "_vs ":"400", "order":"0" } #print(data_2) queue_list.put(data_2) def handle_caipu_list(data): print("當前的食材:",data["keyword"]) caipu_list_url = "http://api.douguo.net/recipe/s/0/20"; caipu_response = handle_request(caipu_list_url, data) caipu_response_dict = json.loads(caipu_response.text) for caipu_item in caipu_response_dict["result"]["list"]: caipu_info ={} caipu_info["shicai"] = data["keyword"] if caipu_item["type"]==13: caipu_info["user_name"] = caipu_item["r"]["an"] caipu_info["shicai_id"] = caipu_item["r"]["id"] caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","") caipu_info["caipu_name"] = caipu_item["r"]["n"] caipu_info["zuoliao_list"] = caipu_item["r"]["major"] #print(caipu_info) detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"]) detail_data ={ "client":"4", "_session":"1547000257341354730010002552", "author_id":"0", "_vs":"2803", "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"] caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"] #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) else: continue handle_index() handle_caipu_list(queue_list.get())
通過python多執行緒-執行緒池抓取
-
python3通過concurrent.futures import ThreadPoolExecutor
>引用執行緒池
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import json import requests from multiprocessing import Queue from handle_mongo import mongo_info from concurrent.futures import ThreadPoolExecutor #建立佇列 queue_list = Queue() def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } response = requests.post(url=url,headers=header,data=data) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) # print(response.text) index_response_dic = json.loads(response.text) for item_index in index_response_dic["result"]["cs"]: # print(item_index) for item_index_cs in item_index["cs"]: # print(item_index_cs) for item in item_index_cs["cs"]: #print(item) data_2 ={ "client":"4", #"_session":"1547000257341354730010002552", "keyword":item["name"], "_vs ":"400", "order":"0" } #print(data_2) queue_list.put(data_2) def handle_caipu_list(data): print("當前的食材:",data["keyword"]) caipu_list_url = "http://api.douguo.net/recipe/s/0/20"; caipu_response = handle_request(caipu_list_url, data) caipu_response_dict = json.loads(caipu_response.text) for caipu_item in caipu_response_dict["result"]["list"]: caipu_info ={} caipu_info["shicai"] = data["keyword"] if caipu_item["type"]==13: caipu_info["user_name"] = caipu_item["r"]["an"] caipu_info["shicai_id"] = caipu_item["r"]["id"] caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","") caipu_info["caipu_name"] = caipu_item["r"]["n"] caipu_info["zuoliao_list"] = caipu_item["r"]["major"] #print(caipu_info) detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"]) detail_data ={ "client":"4", "_session":"1547000257341354730010002552", "author_id":"0", "_vs":"2803", "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"] caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"] #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) else: continue handle_index() pool = ThreadPoolExecutor(max_workers=20) while queue_list.qsize()>0: pool.submit(handle_caipu_list,queue_list.get())
通過使用代理ip隱藏爬蟲
當app運維人員,發現我們的一直在請求他們的伺服器,很可能就把咱們的ip給封了,通過代理ip的方式。隱藏自我。
-
註冊申請 abuyun.com
>一個小時1元,我申請了一個小時咱們一起使用下
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/11 2:40 # @Author: Aries # @Site: # @File: handle_proxy.py # @Software: PyCharm #60.17.177.187 代理出來的ip importrequests url = 'http://ip.hahado.cn/ip' proxy = {'http':'http://H79623F667Q3936C:[email protected]:9030'} response = requests.get(url=url,proxies=proxy) print(response.text)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time: 2019/1/9 11:06 # @Author: lm # @Url: idig8.com # @Site: # @File: spider_douguomeishi.py # @Software: PyCharm import json import requests from multiprocessing import Queue from handle_mongo import mongo_info from concurrent.futures import ThreadPoolExecutor #建立佇列 queue_list = Queue() def handle_request(url,data): header ={ "client": "4", "version": "6916.2", "device": "SM-G955N", "sdk": "22,5.1.1", "imei": "354730010002552", "channel": "zhuzhan", "mac": "00:FF:E2:A2:7B:58", "resolution": "1440*900", "dpi":"2.0", "android-id":"bcdaf527105cc26f", "pseudo-id":"354730010002552", "brand":"samsung", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"3", "carrier": "Android", #"imsi": "310260000000000", "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", "lon": "105.566938", "lat": "29.99831", "cid": "512000", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding": "gzip, deflate", "Connection": "Keep-Alive", # "Cookie": "duid=58349118", "Host": "api.douguo.net", #"Content-Length": "65" } proxy = {'http': 'http://H79623F667Q3936C:[email protected]:9030'} response = requests.post(url=url,headers=header,data=data,proxies=proxy) return response def handle_index(): url = "http://api.douguo.net/recipe/flatcatalogs" # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ "client":"4", "_session":"1547000257341354730010002552", "v":"1503650468", "_vs":"0" } response = handle_request(url,data) # print(response.text) index_response_dic = json.loads(response.text) for item_index in index_response_dic["result"]["cs"]: # print(item_index) for item_index_cs in item_index["cs"]: # print(item_index_cs) for item in item_index_cs["cs"]: #print(item) data_2 ={ "client":"4", #"_session":"1547000257341354730010002552", "keyword":item["name"], "_vs ":"400", "order":"0" } #print(data_2) queue_list.put(data_2) def handle_caipu_list(data): print("當前的食材:",data["keyword"]) caipu_list_url = "http://api.douguo.net/recipe/s/0/20"; caipu_response = handle_request(caipu_list_url, data) caipu_response_dict = json.loads(caipu_response.text) for caipu_item in caipu_response_dict["result"]["list"]: caipu_info ={} caipu_info["shicai"] = data["keyword"] if caipu_item["type"]==13: caipu_info["user_name"] = caipu_item["r"]["an"] caipu_info["shicai_id"] = caipu_item["r"]["id"] caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","") caipu_info["caipu_name"] = caipu_item["r"]["n"] caipu_info["zuoliao_list"] = caipu_item["r"]["major"] #print(caipu_info) detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"]) detail_data ={ "client":"4", "_session":"1547000257341354730010002552", "author_id":"0", "_vs":"2803", "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}' } detail_reponse = handle_request(detail_url,detail_data) detail_reponse_dic = json.loads(detail_reponse.text) caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"] caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"] #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) else: continue handle_index() pool = ThreadPoolExecutor(max_workers=2) while queue_list.qsize()>0: pool.submit(handle_caipu_list,queue_list.get())
PS:本次是app資料抓取的入門。首先是通過模擬器的代理服務,到本地的電腦(安裝fiddler),這樣fiddler就可以抓取資料了,分析資料這塊要憑藉自己的經驗找到對應的url,如果能分析到url,基本爬蟲就寫一半。封裝請求頭。通過fiddler獲取的。裡面header內容比較多,嘗試刪除最簡化,也是一種反爬蟲的策略,有的資料放進去到容易被發現是爬蟲了,例如cookies等等,但是有的爬蟲爬取資料需要cookies。通過代理的方式設定代理ip,防止爬取過程中同一個ip,一直請求一個介面被發現是爬蟲。引入了佇列的目的就是為了使用執行緒池的時候方便提取。然後放入mongodb中。這樣使用多執行緒的app資料就完成了。
>>原創文章,歡迎轉載。轉載請註明:轉載自IT人故事會,謝謝!
>>原文連結地址:上一篇:已是最新文章