1. 程式人生 > >爬蟲——BeautifulSoup 淘寶模特資訊爬取

爬蟲——BeautifulSoup 淘寶模特資訊爬取

                                            (僅供參考)

import os
import requests
from bs4 import BeautifulSoup

from selenium import webdriver

Path_Dir = "D:\\Pachong\\shuju"       
if os.path.exists(Path_Dir):
    pass
else:
    os.mkdir(Path_Dir)

def Get_Pictures(MM_Name, MM_Page_Url):
    Driver = webdriver.PhantomJS(executable_path
='D:\\Pachong\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe') Driver.get("https:"+MM_Page_Url) Soup_Html = BeautifulSoup(Driver.page_source, "html.parser") MM_Pics_Url = Soup_Html.find("div", attrs={"class":"mm-p-info mm-p-domain-info"}) if MM_Pics_Url: MM_Pics_Url_Text = "https:"
+ MM_Pics_Url.find("span").get_text() print(MM_Pics_Url_Text) if __name__ == '__main__': MM_Name_List = [] Head_Links_List = [] MM_Age_List = [] MM_Address_List = [] MM_Page_List = [] Url_Base = 'http://mm.taobao.com/json/request_top_list.htm?page=1 ' Html_Response = requests.get(Url_Base).text Soup_Resp = BeautifulSoup(Html_Response,
"html.parser") MM_Name = Soup_Resp.find_all("a", attrs={"class":"lady-name"}) for item in MM_Name: MM_Name_Item = item.get_text().strip() MM_Name_List.append(MM_Name_Item) MM_Page_Link = item.get("href") MM_Page_List.append(MM_Page_Link) Head_Links = Soup_Resp.find_all("a", attrs={"class":"lady-avatar"}) for item in Head_Links: Head_Img_Link_Tag = item.find("img").get("src") Head_Links_List.append(Head_Img_Link_Tag) MM_Age_Tags = Soup_Resp.find_all("p", attrs={"class":"top"}) for item in MM_Age_Tags: MM_Age_Text = item.find("em").get_text() MM_Age_List.append(MM_Age_Text) MM_Address_Text = item.find("span").get_text() MM_Address_List.append(MM_Address_Text) Item_len = len(MM_Name_List) for i in range(Item_len): print("MM名稱:%s;\tMM年齡:%s;\tMM居住地:%s;\tMM頭像:%s"%(MM_Name_List[i],MM_Age_List[i],MM_Address_List[i],Head_Links_List[i])) Get_Pictures(MM_Name_List[i], MM_Page_List[i])