爬蟲——BeautifulSoup 淘寶模特資訊爬取
阿新 • • 發佈:2019-01-27
(僅供參考)
import os import requests from bs4 import BeautifulSoup from selenium import webdriver Path_Dir = "D:\\Pachong\\shuju" if os.path.exists(Path_Dir): pass else: os.mkdir(Path_Dir) def Get_Pictures(MM_Name, MM_Page_Url): Driver = webdriver.PhantomJS(executable_path='D:\\Pachong\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe') Driver.get("https:"+MM_Page_Url) Soup_Html = BeautifulSoup(Driver.page_source, "html.parser") MM_Pics_Url = Soup_Html.find("div", attrs={"class":"mm-p-info mm-p-domain-info"}) if MM_Pics_Url: MM_Pics_Url_Text = "https:"+ MM_Pics_Url.find("span").get_text() print(MM_Pics_Url_Text) if __name__ == '__main__': MM_Name_List = [] Head_Links_List = [] MM_Age_List = [] MM_Address_List = [] MM_Page_List = [] Url_Base = 'http://mm.taobao.com/json/request_top_list.htm?page=1 ' Html_Response = requests.get(Url_Base).text Soup_Resp = BeautifulSoup(Html_Response,"html.parser") MM_Name = Soup_Resp.find_all("a", attrs={"class":"lady-name"}) for item in MM_Name: MM_Name_Item = item.get_text().strip() MM_Name_List.append(MM_Name_Item) MM_Page_Link = item.get("href") MM_Page_List.append(MM_Page_Link) Head_Links = Soup_Resp.find_all("a", attrs={"class":"lady-avatar"}) for item in Head_Links: Head_Img_Link_Tag = item.find("img").get("src") Head_Links_List.append(Head_Img_Link_Tag) MM_Age_Tags = Soup_Resp.find_all("p", attrs={"class":"top"}) for item in MM_Age_Tags: MM_Age_Text = item.find("em").get_text() MM_Age_List.append(MM_Age_Text) MM_Address_Text = item.find("span").get_text() MM_Address_List.append(MM_Address_Text) Item_len = len(MM_Name_List) for i in range(Item_len): print("MM名稱:%s;\tMM年齡:%s;\tMM居住地:%s;\tMM頭像:%s"%(MM_Name_List[i],MM_Age_List[i],MM_Address_List[i],Head_Links_List[i])) Get_Pictures(MM_Name_List[i], MM_Page_List[i])