1. 程式人生 > >萌新爬蟲瑟瑟發抖1(指令碼)

萌新爬蟲瑟瑟發抖1(指令碼)

from bs4 import BeautifulSoup
import requests

url_saves = 'https://cn.tripadvisor.com/Saves#37685322'
url = 'https://www.tripadvisor.cn/Attractions-g60763-Activities-New_York_City_New_York.html'
urls = ['https://www.tripadvisor.cn/Attractions-g60763-Activities-New_York_City_New_York.html#ATTRACTION _LIST'.format(str(i)) for i in range(30,930,30)]

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
    'Cookie':'TAUnique=%1%enc%3AdibCuK98eOf%2FIbRWllITfZBr1qC4reTpbZRzaJW8tbc2jHwltRJPGQ%3D%3D; TASSK=enc%3AANgIRvpTIdCO%2FoPg9knSqh1eKOkT6rvwmIKbn0sozrNyLpNibUPy7wiOc%2FZYxt9R4DZAT%2B4FhaQh06nUuLbbEXod61MPl9xftla4pXiPR41%2F%2BiFgd%2Fw9Qzua5Cl3ldTi3A%3D%3D; TAPD=tripadvisor.cn; __gads=ID=2d5d89279523853f:T=1512888157:S=ALNI_MYpgmD_FsQfll6r9om0JEmKpro3aw; ki_t=1512478874932%3B1512998357506%3B1512998513381%3B3%3B4; ki_r=; ServerPool=B; TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RVL.60763_346l105127_346*RS.1; TAReturnTo=%1%%2FAttraction_Review-g60763-d105127-Reviews-Central_Park-New_York_City_New_York.html; roybatty=TNI1625!AGJVA9rBu7V97HCM%2FYnOBXB3CcRSgplT0OKBLyaYV3QjKv4mt4bFLICfnUdWu5uvG3dz2TWp%2FF%2BgppHy5kTIAFChPFs4fsiggM2zo1L1%2FczHDOpirx4ZXxES%2FagOtHJEZr5B2hOdymscD4UG9j6Pi2NzCkrnbosnnNI0eTUHNEC%2F%2C1; _ga=GA1.2.1995199171.1512478869; _gid=GA1.2.120677595.1512998346; TASession=%1%V2ID.EEB4B71284DEBC505463D3A309A2DABA*SQ.8*LP.%2FAttractions-g60763-Activities-New_York_City_New_York%5C.html*LS.Attraction_Review*GR.21*TCPAR.38*TBR.34*EXEX.66*ABTR.87*PHTB.36*FS.42*CPU.18*HS.recommended*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*LF.zhCN*FA.1*DF.0*MS.-1*RMS.-1*FLO.60763*TRA.true*LD.105127; CM=%1%HanaPersist%2C%2C-1%7CPremiumMobSess%2C%2C-1%7Ct4b-pc%2C%2C-1%7CHanaSession%2C%2C-1%7CRestAds%2FRPers%2C%2C-1%7CRCPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CFtrPers%2C%2C-1%7CTheForkMCCPers%2C%2C-1%7CHomeASess%2C%2C-1%7CPremiumSURPers%2C%2C-1%7CPremiumMCSess%2C%2C-1%7CCpmPopunder_1%2C1%2C1513084733%7CRestPremRSess%2C%2C-1%7CCCSess%2C%2C-1%7CPremRetPers%2C%2C-1%7CViatorMCPers%2C%2C-1%7Csesssticker%2C%2C-1%7CPremiumORSess%2C%2C-1%7Ct4b-sc%2C%2C-1%7CRestAdsPers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7Cb2bmcpers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7CPremMCBtmSess%2C%2C-1%7CPremiumSURSess%2C%2C-1%7CLaFourchette+Banners%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7CPremiumRRSess%2C%2C-1%7CSaveFtrPers%2C%2C-1%7CSPMCSess%2C%2C-1%7CTheForkORSess%2C%2C-1%7CTheForkRRSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CMetaFtrSess%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CFtrSess%2C%2C-1%7CRestAds%2FRSess%2C%2C-1%7CHomeAPers%2C%2C-1%7CPremiumMobPers%2C%2C-1%7CRCSess%2C%2C-1%7CLaFourchette+MC+Banners%2C%2C-1%7CRestAdsCCSess%2C%2C-1%7CRestPremRPers%2C%2C-1%7Csh%2C%2C-1%7Cpssamex%2C%2C-1%7CTheForkMCCSess%2C%2C-1%7CCCPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Cb2bmcsess%2C%2C-1%7CSPMCPers%2C%2C-1%7CPremRetSess%2C%2C-1%7CViatorMCSess%2C%2C-1%7CPremiumMCPers%2C%2C-1%7CPremiumRRPers%2C%2C-1%7CRestAdsCCPers%2C%2C-1%7CTheForkORPers%2C%2C-1%7CPremMCBtmPers%2C%2C-1%7CTheForkRRPers%2C%2C-1%7CSaveFtrSess%2C%2C-1%7CPremiumORPers%2C%2C-1%7CRestAdsSess%2C%2C-1%7CRBASess%2C%2C-1%7Cperssticker%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; TAUD=LA-1513083037070-1*RDD-1-2017_12_12*LG-366835-2.1.F.*LD-366836-.....'
}

def get_attractions(url,data=None):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text.'lxml')
    titles = soup.select('div.property_title > a[target="_blank"]')
    imgs = soup.select('img[width="200"]')
    cates = soup.select('div.p13n_reasoning_v2')
    for title, img, cate in zip(titles, imgs, cates):
    data = dict(title=titles.get_text(), img=img.get('src'), meta=list(cate.stripped_strings))
    print(data)

def get_favs(url,data=None):
    wb_data = requests.get(url_saves, headers=headers)
    soup = BeautifulSoup(wb_data.text.'lxml')
    titles = soup.select('div.photo > div.sizedThumd > img.location-name')
    imgs = soup.select('img.photo_image')
    metas = soup.select('span.format_address')

    if data == None:
        for title, img, meta in zip(titles, imgs, metas):
            data = dict(title=titles.get_text(), img=img.get('src'), meta=list(meta.stripped_strings))
            print(data)

print(urls)
from bs4 import BeautifulSoup
import requests

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}

url = 'https://www.tripadvisor.cn/Attractions-g60763-Activities-New_York_City_New_York.html'

wb_data = requests.get(url_saves,headers=headers)
soup = BeautifulSoup(wb_data.text.'lxml')
imgs = soup.select('img.photo_image')
for i in imgs:
    print(i.get('src'))