1. 程式人生 > >爬蟲爬取京東部分需要的資料

爬蟲爬取京東部分需要的資料

#_*_coding=utf-8 _*_
#__author__ = 'Administrator'


from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import urllib2
import time
from compiler.ast import flatten
import re
import xlwt
reload(sys)
sys.setdefaultencoding('utf-8')


deiver =webdriver.Firefox()
deiver.get('https://fresh.jd.com/'
) deiver.find_element_by_xpath('/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div/a').click()#需要手動輸入div[1]/div/a') ,第一個div值 windows = deiver.window_handles deiver.switch_to.window(windows[-1]) #點選進入商品列表介面 deiver.switch_to.window(windows[0]) deiver.close() deiver.switch_to.window(windows[-1
]) time.sleep(2) pages = deiver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[1]/div/span').text #獲取一共有多少頁商品 pages = pages.encode("utf-8") pages = int(pages) page = pages/60 + 1 all_goods = [] all_url_goods = []#所有商品的url for aa in range(1,page): # print aa a = 'https://list.jd.com/list.html?cat=12218,12221&page='
#*****需要手動輸入cat=12218,12221,cat後值 b = '&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main' bb = aa x = '%s%d%s'%(a,bb,b) jd = urllib2.urlopen(x)#訪問生鮮下商品列表頁 html = jd.read() soup = BeautifulSoup(html,'html.parser') list = soup.select('div[class="p-name"]') new_list1 = [] name = [] for i in range(len(list)): try : b = re.findall('">\n<a href="(.*)'" target=",str(list[i]))[0].decode()#匹配商品列表頁的url new_list1.append(b)#匹配獲取商品列表頁一頁的60個url地址,並新增到new_list1下 except Exception as e: pass print('第%s頁'%aa) all_url_goods.append(new_list1) print('url獲取結束,開始獲取規格名稱') all_url_goods = flatten(all_url_goods) #print(all_url_goods) #獲取page頁數,並將所有頁數下的url新增到all_url_goods for i in all_url_goods: x = 'http:' i = i.replace('"','') xx = '%s%s'%(x,i) goods = urllib2.urlopen(xx)#訪問商品列表頁每個商品的url html =goods.read() soup = BeautifulSoup(html,'lxml') list1 = soup.findAll(attrs={'data-sku':True}) goods_url = [] for i in range(len(list1)): a = re.findall('data-sku="(.*)" data-value="',str(list1[i]))#匹配sku goods_url.append(a) goods_url=flatten(goods_url) #print(goods_url) for i in range(len(goods_url)): a = 'https://item.jd.com/' b =int(goods_url[i]) c = '.html' last_url = '%s%d%s'%(a,b,c) html = urllib2.urlopen(last_url)#訪問商品詳情頁的各個規格 soup = BeautifulSoup(html,'lxml') last_list =soup.select('div[class="sku-name"]')#匹配名稱 for i in range(len(last_list)): re_goodsname = last_list[i].string name.append(re_goodsname) print(name) print(len(name)) #all_goods = flatten(all_goods) work_excel = xlwt.Workbook() sheet1 = work_excel.add_sheet(u"sheet1",cell_overwrite_ok= True) for i in range(len(name)): sheet1.write(i,0,name[i]) #print i work_excel.save('xinxianshuiguo.xls')

歡迎大神給與提點。另外想問一下,非同步載入的資料,可不可以不直接使用time.sleep()?有沒有別的方法,類似selenium中顯式等待的?