1. 程式人生 > >python使用requests庫和re庫寫的京東商品信息爬蟲

python使用requests庫和re庫寫的京東商品信息爬蟲

fin 搜索 goods tle 爬取 val timeout stat for

 1 import requests
 2 import re
 3  
 4 def getHTMLText(url):
 5     try:
 6         r = requests.get(url, timeout=30)
 7         r.raise_for_status()
 8         r.encoding = r.apparent_encoding
 9         return r.text
10     except:
11         return ""
12      
13 def parsePage(ilt, html):
14     try
: 15 plt = re.findall(rdata-done="1"><em>¥</em><i>\d+\.\d+</i></strong>,html) 16 tlt = re.findall(r<em>.+<font class="skcolor_ljg">筆盒</font>.+</em>,html) 17 for i in range(len(plt)): 18 match=re.search(r\d+\.\d+
,plt[i])#這個函數返回的對象是match對象,所以用group屬性把價格取出 19 price=match.group(0) 20 list_match=re.findall(r[\u4e00-\u9fa5],tlt[i])#這個字符串的中文提取我想了好久都沒想到用什麽正則表達式一下子提取出來 21 title=‘‘ 22 for m in range(len(list_match)):#後來放棄了用正則表達式一下子提取出來的想法,要是有大佬想到了指點一下唄
23 title=title+list_match[m] 24 ilt.append([price , title]) 25 except: 26 print("") 27 28 def printGoodsList(ilt): 29 tplt = "{:4}\t{:8}\t{:16}" 30 print(tplt.format("序號", "價格", "商品名稱")) 31 count = 0 32 for g in ilt: 33 count = count + 1 34 print(tplt.format(count, g[0], g[1])) 35 36 def main(): 37 goods = 筆盒 38 depth=3 39 start_url=https://search.jd.com/Search?keyword=+goods+&enc=utf-8 40 infoList = [] 41 for i in range(1,depth): 42 try: 43 url = start_url + &page= + str(2*i-1) 44 html = getHTMLText(url) 45 parsePage(infoList, html) 46 except: 47 continue 48 printGoodsList(infoList) 49 main()

技術分享圖片

1,下面附上參考源碼,來源慕課;原來的爬蟲是爬淘寶首頁商品,不過現在淘寶首頁要登錄驗證,不能直接爬取;但是具有參考價值;

import requests
import re
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return" "
    
def parsePage(ilt,html):
    try:
        plt=re.findall(r\"view_price\"\:\"[\d\.]*\" ,html)
        tlt=re.findall(r\"raw_title\"\:\".*?\",html)
        for i in range(len(plt)):
            price=eval(plt[i].split(:)[1])
            title=eval(tlt[i].split(:)[1])
            ilt.append([price,title])
    except:
        print("")

def printGoodsList(ilt):
    tplt="{:4}\t{:8}\t{:16}"
    print(tplt.format("序號","價格","商品名稱"))
    count=0
    for g in ilt:
        count=count+1
        print(tplt.format(count,g[0],g[1]))
        
def main():
    goods=書包
    depth=2#搜索結果設置為兩頁
    start_url=https://s.taobao.com/search?q=+goods
    infoList=[]
    for i in range(depth):
        try:
            url=start_url+&s=+str(44*i)
            html=getHTMLText(url)#把網站文本text爬下來
            parsePage(infoList,html)#然後把文本裏需要的信息爬下來
        except:
            continue
    printGoodsList(infoList)#然後把信息整理一下打印出來

main()

python使用requests庫和re庫寫的京東商品信息爬蟲