1. 程式人生 > >gui采集淘寶列表商品[後期有空更新采集各大電商平臺]

gui采集淘寶列表商品[後期有空更新采集各大電商平臺]

accept lan time object nec nal stringvar ble amp

import requests
from pymysql import *
from tkinter import *

window = Tk()
window.title("淘寶列表商品采集")
window.geometry(‘200x180‘)

Label(window, text=‘關鍵字采集‘).pack()
name = StringVar()
Entry(window, textvariable=name).pack()

Label(window, text=‘采集起始頁‘).pack()
to_page = StringVar()
Entry(window, textvariable=to_page).pack()

Label(window, text=‘采集結束頁‘).pack()
w_page = StringVar()
Entry(window, textvariable=w_page).pack()


class Taobao(object):
def __init__(self,name, to_page, w_page):

self.url = ‘https://s.taobao.com/search?q=‘+name+‘&s={}‘
self.headers = {‘accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
‘accept-encoding‘:‘gzip, deflate, br‘,
‘accept-language‘:‘zh-CN,zh;q=0.9‘,
‘cache-control‘:‘max-age=0‘,
‘upgrade-insecure-requests‘:‘1‘,
‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘}


self.url_list = [self.url.format(str((i) * 44)) for i in range(to_page, w_page + 1)]

# 請求url
def get_data(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
return response.content.decode()

# 解析數據
def parse_data(self, data):
# 源碼數據
# 定義各個字段正則匹配規則
img_urlpat = ‘"pic_url":"(//.*?)"‘
goods_urlpat = ‘"nid":"(.*?)"‘
title_pat = ‘"raw_title":"(.*?)"‘
company_pat = ‘"nick":"(.*?)"‘
price_pat = ‘"view_price":"(.*?)"‘
deal_countpat = ‘"view_sales":"(.*?)"‘
comment_countpat = ‘"comment_count":"(.*?)"‘

# 查找滿足匹配規則的內容,並存在列表中
imgL = re.compile(img_urlpat).findall(data)
goodsL = re.compile(goods_urlpat).findall(data)
nameL = re.compile(title_pat).findall(data)
companyL = re.compile(company_pat).findall(data)
priceL = re.compile(price_pat).findall(data)
dealL = re.compile(deal_countpat).findall(data)
commentL = re.compile(comment_countpat).findall(data)

data_list = []
for j in range(len(imgL)):
name = ‘淘寶‘
goods_link = "https://detail.tmall.com/item.htm?id=" + goodsL[j] # 商品鏈接
img_link = "http:" + imgL[j] # 商品圖片鏈接
title = nameL[j] # 商品名稱
company = companyL[j] # 淘寶店鋪名稱
price = priceL[j] # 商品價格
deal_count = dealL[j] # 商品付款人數
comment_count = commentL[j] # 商品評論數,會存在為空值的情況
if (comment_count == ""):
comment_count = 0
time = {
‘name‘: name,
‘img_link‘: img_link,
‘goods_link‘: goods_link,
‘price‘: price,
‘title‘: title,
‘company‘: company,
‘deal_count‘: deal_count,
‘comment_count‘: comment_count,
}
data_list.append(time)
return data_list

# 保存數據

def save_data(self, data_list):
try:
conn = Connect(host="127.0.0.1", user="root", password="root", database="data_list", port=3306,
charset="utf8")
cs1 = conn.cursor()
# 執行insert語句,並返回受影響的行數:添加一條數據
for index, data in enumerate(data_list):
count = cs1.execute(
‘insert into data(name,goods_link,img_link,title,price,company,deal_count,comment_count) values("%s","%s","%s","%s","%s","%s","%s","%s")‘ % (
data[‘name‘], data[‘goods_link‘], data[‘img_link‘], data[‘title‘], data[‘price‘],
data[‘company‘], data[‘deal_count‘], data[‘comment_count‘]))
# 關閉Cursor對象
print(count)
cs1.close()
# 提交之前的操作,此處為insert操作
conn.commit()
except Exception as e:
‘‘‘吧報錯信息寫入log日誌‘‘‘
with open(‘log.txt‘, ‘a‘) as f:
f.write(repr(e) + ‘\n‘)
finally:
# 關閉Connection對象
conn.close()

def run(self):
# 構建url
# 構建請求頭
# 發起請求
for url in self.url_list:
data = self.get_data(url)
# 解析響應,抽取數據
data_list = self.parse_data(data)
# 保存數據
self.save_data(data_list)


def main():
n = str(name.get())
t = int(to_page.get())
w = int(w_page.get())
all = Taobao(n, t, w)
all.run()

if __name__ == ‘__main__‘:
Button(window, text="確定", relief=‘groove‘, width=9, height=1, bd=4, command=main).pack()
window.mainloop()


技術分享圖片





gui采集淘寶列表商品[後期有空更新采集各大電商平臺]