1. 程式人生 > >如何在python3中將網頁爬蟲資料儲存到mysql資料庫

如何在python3中將網頁爬蟲資料儲存到mysql資料庫

前兩篇文章都在說在py中用BeautfulSoup爬取本地網頁的事情,本來準備去真實網頁試一下的,但是老林說不如把你之前學的mysql資料庫溫習一下,順道學著把你現在爬到的網頁存取到mysql資料庫之中~
由此 本文的主題就出現了:

如何在python3中將網頁爬蟲資料儲存到mysql資料庫

先小小插播一下:為何標題強調python3!
因為py2與py3連線資料庫時用的不是一個庫!
PyMySQL 是在 Python3.x 版本中用於連線 MySQL 伺服器的一個庫,
Python2中則使用mysqldb。

from bs4 import BeautifulSoup
import pymysql

#本地網頁爬取資料
#即上一篇文章所學知識
def getData():
  datalist = []
  with open('D:/Study/Data Analysis/week1/1_2/1_2answer_of_homework/1_2_homework_required/index.html','r')as wb_data:
    Soup = BeautifulSoup(wb_data,'lxml')
    #print(Soup)
    # address = Soup.select('body > div:nth-of-type(3) > div > div.col-md-9 > div:nth-of-type(3) > div:nth-of-type(3) > div > img')
    address = Soup.select('body > div > div > div.col-md-9 > div > div > div > img')
    price = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
    title = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
    amount = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
    stars = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
    print("getData--end")
    print('start-print-data')
    for address,price,title,amount,stars in zip(address,price,title,amount,stars):
      data = {
        'address':address.get('src'),
        'price':price.get_text(),
        'title':title.get_text(),
        'amount':list(amount.stripped_strings)[0],
        'stars': len(stars.find_all("span", class_='glyphicon glyphicon-star'))
    }
      print(data)
      datalist.append(data)
    print('end-print-data')
    return datalist


#資料庫中建立新表用以儲存
def mysql_create():
  mysql_host = 'localhost'
  mysql_db = 'school'
  mysql_user = 'root'
  mysql_password = '123'
  mysql_port = 3306
  db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db,charset='utf8') # 連線資料庫編碼注意是utf8,不然中文結果輸出會亂碼
  sql_create = """CREATE TABLE schoolsheet(
price VARCHAR(10),
title VARCHAR(50),
amount VARCHAR(265),
stars VARCHAR(265),
address VARCHAR(265),
PRIMARY KEY (`price`),
UNIQUE KEY `title`(`title`))ENGINE=InnoDB AUTO_INCREMENT=12 DEFAULT CHARSET=utf8"""
 # sql_key="CREATE UNIQUE INDEX id ON schoolsheet(id)"
  cursor = db.cursor()
  cursor.execute("DROP TABLE IF EXISTS schoolsheet")
  cursor.execute(sql_create)# 執行SQL語句
  db.commit()
  #cursor.execute(sql_key)
  db.close() # 關閉資料庫連


#存放爬取資料到資料庫中
def IntoMysql(datalist):
  mysql_host = 'localhost'
  mysql_db = 'school'
  mysql_user = 'root'
  mysql_password = '123'
  mysql_port = 3306
  db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db,charset='utf8') # 連線資料庫編碼注意是utf8,不然中文結果輸出會亂碼
  print('open connect!')
  cursor = db.cursor()
  print('start-insert-data')
  for j in range(len(datalist)):
    datarow = datalist[j]
    addr = datarow['address']
    pric = datarow['price']
    titl = datarow['title']
    amou = datarow['amount']
    star = datarow['stars']
    sql = "INSERT INTO schoolsheet(price,title,amount,stars,address)VALUES ('%s','%s','%s','%s','%s')"%(pric,titl,amou,star,addr)
    cursor.execute(sql)
    db.commit()

  db.close()


datalist=getData()
mysql_create()
IntoMysql(datalist)

一切領悟都在程式碼之中,還不太熟的我準備再多敲一敲領會一下,沒有太多講解,沒準等我領會完再來更,畢竟不懂就不瞎BB了!!!

萌星一枚,大神輕噴!!!