1. 程式人生 > >爬取關於LPL的微信文章並存入資料庫

爬取關於LPL的微信文章並存入資料庫

import requests
from bs4 import BeautifulSoup
import time
import pymysql


def get_HTML(url):
    hd = {'User-Agent': 'Mozilla/5.0'}
    try:
        r = requests.get(url, headers=hd)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('11111')


def get_url(html, list):
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.select('.txt-box h3 a')
    for item in items:
        # print(item['href'])
        list.append(item['href'])


def get_Text(list):
    for url in list:
        html = get_HTML(url)
        soup = BeautifulSoup(html, 'html.parser')
        # 找到標題
        title = soup.select('h2.rich_media_title')[0].string.replace('\n', '').replace(' ', '')
        '''
        這一段程式碼
        先找到標題的標籤,然後提取其中字元資訊,再將字串中的換行符去掉,再將空格也去掉
        '''
        # 找到作者
        author = soup.select('a#js_name')[0].string.replace('\n', '').replace(' ', '')

        # 找到內容
        content = ''
        contents = soup.select('.rich_media_content p')
        for item in contents:
            if item.string:
                content = content + str(item.string)
        print(title, author, content)
        yield title, author, content

        time.sleep(3)


def insertDATABASES(title, author, content):
    db = pymysql.connect(host='localhost', user='root', password='yellowkk', port=3306, db='spiders', charset='utf8')
    cursor = db.cursor()
    # sql1 = 'CREATE TABLE IF NOT EXISTS wechat_article(title VARCHAR(255),author VARCHAR(255),content text)'
    # cursor.execute(sql1)
    sq2 = 'INSERT INTO wechat_article(title,author,content) values(%s,%s,%s)'
    try:
        cursor.execute(sq2, (title, author, content))
        db.commit()
    except:
        db.rollback()
    db.close()


def main():
    text_url_list = []
    url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query=lpl'
    html = get_HTML(url)
    get_url(html, text_url_list)
    for title, author, content in get_Text(text_url_list):
        insertDATABASES(title, author, content)


main()