爬取關於LPL的微信文章並存入資料庫
阿新 • • 發佈:2018-12-10
import requests from bs4 import BeautifulSoup import time import pymysql def get_HTML(url): hd = {'User-Agent': 'Mozilla/5.0'} try: r = requests.get(url, headers=hd) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('11111') def get_url(html, list): soup = BeautifulSoup(html, 'html.parser') items = soup.select('.txt-box h3 a') for item in items: # print(item['href']) list.append(item['href']) def get_Text(list): for url in list: html = get_HTML(url) soup = BeautifulSoup(html, 'html.parser') # 找到標題 title = soup.select('h2.rich_media_title')[0].string.replace('\n', '').replace(' ', '') ''' 這一段程式碼 先找到標題的標籤,然後提取其中字元資訊,再將字串中的換行符去掉,再將空格也去掉 ''' # 找到作者 author = soup.select('a#js_name')[0].string.replace('\n', '').replace(' ', '') # 找到內容 content = '' contents = soup.select('.rich_media_content p') for item in contents: if item.string: content = content + str(item.string) print(title, author, content) yield title, author, content time.sleep(3) def insertDATABASES(title, author, content): db = pymysql.connect(host='localhost', user='root', password='yellowkk', port=3306, db='spiders', charset='utf8') cursor = db.cursor() # sql1 = 'CREATE TABLE IF NOT EXISTS wechat_article(title VARCHAR(255),author VARCHAR(255),content text)' # cursor.execute(sql1) sq2 = 'INSERT INTO wechat_article(title,author,content) values(%s,%s,%s)' try: cursor.execute(sq2, (title, author, content)) db.commit() except: db.rollback() db.close() def main(): text_url_list = [] url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query=lpl' html = get_HTML(url) get_url(html, text_url_list) for title, author, content in get_Text(text_url_list): insertDATABASES(title, author, content) main()