1. 程式人生 > >python爬蟲爬取ithome的新聞儲存到本地資料庫

python爬蟲爬取ithome的新聞儲存到本地資料庫

爬IT之家首頁的新聞,並讀取每篇新聞,並將新聞存到本地資料庫。
效率不是很高,請求各位大神指點。 
from bs4 import BeautifulSoup
import urllib.request import re import pymysql conn =pymysql.connect(host='localhost',user ='root',passwd='',db='myblog',charset='utf8') cur=conn.cursor() headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1)') opener = urllib.request.build_opener() opener.addheaders = [headers] data = opener.open(url).read() file=data.decode('gbk') title1='
http://www
\.ithome\.com/html/[a-z]+/[0-9]*\.htm' a=re.findall(title1,file) for i in range(0,len(a)):    urltest=urllib.request.urlopen(a[i])    strdata=urltest.read().decode('gbk')    strdata=str(strdata)    soup = BeautifulSoup(strdata,"html.parser")    redemo="-"+" "+"[^\s]*"    pattern=re.compile(redemo)    rehtml="<[^<]*>"    title=soup.h1.string    title=str(title).encode('utf8')    text = soup.find_all(class_="post_content")    for string in text:     content=string    ad=soup.find_all(class_="yj_d")    for adstring in ad:         ad=adstring    ad=str(ad)    content=str(content)    content=content.replace(ad,"")    html=re.compile(rehtml)    content=html.sub("\n",content)    content=re.compile("\n+").sub(" ",content)    content=content.encode('utf8')    #content="content"    sql="INSERT INTO `myblog`.`article` (`id`, `title`, `content`, `author`, `time`) VALUES (NULL,%s,%s, 'admin', '2016-03-08');"    cur.execute(sql,[title,content])     conn.commit() conn.close()