python爬蟲,抓取新浪科技的文章(beautifulsoup+mysql)
阿新 • • 發佈:2018-12-22
這幾天的辛苦沒有白費,總算完成了對新浪科技的文章抓取,除非沒有新的內容了,否則會一直爬取新浪科技的文章。
想了解更多可以關注我的github:https://github.com/libp/WebSpider
如果想要資料庫表結構可以留下郵箱~
# -*- coding: utf-8 -*- __author__ = 'Peng' from bs4 import BeautifulSoup,Comment import urllib2 from urllib2 import urlopen,HTTPError import MySQLdb import json import datetime import logging import sys import re import time #配置日誌輸出位置為控制檯 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', stream=sys.stdout) def spiderSinaTech(url,webname): conn = getConn(); cur = conn.cursor() data = getSinaArticle(url,webname) if (data == None): #不能解析目標網頁 return -1 try: sqlInsertArticle="insert into tbl_peng_article (title,author,content,createTime,getTime,url,webname) values (%s,%s,%s,%s,%s,%s,%s)" result = cur.execute(sqlInsertArticle,(data['title'],data['author'],data['article'],data['published_time'],data['getTime'],data['url'],data['webname'])) except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.commit() cur.close() conn.close() return result def getSinaArticle(url,webname): #建立字典用來儲存函式的返回結果 dict={'url':url,'title':'','published_time':'','getTime':'','author':'','article':'','webname':webname} #建立請求頭 headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36", "Accept":"*/*"} #開啟網頁 try: dict['url']=url request = urllib2.Request(url,headers=headers) html = urlopen(request) except HTTPError as e: print(e) #讀取網頁內容並轉換成樹形文件結構 soup = BeautifulSoup(html.read(),"lxml") #去除html註釋 for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() #過濾JavaScript [s.extract() for s in soup('script')] try: #獲取標題 title = soup.find(id="main_title").get_text(); # print(title) dict['title'] = title except: return None #獲取釋出時間 published_time = soup.find(property="article:published_time")['content']; #2017-06-03T11:31:53+08:00 這種時間格式叫UTC時間格式...很噁心 # print(published_time) UTC_FORMAT = "%Y-%m-%dT%H:%M:%S+08:00" dict['published_time'] = datetime.datetime.strptime(published_time, UTC_FORMAT) #獲取作者 author = soup.find(property="article:author")['content']; # print(author) dict['author'] = author #獲取文章主體 content = soup.find(id="artibody"); img = content.find_all(class_="img_wrapper") #刪除文件書中圖片標籤 for del_img in img: del_img.decompose() #獲取文章主體各個段落 paragraph = soup.find(id="artibody").contents; #最終入庫的文章內容 article ="" for child in paragraph: article += str(child) # print(article) dict['article'] = article # print json.dumps(dict) # date在轉換成json的時候包括,需要重構date轉換的函式 # return json.dumps(dict) #文章抓取時間 dict['getTime']=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) return dict def getConn(): conn= MySQLdb.connect( host='localhost', port = 3306, user='root', passwd='root', db ='nichuiniu', charset='utf8', ) return conn def GOSina(url,webname): #建立連結集合 # pages = set() #建立字典用來儲存函式的返回結果 # dict={'url':url,'title':'','published_time':'','getTime':'','author':'','article':'','webname':webname} #建立請求頭 headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36", "Accept":"*/*"} #開啟網頁 try: request = urllib2.Request(url,headers=headers) html = urlopen(request) except HTTPError as e: print(e) #讀取網頁內容並轉換成樹形文件結構 soup = BeautifulSoup(html.read(),"lxml") conn = getConn(); cur = conn.cursor() #宣告一個數組用來儲存入庫的文章連結 L = [] for link in soup.findAll("a",href=re.compile(r'(.*?)(tech)(.*?)(\d{4}-\d{2}-\d{2})(/doc-ify)')): if 'href' in link.attrs: #提取href中的url,並規範格式去除分頁引數 xurl = re.compile(r'(.*?shtml)').search(link.attrs['href']).group(1) sqlQueryUrl="select * from tbl_peng_article where url='%s'"%xurl # print link.attrs['href'] result = cur.execute(sqlQueryUrl) conn.commit() if ( result == 0 ): # data = getSinaArticle(url,webname) rs = spiderSinaTech(xurl,webname) if( rs > 0 ): logging.info("----URL has insert into database :%s"%xurl) L.append(xurl) time.sleep( 2 ) elif( rs == -1): logging.info("****URL content cannt be understand %s"%xurl) else : logging.info("&&&&URL already in database %s"%xurl) cur.close() conn.close() #如果不為空就返回最後一個url,為空則停止抓取 if L: return L[-1] else: return 0 logging.info("begin spider sina tech") url="http://tech.sina.com.cn/it/2017-06-07/doc-ifyfuzny3756083.shtml" webname="sina" x = GOSina(url,webname) if x!= 0: GOSina(x,webname) logging.info("end spider sina tech")