1. 程式人生 > >python爬蟲,抓取新浪科技的文章(beautifulsoup+mysql)

python爬蟲,抓取新浪科技的文章(beautifulsoup+mysql)

這幾天的辛苦沒有白費,總算完成了對新浪科技的文章抓取,除非沒有新的內容了,否則會一直爬取新浪科技的文章。

想了解更多可以關注我的github:https://github.com/libp/WebSpider

如果想要資料庫表結構可以留下郵箱~

# -*- coding: utf-8 -*-


__author__ = 'Peng'
from bs4 import BeautifulSoup,Comment
import urllib2
from urllib2 import urlopen,HTTPError
import MySQLdb
import json
import datetime
import logging
import sys
import re
import time

#配置日誌輸出位置為控制檯
logging.basicConfig(level=logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                stream=sys.stdout)


def spiderSinaTech(url,webname):
     conn = getConn();
     cur = conn.cursor()

     data = getSinaArticle(url,webname)
     if (data == None):
         #不能解析目標網頁
         return -1
     try:
         sqlInsertArticle="insert into tbl_peng_article (title,author,content,createTime,getTime,url,webname) values (%s,%s,%s,%s,%s,%s,%s)"
         result = cur.execute(sqlInsertArticle,(data['title'],data['author'],data['article'],data['published_time'],data['getTime'],data['url'],data['webname']))
     except MySQLdb.Error,e:
         print "Mysql Error %d: %s" % (e.args[0], e.args[1])
     conn.commit()
     cur.close()
     conn.close()
     return result


def getSinaArticle(url,webname):
    #建立字典用來儲存函式的返回結果
    dict={'url':url,'title':'','published_time':'','getTime':'','author':'','article':'','webname':webname}

    #建立請求頭
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
             "Accept":"*/*"}

    #開啟網頁
    try:
        dict['url']=url
        request = urllib2.Request(url,headers=headers)
        html = urlopen(request)
    except HTTPError as e:
        print(e)
    #讀取網頁內容並轉換成樹形文件結構
    soup = BeautifulSoup(html.read(),"lxml")

    #去除html註釋
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()

    #過濾JavaScript
    [s.extract() for s in soup('script')]

    try:
        #獲取標題
        title = soup.find(id="main_title").get_text();
        # print(title)
        dict['title'] = title
    except:
        return None

    #獲取釋出時間
    published_time = soup.find(property="article:published_time")['content'];
    #2017-06-03T11:31:53+08:00   這種時間格式叫UTC時間格式...很噁心
    # print(published_time)
    UTC_FORMAT = "%Y-%m-%dT%H:%M:%S+08:00"
    dict['published_time'] = datetime.datetime.strptime(published_time, UTC_FORMAT)

    #獲取作者
    author = soup.find(property="article:author")['content'];
    # print(author)
    dict['author'] = author

    #獲取文章主體
    content = soup.find(id="artibody");
    img = content.find_all(class_="img_wrapper")
    #刪除文件書中圖片標籤
    for del_img in img:
        del_img.decompose()

    #獲取文章主體各個段落
    paragraph = soup.find(id="artibody").contents;

    #最終入庫的文章內容
    article =""
    for child in paragraph:
        article += str(child)
    # print(article)
    dict['article'] = article
    # print json.dumps(dict)
    # date在轉換成json的時候包括,需要重構date轉換的函式
    # return json.dumps(dict)

    #文章抓取時間
    dict['getTime']=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    return dict

def getConn():
     conn= MySQLdb.connect(
        host='localhost',
        port = 3306,
        user='root',
        passwd='root',
        db ='nichuiniu',
        charset='utf8',
        )
     return conn

def GOSina(url,webname):
    #建立連結集合
    # pages = set()
    #建立字典用來儲存函式的返回結果
    # dict={'url':url,'title':'','published_time':'','getTime':'','author':'','article':'','webname':webname}

    #建立請求頭
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
             "Accept":"*/*"}

    #開啟網頁
    try:
        request = urllib2.Request(url,headers=headers)
        html = urlopen(request)
    except HTTPError as e:
        print(e)
    #讀取網頁內容並轉換成樹形文件結構
    soup = BeautifulSoup(html.read(),"lxml")
    conn = getConn();
    cur = conn.cursor()
    #宣告一個數組用來儲存入庫的文章連結
    L = []
    for link in soup.findAll("a",href=re.compile(r'(.*?)(tech)(.*?)(\d{4}-\d{2}-\d{2})(/doc-ify)')):

        if 'href' in link.attrs:
            #提取href中的url,並規範格式去除分頁引數
            xurl = re.compile(r'(.*?shtml)').search(link.attrs['href']).group(1)
            sqlQueryUrl="select * from tbl_peng_article where url='%s'"%xurl
            # print link.attrs['href']
            result = cur.execute(sqlQueryUrl)
            conn.commit()
            if ( result == 0 ):
                # data = getSinaArticle(url,webname)
                rs = spiderSinaTech(xurl,webname)
                if( rs > 0 ):
                    logging.info("----URL has insert into database :%s"%xurl)
                    L.append(xurl)
                    time.sleep( 2 )
                elif( rs == -1):
                    logging.info("****URL content cannt be understand %s"%xurl)
            else :
                logging.info("&&&&URL already in database %s"%xurl)
    cur.close()
    conn.close()
    #如果不為空就返回最後一個url,為空則停止抓取
    if L:
        return L[-1]
    else:
        return 0

logging.info("begin spider sina tech")
url="http://tech.sina.com.cn/it/2017-06-07/doc-ifyfuzny3756083.shtml"
webname="sina"
x = GOSina(url,webname)
if x!= 0:
    GOSina(x,webname)

logging.info("end spider sina tech")