1. 程式人生 > >python實現kindle每天推送部落格2----python實現爬取部落格內容

python實現kindle每天推送部落格2----python實現爬取部落格內容

#!/usr/bin/env python
#coding=utf-8
#
#   Copyright 2017 liuxinxing
#

from bs4 import BeautifulSoup
import urllib2

import datetime
import time
import PyRSS2Gen
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


class RssSpider():
    def __init__(self):
        self.myrss = PyRSS2Gen.RSS2(title='OSChina',
                                    link='http://my.oschina.net',
                                    description=str(datetime.date.today()),
                                    pubDate=datetime.datetime.now(),
                                    lastBuildDate = datetime.datetime.now(),
                                    items=[]
                                    )
        self.xmlpath=r'./oschina.xml'

        self.baseurl="http://www.oschina.net/blog"
        #if os.path.isfile(self.xmlpath):
            #os.remove(self.xmlpath)
    def useragent(self,url):
        i_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36","Referer": 'http://baidu.com/'}
        req = urllib2.Request(url, headers=i_headers)
        html = urllib2.urlopen(req).read()
        return html

    def enterpage(self,url):
        pattern = re.compile(r'\d{4}\S\d{2}\S\d{2}\s\d{2}\S\d{2}')
        rsp=self.useragent(url)
        # print rsp
        soup=BeautifulSoup(rsp, "html.parser")
        # print soup
        timespan=soup.find('div',{'class':'blog-content'})
        # print timespan
        timespan=str(timespan).strip().replace('\n','').decode('utf-8')
        # match=re.search(r'\d{4}\S\d{2}\S\d{2}\s\d{2}\S\d{2}',timespan)
        # timestr=str(datetime.date.today())
        # if match:
        #     timestr=match.group()
            #print timestr
        ititle=soup.title.string
        print ititle
        div=soup.find('div',{'class':'BlogContent'})
        # print type(div)
        doc = div.get_text()
        # print type(doc)
        return ititle,doc

    def getcontent(self):
        rsp=self.useragent(self.baseurl)
        # print rsp
        soup=BeautifulSoup(rsp, "html.parser")
        # print soup
        ul=soup.find('div',{'id':'topsOfRecommend'})
        # print ul
        for div in ul.findAll('div',{'class':'box-aw'}):
            # div=li.find('div')
            # print div
            if div is not None:
                alink=div.find('a')
                if alink is not None:
                    link=alink.get('href')
                    print link
                    if self.isbloglink(link):
                        title,doc =self.enterpage(link)
                        self.savefile(title,doc)

    def isbloglink(self,link):
        express = r".*/blog/.*"
        mo = re.search(express, link)
        if mo:
            return True
        else:
            return False

    def savefile(self,title,doc):
        doc = doc.decode('utf-8')
        with open("./data/"+title+".txt",'w') as f:
            f.write(doc)



if __name__=='__main__':
    rssSpider=RssSpider()
    rssSpider.getcontent()
    # rssSpider.enterpage("https://my.oschina.net/diluga/blog/1501203")