1. 程式人生 > >Python爬蟲抓取攜程網機票資訊併發郵件通知

Python爬蟲抓取攜程網機票資訊併發郵件通知

背景:
由於要買機票,所以一直進行搜尋,爬蟲可以幫我解決這個問題;

解釋的超級詳細。
於是通過這一過程,基本瞭解了一些;

  • 查詢 上海 到 西安 4.29~05.02的機票:
#coding:utf-8

import urllib2
from lxml import etree
import json
import random
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def get_json2(date,rk,CK,r):
    '''根據構造出的url獲取到航班資料'''
    url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=SIA&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s"
%(date,rk,CK,r) headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",'Referer':"http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=2017-04-29"} headers['Referer']="http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s"%date req=urllib2.Request(url,headers=headers) res=urllib2.urlopen(req) content=res.read() dict_content=json.loads(content,encoding="gb2312"
) length = len(dict_content['fis']) # print length i = 0 for i in range(length): if ((dict_content['fis'][i][u'lp']) < 600 ): print (dict_content['fis'][i][u'lp']), print (dict_content['fis'][i][u'dt']), print (dict_content['fis'][i][u'at'
]) #print (dict_content['fis'][i][u'dpbn']) def get_parameter(date): '''獲取重要的引數 date:日期,格式示例:2016-05-13 ''' url='http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s'%date res=urllib2.urlopen(url).read() tree=etree.HTML(res) pp=tree.xpath('''//body/script[1]/text()''')[0].split() CK_original=pp[3][-34:-2] CK=CK_original[0:5]+CK_original[13]+CK_original[5:13]+CK_original[14:] rk=pp[-1][18:24] num=random.random()*10 num_str="%.15f"%num rk=num_str+rk r=pp[-1][27:len(pp[-1])-3] return rk,CK,r if __name__=='__main__': dates=['2017-04-29','2017-04-30','2017-05-01','2017-05-02'] for date in dates: rk,CK,r=get_parameter(date) get_json2(date,rk,CK,r) print "-----"

傳送郵件程式(我找到出處就放上來):

# -*- coding: utf-8 -*-

from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib

def _format_addr(s):
    name, addr = parseaddr(s)
    return formataddr(( \
        Header(name, 'utf-8').encode(), \
        addr.encode('utf-8') if isinstance(addr, unicode) else addr))

from_addr = raw_input('From: ')
password = raw_input('Password: ')
to_addr = raw_input('To: ')
smtp_server = raw_input('SMTP server: ')

msg = MIMEText('Not just fly fight...', 'plain', 'utf-8')
msg['From'] = _format_addr(u'Air <%s>' % from_addr)
msg['To'] = _format_addr(u'126.Air <%s>' % to_addr)
msg['Subject'] = Header(u'flight……', 'utf-8').encode()

server = smtplib.SMTP(smtp_server, 25)
server.set_debuglevel(1) # 正式用改為0就好啦
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()

將查詢資訊和傳送郵件的程式整合起來,大概就是這樣,

#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib2
from lxml import etree
import json
import random
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
import sys
reload(sys)
sys.setdefaultencoding('utf8')

from_addr = "****@126.com" #raw_input('From: ')
password = "******" #raw_input('Password: ')
to_addr = "********@qq.com" #raw_input('To: ')
smtp_server = "smtp.126.com"#raw_input('SMTP server: ')

def _format_addr(s):
    name, addr = parseaddr(s)
    return formataddr(( \
        Header(name, 'utf-8').encode(), \
        addr.encode('utf-8') if isinstance(addr, unicode) else addr))

def get_json2(date,rk,CK,r):
    '''根據構造出的url獲取到航班資料'''
    url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=SIA&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s"%(date,rk,CK,r)
    headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",'Referer':"http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=2017-04-29"}
    headers['Referer']="http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s"%date
    req=urllib2.Request(url,headers=headers)
    res=urllib2.urlopen(req)
    content=res.read()
    dict_content=json.loads(content,encoding="gb2312")
    length = len(dict_content['fis']) 
    # print length
    i = 0
    for i in range(length):
        if ((dict_content['fis'][i][u'lp']) < 600 ):
            print (dict_content['fis'][i][u'lp']),
            print (dict_content['fis'][i][u'dt']),
            print (dict_content['fis'][i][u'at']),
            print (dict_content['fis'][i][u'dpbn'])  
            if ((dict_content['fis'][i][u'lp']) <= 450 ):
                msg = MIMEText(('%r at %s in %s'% ((dict_content['fis'][i][u'lp']),(dict_content['fis'][i][u'dt']),(dict_content['fis'][i][u'dpbn']))),'plain', 'utf-8')
                msg['From'] = _format_addr(u'Air <%s>' % from_addr)
                msg['To'] = _format_addr(u'126.Air <%s>' % to_addr)
                msg['Subject'] = Header(u'flight…%r '%(dict_content['fis'][i][u'lp']), 'utf-8').encode()
                server = smtplib.SMTP(smtp_server, 25)
                server.set_debuglevel(0)
                server.login(from_addr, password)
                server.sendmail(from_addr, [to_addr], msg.as_string())
                server.quit()




def get_parameter(date):
    '''獲取重要的引數
    date:日期,格式示例:2016-05-13
    '''
    url='http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s'%date
    res=urllib2.urlopen(url).read()
    tree=etree.HTML(res)
    pp=tree.xpath('''//body/script[1]/text()''')[0].split()
    CK_original=pp[3][-34:-2]
    CK=CK_original[0:5]+CK_original[13]+CK_original[5:13]+CK_original[14:]

    rk=pp[-1][18:24]
    num=random.random()*10
    num_str="%.15f"%num
    rk=num_str+rk
    r=pp[-1][27:len(pp[-1])-3]

    return rk,CK,r

if __name__=='__main__':
    dates=['2017-04-29','2017-04-30','2017-05-01']

    for date in dates:
        rk,CK,r=get_parameter(date)
        get_json2(date,rk,CK,r)
        print "-----"

然後使用crontab 做一個定時任務,每20mins執行一次;
基本格式 :

*  *  *  *  *  command
分  時  日  月  周  命令 

so,

0,20,40 * * * * python ~/test.py

儘管有很多問題,正在學習。