1. 程式人生 > >爬蟲-幾個簡單的小檔案

爬蟲-幾個簡單的小檔案

本週第一次接觸爬蟲,在https://scrapy-chs.readthedocs.io/zh_CN/latest/intro/tutorial.html上簡單入門後便在馬神的指導下寫了一些簡單的爬蟲。
因為以前web,html,css,xpath都沒有接觸過,看網頁的原始碼時很是吃力。也一開始也不知道定位的方法,走了不少的彎路,以此來記錄一下。
一開始在用scrapy庫爬之前先在b站上看了北理工嵩天老師的視訊,但是那個視訊主要講的是request庫和re庫, 還有美麗湯。跟 著敲一遍程式碼也只是似懂非懂,畢竟正則表示式都忘得差不多了。這裡講其中的部分原始碼放上來,方便有需要的童鞋。
1.爬取中國大學排名:

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""   
def fillUnivList(ulist,html):
    soup=BeautifulSoup(html,"html.parser"
) for tr in soup.find("tbody").children: if isinstance(tr,bs4.element.Tag): tds=tr("td") ulist.append([tds[0].string,tds[1].string,tds[2].string]) pass def printUnivList(ulist,num): print "{:^10}\t{:^6}\t{:^10}".format("排名","學校名稱","總分") for i in range(num): u=ulist[i] print("{:^10}\t{:^6}\t{:^10}"
.format(u[0],u[1],u[2])) def main(): uinfo=[] url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html" html=getHTMLText(url) fillUnivList(uinfo,html) printUnivList(uinfo,20) main()

2.爬取股市資訊:

import requests
import traceback
from bs4 import BeautifulSoup
import re

def getHTMLText(url,code='utf-8'):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html=getHTMLText(stockURL)
    soup=BeautifulSoup(html,'html.parser')
    a=soup.find_all('a')
    for i in a:
        try:
            href=i.attrs['href']
            lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
        except:
            continue

    return ""

def getStockInfo(lst,stockURL,fpath):
    for stock in lst:
        url=stockURL+stock+".html"
        html=getHTMLText(url)
        try:
            if html=="":
                continue
            infoDict={}
            soup=BeautifulSoup(htnl,'html.parser')
            stockInfo=soup.find('div',attrs={'class':'stock-bets'})

            name=stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDict.update({'stockname':name.text.split()[0]})

            keyList=stockInfo.find_all('dt')
            valueList=stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key=keyList[i].text
                val=valueList[i].text
                infoDict[key]=val
            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
        except:
            traceback.print_exc()
            continue
def main():
    stock_list_url='http://quote.eastmoney.com/stcoklist.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file='e://baidustockinfo.txt'
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()

3.爬取淘寶商品價格

#!usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re

def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt,html):
    try:
        plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price=eval(plt[i].split(':')[1])
            title=eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        price("")

def printGoodsList(ilt):
    tlpt="{:4}\t{:8}\t{:16}"
    print(tlpt.format("num","price","goodsname"))
    count=0
    for g in ilt:
        count=count+1
        print(tlpt.format(count,g[0],g[1]))

def main():
    goods=str('school bag')
    depth=2
    start_url='https://s.taobao.com/search?q='+goods
    infoList=[]
    for i in range(depth):
        try:
            url=start_url+'&s='+str(48*i)
            html=getHTMLText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodsList(infoList)

main()

scrapy庫使用:
scrapy startproject name
genspider name
爬蟲檔案:
class …
def parse(self,response)…
response指返回的selector物件,response.xpath(’ ‘) .extract()返回一個表,其中xpath() 的用法一開始用不熟練,費時不少。一開始連 xpath()返回什麼和xpath().extract()返回什麼都分不清,有事基礎不牢有些坑是不得不踩。
因為政策檔案較多,首先爬取這些檔案的URL(接下來的學習中應該會找到更為高效的方法),給出爬取URL的程式碼:

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
class DmozSpider(scrapy.Spider):
    name = "extr"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        'http://www.szkj.gov.cn/news/20171227/tfpmwnpw-0jt7-zwta-wffv-5o1ar32e34.html',
    ]

#    def parse(self,response):
#        filename=response.url.split('/')[-2]
#        with open(filename,'wb')as f:
#            f.write(response.body)
#            
#/html/body/div[4]/div[1]/div/ul
#/html/body/div[4]/div[1]/div/ul/li[1]
#/html/body/div[4]/div[1]/div/ul/li[1]/div[1]
#/html/body/div[4]/div[1]/div/ul/li[1]/div[1]/a
#/html/body/div[4]/div[1]/div/ul/li[1]/div[2] time
#/html/body/div[4]/div[1]/div/ul/li[1]/div[2]
    def parse(self, response):
        i=0
        rel=response.xpath('/html/body/div[4]/div[1]/div/ul/li')
#        doc={"url":[],"title":[],"time":[]}
        doc=[]
        file_object=file(r"E:\python\try\art\url.txt","a+")
        for father in response.xpath('/html/body/div[4]/div[1]/div/ul/li'):
            if "2017" in father.xpath('./div[2]/text()').extract()[0]:
                dict={}
                print"====================%d========!!!!!!!!!!!!============================="%i
                art_url=father.xpath('./div[1]/a/@href').extract()[0]
                dict['url']=art_url
                print(art_url)
                file_object.write(art_url)
                file_object.write('\t')
                title=father.xpath('./div[1]/a/@title').extract()[0]
                dict['title']=title
                print(title)
                file_object.write(title)
                file_object.write('\t')
                time=father.xpath('./div[2]/text()').extract()[0]
                dict['time']=time
                print(time)
                file_object.write(time)
                file_object.write('\n')
                doc.append(dict)
                i=i+1

        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        for d in doc:
#            print(d["url"]+"     "+d["title"]+"        "+d["time"])
            print("\'"+"http://www.szkj.gov.cn"+d["url"]+"\',")



接著通過這些連結爬取政策檔案中的內容,並將這些內容寫到本機中,程式碼:

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage

import scrapy
class DmozSpider(scrapy.Spider):
    name = "art"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        'http://www.szkj.gov.cn/news/20171227/tfpmwnpw-0jt7-zwta-wffv-5o1ar32e34.html',
        'http://www.szkj.gov.cn/news/20171117/hjojwmv5-yrwm-jaxe-izic-x9b151x2iz.html',
        'http://www.szkj.gov.cn/news/20171027/2rjrxx0w-nw76-ru14-b2zd-gegfavg5hn.html',
        'http://www.szkj.gov.cn/news/20171018/o0hknmkk-rcll-pffs-k5vw-4b5w65u3sw.html',
        'http://www.szkj.gov.cn/news/20170929/14ftvjse-90e4-7a2o-0s6w-gy3zm42v82.html',
        'http://www.szkj.gov.cn/news/20170927/j0z3mctd-0zaz-uhd7-lw66-bzdt6owoaa.html',
        'http://www.szkj.gov.cn/news/20170920/xe5v6qrk-40ip-i1ie-f19q-0afi5pbakf.html',
        'http://www.szkj.gov.cn/news/20170825/ltehbi1t-atph-esk3-8nvo-qjvo3edxfg.html',
        'http://www.szkj.gov.cn/news/20170722/zok9eaum-nbpm-fg7q-9dst-i8y1kfkbm3.html',
        'http://www.szkj.gov.cn/news/20170721/ltqa0c66-lu96-fvh5-pu6h-7byyzvifnn.html',
        'http://www.szkj.gov.cn/news/20170630/jhgxoro3-rwys-9hw7-dlhr-ssxizm6yg6.html',
        'http://www.szkj.gov.cn/news/20170414/9uso4cya-9clp-khwc-72u6-sszwklb6fp.html',
        'http://www.szkj.gov.cn/news/20170109/fac7j3kh-hzgz-yfwd-qjwd-pvxu3ubzsk.html'
    ]




#    def parse(self,response):
#        filename=response.url.split('/')[-2]
#        with open(filename,'wb')as f:
#            f.write(response.body)
#            
#/html/body/div[4]/div[1]/div/div[2]
#/html/body/div[4]/div[1]/div/div[2]/div[1]
#
#
#/html/body/div[4]/div[1]/div      main
#/html/body/div[4]/div[1]/div/h1    title
#/html/body/div[4]/div[1]/div/div[1]   time
#/html/body/div[4]/div[1]/div/div[2]  content
#
#
#/html/body/div[4]/div[1]/div/div[2]
#/html/body/div[4]/div[1]/div/div[2]/div[5]/text()
#/html/body/div[4]/div[1]/div[1]/div/span/a
    file_path='E:\python\try\art\content.txt'
    def parse(self, response):
        i=0
        arr_str=[]
        file_object=file(r"E:\python\try\art\content.txt","a+")
        title=response.xpath('/html/body/div[4]/div[1]/div/h1/text()').extract()[0]
        file_object.write(title+'\n')
        time=response.xpath('/html/body/div[4]/div[1]/div/div[1]/text()').extract()[0]
        file_object.write(time+'\n')
#        with open(r'E:\python\try\art\content.txt','w') as file_object:
        for rel in response.xpath('/html/body/div[4]/div[1]/div/div[2]/div'):
            print "=========================================%d==============================="%i
            try:
                son=rel.xpath('string(.)').extract()[0].replace(u'\xa0', u' ')
                print(type(son))
                print(len(son))
                file_object.write(son)
                print(son)
                i=i+1
            except:
                continue


        file_object.write('\n\n\n\n\n\n\n\n\n\n')
        file_object.close()





#txtName = "codingWord.txt"
#f=file(txtName, "a+")
#for i in range(1,100):
#    if i % 2 == 0:
#        new_context = "C++" + '\n'
#        f.write(new_context)
#    else:
#        new_context = "Python" + '\n'
#        f.write(new_context)
#f.close()


#        for tr in rel:
#            
#            str = tr.xpath('./td[1]/text()').extract()
#            print(str[0])
#            print(tr.xpath('./td[2]/div/@align').extract()[0])
#            print(tr.xpath('./td[2]/div/text()').extract()[0])
#            print(tr.xpath('./td[4]/text()').extract()[0])

#        print "========================================================================"
#        rel.xpath('/td[1]').extract()
#        print "========================================================================"
#        print response.xpath('/html/body/div[3]/div/div[2]/div/div[3]/div/table/tbody/tr[1]/td[1]').extract()


這周的部分成果,希望下週能夠更深入得學習爬蟲的相關知識。不過不知道資料探勘這門課的爬蟲到底能學到怎樣的一種程度才算合適。
旁邊的馬神早已經開始其進階的道路,希望他程式碼敲得慢些,等等我。。。。。。。