爬蟲-幾個簡單的小檔案

阿新 • • 發佈：2019-01-23

本週第一次接觸爬蟲，在https://scrapy-chs.readthedocs.io/zh_CN/latest/intro/tutorial.html上簡單入門後便在馬神的指導下寫了一些簡單的爬蟲。
因為以前web，html，css,xpath都沒有接觸過，看網頁的原始碼時很是吃力。也一開始也不知道定位的方法，走了不少的彎路，以此來記錄一下。
一開始在用scrapy庫爬之前先在b站上看了北理工嵩天老師的視訊，但是那個視訊主要講的是request庫和re庫，還有美麗湯。跟著敲一遍程式碼也只是似懂非懂，畢竟正則表示式都忘得差不多了。這裡講其中的部分原始碼放上來，方便有需要的童鞋。
1.爬取中國大學排名：

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""   
def fillUnivList(ulist,html):
    soup=BeautifulSoup(html,"html.parser" 
)
    for tr in soup.find("tbody").children:
        if isinstance(tr,bs4.element.Tag):
            tds=tr("td")
            ulist.append([tds[0].string,tds[1].string,tds[2].string])
    pass
def printUnivList(ulist,num):
    print "{:^10}\t{:^6}\t{:^10}".format("排名","學校名稱","總分")
    for i in range(num):
        u=ulist[i]
        print("{:^10}\t{:^6}\t{:^10}" 
.format(u[0],u[1],u[2]))
def main():
    uinfo=[]
    url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
    html=getHTMLText(url)
    fillUnivList(uinfo,html)
    printUnivList(uinfo,20)
main()

2.爬取股市資訊：

import requests
import traceback
from bs4 import BeautifulSoup
import re

def getHTMLText(url,code='utf-8'):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html=getHTMLText(stockURL)
    soup=BeautifulSoup(html,'html.parser')
    a=soup.find_all('a')
    for i in a:
        try:
            href=i.attrs['href']
            lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
        except:
            continue

    return ""

def getStockInfo(lst,stockURL,fpath):
    for stock in lst:
        url=stockURL+stock+".html"
        html=getHTMLText(url)
        try:
            if html=="":
                continue
            infoDict={}
            soup=BeautifulSoup(htnl,'html.parser')
            stockInfo=soup.find('div',attrs={'class':'stock-bets'})

            name=stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDict.update({'stockname':name.text.split()[0]})

            keyList=stockInfo.find_all('dt')
            valueList=stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key=keyList[i].text
                val=valueList[i].text
                infoDict[key]=val
            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
        except:
            traceback.print_exc()
            continue
def main():
    stock_list_url='http://quote.eastmoney.com/stcoklist.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file='e://baidustockinfo.txt'
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()

3.爬取淘寶商品價格

#!usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re

def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt,html):
    try:
        plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price=eval(plt[i].split(':')[1])
            title=eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        price("")

def printGoodsList(ilt):
    tlpt="{:4}\t{:8}\t{:16}"
    print(tlpt.format("num","price","goodsname"))
    count=0
    for g in ilt:
        count=count+1
        print(tlpt.format(count,g[0],g[1]))

def main():
    goods=str('school bag')
    depth=2
    start_url='https://s.taobao.com/search?q='+goods
    infoList=[]
    for i in range(depth):
        try:
            url=start_url+'&s='+str(48*i)
            html=getHTMLText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodsList(infoList)

main()

scrapy庫使用：
scrapy startproject name
genspider name
爬蟲檔案：
class …
def parse(self,response)…
response指返回的selector物件，response.xpath(’ ‘) .extract(）返回一個表，其中xpath（）的用法一開始用不熟練，費時不少。一開始連 xpath（）返回什麼和xpath（）.extract（）返回什麼都分不清，有事基礎不牢有些坑是不得不踩。
因為政策檔案較多，首先爬取這些檔案的URL（接下來的學習中應該會找到更為高效的方法），給出爬取URL的程式碼：

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
class DmozSpider(scrapy.Spider):
    name = "extr"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        'http://www.szkj.gov.cn/news/20171227/tfpmwnpw-0jt7-zwta-wffv-5o1ar32e34.html',
    ]

#    def parse(self,response):
#        filename=response.url.split('/')[-2]
#        with open(filename,'wb')as f:
#            f.write(response.body)
#            
#/html/body/div[4]/div[1]/div/ul
#/html/body/div[4]/div[1]/div/ul/li[1]
#/html/body/div[4]/div[1]/div/ul/li[1]/div[1]
#/html/body/div[4]/div[1]/div/ul/li[1]/div[1]/a
#/html/body/div[4]/div[1]/div/ul/li[1]/div[2] time
#/html/body/div[4]/div[1]/div/ul/li[1]/div[2]
    def parse(self, response):
        i=0
        rel=response.xpath('/html/body/div[4]/div[1]/div/ul/li')
#        doc={"url":[],"title":[],"time":[]}
        doc=[]
        file_object=file(r"E:\python\try\art\url.txt","a+")
        for father in response.xpath('/html/body/div[4]/div[1]/div/ul/li'):
            if "2017" in father.xpath('./div[2]/text()').extract()[0]:
                dict={}
                print"====================%d========!!!!!!!!!!!!============================="%i
                art_url=father.xpath('./div[1]/a/@href').extract()[0]
                dict['url']=art_url
                print(art_url)
                file_object.write(art_url)
                file_object.write('\t')
                title=father.xpath('./div[1]/a/@title').extract()[0]
                dict['title']=title
                print(title)
                file_object.write(title)
                file_object.write('\t')
                time=father.xpath('./div[2]/text()').extract()[0]
                dict['time']=time
                print(time)
                file_object.write(time)
                file_object.write('\n')
                doc.append(dict)
                i=i+1

        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        for d in doc:
#            print(d["url"]+"     "+d["title"]+"        "+d["time"])
            print("\'"+"http://www.szkj.gov.cn"+d["url"]+"\',")

接著通過這些連結爬取政策檔案中的內容，並將這些內容寫到本機中，程式碼：

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage

import scrapy
class DmozSpider(scrapy.Spider):
    name = "art"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        'http://www.szkj.gov.cn/news/20171227/tfpmwnpw-0jt7-zwta-wffv-5o1ar32e34.html',
        'http://www.szkj.gov.cn/news/20171117/hjojwmv5-yrwm-jaxe-izic-x9b151x2iz.html',
        'http://www.szkj.gov.cn/news/20171027/2rjrxx0w-nw76-ru14-b2zd-gegfavg5hn.html',
        'http://www.szkj.gov.cn/news/20171018/o0hknmkk-rcll-pffs-k5vw-4b5w65u3sw.html',
        'http://www.szkj.gov.cn/news/20170929/14ftvjse-90e4-7a2o-0s6w-gy3zm42v82.html',
        'http://www.szkj.gov.cn/news/20170927/j0z3mctd-0zaz-uhd7-lw66-bzdt6owoaa.html',
        'http://www.szkj.gov.cn/news/20170920/xe5v6qrk-40ip-i1ie-f19q-0afi5pbakf.html',
        'http://www.szkj.gov.cn/news/20170825/ltehbi1t-atph-esk3-8nvo-qjvo3edxfg.html',
        'http://www.szkj.gov.cn/news/20170722/zok9eaum-nbpm-fg7q-9dst-i8y1kfkbm3.html',
        'http://www.szkj.gov.cn/news/20170721/ltqa0c66-lu96-fvh5-pu6h-7byyzvifnn.html',
        'http://www.szkj.gov.cn/news/20170630/jhgxoro3-rwys-9hw7-dlhr-ssxizm6yg6.html',
        'http://www.szkj.gov.cn/news/20170414/9uso4cya-9clp-khwc-72u6-sszwklb6fp.html',
        'http://www.szkj.gov.cn/news/20170109/fac7j3kh-hzgz-yfwd-qjwd-pvxu3ubzsk.html'
    ]




#    def parse(self,response):
#        filename=response.url.split('/')[-2]
#        with open(filename,'wb')as f:
#            f.write(response.body)
#            
#/html/body/div[4]/div[1]/div/div[2]
#/html/body/div[4]/div[1]/div/div[2]/div[1]
#
#
#/html/body/div[4]/div[1]/div      main
#/html/body/div[4]/div[1]/div/h1    title
#/html/body/div[4]/div[1]/div/div[1]   time
#/html/body/div[4]/div[1]/div/div[2]  content
#
#
#/html/body/div[4]/div[1]/div/div[2]
#/html/body/div[4]/div[1]/div/div[2]/div[5]/text()
#/html/body/div[4]/div[1]/div[1]/div/span/a
    file_path='E:\python\try\art\content.txt'
    def parse(self, response):
        i=0
        arr_str=[]
        file_object=file(r"E:\python\try\art\content.txt","a+")
        title=response.xpath('/html/body/div[4]/div[1]/div/h1/text()').extract()[0]
        file_object.write(title+'\n')
        time=response.xpath('/html/body/div[4]/div[1]/div/div[1]/text()').extract()[0]
        file_object.write(time+'\n')
#        with open(r'E:\python\try\art\content.txt','w') as file_object:
        for rel in response.xpath('/html/body/div[4]/div[1]/div/div[2]/div'):
            print "=========================================%d==============================="%i
            try:
                son=rel.xpath('string(.)').extract()[0].replace(u'\xa0', u' ')
                print(type(son))
                print(len(son))
                file_object.write(son)
                print(son)
                i=i+1
            except:
                continue


        file_object.write('\n\n\n\n\n\n\n\n\n\n')
        file_object.close()





#txtName = "codingWord.txt"
#f=file(txtName, "a+")
#for i in range(1,100):
#    if i % 2 == 0:
#        new_context = "C++" + '\n'
#        f.write(new_context)
#    else:
#        new_context = "Python" + '\n'
#        f.write(new_context)
#f.close()


#        for tr in rel:
#            
#            str = tr.xpath('./td[1]/text()').extract()
#            print(str[0])
#            print(tr.xpath('./td[2]/div/@align').extract()[0])
#            print(tr.xpath('./td[2]/div/text()').extract()[0])
#            print(tr.xpath('./td[4]/text()').extract()[0])

#        print "========================================================================"
#        rel.xpath('/td[1]').extract()
#        print "========================================================================"
#        print response.xpath('/html/body/div[3]/div/div[2]/div/div[3]/div/table/tbody/tr[1]/td[1]').extract()

這周的部分成果，希望下週能夠更深入得學習爬蟲的相關知識。不過不知道資料探勘這門課的爬蟲到底能學到怎樣的一種程度才算合適。
旁邊的馬神早已經開始其進階的道路，希望他程式碼敲得慢些，等等我。。。。。。。

爬蟲-幾個簡單的小檔案

爬蟲-幾個簡單的小檔案

我的Java第二天——幾個簡單小程式

胡八一之Java（六）：表示式的幾個簡單的小陷阱

初識PLSQL 幾個簡單的小程式

幾個簡單步驟可以提高網站瀏覽體驗

JavaSE 最easy出錯的幾個簡單的問題

Java的幾個有用小Util函數（日期處理和http）

移動端的幾個面試小問題

獲取命令使用的方法和幾個簡單的操作命令

學會這幾個Excel小技巧，加班從此對你說拜拜~

python 幾個簡單算法詳解

寫幾個簡單用artTemplate的例子

圖書管理系統的幾個簡單示例圖

java的幾個入門小程序

幾個Python小案例，愛上Python編程！

php操作數據庫的幾個簡單語句

分享幾個Python小技巧函式裡的4個小花招

分享幾個Python小技巧函式裡的4個小花招！

關於Fibonacci數列幾個簡單的演算法

風火程式設計--python爬蟲幾個xpath解析方法

爬蟲-幾個簡單的小檔案

相關推薦