1. 程式人生 > >爬蟲小練手-爬取慕課網首頁的圖片

爬蟲小練手-爬取慕課網首頁的圖片

#!/usr/bin/python
#-*- coding:utf-8 -*-
import re
import requests
import Queue
import threading
import urllib
from bs4 import BeautifulSoup as BS
#測試時先用指令碼把網頁下載下來,然後用sublime開啟,搜尋 <img  發現有匹配,
#對比原始碼,基本一致說明沒做防爬處理


class create_thread(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self._queue=queue 


def run(self):
while not self._queue.empty():
url=self._queue.get()
#print '準備進入craw'.decode('utf-8')
craw(url)


def craw(url):


r=requests.get(url=url)
html=r.content
#print 'r.content:'
#f=open('tupian1.html','w')
#f.write(r.content)
#f.close()
soup=BS(html,'html.parser')
#data=soup.find_all(name='img',attrs={})
#print '進入了craw下一個find_all'.decode('utf-8')
data=soup.find_all(name='img',attrs={})
#print 'data=%s' % data
for i in data:
#print i.decode('utf-8')
#這裡有一點很重要,因為慕課上的img中的src有兩種,絕對和相對的,這裡相對的要補全,絕對的才能直接爬
#所以要先判斷一下,不然的話就會出錯
#<img src="http://szimg.mukewang.com/59897b1600011ec805400300-360-202.jpg" class="l">
#<img title="慕課網" src="/static/img/index/logo_new.png"></a></div>
if 'http:' in i['src']:
full_src=i['src']
filename=i['src'].split('/')[-1]
urllib.urlretrieve(url=full_src,filename='img/'+filename)
print 'download %s '% full_src
if __name__=='__main__':
queue=Queue.Queue()
threads=[]
threads_count=120
url="http://www.imooc.com/"
queue.put(url)
for i in xrange(threads_count):
threads.append(create_thread(queue))
for i in threads:
i.start()
for i in threads:
i.join()