Python爬取百度貼吧圖片指令碼
阿新 • • 發佈:2018-12-30
新手,以下是爬取百度貼吧制定帖子的圖片指令碼,因為指令碼主要是解析html程式碼,因此一旦百度修改頁面前端程式碼,那麼指令碼會失效,權當爬蟲入門練習吧,後續還會嘗試更多的爬蟲。
# coding=utf-8 # !/usr/bin/env python import urllib, string, os from bs4 import BeautifulSoup def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(): imgPath = 'F:/craw_tieba/' if not os.path.exists(imgPath): os.makedirs(imgPath) baseUrl = 'http://tieba.baidu.com/p/4657665666' imgList = [] for pg in range(1, 114): url = baseUrl + '?pn=' + str(pg) print 'Craw: ',url html = getHtml(url) soup = BeautifulSoup(html) imgURLList = string.split(str(soup.find_all('img')), ',') for i in range(0, len(imgURLList)): if 'http://imgsrc.baidu.com/forum/w%3D580/sign=' in imgURLList[i]: start = string.find(imgURLList[i], 'http') end = string.find(imgURLList[i], '.jpg') + 4 imgList.append(imgURLList[i][start : end]) x = 1 for img in imgList: urllib.urlretrieve(img, 'F:/craw_tieba/%s.jpg' % x) x += 1 print 'Craw tieba finish!' if __name__ == '__main__': getImg()