1. 程式人生 > >python3爬蟲-快速入門-爬取圖片和標題

python3爬蟲-快速入門-爬取圖片和標題

瀏覽器 ebr tle path requests itl edi 大致 應用

直接上代碼,先來個爬取豆瓣圖片的,大致思路就是發送請求-得到響應數據-儲存數據,原理的話可以先看看這個

https://www.cnblogs.com/sss4/p/7809821.html

import os#同來創造文件夾
import requests#發送請求和得到響應用的
from bs4 import BeautifulSoup#用來解析回應的數據

def GetHtmlText(url):#得到響應數據
    try:
        r = requests.get(url)#發送url
        r.raise_for_status()#判斷是否成功
        r.encoding = 
utf-8#設置編碼格式 return r.text#返回他的響應數據 except: return ‘‘ def main(pages): filepath=os.getcwd()+\爬的圖片\\#創造一個文件夾 if not os.path.exists(filepath):#如果沒有則創造 os.makedirs(filepath) pagenum=pages#要爬取的頁數 fnum=1 for page in range(pages): url="https://movie.douban.com/celebrity/1048000/photos/?type=C&start=
"+str(page*30)+&sortby=like&size=a&subtype=a#第幾頁 html=GetHtmlText(url) soup=BeautifulSoup(html,html.parser)#html。parser是解析器 uls=soup.find_all(ul,class_="poster-col3 clearfix")#從響應的數據中找到ul class是xxxx的數據 for ul in uls: imgs=ul.find_all(img
) #找到img的標簽 for img in imgs: imgurl=img[src]#得到img的url imgcontent=requests.get(imgurl).content#得到這個url下的內容content,應該是二進制的 filename=str(fnum)+.jpg with open(filepath+filename,wb) as wf:#二進制形式寫入數據 wf.write(imgcontent) fnum+=1 if __name__ == __main__: main(9)

再來個爬去標題類的

import requests
from bs4 import BeautifulSoup

url="http://www.jianshu.com"
headers={User-Agent:SE 2.X MetaSr 1.0}#設置請求頭的User-Agent,理解的話可以認為是從哪個瀏覽器發出的,不然的話會被反爬蟲
page=requests.get(url=url,headers=headers)
page_info=page.text
page_bf=BeautifulSoup(page_info,html.parser)

#print(page_bf.prettify())
titles=page_bf.find_all(a,title)

for title in titles:
    print(title.string)
    print(http://www.jianshu.com+title.get(href))
with open(r"D:\untitled\爬蟲爬到的標題.txt","w",encoding=utf-8) as file:
    for title in titles:
        file.write(title.string+\n)
        file.write("http://www.jianshu.com"+title.get(href)+\n\n)

這個是下載小說的---(別人的代碼)

from bs4 import BeautifulSoup
import requests,sys
class downloader(object):
    def __init__(self):
        self.server="http://www.biqukan.com/"
        self.target="http://www.biqukan.com/1_1094"
        self.name=[]
        self.urls=[]
        self.nums=0

    def get_download_url(self):
        req=requests.get(url=self.target)
        html=req.text
        div_bf=BeautifulSoup(html)
        div=div_bf.find_all(div,class_=listmain)
        a_bf=BeautifulSoup(str(div[0]))
        a=a_bf.find_all(a)
        self.nums=len(a[15:])
        for each in a[15:]:
            self.name.append(each.string)
            self.urls.append(self.server+each.get(href))
    def get_contents(self ,target):
        req=requests.get(url=target)
        html=req.text
        bf=BeautifulSoup(html)
        texts=bf.find_all(div,class_=showtxt)
        texts=texts[0].text.replace(\xa0*8,\n\n)
        return texts
    def writer(self,name,path,text):
        write_flag=True
        with open(path,"a",encoding=utf-8) as f:
            f.write(name+\n)
            f.writelines(text)
            f.write(\n\n)

dl=downloader()
dl.get_download_url()
print("開始下載")
for i in range(dl.nums):
    dl.writer(dl.name[i], 一念永恒.txt, dl.get_contents(dl.urls[i]))
    sys.stdout.write("  已下載:%.3f%%" %  float(i/dl.nums) + \r)
    sys.stdout.flush()
print(《一年永恒》下載完成)

python3爬蟲-快速入門-爬取圖片和標題