1. 程式人生 > >012 Python 爬蟲項目1

012 Python 爬蟲項目1

python 爬蟲 tor url post strong port pytho .com http

# Python 爬蟲項目1
  ● Python 網頁請求
    requests
      POST
      GET

    網頁狀態碼

1 # -*- coding: UTF-8 -*-
2 from bs4 import BeautifulSoup
3 import requests
4 
5 url = "http://www.baidu.com"
6 unknow = requests.get(url)
7 print(type(unknow))
8 print(unknow)

技術分享

    

    通過標簽匹配內容

 1 # -*- coding: UTF-8 -*-
2 from bs4 import BeautifulSoup 3 import requests 4 5 url = "http://zz.ganji.com/fang1/" 6 r = requests.get(url) 7 soup = BeautifulSoup(r.text,lxml) 8 for item in soup.find_all(dd): 9 if item[class] == [dd-item,title]: 10 #print(item) 11 print(item.a.string) 12 print
("----------------------------------------------------")

技術分享

    通過瀏覽器復制 copy selector

  技術分享

 1 # -*- coding: UTF-8 -*-
 2 from bs4 import BeautifulSoup
 3 import requests
 4 
 5 url = "http://zz.ganji.com/fang1/"
 6 r = requests.get(url)
 7 soup = BeautifulSoup(r.text,lxml)
 8 
 9 #價格獲取
10 title = soup.select(
dl > dd.dd-item.info > div.price > span.num) 11 print(title) 12 13 title2 = soup.select(dl > dd.dd-item.size > span.first.js-huxing) 14 print(title2)

技術分享

1 title = soup.select(dl > dd.dd-item.info > div.price > span.num)
2 print(title)
3 print(type(title[0]))

  title 的類型還是 標簽 Tag

技術分享

    soup.body.div.div.a 方式獲取

1 # -*- coding: UTF-8 -*-
2 from bs4 import BeautifulSoup
3 import requests
4 
5 url = "http://zz.ganji.com/fang1/"
6 r = requests.get(url)
7 soup = BeautifulSoup(r.text,lxml)
8 print(soup.body.div.div.a)

技術分享

 1 from bs4 import BeautifulSoup
 2 import requests
 3 
 4 def isdiv(tag):
 5     return tag.name == div
 6 
 7 url = "http://zz.ganji.com/fang1/"
 8 r = requests.get(url)
 9 soup = BeautifulSoup(r.text,lxml)
10 
11 value = soup.find_all(isdiv)
12 print(value)

    python 使用代理發送網頁請求

1 import requests   
2 proxies = { "http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080", }   
3 requests.get("http://example.org", proxies=proxies)  

012 Python 爬蟲項目1