python之收集網站資料(三種方式)
阿新 • • 發佈:2019-01-24
方式一:python之收集整個網站資料
目的:
建立一個爬蟲和資料收集程式(資料列印)
1:程式碼
# coding=utf-8
"""
@author: jiajiknag
程式功能:收集整個網站資料
建立一個爬蟲和資料收集程式(資料列印)
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
# 全集變數集合
global pages
# urlopen 用來開啟並讀取一個從網路獲取的遠端物件
html = urlopen("http://en.wikipedia.org"+pageUrl)
# 建立BeautifulSoup物件
bs0bj = BeautifulSoup(html)
try:
print(bs0bj.h1.get_text())
print(bs0bj.find(id="mw-content-text").findAll("p")[0])
print(bs0bj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print("頁面缺少一些屬性,不用擔心" )
# 遍歷,以/wiki/開頭
for link in bs0bj.findAll("a",href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
# 新頁面
newPage = link.attrs['href']
# 虛線作用:分離不同的頁面內容
print("-------------------------------\n"+newPage)
# 新增新頁面
pages.add(newPage)
# 或許新頁面
getLinks(newPage)
# 先處理一個空URL
getLinks("")
2:結果
方式二:通過網際網路採集->從一個外鏈隨意跳轉到另一個外鏈:獲得隨機外鏈
1:程式碼
# coding=utf-8
"""
@author: jiajiknag
程式功能: 通過網際網路採集-從一個外鏈隨意跳轉到另一個外鏈
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
# 隨機發送當前時間
# random.seed()是隨機數種子,也就是為隨機數提供演算法,完全相同的種子產生的隨機數列是相同的
random.seed(datetime.datetime.now())
# 獲取頁面所以內鏈的列表
def getInternalLinks(bs0bj, includeUrl):
internalLinks = []
# 找出所有以“\”開頭的連結
for link in bs0bj.findAll("a",href=re.compile("^(/|.*)"+includeUrl)):
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks
# 獲取頁面所有外鏈的列表
def getExternalLinks(bs0bj,excludeUrl):
externalLinks = []
# 找出所有以“http”或者“www”開頭且包含當前的url的連結
for link in bs0bj.findAll("a", href=re.compile("^(http|www)((?!"+ excludeUrl+").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in excludeUrl:
externalLinks.append(link.attrs['href'])
return excludeUrl
# 爬去地址
def splitAddress(address):
addressParts = address.replace("http://", "").split("/")
return addressParts
# 獲取隨機外連結
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bs0bj = BeautifulSoup(html)
# [0]--返回一個列表,獲取外鏈
externalLinks = getExternalLinks(bs0bj, splitAddress(startingPage)[0])
if len(externalLinks) ==0:
# 既然不是外鏈,那就是內鏈
internalLinks = getInternalLinks(startingPage)
# 返回獲得的隨機外鏈
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink("http://oreilly.com")
print("隨機外鏈是:", externalLink)
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")
2:流程圖
3:結果
方式三: 收集網站上所有的外鏈
1:程式碼
# coding=utf-8
"""
@author: jiajiknag
程式功能:
"""
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
# 所有內部連結的列表
def getInternalLinks(bsObj, includeUrl):
includeUrl = urlparse(includeUrl).scheme + "://" + urlparse(includeUrl).netloc
internalLinks = []
# 查詢所有以“/”開頭的連結
for link in bsObj.findAll("a", href=re.compile("^(/|.*" + includeUrl + ")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if (link.attrs['href'].startswith("/")):
internalLinks.append(includeUrl + link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
# 所有外部連結的列表
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
# 查詢以“http”或“www”開頭的所有連結
# 不包含當前的URL
for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html, "html.parser")
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("沒有外部連結")
domain = urlparse(startingPage).scheme + "://" + urlparse(startingPage).netloc
internalLinks = getInternalLinks(bsObj, domain)
return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks) - 1)])
else:
return externalLinks[random.randint(0, len(externalLinks) - 1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("隨機的外部連結是: " + externalLink)
followExternalOnly(externalLink)
# 收集一個網站上所有的外連結,並記錄每一個外鏈
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
bsObj = BeautifulSoup(html, "html.parser")
internalLinks = getInternalLinks(bsObj, domain)
externalLinks = getExternalLinks(bsObj, domain)
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
allIntLinks.add(link)
getAllExternalLinks(link)
# 建立物件
followExternalOnly("http://oreilly.com")
# 新增
allIntLinks.add("http://oreilly.com")
# 建立物件
getAllExternalLinks("http://oreilly.com")
2:結果
博主這裡可能請求過於頻繁所以才導致如下結果啊。
注:小夥伴瀏覽到這可以提出建議和意見,,,