Python爬蟲小實踐：尋找失蹤人口，爬取失蹤兒童信息並寫成csv文件，方便存入數據庫

阿新 • • 發佈：2017-10-25

python tor enc mini 執行 gem view 獲取但是

前兩天有人私信我，讓我爬這個網站，http://bbs.baobeihuijia.com/forum-191-1.html上的失蹤兒童信息，準備根據失蹤兒童的失蹤時的地理位置來更好的尋找失蹤兒童，這種事情本就應該義不容辭,如果對網站服務器造成負荷，還請諒解。

這次依然是用第三方爬蟲包BeautifulSoup，還有Selenium+Chrome，Selenium+PhantomJS來爬取信息。

通過分析網站的框架，依然分三步來進行。

步驟一：獲取http://bbs.baobeihuijia.com/forum-191-1.html這個版塊上的所有分頁頁面鏈接

步驟二：獲取每一個分頁鏈接上所發的帖子的鏈接

步驟三：獲取每一個帖子鏈接上要爬取的信息，編號，姓名，性別，出生日期，失蹤時身高，失蹤時間，失蹤地點，以及是否報案

起先用的BeautifulSoup，但是被管理員設置了網站重定向，然後就采用selenium的方式，在這裏還是對網站管理員說一聲抱歉。

1、獲取http://bbs.baobeihuijia.com/forum-191-1.html這個版塊上的所有分頁頁面鏈接

技術分享

通過分析：發現分頁的頁面鏈接處於<div class="pg">下，所以寫了以下的代碼

BeautifulSoup形式：

[python] view plain copy

def GetALLPageUrl(siteUrl):

#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘https‘:‘111.76.129.200:808‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)

#獲取網頁信息
req=request.Request(siteUrl,headers=headers1 or headers2 or headers3)
html=urlopen(req)
bsObj=BeautifulSoup(html.read(),"html.parser")
html.close()
#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com，以便組成頁面鏈接
siteindex=siteUrl.rfind("/")
tempsiteurl=siteUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/
tempbianhaoqian=siteUrl[siteindex+1:-6]#forum-191-
#爬取想要的信息
bianhao=[]#存儲頁面編號
pageUrl=[]#存儲頁面鏈接
templist1=bsObj.find("div",{"class":"pg"})
for templist2 in templist1.findAll("a",href=re.compile("forum-([0-9]+)-([0-9]+).html")):
lianjie=templist2.attrs[‘href‘]
#print(lianjie)
index1=lianjie.rfind("-")#查找-在字符串中的位置
index2=lianjie.rfind(".")#查找.在字符串中的位置
tempbianhao=lianjie[index1+1:index2]
bianhao.append(int(tempbianhao))
bianhaoMax=max(bianhao)#獲取頁面的最大編號
for i in range(1,bianhaoMax+1):
temppageUrl=tempsiteurl+tempbianhaoqian+str(i)+".html"#組成頁面鏈接
#print(temppageUrl)
pageUrl.append(temppageUrl)
return pageUrl#返回頁面鏈接列表

Selenium形式：

[python] view plain copy

#得到當前板塊所有的頁面鏈接
#siteUrl為當前版塊的頁面鏈接
def GetALLPageUrl(siteUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘123.207.143.51:8080‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
try:
#掉用第三方包selenium打開瀏覽器登陸
#driver=webdriver.Chrome()#打開chrome
driver=webdriver.Chrome()#打開無界面瀏覽器Chrome
#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS
driver.set_page_load_timeout(10)
#driver.implicitly_wait(30)
try:
driver.get(siteUrl)#登陸兩次
driver.get(siteUrl)
except TimeoutError:
driver.refresh()
#print(driver.page_source)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
#獲取網頁信息
#抓捕網頁解析過程中的錯誤
try:
#req=request.Request(tieziUrl,headers=headers5)
#html=urlopen(req)
bsObj=BeautifulSoup(html,"html.parser")
#print(bsObj.find(‘title‘).get_text())
#html.close()
except UnicodeDecodeError as e:
print("-----UnicodeDecodeError url",siteUrl)
except urllib.error.URLError as e:
print("-----urlError url:",siteUrl)
except socket.timeout as e:
print("-----socket timout:",siteUrl)
while(bsObj.find(‘title‘).get_text() == "頁面重載開啟"):
print("當前頁面不是重加載後的頁面，程序會嘗試刷新一次到跳轉後的頁面\n")
driver.get(siteUrl)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
bsObj=BeautifulSoup(html,"html.parser")
except Exception as e:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#time.sleep()
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com，以便組成頁面鏈接
siteindex=siteUrl.rfind("/")
tempsiteurl=siteUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/
tempbianhaoqian=siteUrl[siteindex+1:-6]#forum-191-
#爬取想要的信息
bianhao=[]#存儲頁面編號
pageUrl=[]#存儲頁面鏈接
templist1=bsObj.find("div",{"class":"pg"})
#if templist1==None:
#return
for templist2 in templist1.findAll("a",href=re.compile("forum-([0-9]+)-([0-9]+).html")):
if templist2==None:
continue
lianjie=templist2.attrs[‘href‘]
#print(lianjie)
index1=lianjie.rfind("-")#查找-在字符串中的位置
index2=lianjie.rfind(".")#查找.在字符串中的位置
tempbianhao=lianjie[index1+1:index2]
bianhao.append(int(tempbianhao))
bianhaoMax=max(bianhao)#獲取頁面的最大編號
for i in range(1,bianhaoMax+1):
temppageUrl=tempsiteurl+tempbianhaoqian+str(i)+".html"#組成頁面鏈接
print(temppageUrl)
pageUrl.append(temppageUrl)
return pageUrl#返回頁面鏈接列表

2.獲取每一個分頁鏈接上所發的帖子的鏈接

技術分享

每個帖子的鏈接都位於href下

所以寫了以下的代碼：

BeautifulSoup形式：

[python] view plain copy

#得到當前版塊頁面所有帖子的鏈接
def GetCurrentPageTieziUrl(PageUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘121.22.252.85:8000‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
#獲取網頁信息
req=request.Request(PageUrl,headers=headers1 or headers2 or headers3)
html=urlopen(req)
bsObj=BeautifulSoup(html.read(),"html.parser")
html.close()
#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com，以便組成帖子鏈接
siteindex=PageUrl.rfind("/")
tempsiteurl=PageUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/
#print(tempsiteurl)
TieziUrl=[]
#爬取想要的信息
for templist1 in bsObj.findAll("tbody",id=re.compile("normalthread_([0-9]+)")) :
for templist2 in templist1.findAll("a",{"class":"s xst"}):
tempteiziUrl=tempsiteurl+templist2.attrs[‘href‘]#組成帖子鏈接
print(tempteiziUrl)
TieziUrl.append(tempteiziUrl)
return TieziUrl#返回帖子鏈接列表

Selenium形式：

[python] view plain copy

#得到當前版塊頁面所有帖子的鏈接
def GetCurrentPageTieziUrl(PageUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘110.73.30.157:8123‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
try:
#掉用第三方包selenium打開瀏覽器登陸
#driver=webdriver.Chrome()#打開chrome
driver=webdriver.Chrome()#打開無界面瀏覽器Chrome
#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS
driver.set_page_load_timeout(10)
try:
driver.get(PageUrl)#登陸兩次
driver.get(PageUrl)
except TimeoutError:
driver.refresh()
#print(driver.page_source)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
#獲取網頁信息
#抓捕網頁解析過程中的錯誤
try:
#req=request.Request(tieziUrl,headers=headers5)
#html=urlopen(req)
bsObj=BeautifulSoup(html,"html.parser")
#html.close()
except UnicodeDecodeError as e:
print("-----UnicodeDecodeError url",PageUrl)
except urllib.error.URLError as e:
print("-----urlError url:",PageUrl)
except socket.timeout as e:
print("-----socket timout:",PageUrl)
n=0
while(bsObj.find(‘title‘).get_text() == "頁面重載開啟"):
print("當前頁面不是重加載後的頁面，程序會嘗試刷新一次到跳轉後的頁面\n")
driver.get(PageUrl)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
bsObj=BeautifulSoup(html,"html.parser")
n=n+1
if n==10:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
return 1
except Exception as e:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
time.sleep(1)
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com，以便組成帖子鏈接
siteindex=PageUrl.rfind("/")
tempsiteurl=PageUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/
#print(tempsiteurl)
TieziUrl=[]
#爬取想要的信息
for templist1 in bsObj.findAll("tbody",id=re.compile("normalthread_([0-9]+)")) :
if templist1==None:
continue
for templist2 in templist1.findAll("a",{"class":"s xst"}):
if templist2==None:
continue
tempteiziUrl=tempsiteurl+templist2.attrs[‘href‘]#組成帖子鏈接
print(tempteiziUrl)
TieziUrl.append(tempteiziUrl)
return TieziUrl#返回帖子鏈接列表

3.獲取每一個帖子鏈接上要爬取的信息，編號，姓名，性別，出生日期，失蹤時身高，失蹤時間，失蹤地點，以及是否報案，並寫入CSV中

技術分享

通過查看每一個帖子的鏈接，發現其失蹤人口信息都在<ul>標簽下，所以編寫了以下的代碼

BeautifulSoup形式：

[python] view plain copy

#得到當前頁面失蹤人口信息
#pageUrl為當前帖子頁面鏈接
def CurrentPageMissingPopulationInformation(tieziUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘210.136.17.78:8080‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
#獲取網頁信息
req=request.Request(tieziUrl,headers=headers1 or headers2 or headers3)
html=urlopen(req)
bsObj=BeautifulSoup(html.read(),"html.parser")
html.close()
#查找想要的信息
templist1=bsObj.find("td",{"class":"t_f"}).ul
if templist1==None:#判斷是否不包含ul字段，如果不，跳出函數
return
mycsv=[‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘]#初始化提取信息列表
for templist2 in templist1.findAll("font",size=re.compile("^([0-9]+)*$")):
if len(templist2)==0:
continue
tempText=templist2.get_text()
#print(tempText[0:4])
if "寶貝回家編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
if len(tempText)==0:
tempText="NULL"
mycsv[0]=tempText
if "尋親編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
if len(tempText)==0:
tempText="NULL"
#mycsv.append(tempText)
mycsv[0]=tempText
if "登記編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
if len(tempText)==0:
tempText="NULL"
#mycsv.append(tempText)
mycsv[0]=tempText
if "姓" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[1]=tempText
if"性" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[2]=tempText
if "出生日期" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[3]=tempText
if "失蹤時身高" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[4]=tempText
if "失蹤時間" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[5]=tempText
if "失蹤日期" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[5]=tempText
if "失蹤地點" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[6]=tempText
if "是否報案" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[7]=tempText
try:
writer.writerow((str(mycsv[0]),str(mycsv[1]),str(mycsv[2]),str(mycsv[3]),str(mycsv[4]),str(mycsv[5]),str(mycsv[6]),str(mycsv[7])))#寫入CSV文件
finally:
time.sleep(1)#設置爬完之後的睡眠時間，這裏先設置為1秒

Selenium形式：

[python] view plain copy

#得到當前頁面失蹤人口信息
#pageUrl為當前帖子頁面鏈接
def CurrentPageMissingPopulationInformation(tieziUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘128.199.169.17:80‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
try:
#掉用第三方包selenium打開瀏覽器登陸
#driver=webdriver.Chrome()#打開chrome
driver=webdriver.Chrome()#打開無界面瀏覽器Chrome
#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS
driver.set_page_load_timeout(10)
#driver.implicitly_wait(30)
try:
driver.get(tieziUrl)#登陸兩次
driver.get(tieziUrl)
except TimeoutError:
driver.refresh()
#print(driver.page_source)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
#獲取網頁信息
#抓捕網頁解析過程中的錯誤
try:
#req=request.Request(tieziUrl,headers=headers5)
#html=urlopen(req)
bsObj=BeautifulSoup(html,"html.parser")
#html.close()
except UnicodeDecodeError as e:
print("-----UnicodeDecodeError url",tieziUrl)
except urllib.error.URLError as e:
print("-----urlError url:",tieziUrl)
except socket.timeout as e:
print("-----socket timout:",tieziUrl)
while(bsObj.find(‘title‘).get_text() == "頁面重載開啟"):
print("當前頁面不是重加載後的頁面，程序會嘗試刷新一次到跳轉後的頁面\n")
driver.get(tieziUrl)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
bsObj=BeautifulSoup(html,"html.parser")
except Exception as e:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
time.sleep(0.5)
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#查找想要的信息
templist1=bsObj.find("td",{"class":"t_f"}).ul
if templist1==None:#判斷是否不包含ul字段，如果不，跳出函數
print("當前帖子頁面不包含ul字段")
return 1
mycsv=[‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘]#初始化提取信息列表
for templist2 in templist1.findAll("font",size=re.compile("^([0-9]+)*$")):
tempText=templist2.get_text()
#print(tempText[0:4])
if "寶貝回家編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
if len(tempText)==0:
tempText="NULL"
mycsv[0]=tempText
if "尋親編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
if len(tempText)==0:
tempText="NULL"
#mycsv.append(tempText)
mycsv[0]=tempText
if "登記編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
if len(tempText)==0:
tempText="NULL"
#mycsv.append(tempText)
mycsv[0]=tempText
if "姓" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[1]=tempText
if"性" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[2]=tempText
if "出生日期" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[3]=tempText
if "失蹤時身高" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[4]=tempText
if "失蹤時間" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[5]=tempText
if "失蹤日期" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[5]=tempText
if "失蹤地點" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[6]=tempText
if "是否報案" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[7]=tempText
try:
writer.writerow((str(mycsv[0]),str(mycsv[1]),str(mycsv[2]),str(mycsv[3]),str(mycsv[4]),str(mycsv[5]),str(mycsv[6]),str(mycsv[7])))#寫入CSV文件
csvfile.flush()#馬上將這條數據寫入csv文件中
finally:
print("當前帖子信息寫入完成\n")
time.sleep(5)#設置爬完之後的睡眠時間，這裏先設置為1秒

現附上所有代碼，此代碼僅供參考，不能用於商業用途，網絡爬蟲易給網站服務器造成巨大負荷，任何人使用本代碼所引起的任何後果，本人不予承擔法律責任。貼出代碼的初衷是供大家學習爬蟲，大家只是研究下網絡框架即可，不要使用此代碼去加重網站負荷，本人由於不當使用，已被封IP，前車之鑒，爬取失蹤人口信息只是為了從空間上分析人口失蹤的規律，由此給網站造成的什麽不便，請見諒。

附上所有代碼：

[python] view plain copy

#__author__ = ‘Administrator‘
#coding=utf-8
import io
import os
import sys
import math
import urllib
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib import request
from bs4 import BeautifulSoup
import re
import time
import socket
import csv
from selenium import webdriver
socket.setdefaulttimeout(5000)#設置全局超時函數
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=‘gb18030‘)
#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=‘utf-8‘)
#設置不同的headers,偽裝為不同的瀏覽器
headers1={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0‘}
headers2={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36‘}
headers3={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘}
headers4={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400‘}
headers5={‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
‘Connection‘:‘keep-alive‘,
‘Host‘:‘bbs.baobeihuijia.com‘,
‘Referer‘:‘http://bbs.baobeihuijia.com/forum-191-1.html‘,
‘Upgrade-Insecure-Requests‘:‘1‘,
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36‘}
headers6={‘Host‘: ‘bbs.baobeihuijia.com‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0‘,
‘Accept‘: ‘textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
‘Connection‘: ‘keep-alive‘,
‘Upgrade-Insecure-Requests‘:‘ 1‘
}
#得到當前頁面失蹤人口信息
#pageUrl為當前帖子頁面鏈接
def CurrentPageMissingPopulationInformation(tieziUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘128.199.169.17:80‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
try:
#掉用第三方包selenium打開瀏覽器登陸
#driver=webdriver.Chrome()#打開chrome
driver=webdriver.Chrome()#打開無界面瀏覽器Chrome
#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS
driver.set_page_load_timeout(10)
#driver.implicitly_wait(30)
try:
driver.get(tieziUrl)#登陸兩次
driver.get(tieziUrl)
except TimeoutError:
driver.refresh()
#print(driver.page_source)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
#獲取網頁信息
#抓捕網頁解析過程中的錯誤
try:
#req=request.Request(tieziUrl,headers=headers5)
#html=urlopen(req)
bsObj=BeautifulSoup(html,"html.parser")
#html.close()
except UnicodeDecodeError as e:
print("-----UnicodeDecodeError url",tieziUrl)
except urllib.error.URLError as e:
print("-----urlError url:",tieziUrl)
except socket.timeout as e:
print("-----socket timout:",tieziUrl)
while(bsObj.find(‘title‘).get_text() == "頁面重載開啟"):
print("當前頁面不是重加載後的頁面，程序會嘗試刷新一次到跳轉後的頁面\n")
driver.get(tieziUrl)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
bsObj=BeautifulSoup(html,"html.parser")
except Exception as e:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
time.sleep(0.5)
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#查找想要的信息
templist1=bsObj.find("td",{"class":"t_f"}).ul
if templist1==None:#判斷是否不包含ul字段，如果不，跳出函數
print("當前帖子頁面不包含ul字段")
return 1
mycsv=[‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘,‘NULL‘]#初始化提取信息列表
for templist2 in templist1.findAll("font",size=re.compile("^([0-9]+)*$")):
tempText=templist2.get_text()
#print(tempText[0:4])
if "寶貝回家編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
if len(tempText)==0:
tempText="NULL"
mycsv[0]=tempText
if "尋親編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
if len(tempText)==0:
tempText="NULL"
#mycsv.append(tempText)
mycsv[0]=tempText
if "登記編號" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
if len(tempText)==0:
tempText="NULL"
#mycsv.append(tempText)
mycsv[0]=tempText
if "姓" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[1]=tempText
if"性" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[2]=tempText
if "出生日期" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[3]=tempText
if "失蹤時身高" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[4]=tempText
if "失蹤時間" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[5]=tempText
if "失蹤日期" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[5]=tempText
if "失蹤地點" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[6]=tempText
if "是否報案" in tempText[0:6]:
print(tempText)
index=tempText.find("：")
tempText=tempText[index+1:]
#mycsv.append(tempText)
mycsv[7]=tempText
try:
writer.writerow((str(mycsv[0]),str(mycsv[1]),str(mycsv[2]),str(mycsv[3]),str(mycsv[4]),str(mycsv[5]),str(mycsv[6]),str(mycsv[7])))#寫入CSV文件
csvfile.flush()#馬上將這條數據寫入csv文件中
finally:
print("當前帖子信息寫入完成\n")
time.sleep(5)#設置爬完之後的睡眠時間，這裏先設置為1秒
#得到當前板塊所有的頁面鏈接
#siteUrl為當前版塊的頁面鏈接
def GetALLPageUrl(siteUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘123.207.143.51:8080‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
try:
#掉用第三方包selenium打開瀏覽器登陸
#driver=webdriver.Chrome()#打開chrome
driver=webdriver.Chrome()#打開無界面瀏覽器Chrome
#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS
driver.set_page_load_timeout(10)
#driver.implicitly_wait(30)
try:
driver.get(siteUrl)#登陸兩次
driver.get(siteUrl)
except TimeoutError:
driver.refresh()
#print(driver.page_source)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
#獲取網頁信息
#抓捕網頁解析過程中的錯誤
try:
#req=request.Request(tieziUrl,headers=headers5)
#html=urlopen(req)
bsObj=BeautifulSoup(html,"html.parser")
#print(bsObj.find(‘title‘).get_text())
#html.close()
except UnicodeDecodeError as e:
print("-----UnicodeDecodeError url",siteUrl)
except urllib.error.URLError as e:
print("-----urlError url:",siteUrl)
except socket.timeout as e:
print("-----socket timout:",siteUrl)
while(bsObj.find(‘title‘).get_text() == "頁面重載開啟"):
print("當前頁面不是重加載後的頁面，程序會嘗試刷新一次到跳轉後的頁面\n")
driver.get(siteUrl)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
bsObj=BeautifulSoup(html,"html.parser")
except Exception as e:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#time.sleep()
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com，以便組成頁面鏈接
siteindex=siteUrl.rfind("/")
tempsiteurl=siteUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/
tempbianhaoqian=siteUrl[siteindex+1:-6]#forum-191-
#爬取想要的信息
bianhao=[]#存儲頁面編號
pageUrl=[]#存儲頁面鏈接
templist1=bsObj.find("div",{"class":"pg"})
#if templist1==None:
#return
for templist2 in templist1.findAll("a",href=re.compile("forum-([0-9]+)-([0-9]+).html")):
if templist2==None:
continue
lianjie=templist2.attrs[‘href‘]
#print(lianjie)
index1=lianjie.rfind("-")#查找-在字符串中的位置
index2=lianjie.rfind(".")#查找.在字符串中的位置
tempbianhao=lianjie[index1+1:index2]
bianhao.append(int(tempbianhao))
bianhaoMax=max(bianhao)#獲取頁面的最大編號
for i in range(1,bianhaoMax+1):
temppageUrl=tempsiteurl+tempbianhaoqian+str(i)+".html"#組成頁面鏈接
print(temppageUrl)
pageUrl.append(temppageUrl)
return pageUrl#返回頁面鏈接列表
#得到當前版塊頁面所有帖子的鏈接
def GetCurrentPageTieziUrl(PageUrl):
#設置代理IP訪問
#代理IP可以上http://http.zhimaruanjian.com/獲取
proxy_handler=urllib.request.ProxyHandler({‘post‘:‘110.73.30.157:8123‘})
proxy_auth_handler=urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
try:
#掉用第三方包selenium打開瀏覽器登陸
#driver=webdriver.Chrome()#打開chrome
driver=webdriver.Chrome()#打開無界面瀏覽器Chrome
#driver=webdriver.PhantomJS()#打開無界面瀏覽器PhantomJS
driver.set_page_load_timeout(10)
try:
driver.get(PageUrl)#登陸兩次
driver.get(PageUrl)
except TimeoutError:
driver.refresh()
#print(driver.page_source)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
#獲取網頁信息
#抓捕網頁解析過程中的錯誤
try:
#req=request.Request(tieziUrl,headers=headers5)
#html=urlopen(req)
bsObj=BeautifulSoup(html,"html.parser")
#html.close()
except UnicodeDecodeError as e:
print("-----UnicodeDecodeError url",PageUrl)
except urllib.error.URLError as e:
print("-----urlError url:",PageUrl)
except socket.timeout as e:
print("-----socket timout:",PageUrl)
n=0
while(bsObj.find(‘title‘).get_text() == "頁面重載開啟"):
print("當前頁面不是重加載後的頁面，程序會嘗試刷新一次到跳轉後的頁面\n")
driver.get(PageUrl)
html=driver.page_source#將瀏覽器執行後的源代碼賦給html
bsObj=BeautifulSoup(html,"html.parser")
n=n+1
if n==10:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
return 1
except Exception as e:
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
time.sleep(1)
driver.close() # Close the current window.
driver.quit()#關閉chrome瀏覽器
#http://bbs.baobeihuijia.com/forum-191-1.html變成http://bbs.baobeihuijia.com，以便組成帖子鏈接
siteindex=PageUrl.rfind("/")
tempsiteurl=PageUrl[0:siteindex+1]#http://bbs.baobeihuijia.com/
#print(tempsiteurl)
TieziUrl=[]
#爬取想要的信息
for templist1 in bsObj.findAll("tbody",id=re.compile("normalthread_([0-9]+)")) :
if templist1==None:
continue
for templist2 in templist1.findAll("a",{"class":"s xst"}):
if templist2==None:
continue
tempteiziUrl=tempsiteurl+templist2.attrs[‘href‘]#組成帖子鏈接
print(tempteiziUrl)
TieziUrl.append(tempteiziUrl)
return TieziUrl#返回帖子鏈接列表
#CurrentPageMissingPopulationInformation("http://bbs.baobeihuijia.com/thread-213126-1-1.html")
#GetALLPageUrl("http://bbs.baobeihuijia.com/forum-191-1.html")
#GetCurrentPageTieziUrl("http://bbs.baobeihuijia.com/forum-191-1.html")
if __name__ == ‘__main__‘:
csvfile=open("E:/MissingPeople.csv","w+",newline="",encoding=‘gb18030‘)
writer=csv.writer(csvfile)
writer.writerow((‘寶貝回家編號‘,‘姓名‘,‘性別‘,‘出生日期‘,‘失蹤時身高‘,‘失蹤時間‘,‘失蹤地點‘,‘是否報案‘))
pageurl=GetALLPageUrl("https://bbs.baobeihuijia.com/forum-191-1.html")#尋找失蹤寶貝
#pageurl=GetALLPageUrl("http://bbs.baobeihuijia.com/forum-189-1.html")#被拐寶貝回家
time.sleep(5)
print("所有頁面鏈接獲取成功！\n")
n=0
for templist1 in pageurl:
#print(templist1)
tieziurl=GetCurrentPageTieziUrl(templist1)
time.sleep(5)
print("當前頁面"+str(templist1)+"所有帖子鏈接獲取成功！\n")
if tieziurl ==1:
print("不能得到當前帖子頁面！\n")
continue
else:
for templist2 in tieziurl:
#print(templist2)
n=n+1
print("\n正在收集第"+str(n)+"條信息！")
time.sleep(5)
tempzhi=CurrentPageMissingPopulationInformation(templist2)
if tempzhi==1:
print("\n第"+str(n)+"條信息為空！")
continue
print(‘‘)
print("信息爬取完成！請放心的關閉程序！")
csvfile.close()

寫成的CSV文件截圖：

技術分享

Python爬蟲小實踐：尋找失蹤人口，爬取失蹤兒童信息並寫成csv文件，方便存入數據庫

python tor enc mini 執行 gem view 獲取但是前兩天有人私信我，讓我爬這個網站，http://bbs.baobeihuijia.com/forum-191-1.html上的失蹤兒童信息，準備根據失蹤兒童的失蹤時的地理位置來更好的尋找失蹤兒童，這

Python爬蟲小實踐：尋找失蹤人口，爬取失蹤兒童信息並寫成csv文件，方便存入數據庫

Python爬蟲小實踐：尋找失蹤人口，爬取失蹤兒童信息並寫成csv文件，方便存入數據庫

[python爬蟲小實戰2]根據使用者輸入關鍵詞爬取今日頭條圖集，並批量下載圖片

Python爬蟲新手教程：知乎文章圖片爬取器

爬蟲系列3：Requests+Xpath 爬取租房網站信息並保存本地

python json及mysql——讀取json文件存sql、數據庫日期類型轉換、終端操縱mysql及python codecs讀取大文件問題

Android開發系列（十七）：讀取assets文件夾下的數據庫文件

Python爬蟲小案例：豆瓣電影TOP250

Python爬蟲小偏方：突破登錄和訪問頻率限制，多研究對方不同終端產品

團隊-張文然-需求分析-python爬蟲分類爬取豆瓣電影信息

寫了小工具分享：按關鍵字文件在數據文件中搜索數據行

Python爬蟲入門 | 爬取豆瓣電影信息

自學python爬蟲（四）Requests+正則表示式爬取貓眼電影

python爬蟲系列(3.4-使用xpath和lxml爬取伯樂線上)

python爬蟲——記一次前所未有的經歷（爬取魔方格作文）

Python爬蟲入門教程 2-100 妹子圖網站爬取

python爬蟲之雲片網國內簡訊介面爬取

Python爬蟲入門教程 6-100 蜂鳥網圖片爬取之一

Python爬蟲入門教程 8-100 蜂鳥網圖片爬取之三

python爬蟲建立代理池，爬取5000個代理IP並進行驗證！

Python爬蟲(5) 借助搜狗搜索爬取微信文章

Python爬蟲小實踐：尋找失蹤人口，爬取失蹤兒童信息並寫成csv文件，方便存入數據庫

相關推薦