1. 程式人生 > >python獲取網站信息

python獲取網站信息

python爬蟲學習

#coding:utf-8 import urllib2 import os import sys reload(sys) sys.setdefaultencoding("utf-8") from bs4 import BeautifulSoup heads = {} heads[‘User-Agent‘] = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36‘ request=urllib2.Request("http://www.kugou.com" ,headers=heads)#創建對酷狗官網get請求 result=urllib2.urlopen(request)#發出請求 soup=BeautifulSoup(result.read(),‘html.parser‘)#生成可分析對象 for i in soup.find_all("div"):#遍歷所有div標簽 if i.get("id")=="SongtabContent":#判斷id為SongtabContent的div標簽 s=i.find_all("li")#把所有li標簽內容賦值給s變量 with open(u"C://downloads//lw//a.txt","w") as f:#創建要寫入文件對象 for i in s:#遍歷所有li標簽對象 f.write(u"歌曲名稱為: %s " % i.a.select(".songName")[0].text)#獲取class為songName的值 f.write(u"歌曲播放連接為: %s " % i.a.get("href" )) #獲取標簽為href的值 f.write(u"歌曲播放時間為: %s" % i.a.select(".songTime")[0].text) #獲取class為songTime的值 f.write(os.linesep) def shoufu(): import requests import re resq = requests.get("http://www.sohu.com")#請求搜狐網站 print resq.text[:100]#打印響應結果前一百行 links = re.findall(r‘href="(.*?)"‘, resq.text)#查找所有包含href內容 print len(links) valid_link = []#保存有效連接 invalid_link = []#保存無效連接 for link in links: if re.search(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$", link.strip()):#資源連接篩選出來 print 6, link invalid_link.append(link.strip()) continue#進入此判斷之後執行完直接執行下一次循環 elif link.strip() == "" or link.strip() == "#" or link.strip() == "/":#無效內容篩選去除 # print 1,link invalid_link.append(link) continue elif link.strip().startswith("//"):#把有效相對連接篩選保存 # print 2,link valid_link.append("http:" + link.strip()) continue elif link.strip().count("javascript") >= 1 or link.strip().count("mailto:") >= 1:#引用js連接及郵箱超級連接去除 # print 3,link invalid_link.append(link.strip()) continue elif re.match(r"/\w+", link):#把剩下所有內容連接再做進一步篩選 # print 5,link if re.match(r"http://.*?/", resq.url.strip()):#http開頭連接篩選 valid_link.append(re.match(r"http://.*?/", resq.url.strip()).group() + link.strip())#把連接以/結尾內容保存 else: valid_link.append(re.match(r"http://.*", resq.url.strip()).group() + link.strip())#把連接以內容結尾保存 continue else: # print 7,link valid_link.append(link.strip())#篩選剩下的內容都保存到有效列表中 # for link in valid_link[:100]: # print link print len(valid_link) # for link in invalid_link: # print link print len(invalid_link) file_num = 1#為創建文件準備 for link in list(set(valid_link)): # print link resq = requests.get(link, verify=True)#允許證書校驗並訪問所有保存的有效連接 if u"籃球" in resq.text:#篩選網頁內容中是否存在“籃球”內容 print link if u‘meta charset="utf-8"‘ in resq.text:#判斷網頁是否以utf-8編碼 with open(r"c:\\downloads\\lw\\" + str(file_num) + ".html", "w") as fp: fp.write(resq.text.strip().encode("utf-8"))#編碼內容為utf-8後保存到指定目錄 else: with open(r"c:\\downloads\\lw\\" + str(file_num) + ".html", "w") as fp: fp.write(resq.text.strip().encode("gbk"))#編碼內容為gbk後保存到指定目錄 file_num += 1 print "Done!"

技術分享圖片

python獲取網站信息