1. 程式人生 > >python爬取知乎話題的精華問題下的使用者資訊

python爬取知乎話題的精華問題下的使用者資訊

今天試著用自己的爬蟲程式碼爬取了知乎【同性戀】話題下的所有精華問題的使用者位置資訊
程式碼:

__author__ = 'yang'
# -*- coding: utf-8 -*-

import configparser
import requests
import time
import re
import string


def curTime():
    curTime = time.strftime('%Y-%m-%d %H:%M:%S')
    timeStr = '\n<!--'+curTime+'-->'
    return timeStr

def
loginInfo():
#獲取使用者名稱,密碼 filename = 'test.ini' #test.ini中有知乎賬號、密碼及瀏覽器cookies config = configparser.ConfigParser() config.read(filename) cookies = config.items('COOKIES') cookies = dict(cookies) username = config.get("USER","username") password = config.get("USER","password"
) #print username return username,password,cookies def create_session(): username, password, cookies = loginInfo() session = requests.session() login_data = {'email':username, 'password':password} header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36'
, 'Host': 'www.zhihu.com', 'Referer': 'http://www.zhihu.com/' } r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header) if r.json()['r'] == 1: print 'Login Failed, reason is:', for m in r.json()['data']: print r.json()['data'][m] print 'Use cookies to login...' has_cookies = False for key in cookies: if key != '__name__' and cookies[key] != '': has_cookies = True break if has_cookies is False: raise ValueError('請填寫config.ini檔案中的cookies項') else: r = session.get('http://www.zhihu.com/login/email', cookies=cookies) with open('login.html', 'w') as fp: fp.write(r.content) return session, cookies def writeFile(name,content): with open(name,'w') as fp: fp.write(content) if __name__ == '__main__': requests_session, requests_cookies = create_session() with open('tong.html','w') as fp: fp.write(curTime()) for page in range(0,49): url = 'https://www.zhihu.com/topic/19552984/top-answers?'+str(page) content = requests_session.get(url, cookies=requests_cookies).content f = file('tong.html', 'a+') f.write(content) #f = file('url.html', 'a+') #f.write(curTime()) #匹配問題連線字串 str = re.compile(r'<a class="question_link.*?href="(.*?)">') with open('url.html') as file: content = file.read() questionLinks = str.findall(content) print (questionLinks) with open('resultLink.html','w') as fp: fp.write('\n'.join(questionLinks)) with open('resultLink.html') as fp: questionLinks = fp.readlines() #獲取使用者連結 usrRegex = re.compile(r'<a class="author-link.*?href="(.*?)">') for link in questionLinks: num = link.strip() url = 'https://www.zhihu.com'+str(num) page = requests_session.get(url,cookies=requests_cookies).content #獲取頁面內容 #過濾使用者連結 usrLinks = usrRegex.findall(page) f = file('usrLinks.html','a+') f.write('\n'.join(usrLinks)) with open('usrLinks.html') as fp: ls = fp.readlines() links = [] for link in ls: links.append(link.strip()) #print len(links) links = list(set(links)) #print len(links) #獲取使用者個人頁面 locationRegex = re.compile(r'<span class="location item.*?title="(.*?)"') locations = [] for link in links: url = 'https://www.zhihu.com'+str(link) page = requests_session.get(url, cookies=requests_cookies).content #獲取位置資訊 #locations.append(locationRegex.findall(page)) location = locationRegex.findall(page)+'\n' if (location): f = file('locations.html','a+') f.write('\n'.join(location))