1. 程式人生 > >python爬蟲例項

python爬蟲例項

import re

import requests
from bs4 import BeautifulSoup


# 主方法
def main():
    # 給請求指定一個請求頭來模擬chrome瀏覽器
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
    page_max = 100
    for i in range(1, int(page_max) + 1):
        
if i == 1: house = 'https://cc.lianjia.com/ershoufang/erdaoqu/' else: house = 'https://cc.lianjia.com/ershoufang/erdaoqu/pg'+str(i) res = requests.get(house, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') li_max = soup.find('ul', class_='
sellListContent').find_all('li') for li in li_max: try: house_param = {} content = li.find('div', class_='houseInfo').text content = content.split("|") house_param['housing_estate'] = content[0] house_param[
'square_metre'] = re.findall(r'-?\d+\.?\d*e?-?\d*?', content[2])[0] # --------------------------------------------------------# position = li.find('div', class_='positionInfo').find('a').text house_param['position'] = position # --------------------------------------------------------# totalprice = li.find('div', class_='totalPrice').text house_param['total_price'] = re.sub("\D", "", totalprice) unitprice = li.find('div', class_='unitPrice').text house_param['unit_price'] = re.sub("\D", "", unitprice) # --------------------------------------------------------# follow = li.find('div', class_='followInfo').text follow = follow.split("/") house_param['follow'] = re.sub("\D", "", follow[0]) house_param['take_look'] = re.sub("\D", "", follow[1]) # --------------------------------------------------------# title_src = li.find('div', class_='title').find('a').attrs['href'] house_param['url'] = re.sub("\D", "", title_src) res = requests.get(title_src, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') # --------------------------------------------------------# pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].text house_param['pub_date'] = pub_date print(house_param) except Exception as e: print(e) if __name__ == '__main__': main()