1. 程式人生 > >python3爬蟲 連結+表格+圖片

python3爬蟲 連結+表格+圖片

# -*- coding: utf-8 -*-
import urllib.request
import http.cookiejar
from bs4 import BeautifulSoup
import requests
import csv
import time
import re
import urllib
from urllib.parse import quote
import string

def get_url_2():
    with open('F:/python/二級目錄網址.csv')as f:
        f_csv = csv.reader(f)
        link_list =[]
        for link1 in f_csv:
            link_list.append(link1)
        return link_list

def get_url_weizhuang(head={
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}):
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    header = []
    for key, value in head.items():
        elem = (key, value)
        header.append(elem)
    opener.addheaders = header
    return opener

def get_html(link):
    Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D"
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
        'Cookie': Cookie,
        'Connection': 'keep-alive',
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Host': 'query.sse.com.cn',
        'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'
    }
    r = requests.get(link, headers=headers, timeout=10)
    # print("響應狀態碼:", r.status_code)
    if 200 != r.status_code:
        pass
    # 普通BeautifulSoup find
    html = r.text
    return html

def get_data4():
    uop = get_url_weizhuang().open(link1, timeout=1000)
    content = uop.read().decode("utf-8")
    # 基本資訊-表格
    pattern1 = re.compile('style="padding-left:20px;">([\s\S]*?)</td>')
    # 受理標準-文字
    pattern2 = re.compile('style="font-size: 15px; line-height: 45px;text-indent: 2em; padding: 0 10px;">([\s\S]*?)</p>')
    # 設定依據-文字
    pattern3 = re.compile('<p style="font-size: 15px; line-height: 45px;">([\s\S]*?)</p>')
    # 收費標準及依據-文字
    pattern4 = re.compile('<p style="font-size: 15px; line-height: 45px;(.*?);float:left;">([\s\S]*?)</p>')
    # 辦理流程-表格
    pattern5 = re.compile('<div class="main_tab_item" id="con-one-5" style="display: none;">([\s\S]*?)</div>')
    # 辦理流程圖-圖片
    pattern6 = re.compile('<div class="main_tab_item" id="con-one-6" style="display: none;">([\s\S]*?)</div>')
    # 辦理材料-表格
    pattern7 = re.compile('<div class="main_tab_item" id="con-one-7" style="display: none;">([\s\S]*?)</div>')
    # 特殊環節-表格
    pattern8=re.compile('<div class="main_tab_item" id="con-one-8" style="display: none;">([\s\S]*?)</div>')
    # 辦理結果樣本-圖片
    # pattern9=re.compile('<img src="([\s\S]*?)" <alt>')
    items1 = re.findall(pattern1,content)
    items2 = re.findall(pattern2,content)
    items3 = re.findall(pattern3,content)
    items4 = re.findall(pattern4,content)
    items5 = re.findall(pattern5,content)
    items7 = re.findall(pattern7,content)
    items8 = re.findall(pattern8,content)
    item_sum1 = [[items1,items8],[items2,items3,items4],[items5, items7]]
    for p1 in item_sum1:
        jiben_xinxi = []
        for p in p1:
            for item11 in p:
                item1111 = qingxi_data(item11)
                jiben_xinxi.append(item1111)
        ui_string2 = str(jiben_xinxi).replace('\n', '').replace('\r', '').replace('\\n', '').replace('\\r','').\
            replace(' ','').replace('\'','').replace('>','').replace('[','').replace(']','').replace('\\u3000','').\
            replace('(text-indent:2em;padding:010px','').replace('(padding:010px;margin-top:0px;','').replace('--','')\
            .replace('"','').replace(')"','').split(',')
        ui_string2 = [x for x in ui_string2 if x != '']
        for n in range(len(ui_string2)):
            pattern = 'vard=(.*);if'
            ui_string3 = re.findall(pattern, ui_string2[n])
            if ui_string3 != []:
                ui_string2[n] = ui_string3
        print(ui_string2)
        save_contents(ui_string2)

def qingxi_data(item11):
    dr = re.compile('<[^>]+>', re.S)
    item111 = dr.sub(',', str(item11))
    item1111 = item111.replace('\\r', '').replace('\\n', '').replace(' ', '').replace('\n', '').replace('\r', '')
    return item1111

def dict_data5(jiben_xinxi):
    dict1 = {}
    len_1 = len(jiben_xinxi)
    if len_1 % 2 == 0:
        for index, item in enumerate(jiben_xinxi):
            if index % 2 == 0:
                dict1[item] = jiben_xinxi[index + 1]
        print(dict1)

def save_contents(shuju):
    urlist = shuju
    try:
        with open("詳細資料.csv",'a+',newline='') as f:
            writer = csv.writer(f)
            for i in range(len(urlist)):
                writer.writerow([urlist[i]])
    except:
        pass

def check_link(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('無法連結伺服器!!!')

def Schedule(a, b, c):
    '''
    a:已經下載的資料塊
    b:資料庫塊的大小
    c:遠端檔案的大小
    '''
    per = 100.0 * a * b / c
    if per > 100:
        per = 100
        print('完成!')
    print('%.2f%%' % per)

def get_contents(rurl):
    soup = BeautifulSoup(rurl, 'lxml')
    trs = soup.find_all('img')
    title_name = soup.find(attrs={'class': 'content_banner_list_up'}).string
    title_name2 = title_name.replace(' ', '').replace("\n", "").replace("\r", "")
    if trs != []:
        for src in trs:
            ui = []
            ui.append(src)
            ui_string = str(ui).replace('<img alt="" src="','').replace('"/>','').replace('[','').replace(']','')
            url = quote(ui_string,safe=string.printable)
            pattern2 =  "[\u4e00-\u9fa5]+"
            regex2 = re.compile(pattern2)
            results2 = regex2.findall(ui_string)
            filename = str(results2[0]) + '.jpg'
            try:
                urllib.request.urlretrieve(url, 'e:/test/%s_%s' % (title_name2, filename), Schedule)
            except:
                pass
            time.sleep(1)
        print('下載完成!')
    else:
        pass

if __name__ == '__main__':
    for i in range(len(get_url_2())):
        link1 = get_url_2()[i][0]
        print(link1)
        get_data4()
        rs = check_link(link1)
        get_contents(rs)
        time.sleep(3)
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
link1 = 'http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_index.do?webId=31&deptid='
def get_html(link):
    Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D"
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
        'Cookie': Cookie,
        'Connection': 'keep-alive',
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Host': 'query.sse.com.cn',
        'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'
    }
    r = requests.get(link, headers=headers, timeout=10)
    if 200 != r.status_code:
        pass
    html = r.text
    return html

def get_id(link11):
    movie_list = []
    soup = BeautifulSoup(get_html(link=link11), "lxml")
    div_list2 = re.findall(r'href="javascript:changebm(.*)" title=(.*)',soup.decode("utf8", "ignore"))
    for i in range(len(div_list2)):
        list1 = str(div_list2[i])
        list2 = re.findall(r"[\u4e00-\u9fa5]+",list1)
        list3 = re.findall(r"\d+",list1)
        if len((list3[0]))==9:
            movie_list.append(list3[0])
        else:
            pass
    return movie_list

def get_shuju_1():
    movie_list2 = get_id(link1)
    print(movie_list2)
    for n in range(len(movie_list2)):
        url_id = movie_list2[n]
        for p in range(1,9):
            url3 = "http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_list.do?webId=31&deptid=%s&isone=&isonline=&type=&word=&page_num=%s" % (url_id,p)
            soup3 = BeautifulSoup(get_html(link=url3), "lxml")
            div_list2 = soup3.select('a')
            if len(div_list2) != 0:
                print("存在此頁" + '' + url3)
                div_list = soup3.select('div > div.r3_tit > a')
                for m in range(len(div_list)):
                    div_list_2 = str(div_list[m]).replace('<a href="', '').replace('" target="_blank" title="', '').replace('\r\n\t\t\t\t\t\t\t\t\t</a>', '')
                    div_list_3 = re.sub(r'">[\u4e00-\u9fa5]+', '', str(div_list_2))
                    pattern = re.compile(r'^http(.*)html')
                    div_list_4 = re.findall(pattern, div_list_3)
                    movie_list1 = []
                    time.sleep(1)
                    movie_list3 = ('http'+str(div_list_4[0])+'html')
                    movie_list1.append(movie_list3)
                    save_contents(movie_list1)
            else:
                None
def save_contents(shuju):
    urlist = shuju
    try:
        with open("二級目錄網址.csv",'a+',newline='') as f:
            writer = csv.writer(f)
            for i in range(len(urlist)):
                writer.writerow([urlist[i]])
    except:
        pass

if __name__ == '__main__':
    get_html(link=link1)
    get_id(link1)
    get_shuju_1()