1. 程式人生 > >scrapy框架爬取前程無憂

scrapy框架爬取前程無憂

用了幾天時間爬取了前程無憂上的資料進行分析,完成了一個小專案,
截取了部分程式碼,不足之處,你也打不到我!
pi… pi…

網站:前程無憂
資料庫:MySQL
知識點:scrapy框架,mysql資料庫,xpath,echarts,jeiba

直接看專案!

爬蟲程式碼》》

import scrapy
from qiancheng.items import QianchengItem
class QcwyspiderSpider(scrapy.Spider):
    name = 'qcwyspider'
    # allowed_domains = ['https://search.51job.com']
    start_urls = ['https://search.51job.com/list/030200,000000,0000,00,9,99,%%2B,2,%s.html'% k for k in range(2000)]
    def parse(self, response):
        temp = response.xpath('//div[@class="el"]/p/span/a/@href').extract()
        for i in temp:
            url = response.urljoin(i)
            yield scrapy.Request(url=url, callback=self.parseContents, dont_filter=True)
    def parseContents(self,response):
        item = QianchengItem()
        item['title'] = ''.join(response.xpath('//div[@class="in"]/div/h1/@title').extract())
        item['salary'] = ''.join(response.xpath('//div[@class="in"]/div/strong/text()').extract())
        item['company'] = ''.join(response.xpath('//div[@class="tHeader tHjob"]/div/div/p/a/@title').extract())
        item['style'] = ''.join(response.xpath('//div[@class="tCompany_sidebar"]/div/div[@class="com_tag"]/p[1]/@title').extract())
        yield item

item設定》》

import scrapy

class QianchengItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    salary = scrapy.Field()      # 工資
    company = scrapy.Field()       # 公司
    style = scrapy.Field()      # 地址

pipelines(管道)設定》》

import pymysql
from scrapy.conf import settings as st
class QianchengPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect(
            host="127.0.0.1",
            port=3306,
            user='root',
            password='12345678',
            db='mysql'
      )
        self.cursor = self.conn.cursor()
        self.cursor.execute('drop table if exists fangyuan')
        sql = """create table fangyuan (
                title char(255)not null,
                company varchar(1000),
                salary varchar(1000),
                style varchar(1000)
                )"""
        self.cursor.execute(sql)
    def process_item(self,item,spider):
        print("開始儲存---")
        sql="insert into fangyuan(title,salary,company,style) values('%s','%s','%s','%s')" % (item['title'],item['salary'],item['company'],item['style'])
        self.cursor.execute(sql)
        self.conn.commit()
        print("儲存結束----")
        return item

setting設定》》


BOT_NAME = 'qiancheng'
SPIDER_MODULES = ['qiancheng.spiders']
NEWSPIDER_MODULE = 'qiancheng.spiders'

ITEM_PIPELINES = {
    'qiancheng.pipelines.QianchengPipeline': 300,
}
DEFAULT_REQUEST_HEADERS = {
    'Referer': 'https://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=030200%2C00&funtype=0000&industrytype=00&keyword='
}
DOWNLOAD_DELAY = 1
ROBOTSTXT_OBEY = True

爬蟲部分結束。
開始視覺化:

餅狀圖:》》

from pyecharts import Pie
import pymysql
con = pymysql.connect(
    host="127.0.0.1",
    port=3306,
    user="root",
    password="12345678",
    db='mysql',
)
k = []
o = []
cursor = con.cursor(pymysql.cursors.DictCursor)
u = cursor.execute('''select style from fangyuan ''')
a = cursor.fetchall()
for y in a:
    if y['salary'] not in k:
        k.append(y["salary"])
for h in k:
    r = cursor.execute('select style from fangyuan where style="%s"' % h)
    o.append(r)
print(o)
print(k)
pie = Pie('招聘公司的型別比例')
attr = ['上市公司', '民營公司', '外資(非歐美)', '國企', '合資', '創業公司', '外資(歐美)', '非營利組織', '事業單位', '外企代表處']
v1 = [2597, 42541, 1918, 2008, 2088, 954, 1183, 93, 95, 67]
pie.use_theme('dark')
pie.add('公司型別', attr, v1, is_label_show=True)
pie.render('pie.html')

在這裡插入圖片描述

條形圖:》》

from pyecharts import Bar

import pymysql
con = pymysql.connect(
    host="127.0.0.1",
    port=3306,
    user="root",
    password="12345678",
    db='mysql',
)
k = []
o = []
cursor = con.cursor(pymysql.cursors.DictCursor)
u = cursor.execute('''select salary from fangyuan ''')
a = cursor.fetchall()

for y in a:
    if y['salary'] not in k:
        k.append(y["salary"])
for h in k:
    r = cursor.execute('select salary from fangyuan where salary="%s"' %h)
    o.append(r)

for n in o[:20]:
    print(n)
for m in k[:20]:
    print(m)

v1 = [941, 270, 416, 6843, 4202, 122, 509, 1178, 817, 5979, 336, 256, 1706, 3084, 1287, 332]
attr = [ '0.8-1.5萬/月', '0.7-1萬/月', '3.5-4.5千/月', '6-8千/月', '3-4.5千/月', '4.5-6.5千/月', '3.5-5千/月', '0.6-1萬/月', '3-5千/月', '4.5-6千/月', '3.5-6千/月', '8-10萬/年', '4-6千/月', '1-1.5萬/月', '5-8千/月', '3-8千/月']
bar = Bar('各行業薪資水平')
bar.use_theme('dark')
bar.add('工資分佈', is_label_show=True)
bar.render('bar.html')

!選取了一部分資料進行分析,還有要改善的地方

最後再來張詞雲圖:》》

import jieba
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from PIL import Image
import numpy as np
with open(r'C:\Users\hc\Desktop\fangyuan.txt', 'r', encoding="UTF-8") as file1:
    content = "".join(file1.readlines())
content_after = "".join(jieba.cut(content, cut_all=True))
# 新增的程式碼,把剛剛你儲存好的圖片用Image方法開啟,
# 然後用numpy轉換了一下
images = Image.open("hello.jpg")
maskImages = np.array(images)

# 修改了一下wordCloud引數,就是把這些資料整理成一個形狀,
# 具體的形狀會適應你的圖片的.
wc = WordCloud(font_path="C:\Windows\Fonts\simsun.ttc",background_color="black",max_words=1000,max_font_size=100,width=1500,height=1500, mask=maskImages).generate(content)
plt.imshow(wc)
wc.to_file('cyt.png')

在這裡插入圖片描述

啦啦啦啦阿拉啦啦啦啦。。。。。。。。。。。

就這樣,還有好多要完善的地方,希望各位大佬給點建議!
。。。。。。。。

順便再給點鼓勵,畢竟路還很長,你的鼓勵足以溫暖我心!。。。。。。。。。。。。。。