python實現數據爬取-清洗-持久化存儲-數據平臺可視化
基於python對淘寶模特個人信息進行篩選爬取,數據清洗,持久化寫入mysql數據庫.使用django對數據庫中的數據信息篩選並生成可視化報表進行分析。
數據爬取,篩選,存庫:
# -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import sys import re reload(sys) sys.setdefaultencoding(‘utf-8‘) import MySQLdb import chardet conn= MySQLdb.connect( host=‘localhost‘, port = 數據庫端口, user=‘root‘, passwd=‘數據庫密碼 db =‘xxnlove‘, charset=‘utf8‘ ) cur = conn.cursor() cur.execute("create table model(name text(225),age varchar(10),blood varchar(10),school text(225),height varchar(10),weight varchar(10),Measurements text(225),cup varchar(20),location text(225))ENGINE=InnoDB DEFAULT CHARSET=utf8;") #CREATE DATABASE gmtdb DEFAULT CHARACTER SET utf8mb4; for num in range(521,1314): try: URL = ‘http://mm.taobao.com/json/request_top_list.htm?page=%d‘ % num #print "現在爬取的網站url是:" + URL response = requests.get(URL) response.encoding = ‘gb2312‘ text = response.text soup = BeautifulSoup(text, ‘lxml‘) for model in soup.select(".list-item"): try: model_id = model.find(‘span‘, {‘class‘: ‘friend-follow J_FriendFollow‘})[‘data-userid‘] json_url = "http://mm.taobao.com/self/info/model_info_show.htm?user_id=%d" % int(model_id) response_json = requests.get(json_url) response_json.encoding = ‘gb2312‘ text_response_json = response_json.text soup_json = BeautifulSoup(text_response_json, ‘lxml‘) #print "***********************************" + model.find(‘a‘, {‘class‘: ‘lady-name‘}).string + "*********************************" #print "模特的名字:" + model.find(‘a‘, {‘class‘: ‘lady-name‘}).string name = model.find(‘a‘, {‘class‘: ‘lady-name‘}).string #print "模特的年齡:"+ model.find(‘p‘, {‘class‘: ‘top‘}).em.strong.string age = model.find(‘p‘, {‘class‘: ‘top‘}).em.strong.string blood = soup_json.find_all(‘li‘, {‘class‘: ‘mm-p-cell-right‘})[1].span.string # if blood is None: # blood = "None" school = soup_json.find_all(‘li‘)[5].span.string height = soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-height‘}).p.string weight = soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-weight‘}).p.string Measurements = soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-size‘}).p.string location = model.find(‘p‘, {‘class‘: ‘top‘}).span.string cup = soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-bar‘}).p.string sqli="insert into model values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" cur.execute(sqli,(name,age,blood,school,height,weight,Measurements,cup,location)) #print "罩杯:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-bar‘}).p.string ‘‘‘ print "生日:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-cell-left‘}).span.string blood = soup_json.find_all(‘li‘, {‘class‘: ‘mm-p-cell-right‘})[1].span.string if blood is None: blood = "無" print "血型:" + blood print "學校/專業:" + soup_json.find_all(‘li‘)[5].span.string print "身高:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-height‘}).p.string print "體重:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-weight‘}).p.string print "三圍:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-size‘}).p.string print "罩杯:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-bar‘}).p.string print "鞋碼:" + soup_json.find(‘li‘, {‘class‘: ‘mm-p-small-cell mm-p-shose‘}).p.string print "模特所在地:"+ model.find(‘p‘, {‘class‘: ‘top‘}).span.string print "模特的id:"+ model.find(‘span‘, {‘class‘: ‘friend-follow J_FriendFollow‘})[‘data-userid‘] print "模特的標簽:"+ model.find_all(‘p‘)[1].em.string print "模特的粉絲數:"+ model.find_all(‘p‘)[1].strong.string print "模特的排名:"+ [text for text in model.find(‘div‘, {‘class‘: ‘popularity‘}).dl.dt.stripped_strings][0] print model.find(‘ul‘, {‘class‘: ‘info-detail‘}).get_text(" ",strip=True) print "模特的個人資料頁面:" +"http:"+ model.find(‘a‘, {‘class‘: ‘lady-name‘})[‘href‘] print "模特的個人作品頁面:" +"http:"+ model.find(‘a‘, {‘class‘: ‘lady-avatar‘})[‘href‘] print "模特的個人頭像:" + "http:" + model.find(‘img‘)[‘src‘] print "***********************************" + model.find(‘a‘, {‘class‘: ‘lady-name‘}).string + "*********************************" print "\n" ‘‘‘ except: print "error" except: print num + "page is error" cur.close() conn.commit() conn.close()
數據庫結構:
寫入數據庫中的模特記錄數量:
寫入數據庫中模特信息部分圖:
django 實現圖表展示:
#coding:utf-8 # Create your views here. from django.shortcuts import render,render_to_response from django.http import HttpResponse,HttpResponseRedirect import MySQLdb import sys import re import json import jieba from operator import itemgetter from pytagcloud import create_tag_image, make_tags import random import time import smtplib from email.mime.text import MIMEText reload(sys) sys.setdefaultencoding(‘utf-8‘) conn= MySQLdb.connect( host=‘localhost‘, port = 端口, user=‘root‘, passwd=‘密碼‘, db =‘xxnlove‘, charset=‘utf8‘ ) def receive_message(request): if request.method == ‘POST‘: name = request.POST[‘name‘] email = request.POST[‘email‘] subject = request.POST[‘subject‘] message = request.POST[‘message‘] cur = conn.cursor() sql = "insert into message values(%s,%s,%s,%s)" cur.execute(sql,(name,email,subject,message)) cur.close() conn.commit() conn.close() return render_to_response(‘index.html‘) def send_email(request): _user = "[email protected]
{% load staticfiles %} <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>Charts demo</title> <script src="{% static "js/echarts.js" %}"></script> <script src="{% static "js/china.js" %}"></script> <script src="https://code.jquery.com/jquery-3.1.1.min.js"></script> </head> <body> <div id="main" style="height:600px;"></div> <script type="text/javascript"> var myChart = echarts.init(document.getElementById(‘main‘)); option = { title : { text: ‘淘寶模特所在省份分部情況‘, subtext: ‘‘, x:‘center‘ }, tooltip : { trigger: ‘item‘ }, legend: { orient: ‘vertical‘, x:‘left‘, data:[‘‘] }, dataRange: { min: 0, max: 2500, x: ‘left‘, y: ‘bottom‘, text:[‘高‘,‘低‘], // 文本,默認為數值文本 calculable : true }, toolbox: { show: true, orient : ‘vertical‘, x: ‘right‘, y: ‘center‘, feature : { mark : {show: true}, dataView : {show: true, readOnly: false}, restore : {show: true}, saveAsImage : {show: true} } }, roamController: { show: true, x: ‘right‘, mapTypeControl: { ‘china‘: true } }, series : [ { name: ‘人數‘, type: ‘map‘, mapType: ‘china‘, roam: false, itemStyle:{ normal:{label:{show:true}}, emphasis:{label:{show:true}} }, data:[ {name: ‘北京‘,value: {{ beijin }}}, {name: ‘江西‘,value: {{ jianxi }}}, {name: ‘廣東‘,value: {{ guangdong }}}, {name: ‘山東‘,value: {{ shandong }}}, {name: ‘江蘇‘,value: {{ jiangsu }}}, {name: ‘河南‘,value: {{ henan }}}, {name: ‘上海‘,value: {{ shanghai }}}, {name: ‘河北‘,value: {{ hebei }}}, {name: ‘浙江‘,value: {{ zhejiang }}}, {name: ‘陜西‘,value: {{ shanxi }}}, {name: ‘湖南‘,value: {{ hunan }}}, {name: ‘重慶‘,value: {{ chongqing }}}, {name: ‘福建‘,value: {{ fujian }}}, {name: ‘天津‘,value: {{ tianjin }}}, {name: ‘雲南‘,value: {{ yunnan }}}, {name: ‘四川‘,value: {{ sichuan }}}, {name: ‘廣西‘,value: {{ guangxi }}}, {name: ‘安徽‘,value: {{ anhui }}}, {name: ‘海南‘,value: {{ hainan }}}, {name: ‘江西‘,value: {{ jiangxi }}}, {name: ‘湖北‘,value: {{ hubei }}}, {name: ‘山西‘,value: {{ shanxi2 }}}, {name: ‘遼寧‘,value: {{ liaoning }}}, {name: ‘臺灣‘,value: {{ taiwan }}}, {name: ‘黑龍江‘,value: {{ heilongjiang }}}, {name: ‘貴州‘,value: {{ guizhou }}}, {name: ‘甘肅‘,value: {{ gansu }}}, {name: ‘青海‘,value: {{ qinghai }}}, {name: ‘新疆‘,value: {{ xinjiang }}}, {name: ‘西藏‘,value: {{ xizang }}}, {name: ‘吉林‘,value: {{ jiling }}}, {name: ‘寧夏‘,value: {{ ningxia }}}, {name: ‘內蒙古‘,value: {{ neimenggu }}}, ] } ] }; myChart.setOption(option); </script> </body> </html>
{% load staticfiles %} <!DOCTYPE html> <head> <meta charset="utf-8"> <title>動態數據展示</title> </head> <body> <!-- 為ECharts準備一個具備大小(寬高)的Dom --> <div id="main" style="height:400px"></div> <!-- ECharts單文件引入 --> <script src="http://echarts.baidu.com/build/dist/echarts.js"></script> <script type="text/javascript"> // 路徑配置 require.config({ paths: { echarts: ‘http://echarts.baidu.com/build/dist‘ } }); // 使用 require( [ ‘echarts‘, ‘echarts/chart/bar‘ // 使用柱狀圖就加載bar模塊,按需加載 ], function (ec) { // 基於準備好的dom,初始化echarts圖表 var myChart = ec.init(document.getElementById(‘main‘)); var option = { tooltip: { show: true }, legend: { color:‘#0000FF‘, data:[‘模特年齡‘] }, xAxis : [ { type : ‘category‘, data : {{ category }} } ], yAxis : [ { type : ‘value‘ } ], series : [ { "name":"模特年齡", "type":"bar", "data":{{ agedata }} } ] }; // 為echarts對象加載數據 myChart.setOption(option); } ); </script> </body>
網站首頁:
提交的信息會寫入數據庫中:
模特年齡正態分布情況:
首先對信息進行分詞處理,然後排序,選取出現頻率最高的前100個詞。
這個花了我很多時間,要解決echarts地圖只精確到省或者直轄市,而我爬取到的數據可能是具體的某一個地方市名,針對這個問題:我首先找了一下各省下面的市都有哪些,sql語句使用正則匹配想要獲取的信息。我創建了個字典存放省名和下屬的市名。另外創建個字典存放省名和匹配到的人數。
簡單小結:這裏面涉及到的知識點還挺多的:
爬蟲:我使用的requests和beautiful這倆庫。
數據庫:使用的是mysql,涉及到數據庫編碼,sql查詢,模糊匹配,python對數據庫的操作,中文顯示亂碼的問題。
詞雲:jieba進行分詞,pytagcloud用來生成詞雲。
django:views、templates、static 、url,因為我用的MySQLdb,所以沒有使用django自身的ORM(models),這樣我覺得更靈活。
前端展示:bootstrap(主要用來做網站的布局)和echarts(進行圖表展示和數據分析用)。
本文出自 “付煒超” 博客,請務必保留此出處http://9399369.blog.51cto.com/9389369/1953469
python實現數據爬取-清洗-持久化存儲-數據平臺可視化