1. 程式人生 > >python隨筆(一)

python隨筆(一)

xlsx border album pytho 環境 win 自己 column cts

python爬蟲獲取QQ音樂和豆瓣的最新電影音樂名字

先上代碼開源大家一起學習,代碼如下:

#!python2
#coding:utf-8
__author__ = OldHarry

import urllib2
import os
import re
import json
import xlsxwriter
import sys
defaultencoding = utf-8
if sys.getdefaultencoding() != defaultencoding:
    reload(sys)
    sys.setdefaultencoding(defaultencoding)

def getHtml(url): send_headers = { User-Agent:Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0, Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8, Connection:keep-alive } urls = urllib2.Request(url,headers=send_headers) html = urllib2.urlopen(urls)
if html.getcode() == 200: pass #print ("已捕獲"),url,"目標站數據..." else: print ("訪問出現錯誤...錯誤代碼:"),html.getcode() return html.read() def kugoumusic(url): xx=getHtml(url) rr=re.compile(r<span class="songName">(.*?) - (.*?)</span>) x=rr.findall(xx) nk
=[] for xxx in x: if xxx not in nk: nk.append(xxx[1].decode(utf8)) print json.dumps(nk, encoding="UTF-8", ensure_ascii=False) return nk def qqmusic(url): xx=getHtml(url) rr=re.compile(r{"action":{"alert":[0-9]+,"icons":[0-9]+,"msgdown":[0-9]+,"msgfav":[0-9]+,"msgid":[0-9]+,"msgpay":[0-9]+,"msgshare":[0-9]+,"switch":[0-9]+},"album":{"id":[0-9]+,"mid":"[a-zA-Z0-9]+","name":"(.*?)") x=rr.findall(xx) nq=[] for xxx in x: xxx.strip() if xxx not in nq: nq.append(xxx) print json.dumps(nq, encoding="UTF-8", ensure_ascii=False) return nq def dbmovie(url): ssd = getHtml(url) tt=re.compile(ralt="(.*?)" rel="[a-z]+" class="" />) shu=tt.findall(ssd) print json.dumps(shu, encoding="UTF-8", ensure_ascii=False) return shu def rmmovie(url): ssd = getHtml(url) tt=re.compile(r"title":"(.*?)") shu=tt.findall(ssd) print json.dumps(shu, encoding="UTF-8", ensure_ascii=False) return shu def rmdsj(): ssd = rmmovie(https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0)+rmmovie(https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20)+rmmovie(https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40) return ssd def runtest(): IP_PATH = os.path.abspath(.) + \TXT.xls print "酷狗音樂--新歌榜" a=kugoumusic("http://www.kugou.com/") print "騰訊音樂--內地新歌榜" b=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom2388477980207393&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A1%7D%7D%7D") print "騰訊音樂--港臺新歌榜" c=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom6698628102261504&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A2%7D%7D%7D") print "騰訊音樂--歐美新歌榜" d=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom08419989487702839&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A3%7D%7D%7D") print "騰訊音樂--日本新歌榜" e=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom24411354608866187&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A4%7D%7D%7D") print "騰訊音樂--韓國新歌榜" f=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom909302436024819&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A5%7D%7D%7D") print "豆瓣電影--正在熱映" g=dbmovie("https://movie.douban.com/") print "豆瓣電影--熱門電影" h=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0") print "豆瓣電影--最新電影" i=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%9C%80%E6%96%B0&page_limit=20&page_start=0") print "豆瓣電影--經典電影" j=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BB%8F%E5%85%B8&sort=time&page_limit=20&page_start=0") print "豆瓣電影--可播放電影" k=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8F%AF%E6%92%AD%E6%94%BE&sort=time&page_limit=20&page_start=0") print "豆瓣電影--高分電影" l=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=time&page_limit=20&page_start=0") print "豆瓣電影--熱門電視劇" m=rmdsj() workbook = xlsxwriter.Workbook(IP_PATH) worksheet = workbook.add_worksheet() bold = workbook.add_format({bold: 1, align: center, border: 1}) bold2 = workbook.add_format({align: center, border: 1}) headings = [酷狗音樂--新歌榜, 騰訊音樂--內地新歌榜,騰訊音樂--港臺新歌榜,騰訊音樂--歐美新歌榜,騰訊音樂--日本新歌榜,騰訊音樂--韓國新歌榜,豆瓣電影--正在熱映,豆瓣電影--熱門電影,豆瓣電影--最新電影,豆瓣電影--經典電影,豆瓣電影--可播放電影,豆瓣電影--高分電影,豆瓣電影--熱門電視劇] worksheet.write_row(A1, headings, bold) SS=30 worksheet.set_column(A:A, SS) worksheet.set_column(B:B, SS) worksheet.set_column(C:C, SS) worksheet.set_column(D:D, SS) worksheet.set_column(E:E, SS) worksheet.set_column(F:F, SS) worksheet.set_column(G:G, SS) worksheet.set_column(H:H, SS) worksheet.set_column(I:I, SS) worksheet.set_column(J:J, SS) worksheet.set_column(K:K, SS) worksheet.set_column(L:L, SS) worksheet.set_column(M:M, SS) worksheet.write_column(A2, a, bold2) worksheet.write_column(B2, b, bold2) worksheet.write_column(C2, c, bold2) worksheet.write_column(D2, d, bold2) worksheet.write_column(E2, e, bold2) worksheet.write_column(F2, f, bold2) worksheet.write_column(G2, g, bold2) worksheet.write_column(H2, h, bold2) worksheet.write_column(I2, i, bold2) worksheet.write_column(J2, j, bold2) worksheet.write_column(K2, k, bold2) worksheet.write_column(L2, l, bold2) worksheet.write_column(M2, m, bold2) workbook.close() if __name__ == __main__: runtest()

主要思路是:第一步解析網站,第二步選擇自己想要的數據,第三步在當前文件夾生成一個文件夾寫入excl

第一次寫博客,各路大神不喜勿噴,python萌新一枚。

開發環境:Pycharm python2.7

2019-04-0411:33:23

Study hard and make progress every day!

python隨筆(一)