1. 程式人生 > >python 抓取電影天堂電影信息放入數據庫

python 抓取電影天堂電影信息放入數據庫

python mysql 電影

# coding:utf-8 import requests from bs4 import BeautifulSoup from multiprocessing import Pool import urllib2 import re import json import chardet import pymysql # url = "http://dytt8.net/" # page = requests.get(url).content # page_html = BeautifulSoup(page,‘lxml‘) # name = page_html.select("td.inddline > a:nth-of-type(2)") # for n in name: # if ‘dyzz‘ in n.encode(‘gbk‘): # print n.encode(‘gbk‘) # file = open("move.txt","a+") # file.write(n.encode(‘utf-8‘)+‘\n‘) # file.close() def getmoveinfo( url ): page = requests.get(url).content page_html = BeautifulSoup(page,‘lxml‘) # title = page_html.select("div.title_all") # title = title[4].select("h1") # title = title[0].select("font") # return title[0].contents; title = page_html.find_all("font", attrs={"color": "#07519a"}) title_content = title[0].contents if(re.findall(r"譯  名(.*?)<br/>", str(page_html))): yiming = re.findall(r"譯  名(.*?)<br/>", str(page_html))[0] else: yiming = ‘‘ if(re.findall(r"類  別(.*?)<br/>", str(page_html))): leibie = re.findall(r"類  別(.*?)<br/>", str(page_html))[0] else: leibie = ‘‘ if(re.findall(r"語  言(.*?)<br/>", str(page_html))): yuyan = re.findall(r"語  言(.*?)<br/>", str(page_html))[0] else: yuyan = ‘‘ if(re.findall(r"字  幕(.*?)<br/>", str(page_html))): zimu = re.findall(r"字  幕(.*?)<br/>", str(page_html))[0] else: zimu = ‘‘ if(re.findall(r"上映日期(.*?)<br/>", str(page_html))): date = re.findall(r"上映日期(.*?)<br/>", str(page_html))[0] else: date = ‘‘ if(re.findall(r"豆瓣評分(.*?)<br/>", str(page_html))): douban = re.findall(r"豆瓣評分(.*?)<br/>", str(page_html))[0] else: douban = ‘‘ if(re.findall(r"片  長(.*?)<br/>", str(page_html))): pianchang = re.findall(r"片  長(.*?)<br/>", str(page_html))[0] else: pianchang = ‘‘ if(re.findall(r"導  演(.*?)<br/>", str(page_html))): daoyan = re.findall(r"導  演(.*?)<br/>", str(page_html))[0] else: daoyan = ‘‘ if(re.findall(r"主  演(.*?)<br/>", str(page_html))): zhuyan = re.findall(r"主  演(.*?)<br/>", str(page_html))[0] else: zhuyan = ‘‘ if(re.findall(r"簡  介(.*?)【下載地址】", str(page_html))): jianjie = re.findall(r"簡  介(.*?)【下載地址】", str(page_html))[0] else: jianjie = ‘‘ addres = page_html.find_all("td", attrs={"bgcolor": "#fdfddf"}) if(addres): addres = addres[0].contents; addres = addres[0].get("href").encode(‘utf-8‘) else: addres = ‘‘ res = {} res[‘title‘] =title_content[0].encode("utf-8") res[‘yiming‘] = yiming res[‘leibie‘] = leibie res[‘yuyan‘] = yuyan res[‘zimu‘] = zimu res[‘date‘] = date res[‘douban‘] = douban res[‘pianchang‘] = pianchang res[‘daoyan‘] = daoyan res[‘zhuyan‘] = zhuyan res[‘jianjie‘] = jianjie.replace("<br/>", "") res[‘addres‘] = addres return res url = "http://dytt8.net/" page = requests.get(url).content page_html = BeautifulSoup(page,‘lxml‘) name = page_html.select("td.inddline > a:nth-of-type(2)") conn = pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,password=‘root‘,db=‘moves‘,charset=‘utf8‘) cursor = conn.cursor() for n in name: if ‘dyzz‘ in n.encode(‘gbk‘): info = getmoveinfo("http://dytt8.net"+n.get("href")) title = info[‘title‘] yiming = info[‘yiming‘] leibie = info[‘leibie‘] yuyan = info[‘yuyan‘] zimu = info[‘zimu‘] date = info[‘date‘] douban = info[‘douban‘] pianchang = info[‘pianchang‘] daoyan = info[‘daoyan‘] zhuyan = info[‘zhuyan‘] jianjie = info[‘jianjie‘] addres = info[‘addres‘] # print title.decode(‘utf-8‘).encode(‘gbk‘) cursor.execute("INSERT INTO move_info(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘,‘{5}‘,‘{6}‘,‘{7}‘,‘{8}‘,‘{9}‘,‘{10}‘,‘{11}‘);".format(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres)) conn.commit() cursor.close() conn.close() print ‘ok‘

python 抓取電影天堂電影信息放入數據庫