1. 程式人生 > >Python3網路爬蟲:今日頭條新聞App的廣告資料抓取

Python3網路爬蟲:今日頭條新聞App的廣告資料抓取

咱們就不說廢話了,直接上完整的原始碼

def startGetData(self):
    ret = random.randint(2, 10)
    index = 0
url = ""
while index < ret:
        if index == 0:
            url = "http://lf.snssdk.com/api/news/feed/v80/?fp=PSTqPzFSJ2HuFlG7LlU1FYmeLS4_&version_code=6.6.5&app_name=news_article&vid=07FEB726-62D1-442A-ADE2-6781" 
\ "0CF8C421&device_id=51911855605&channel=App%20Store&resolution=750*1334&aid=13&ab_version=304488,346137,349052,271178,326588,326524,326532,338589,3" \ "36927,295827,325048,345778,239096,348856,344345,170988,346540,332095,325197,338954,330633,297058,276204,286212,313219,338067,348326,347814,277771,"
\ "310595,342074,334586,339207,323233,328671,346557,280773,338894,319962,344870,345191,348452,349020,348669,343444,214069,337616,348941,207251,26631" \ "2,247847,280447,281298,328218,325618,328227,348992,288417,290193,326190,339904,344131&ab_feature=201617,z1&ab_group=z1,201617&openudid=0bf32dfcb91c"
\ "3dc330eb92c492a9e9093fc44b51&idfv=07FEB726-62D1-442A-ADE2-67810CF8C421&ac=WIFI&os_version=9.3.1&ssmix=a&device_platform=iphone&iid=31813899088&ab_c" \ "lient=a1,f2,f7,e1&device_type=iPhone%206&idfa=0784D090-1DC1-4B24-BAA1-0C474ED94D52&detail=1&refresh_reason=1&last_refresh_sub_entrance_interva" \ "l=" + str(int( time.time())) + "&tt_from=pull&count=20&list_count=37&support_rn=4&LBS_status=deny&cp=54AbF4Ad5aAE6q1&loc_mode=0&min_behot_time=" + str( int(time.time())) + "&image=1&session_refres" \ "h_idx=3&strict=1&refer=1&language=zh-Hans-CN&concern_id=6286225228934679042&as=a295754fa45e4a0a5a3192&ts=" + str( int(time.time())) self.list_count = 17 elif index >= 1: url = "http://lf.snssdk.com/api/news/feed/v80/?fp=PSTqPzFSJ2HuFlG7LlU1FYmeLS4_&version_code=6.6.5&app_name=news_article&vid=07FEB726-62D1-442A-ADE2-67810" \ "CF8C421&device_id=51911855605&channel=App%20Store&resolution=750*1334&aid=13&ab_version=304488,346137,349052,271178,326588,326524,326532,338589,336" \ "927,295827,325048,345778,239096,348856,344345,170988,346540,332095,325197,338954,330633,297058,276204,286212,313219,338067,348326,347814,277771,31" \ "0595,342074,334586,339207,323233,328671,346557,280773,338894,319962,344870,345191,348452,349020,348669,343444,214069,337616,348941,207251,266312,2" \ "47847,280447,281298,328218,325618,328227,348992,288417,290193,326190,339904,344131&ab_feature=201617,z1&ab_group=z1,201617&openudid=0bf32dfcb91c3dc" \ "330eb92c492a9e9093fc44b51&idfv=07FEB726-62D1-442A-ADE2-67810CF8C421&ac=WIFI&os_version=9.3.1&ssmix=a&device_platform=iphone&iid=31813899088&ab_clien" \ "t=a1,f2,f7,e1&device_type=iPhone%206&idfa=0784D090-1DC1-4B24-BAA1-0C474ED94D52&detail=1&last_refresh_sub_entrance_interval=" + str( int(time.time())) + "&tt_from=load_m" \ "ore&count=20&list_count=" + str( self.list_count) + "&support_rn=4&LBS_status=deny&cp=5bAfF1A75eB77q1&max_behot_time=" + str( int(time.time())) + "&loc_mode=0&image=1&strict=1&city=&refer=1&concer" \ "n_id=6286225228934679042&language=zh-Hans-CN&as=a285d52fa5274adb8a3006&ts=" + str( int(time.time())) self.list_count += 8 time.sleep(5) index = index + 1 print(url) self.parse_url(url)

這個是啟動函式

def parse_url(self, url):
    response = requests.get(url, headers=self.getHeader(), verify=False)
    self.parse_json(response.content.decode("utf-8"))
網路請求並返回json字元竄
def getHeader(self):
    header = {"Host": "is.snssdk.com",
"Accept-Language": "zh-Hans;q=1",
"tt-request-time": str(int(time.time() * 1000)),
"Connection": "keep-alive",
"Accept-Encoding": "gzip,deflate",
"Cookie": "CNZZDATA1272189606=1385639719-1525687011-%7C1525692411;alert_coverage=76;install_id=31781370987;ttreq=1$b79c6e66ea460b1579579c027e8073593305644e;odin_tt = 4c07858cc8b75143c593d0a99a04aa8fcf10136c3dca9badd9c31a2aa9cc415022834c64d7f52952d9290e3028876735;UM_distinctid = 1633a13d9fd41b-0910970a30f79a8-12485712-3d10d-1633a13d9fe84a;_ga=GA1.2.555016291.1525687770;_gid=GA1.2.96631484.1525687770;qh[360] = 1;__tea_sdk__ssid=957b8ce1-d5b3-4010-bd9c-bfec73bdf526;__tea_sdk__user_unique_id=6552731409432937992;tt_webid=6552731409432937992",
"X-SS-Cookie": "CNZZDATA1272189606=1385639719-1525687011-%7C1525692411;alert_coverage = 76;install_id=31781370987;ttreq=1$b79c6e66ea460b1579579c027e8073593305644e;odin_tt=4c07858cc8b75143c593d0a99a04aa8fcf10136c3dca9badd9c31a2aa9cc415022834c64d7f52952d9290e3028876735;UM_distinctid=1633a13d9fd41b-0910970a30f79a8-12485712-3d10d-1633a13d9fe84a;_ga=GA1.2.555016291.1525687770;_gid=GA1.2.96631484.1525687770;qh[360]=1;__tea_sdk__ssid=957b8ce1-d5b3-4010-bd9c-bfec73bdf526;__tea_sdk__user_unique_id=6552731409432937992;tt_webid=6552731409432937992",
"User-Agent": "News/6.6.5(iPhone;iOS10.2;Scale/2.00)",
"Accept": "*/*"}
    print(str(int(time.time() * 1000)))
    return header

頭部封裝

def parse_json(self, jsonStr):
    print(jsonStr)
    DataInfo.time = Util().getCurrTime()
    try:
        json_list = (json.loads(jsonStr))["data"]
        for json_str in json_list:
            content = json.loads(json_str["content"])
            if "label" in content:
                if "廣告".__eq__(content["label"]):
                    print("廣告")
                    filter_words = content["filter_words"]
                    for filter_word in filter_words:
                        name = filter_word["name"]
                        if "遊戲" in name:
                            print("遊戲" + str(content))
                            self.savaDataInfo(content)
    except KeyError as x:
        print(x)

解析json資料

def savaDataInfo(self, content):
    DataInfo.title = content["title"]
    DataInfo.type = 1
DataInfo.channel = "jinritoutiao"
if "download_url" in content["raw_ad_data"]:
        DataInfo.appdownload = content["raw_ad_data"]["download_url"]
    self.saveBitmapUrlOrPath(content)
    DataInfo.device_type = "ios"
DataInfo.app_name = content["source"]
    MySqlManager().insert_inspection_list(3)

儲存資料到mysql

def saveBitmapUrlOrPath(self, content):
    bitmap = {}
    video = {}
    bitmap_path = {}
    filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg"
bitmap_path["pic_path1"] = self.path + filename
    DataInfo.pic_path = bitmap_path
    if "display_url" in content:
        DataInfo.source_type = 3
video["video1"] = content["display_url"]
        if "video_detail_info" in content:
            bitmap["pic1"] = content["video_detail_info"]["detail_video_large_image"]["url"]
        else:
            bitmap["pic1"] = content["large_image_list"][0]["url"]
    else:
        DataInfo.source_type = 1
bitmap["pic1"] = content["large_image_list"][0]["url"]
    Util().save_img(bitmap["pic1"], filename, self.path)
    DataInfo.pic_list = bitmap
    DataInfo.video = video

下載圖片到伺服器,並儲存圖片路徑

# 將產品詳情插入資料庫
def insert_product_detail(self, product_id, json_obj):
    table_name = "product_detail"
if self.isProductIdExits(table_name, product_id) == 1:
        update_sql = "UPDATE " + table_name + " SET company_num=%d,days=%d,first_seen='%s',labels='%s',last_seen='%s'," \
                                              "logo_url='%s',media_list='%s',media_num=%d,product_id=%d,product_name='%s',updated_at='%s' WHERE product_id=%d" \
                     % (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"],
json_obj["lastSeen"], json_obj["logoURL"],
json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"],
json_obj["productName"], self.getCurrentTime(), product_id)
        self.execute(update_sql)
    else:
        insert_sql = "INSERT INTO " + table_name + "(company_num,days,first_seen,labels,last_seen,logo_url,media_list,media_num,product_id,product_name,created_at)" \
                                                   " VALUES (%d ,%d ,'%s','%s','%s','%s','%s',%d ,%d ,'%s','%s')" \
                     % (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"],
json_obj["lastSeen"], json_obj["logoURL"],
json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"],
json_obj["productName"], self.getCurrentTime())
        self.execute(insert_sql)

# 將產品詳情頁圖示資料插入資料庫
def insert_product_detail_table(self, product_id, json_obj):
    table_name = "product_detail_table"
if self.isProductIdExits(table_name, product_id) == 1:
        update_sql = "UPDATE " + table_name + " SET ad_creative_list='%s',ad_creative_list='%s',xlabel='%s',ad_count_last_year=%d,product_id=%d,updated_at='%s' WHERE product_id=%d" \
                     % (json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]),
json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"],
product_id, self.getCurrentTime(), product_id)
        self.execute(update_sql)
    else:
        insert_sql = "INSERT INTO " + table_name + "(ad_creative_list,ad_material_list,xlabel,ad_count_last_year,product_id,created_at)" \
                                                   " VALUES ('%s','%s','%s',%d,%d,'%s')" \
                     % (
                         json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]),
json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"],
product_id, self.getCurrentTime())
        self.execute(insert_sql)

# 將圖片素材插入資料庫
def insert_product_detail_pic(self, product_id, json_obj):
    self.savePic(json_obj)
    table_name = "product_pic_material_list"
material_id = json_obj["materialId"]
    if self.isMaterialIdExits(table_name, material_id) == 1:
        update_sql = "UPDATE " + table_name + " SET company_num=%d,creative_num=%d,first_seen='%s',h=%d,last_days=%d,last_seen='%s',material_id=%d,material_type=%d," \
                                              "media_list='%s',new='%s',pic1='%s',pic2='%s',pic3='%s',product_num=%d,video='%s',w=%d,product_id=%d,video='%s' WHERE material_id=%d" \
                     % (json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"],
json_obj["lastDays"], json_obj["lastSeen"], material_id,
json_obj["materialType"],
json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"],
json_obj["pic3"],
json_obj["productNum"], json_obj["video"], json_obj["w"], product_id,
self.getCurrentTime(), material_id)
        self.execute(update_sql)
    else:
        insert_sql = "INSERT INTO " + table_name + "(company_num,creative_num,first_seen,h,last_days,last_seen,material_id,material_type,media_list,new,pic1,pic2,pic3" \
                                                   ",product_num,video,w,product_id,created_at,pic1_path,pic2_path,pic3_path)" \
                                                   " VALUES (%d,%d,'%s',%d,%d,'%s',%d,%d,'%s','%s','%s','%s','%s',%d,'%s',%d,%d,'%s','%s','%s','%s')" \
                     % (
                         json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"],
json_obj["lastDays"], json_obj["lastSeen"], json_obj["materialId"],
json_obj["materialType"],
json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"],
json_obj["pic3"],
json_obj["productNum"], json_obj["video"], json_obj["w"], product_id,
self.getCurrentTime(), self.pic1_path, self.pic2_path, self.pic3_path)

        self.execute(insert_sql)

def savePic(self, json_obj):
    pic1 = json_obj["pic1"]
    pic2 = json_obj["pic2"]
    pic3 = json_obj["pic3"]
    if pic1.strip() != '':
        filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg"
self.pic1_path = self.path + filename
        Util().save_img(pic1, filename, self.path)
    if pic2.strip() != '':
        filename = "pic2_" + str(int(time.time() * 1000000)) + ".jpg"
self.pic2_path = self.path + filename
        Util().save_img(pic2, filename, self.path)
    if pic3.strip() != '':
        filename = "pic3_" + str(int(time.time() * 1000000)) + ".jpg"
self.pic3_path = self.path + filename
        Util().save_img(pic3, filename, self.path)

def isProductIdExits(self, table_name, product_id):
    query_sql = "select *from " + table_name + " where product_id = " + str(product_id)
    cursor = self.conn.cursor()
    result = cursor.execute(query_sql)
    print(result)
    self.conn.commit()
    return result

def isMaterialIdExits(self, table_name, material_id):
    query_sql = "select *from " + table_name + " where material_id = " + str(material_id)
    cursor = self.conn.cursor()
    result = cursor.execute(query_sql)
    print(result)
    self.conn.commit()
    return result

def insert_inspection_list(self, table_id):
    sql = "INSERT INTO " + self.getTableName(
        table_id) + "(title,app_download,time,channel,type,content,gif,video,source_type,pic_list,pic_path,device_type,material_size,app_name,created_at,updated_at)" \
                    " VALUES ('%s','%s','%s','%s',%d,'%s','%s','%s',%d,'%s','%s','%s','%s','%s','%s','%s')" \
          % (DataInfo.title, DataInfo.app_download, DataInfo.time, DataInfo.channel, DataInfo.type,
DataInfo.content, json.dumps(DataInfo.gif), json.dumps(DataInfo.video), DataInfo.source_type,
json.dumps(DataInfo.pic_list),
json.dumps(DataInfo.pic_path), DataInfo.device_type,
DataInfo.material_size,
DataInfo.app_name, self.getCurrentTime(), self.getCurrentTime())
    cursor = self.conn.cursor()
    cursor.execute(sql)
    self.conn.commit()

def getCurrentTime(self):
    return str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

def getTableName(self, table_id):
    return "material_" + str(table_id % 10)

def execute(self, sql):
    cursor = self.conn.cursor()
    cursor.execute(sql)
    self.conn.commit()

def close(self):
    self.conn.close()
儲存資料庫操作