1. 程式人生 > >基於百度AI的自然語言處理文字分類

基於百度AI的自然語言處理文字分類

前言:

需要在百度AI平臺註冊登入並建立專案。

爬蟲程式碼

 1 import scrapy
 2 from BaiDuAi.items import  BaiduaiItem
 3 
 4 class AiSpider(scrapy.Spider):
 5     name = 'ai'
 6     # allowed_domains = ['www.xxx.com']
 7     #人民網url
 8     start_urls = ['http://politics.people.com.cn/n1/2018/1217/c1001-30470023.html']
 9 
10     def parse(self, response):
11 title=response.xpath('/html/body/div[4]/h1/text()').extract_first() 12 content=response.xpath('//*[@id="rwb_zw"]//text()').extract() 13 content=''.join(content).strip('\n \t') 14 item=BaiduaiItem() 15 item['title']=title 16 item['content']=content 17 18 yield
item
爬蟲程式碼

管道程式碼

 1 from aip import AipNlp
 2 
 3 """ 你的 APPID AK SK """
 4 APP_ID = '15198150'
 5 API_KEY = 'jaObSr6rmSmqsjWfKGGpmwxB'
 6 SECRET_KEY = '808Eiz4FPkfMwS2ajClXYhKrcFMN1YUN'
 7 
 8 client = AipNlp(APP_ID, API_KEY, SECRET_KEY)
 9 
10 class BaiduaiPipeline(object):
11     keys=[]
12 def process_item(self, item, spider): 13 title=item['title'].replace('\xa0','') 14 content=item['content'].replace('\xa0','') 15 keys_dict=client.keyword(title,content) 16 for dic in keys_dict['items']: 17 self.keys.append(dic['tag']) 18 19 20 keys="/".join(self.keys) 21 typec_dic=client.topic(title,content) 22 news_type=typec_dic['item']['lv1_tag_list'][0]['tag'] 23 24 with open('./xinwen.html','w',encoding='utf-8')as fp: 25 fp.write(title+'\n\n'+content+'\n\n'+keys+'\n\n'+news_type) 26 return item
管道