1. 程式人生 > >hbase java api操作匯入資料

hbase java api操作匯入資料

使用hbase儲存名人資料集,資料集由名人文字資訊以及名人圖片組成。
名人文字資訊使用scrapy框架從wiki百科上爬取並儲存在csv格式中。
圖片資訊從百度圖片上爬取每人30張儲存在以該名人姓名命名的資料夾中
因此本文包含以下幾個方面:
- 爬取文字的爬蟲
- 爬取圖片的爬蟲
- 將資料匯入hbase

scrapy 爬取wiki百科

首先新建scrapy專案
items.py配置
然後在settings.py檔案中加入

FEED_URI = u'file:///F:/pySpace/celebrity/info1.csv'
FEED_FORMAT = 'CSV'

即以csv格式儲存爬取資料以及檔案儲存位置

在main.py檔案中加入

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.getdefaultencoding()
from scrapy import cmdline
cmdline.execute("scrapy crawl celebrity".split())
<python>
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from celebrity.items
import CelebrityItem from scrapy.http import Request import pandas as pd #讀取待爬取的名人姓名列表 with open(r'F:\pySpace\celebrity\name_lists1.txt','r') as f: url_list = f.read() url_list = url_list.split('\n') class Celebrity(CrawlSpider): len_url = len(url_list) num =1 name = "celebrity" front_url = 'https://zh.wikipedia.org/wiki/'
start_urls = [front_url + url_list[num].encode('utf-8')] def parse(self, response): item = CelebrityItem() selector = Selector(response) body = selector.xpath('//*[@id="mw-content-text"]')[0] Title = body.xpath('//span[@class="mw-headline"]/text()').extract() titles = ['簡介'] for i in range(len(Title)): if Title[i] != '參考文獻' and Title[i] != '註釋' and Title[i] != '外部連結' and Title[i] != '參考資料': titles.append(Title[i]) Passage = selector.xpath('//*[@id="mw-content-text"]/p') all_info = [] for eachPassage in Passage: info =''.join(eachPassage.xpath('.//text()').extract()) if info!= '': all_info.append(info.strip()) Ul_list = selector.xpath('//*[@id="mw-content-text"]/ul') for eachul in Ul_list: info = ''.join(eachul.xpath('.//text()').extract()) if info != '' and info!= '\n' and info != ' ': all_info.append(info) # 爬取帶標題的 k = 0 epoch = len(all_info) / len(titles) i=0 if epoch >0: for i in range(len(titles)): if i == len(titles)-1: item['name'] = url_list[self.num].encode('utf-8') item['title'] = titles[i] item['info'] = ''.join(all_info[k:]) else : item['name'] = url_list[self.num].encode('utf-8') item['title'] = titles[i] item['info'] = ''.join(all_info[k:k+epoch]) k = k+epoch yield item else : for j in range(len(all_info)): item['name'] = url_list[self.num].encode('utf-8') item['title'] = titles[j] item['info'] = all_info[j] yield item #爬取不帶標題的 # for j in range(len(all_info)): # item['name'] = url_list[self.num].encode('utf-8') # item['info'] = all_info[j] # yield item print item['name'] self.num = self.num + 1 print self.num if self.num < self.len_url: nextUrl =self.front_url + url_list[self.num].encode('utf-8') yield Request(nextUrl,callback=self.parse) </python>

爬取圖片

import urllib2
import re
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def img_spider(name_file):

    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
    headers = {'User-Agent':user_agent}

    with open(name_file) as f:
        name_list = [name.rstrip().decode('utf-8') for name in f.readlines()]
        f.close()

    for name in name_list:
        if not os.path.exists('F:/pySpace/celebrity/img_data/' + name):
            os.makedirs('F:/pySpace/celebrity/img_data/' + name)
            try:
                url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=" + name.replace(' ','%20') + "&cg=girl&rn=60&pn=60"
                req = urllib2.Request(url, headers=headers)
                res = urllib2.urlopen(req)
                page = res.read()
                #print page
                img_srcs = re.findall('"objURL":"(.*?)"', page, re.S)
                print name,len(img_srcs)
            except:
                print name," error:"
                continue
            j = 1
            src_txt = ''

            for src in img_srcs:
                with open('F:/pySpace/celebrity/img_data/' + name + '/' + str(j)+'.jpg','wb') as p:
                    try:
                        print "downloading No.%d"%j
                        req = urllib2.Request(src, headers=headers)
                        img = urllib2.urlopen(src,timeout=3)
                        p.write(img.read())
                    except:
                        print "No.%d error:"%j
                        p.close()
                        continue
                    p.close()
                src_txt = src_txt + src + '\n'
                if j==30:
                    break
                j = j+1
            #儲存src路徑為txt
            with open('F:/pySpace/celebrity/img_data/' + name + '/' + name +'.txt','wb') as p2:
                p2.write(src_txt)
                p2.close()
                print "save %s txt done"%name


if __name__ == '__main__':
    name_file = "name_lists1.txt"
    img_spider(name_file)

通過java api 將資料匯入hbase

在hbase中建兩個表,分別為celebrity(儲存圖片資訊)和celebrity_info(儲存文字資訊)名人的姓名為rowkey。

<java>
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import com.csvreader.CsvReader;
import com.google.common.primitives.Chars;
import org.junit.Test;
import java.nio.charset.Charset;
import java.io.*;
import javax.swing.ImageIcon;
/**
 * Created by mxy on 2016/10/31.
 */
public class CelebrityDataBase {

    /*新建表*/
    public void createTable(String tablename)throws Exception{
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HBaseAdmin admin = new HBaseAdmin(config);
        String table = tablename;

        if(admin.isTableAvailable(table)){
            admin.disableTable(table);
            admin.deleteTable(table);
        }else {
            HTableDescriptor t = new HTableDescriptor(table.getBytes());
            HColumnDescriptor cf1 = new HColumnDescriptor("cf1".getBytes()) ;
            cf1.setMaxVersions(10);
            t.addFamily(cf1);
            admin.createTable(t);
        }
        admin.close();
    }
    //插入資料csv格式文字資料
    public void putInfo()throws Exception{
        CsvReader r = new CsvReader("F://pySpace//celebrity//info.csv",',', Charset.forName("utf-8"));
        r.readHeaders();
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity_info");
        while(r.readRecord()){

            System.out.println(r.get("name"));
//          String rowkey = r.get("name");
            Put put = new Put(r.get("name").getBytes());
            put.add("cf1".getBytes(),r.get("title").getBytes(),r.get("info").getBytes());
            table.put(put);

        }
        r.close();
        table.close();

    }

    //查詢圖片資料
    public void getImage(String celebrity_name,String img_num)throws Exception{
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity");
        Get get = new Get(celebrity_name.getBytes());
        Result res = table.get(get);
        Cell c1 = res.getColumnLatestCell("cf1".getBytes(),img_num.getBytes());
        File file=new File("D://"+celebrity_name+img_num);//將輸出的二進位制流轉化後的圖片的路徑
        FileOutputStream fos=new FileOutputStream(file);
        fos.write(c1.getValue());
        fos.flush();
        System.out.println(file.length());
        fos.close();
        table.close();
    }

    //查詢文字資料
    public void getInfo(String name) throws Exception{
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity_info");

        Get get = new Get(name.getBytes());
        Result res = table.get(get);
        Result result = table.get(get);
        for(Cell cell : result.rawCells()){
            System.out.println("rowKey:" + new String(CellUtil.cloneRow(cell))
                    + " cfName:" + new String(CellUtil.cloneFamily(cell))
                    + " qualifierName:" + new String(CellUtil.cloneQualifier(cell))
                    + " value:" + new String(CellUtil.cloneValue(cell)));
        }
        table.close();
    }

//插入圖片資料
    public void putImage(String each_celebrity,String each_img)throws Exception{

        String str = null;
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity");
        str = String.format("F://pySpace//celebrity//img_data//%s//%s",each_celebrity,each_img);
        File file = new File(str);
        int size = 0;
        size = (int)file.length();
        System.out.println(size);
        byte[] bbb = new byte[size];
        try {
            InputStream a = new FileInputStream(file);
            a.read(bbb);
//            System.out.println(bbb);
//            System.out.println(Integer.toBinaryString(bbb));
        } catch (FileNotFoundException e) {
// TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
// TODO Auto-generated catch block
            e.printStackTrace();
        }
        String rowkey = each_celebrity;
        Put put = new Put(rowkey.getBytes());
        put.add("cf1".getBytes(),each_img.getBytes(),bbb);
        table.put(put);
        table.close();

    }

    public static void main(String args[]){
        CelebrityDatabase pt = new CelebrityDatabase();
        try {
            pt.createTable("celebrity);
            pt.createTable("celebrity_info);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("createTable error");
        }
        String root_path = "F://pySpace//celebrity//img_data";
        File file = new File(root_path);
        File[] files = file.listFiles();

        for(int i = 0;i < files.length;i++){
            String each_path = root_path +"//"+ files[i].getName();
            File celebrity_file = new File(each_path);
            File[] celebrity_files = celebrity_file.listFiles();
            System.out.println(each_path);
            for(int j = 0;j<celebrity_files.length - 1;j++){
                try {
                    pt.putImage(files[i].getName(),celebrity_files[j].getName());
                } catch (Exception e) {
                    e.printStackTrace();
                    System.out.println("putImage error");
                }
            }

        }
        //存入文字資訊
        try {
            pt.putInfo();
        } catch (Exception e) {
            e.printStackTrace();
        }

        //取出圖片
        try {
            pt.getImage("龔照勝","13.jpg");
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("getImage error");
        }
        //取出文字
        try {
            pt.getInfo("成龍");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

</java>