1. 程式人生 > >hadoop mapreduce讀取orcfile的java程式碼示例

hadoop mapreduce讀取orcfile的java程式碼示例

orcfile在hive 0.11版本後提供支援,orcfile相比rcfile具有更高的資料壓縮比,在不使用任何壓縮演算法,僅僅使用orcfile儲存格式,資料量大小就能縮小一半以上。

下面以hive 0.13版本為例,列舉了mapreduce讀取orcfile的java示例程式碼:

需要引入的包:hive-common-0.13.1.jar、hive-exec-0.13.1.jar

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

main函式關鍵程式碼:

 public static void main(String[] args) throws IOException,
   URISyntaxException, InterruptedException, ClassNotFoundException {

             Configuration conf = new Configuration();

             Job job = new Job(conf);

              job.setInputFormatClass(OrcNewInputFormat.class);
             FileInputFormat.addInputPath(job, new Path(inputPath));
             job.setMapperClass(ExtractorMapper.class);
            System.exit(job.waitForCompletion(true) ? 0 : 1);

}

map實現函式關鍵程式碼:

 private static class ExtractorMapper extends
   Mapper {

  private static final String SCHEMA = "struct<column_name1:string,column_name2:string>"

  protected void map(
    NullWritable key,
    Writable value,
    Mapper.Context context)
    throws IOException, InterruptedException {
             OrcStruct struct = (OrcStruct)value;
             TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(SCHEMA);

             StructObjectInspector inspector = (StructObjectInspector)
                    OrcStruct.createObjectInspector(typeInfo);

             StringBuffer outputKey = new StringBuffer();
             outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name1")).toString());
             outputKey.append(TAB);
             outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name2")).toString());
             System.out.println(outputKey.toString());

}