1. 程式人生 > >java 讀寫Parquet格式的數據 Parquet example

java 讀寫Parquet格式的數據 Parquet example

readline byte ble print 三種 每一個 sta cep edr

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.GroupFactory;
import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.ParquetReader.Builder; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; public class ReadParquet { static Logger logger=Logger.getLogger(ReadParquet.class); public static void main(String[] args) throws Exception { // parquetWriter("test\\parquet-out2","input.txt");
parquetReaderV2("test\\parquet-out2"); } static void parquetReaderV2(String inPath) throws Exception{ GroupReadSupport readSupport = new GroupReadSupport(); Builder<Group> reader= ParquetReader.builder(readSupport, new Path(inPath)); ParquetReader<Group> build=reader.build(); Group line=null; while((line=build.read())!=null){ System.out.println(line.toString()); } System.out.println("讀取結束"); } //新版本中new ParquetReader()所有構造方法好像都棄用了,用上面的builder去構造對象 static void parquetReader(String inPath) throws Exception{ GroupReadSupport readSupport = new GroupReadSupport(); ParquetReader<Group> reader = new ParquetReader<Group>(new Path(inPath),readSupport); Group line=null; while((line=reader.read())!=null){ System.out.println(line.toString()); } System.out.println("讀取結束"); } /** * * @param outPath  輸出Parquet格式 * @param inPath 輸入普通文本文件 * @throws IOException */ static void parquetWriter(String outPath,String inPath) throws IOException{ MessageType schema = MessageTypeParser.parseMessageType("message Pair {\n" + " required binary city (UTF8);\n" + " required binary ip (UTF8);\n" + " repeated group time {\n"+   " required int32 ttl;\n"+    " required binary ttl2;\n"+ "}\n"+ "}"); GroupFactory factory = new SimpleGroupFactory(schema); Path path = new Path(outPath); Configuration configuration = new Configuration(); GroupWriteSupport writeSupport = new GroupWriteSupport(); writeSupport.setSchema(schema,configuration); ParquetWriter<Group> writer = new ParquetWriter<Group>(path,configuration,writeSupport);
    //把本地文件讀取進去,用來生成parquet格式文件 BufferedReader br
=new BufferedReader(new FileReader(new File(inPath))); String line=""; Random r=new Random(); while((line=br.readLine())!=null){ String[] strs=line.split("\\s+"); if(strs.length==2) { Group group = factory.newGroup() .append("city",strs[0]) .append("ip",strs[1]); Group tmpG =group.addGroup("time"); tmpG.append("ttl", r.nextInt(9)+1); tmpG.append("ttl2", r.nextInt(9)+"_a"); writer.write(group); } } System.out.println("write end"); writer.close(); } }
說下schema(寫Parquet格式數據需要schema,讀取的話"自動識別"了schema)
/*
 * 每一個字段有三個屬性:重復數、數據類型和字段名,重復數可以是以下三種:
 *         required(出現1次)
 *         repeated(出現0次或多次)
 *         optional(出現0次或1次)
 * 每一個字段的數據類型可以分成兩種:
 *         group(復雜類型)
 *         primitive(基本類型)
* 數據類型有
* INT64, INT32, BOOLEAN, BINARY, FLOAT, DOUBLE, INT96, FIXED_LEN_BYTE_ARRAY
*/

maven依賴(我用的1.7)
<dependency>
    <groupId>org.apache.parquet</groupId>
    <artifactId>parquet-hadoop</artifactId>
    <version>1.7.0</version>
</dependency>

java 讀寫Parquet格式的數據 Parquet example