1. 程式人生 > >spark原始碼閱讀筆記Dataset(三)structField、structType、schame

spark原始碼閱讀筆記Dataset(三)structField、structType、schame

StructType(fields: Seq[StructField])
一個StructType物件,可以有多個StructField,同時也可以用名字(name)來提取,就想當於Map可以用key來提取value,但是他StructType提取的是整條欄位的資訊 在原始碼中structType是一個case class,如下:
case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {}
它是繼承Seq的,也就是說Seq的操作,它都擁有,但是從形式上來說,每個元素是用  StructField包住的。
package Dataset

import org.apache.spark.sql.types._


/**
  * Created by root on 9/21/16.
  */
object schemaAnalysis {
  //--------------------------------------------------StructType analysis---------------------------------------
  val struct = StructType(
    StructField("a", IntegerType) ::
      StructField("b", LongType, false) ::
      StructField("c", BooleanType, false) :: Nil)

  def schema_StructType()={
    /**
      * 一個scheme是
      */
    import org.apache.spark.sql.types.StructType
    val schemaTyped = new StructType()
      .add("a","int").add("b","string")
    schemaTyped.foreach(println)
    /**
      * StructField(a,IntegerType,true)
      * StructField(b,StringType,true)
      */
  }
  def structType_extracted()={

    // Extract a single StructField.
    val singleField_a = struct("a")
    println(singleField_a)
    //省卻的清空下表示:可以為空的,
    //StructField(a,IntegerType,true)
    val singleField_b = struct("b")
    println(singleField_b)
    //StructField(b,LongType,false)

    //val nonExisting = struct("d")
    //println(nonExisting)
    //java.lang.IllegalArgumentException: Field "d" does not exist.

    // Extract multiple StructFields. Field names are provided in a set.
    // A StructType object will be returned.
    val twoFields = struct(Set("b", "c"))
    println(twoFields)


    //StructType(StructField(b,LongType,false), StructField(c,BooleanType,false))
    // Any names without matching fields will be ignored.
    // For the case shown below, "d" will be ignored and
    // it is treated as struct(Set("b", "c")).
    val ignoreNonExisting = struct(Set("b", "c", "d"))
    println(ignoreNonExisting)
    // ignoreNonExisting: StructType =
    //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))

    //值得注意的是:當沒有存在的欄位的時候,官方文件說:單個返回的是null,多個返回的是當沒有那個欄位
    //但是實驗的時候,報錯---Field d does not exist
    //原始碼呼叫的是apply方法,確實還沒有處理好這部分功能
    //我是用的是spark2.0初始版本

  }
  def structType_opration()={

    /**
      * 原始碼:case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {
      * 它是繼承與Seq的,也就是說 Seq的操作,StructType都有
      * 可以檢視scala的Seq的操作:http://www.scala-lang.org/api/current/#scala.collection.Seq
      */
    val tmpStruct = StructType(StructField("d", IntegerType)::Nil)
    //集合與集合的操作
    println(struct++tmpStruct)
    // println(struct++:tmpStruct)
    //List(StructField(a,IntegerType,true), StructField(b,LongType,false), StructField(c,BooleanType,false), StructField(d,IntegerType,true))

    //集合與元素的操作
    println(struct :+ StructField("d", IntegerType))

    //可以用add來進行

    println(struct.add("e",IntegerType))
    //StructType(StructField(a,IntegerType,true), StructField(b,LongType,false), StructField(c,BooleanType,false), StructField(e,IntegerType,true))

    //head 部分的元素
    println(struct.head)
    //StructField(a,IntegerType,true)


    //last 部分的元素
    println(struct.last)
    //StructField(c,BooleanType,false)

    println(struct.apply("a"))
    //StructField(a,IntegerType,true)

    println(struct.treeString)

    /**
      * root
       |-- a: integer (nullable = true)
       |-- b: long (nullable = false)
       |-- c: boolean (nullable = false)
      */

    println(struct.contains(StructField("f", IntegerType)))
    //false

    println(struct.mkString)
    //StructField(a,IntegerType,true)StructField(b,LongType,false)StructField(c,BooleanType,false)

    println(struct.prettyJson)

    /**
      * {
          "type" : "struct",
          "fields" : [ {
            "name" : "a",
            "type" : "integer",
            "nullable" : true,
            "metadata" : { }
          }, {
            "name" : "b",
            "type" : "long",
            "nullable" : false,
            "metadata" : { }
          }, {
            "name" : "c",
            "type" : "boolean",
            "nullable" : false,
            "metadata" : { }
          } ]
        }
      */
    //更多操作可以檢視API:http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.types.StructType
  }



  def main(args: Array[String]) {
    //schema_StructType()
    //structType_extracted()
    structType_opration()
  }
}
3、Schema