1. 程式人生 > >SparkML之特徵提取(一)主成分分析(PCA)


主成分分析(Principal Component Analysis,PCA), 將多個變數通過線性變換以選出較少個數重要變數的一種多




Spark 原始碼(mllib包)



Spark 原始碼(mllib包)

 * A feature transformer that projects vectors to a low-dimensional space using PCA.
 * @param k number of principal components
class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
  require(k > 0,
s"Number of principal components must be positive but got 
${k}") /** * Computes a [[PCAModel]] that contains the principal components of the input vectors. * * @param sources source vectors */ @Since("1.4.0") def fit(sources: RDD[Vector]): PCAModel = { require(k <= sources.first().size, s"source vector size is ${sources.first().size} must be greater than k=
$k") val mat = new RowMatrix(sources) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val densePC = pc match { case dm: DenseMatrix => dm case sm: SparseMatrix => /* Convert a sparse matrix to dense. * * RowMatrix.computePrincipalComponents always returns a dense matrix. * The following code is a safeguard. */ sm.toDense case m => throw new IllegalArgumentException("Unsupported matrix format. Expected " + s"SparseMatrix or DenseMatrix. Instead got: ${m.getClass}") } val denseExplainedVariance = explainedVariance match { case dv: DenseVector => dv case sv: SparseVector => sv.toDense } new PCAModel(k, densePC, denseExplainedVariance) } /** * Java-friendly version of [[fit()]] */ @Since("1.4.0") def fit(sources: JavaRDD[Vector]): PCAModel = fit(sources.rdd) } /** * Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA. * * @param k number of principal components. * @param pc a principal components Matrix. Each column is one principal component. */ @Since("1.4.0") class PCAModel private[spark] ( @Since("1.4.0") val k: Int, @Since("1.4.0") val pc: DenseMatrix, @Since("1.6.0") val explainedVariance: DenseVector) extends VectorTransformer { /** * Transform a vector by computed Principal Components. * * @param vector vector to be transformed. * Vector must be the same length as the source vectors given to [[PCA.fit()]]. * @return transformed vector. Vector will be of length k. */ @Since("1.4.0") override def transform(vector: Vector): Vector = { vector match { case dv: DenseVector => pc.transpose.multiply(dv) case SparseVector(size, indices, values) => /* SparseVector -> single row SparseMatrix */ val sm = Matrices.sparse(size, 1, Array(0, indices.length), indices, values).transpose val projection = sm.multiply(pc) Vectors.dense(projection.values) case _ => throw new IllegalArgumentException("Unsupported vector format. Expected " + s"SparseVector or DenseVector. Instead got: ${vector.getClass}") } } }



import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}

object myPCA {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("PCA example").setMaster("local")
    val sc = new SparkContext(conf)


    val data = sc.textFile("/root/application/upload/pca2.data")

    val parseData = data.map{ line =>
    val part = line.split(' ')

    val model = new PCA(3).fit(parseData)

      * [-198.49935555431662,61.7455925014451,-33.61561582724634]



連結:http://pan.baidu.com/s/1dELByj3 密碼:wsnb