spark 多个特征做onehot，怎么做效率高点？

spark 多个特征做onehot，我有50个多特征需要做onehot处理，怎么做效率高点？

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

import  sc.implicits._
    val vectorData = dataRDD
      //将枚举的值转化为 Double
     .map( x => (  enum2Double("是否已流失",x._1),   x._2(0) , x._2(1) ,x._2(2),x._2(3) ) )
       //ml.feature.LabeledPoint
     .toDF("loss","gender","age","grade","region")     //indexing columns
    val stringColumns = Array("gender","age","grade","region")
    val index_transformers: Array[org.apache.spark.ml.PipelineStage] = stringColumns.map(
    cname => new StringIndexer()
        .setInputCol(cname)
        .setOutputCol(s"${cname}_index")
     )
    // Add the rest of your pipeline like VectorAssembler and algorithm
    val index_pipeline = new Pipeline().setStages(index_transformers)
    val index_model = index_pipeline.fit(vectorData)
    val df_indexed = index_model.transform(vectorData)    //encoding columns
    val indexColumns  = df_indexed.columns.filter(x => x contains "index")
    val one_hot_encoders: Array[org.apache.spark.ml.PipelineStage] = indexColumns.map(
    cname => new OneHotEncoder()
       .setInputCol(cname)
       .setOutputCol(s"${cname}_vec")
    )    val pipeline = new Pipeline().setStages(index_transformers ++ one_hot_encoders)
    val model = pipeline.fit(vectorData)

    model.transform(vectorData).select("loss","gender_index_vec","age_index_vec","grade_index_vec","region_index_vec")
    .map (
        x=>
        ml.feature.LabeledPoint(x.apply(0).toString().toDouble ,ml.linalg.Vectors.dense(x.getAs[SparseVector]    ("gender_index_vec").toArray++x.getAs[SparseVector]("age_index_vec").toArray++x.getAs[SparseVector]("grade_index_vec").toArray++x.getAs[SparseVector]("region_index_vec").toArray))
    )
来源：
http://blog.csdn.net/pan_haufei/article/details/72903667
祝成功