时间: | 分类: spark学习
import org.apache.spark.sql.SparkSession
object T {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName(name="read File")
.config("spark.master", "local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._ //隐式转换
val dataList: List[(String,List[Int])]= List(
("A",List(10, 3, 18, 7, 4)),
("B",List(4, 4, 14, 6, 4)),
("C",List(15, 1, 12, 1, 4)),
("D",List(15, 5, 18, 6, 5)),
("E",List(1, 2, 17, 6, 3)),
("F",List(2, 2, 17, 5, 5)),
("A",List(2, 2, 17, 5, 5)))
import spark.implicits._ //
spark.sparkContext.setLogLevel("ERROR")
val data = dataList.toDF("gbNo","fq")
val handle = data.flatMap(r => {
val gbNo = r.getAs[String]("gbNo")
val list = r.getAs[Seq[Int]]("fq").toList
for(i <- list.indices) yield (gbNo, i, list(i))
})
.groupBy("_1")
.pivot("_2")
.sum("_3")
data.show()
handle.show()
}
}
拆分list等字段为多列
+----+-----------------+
|gbNo| fq|
+----+-----------------+
| A|[10, 3, 18, 7, 4]|
| B| [4, 4, 14, 6, 4]|
| C|[15, 1, 12, 1, 4]|
| D|[15, 5, 18, 6, 5]|
| E| [1, 2, 17, 6, 3]|
| F| [2, 2, 17, 5, 5]|
| A| [2, 2, 17, 5, 5]|
+----+-----------------+
+---+---+---+---+---+---+
| _1| 0| 1| 2| 3| 4|
+---+---+---+---+---+---+
| F| 2| 2| 17| 5| 5|
| E| 1| 2| 17| 6| 3|
| B| 4| 4| 14| 6| 4|
| D| 15| 5| 18| 6| 5|
| C| 15| 1| 12| 1| 4|
| A| 12| 5| 35| 12| 9|
+---+---+---+---+---+---+