背景:把一列的DataSet扩张到25列,就报了上面的错,也就是tuple超过了它默认允许的元素个数。
/*
* // 方式一:使用普通方式转换,直接在mp函数类增加DataSet的列数。 这种方式使用的是Tuple()增加列
*
* 结果:未通过 (现在的scala是2.11 ,2.12以上版本没有试)
* */
def fun1(ds1: Dataset[String],spark: SparkSession): Unit = {
import spark.implicits._
ds1.select("value").map { value =>
val values = value.toString().split("#")
//返回元组(元组有多少列,返回的就是多少列,最多能返回列, 实验结果是返回大于22列就会报错)
//报错: too many elements for tuple: 25, allowed: 22
(values(0), values(1), values(2), values(3), values(4), values(5), values(6), values(7), values(8),
values(9), values(10), values(11), values(12), values(13), values(14), values(15), values(16),
values(17), values(18), values(19), values(20), values(21)
,values(22),values(23),values(24)) //-- 25个字段的情况
}.show()
}
//异常如下:
Error:(121, 9) too many elements for tuple: 25, allowed: 22
(values(0), values(1), values(2), values(3), values(4), values(5), values(6), values(7), values(8),
^
如上,在做map使用了Tuple对原有的数据列进行了扩张,但违反了默认tuple个数的设定,可以 用以下两种方式去解决。
- 让case class 继承Product,然后为DataSet增加列,
- 通过数组增加列,使用Array(n…)去获取
具体代码如下:
1:继承Product方式扩展列
/*
* // 方式二:使用 extends Product,然后为DataSet增加列,
* 实验结果:
* 报错:not enough arguments for method map: (implicit evidence
* */
def fun2(ds1: Dataset[String], spark: SparkSession): Unit = {
val sqlContext = spark.sqlContext
import spark.implicits._
//import sqlContext.implicits._
println("方式二")
ds1.map{value =>
val values = value.toString().split("#")
val pre = new PreTreatBean(values(0), values(1), values(2), values(3), values(4), values(5), values(6), values(7), values(8),
values(9), values(10), values(11), values(12), values(13), values(14), values(15), values(16),
values(17), values(18), values(19), values(20), values(21),values(22),values(23),values(24))
pre
}.show()
}
//定义case class
case class PreTreatBean(
val A: String,
val B: String,
val C: String,
val D: String,
val E: String,
val F: String,
val G: String,
val H: String,
val I: String,
val J: String,
val K: String,
val L: String,
val M: String,
val N: String,
val O: String,
val P: String,
val Q: String,
val R: String,
val S: String,
val T: String,
val U: String,
val V: String,
val W: String,
val X: String,
val Y: String
) extends Product {
override def productArity = 25 //字段总数
override def canEqual(that: Any) = that.isInstanceOf[this.type]
override def productElement(n: Int):Any = n match {
case 0 =>A : String
case 1 => B : String
case 2 => C : String
case 3 => D : String
case 4 => E : String
case 5 => F : String
case 6 => G : String
case 7 => H : String
case 8 => I : String
case 9 => J : String
case 10 =>K : String
case 11 =>L : String
case 12 =>M : String
case 13 =>N : String
case 14 =>O : String
case 15 =>P : String
case 16 =>Q : String
case 17 =>R : String
case 18 =>S : String
case 19 =>T : String
case 20 =>U : String
case 21 =>V : String
case 22 =>W : String
case 23 =>X : String
case 24 =>Y : String
}
}
2: 使用数组扩张列`
/*
* // 方式三:数组传递数据,然后通过Array(0)...去获取
* 实验结果:能够生产25列的表
* */
def fun3(ds1: Dataset[String], spark: SparkSession): Unit = {
import spark.implicits._
import org.apache.spark.sql.functions._
val ds2 = ds1.select("value").map(value => {
val values = value.toString()split("#")
values
}
)
println("ds2+++++++++")
ds2.show()
println("============")
ds2.select(
col("value")(0) as "s0",
col("value")(1) as "s1",
col("value")(2) as "s2",
col("value")(3) as "s31",
col("value")(4) as "s41",
col("value")(5) as "s51",
col("value")(6) as "s61",
col("value")(7) as "s71",
col("value")(8) as "s81",
col("value")(9) as "s91",
col("value")(10) as "s11",
col("value")(11) as "sa1",
col("value")(12) as "sb1",
col("value")(13) as "sc1",
col("value")(14) as "sd1",
col("value")(15) as "se1",
col("value")(16) as "sf1",
col("value")(17) as "sn1",
col("value")(18) as "sm1",
col("value")(19) as "ss1",
col("value")(20) as "sss1",
col("value")(21) as "st1",
col("value")(22) as "sy1",
col("value")(23) as "sl1",
col("value")(24) as "sj1"
).select("s0")
.show()
}