spark 逐层拆解json嵌套的array

Spark解析JSON嵌套数组的详细步骤

最新推荐文章于 2025-07-28 00:00:00 发布

原创最新推荐文章于 2025-07-28 00:00:00 发布 · 2.6k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#spark #json

spark 专栏收录该内容

1 篇文章

订阅专栏

本文介绍了如何使用Spark来逐层拆解包含嵌套数组的JSON数据。首先，通过创建SQLContext读取JSON文件。接着，获取数据模式，并针对遇到的数组进行拆解，同时删除第一层的重复字段名。此外，还分别讲解了创建处理数组和结构体下探的函数，以深入解析JSON数据。

1、创建SQLContext 读取json
2、获取schema并循环第一次，遇到array就拆解，然后向下层下探。最后拆完，把第一层重复的字段名删掉。

            SQLContext sqlContext = new SQLContext(sc);   


        DataFrame df=sqlContext.read().json(FileFullName);


        for(StructField sf:df.schema().fields()){
            this.li.add(sf.name());
            String sname=StringUtils.join(li.toArray(),"_");
            if(sf.dataType().typeName()=="array"||"array".equals(sf.dataType().typeName())){

                ArrayType at=(ArrayType)sf.dataType();
                df=df.withColumn(sname, functions.explode(functions.when(df.col(sname).isNull(), functions.array(functions.lit(null).cast(at.elementType())))
                                                              .when(functions.size(df.col(sname)).equalTo(0), functions.array(functions.lit(null).cast(at.elementType())))
                                                              .otherwise(df.col(sname))));
                df=array_loop((ArrayType)sf.dataType(),df);
            }else if(sf.dataType().typeName()=="struct"||"struct".equals(sf.dataType().typeName())){
                df=struct_loop((StructType)sf.dataType(),df);
                df=df.drop(sname);
            }
            cols.add(StringUtils.join(li.toArray(),"."));
            this.li.remove(sf.name());
        }

3、创建array下探的函数。

public DataFrame array_loop(ArrayType arr,DataFrame df){
        if(arr.elementType().typeName()=="struct"||"struct".equals(arr.elementType().typeName())){
            df=struct_loop((StructType)arr.elementType(),df);
        }
        return df;
    }

4、创建struct下探的函数

 public DataFrame struct_loop(StructType st,DataFrame df){
        for(StructField sf:st.fields()){
            this.li.add(sf.name());
            String sname=StringUtils.join(li.toArray(),"_");
            String sname1=StringUtils.join(li.toArray(),".");
            if(sf.dataType().typeName()=="array"||"array".equals(sf.dataType().typeName())){

                ArrayType at=(ArrayType)sf.dataType();
                df=df.withColumn(sname, functions.explode(functions.when(df.col(sname).isNull(), functions.array(functions.lit(null).cast(at.elementType())))
                                                              .when(functions.size(df.col(sname)).equalTo(0), functions.array(functions.lit(null).cast(at.elementType())))
                                                              .otherwise(df.col(sname))));
                df=array_loop((ArrayType)sf.dataType(),df);
            }else if(sf.dataType().typeName()=="struct"||"struct".equals(sf.dataType().typeName())){
                df=struct_loop((StructType)sf.dataType(),df);
            }
            df=df.selectExpr(sname1+" as "+sname,"*");
            cols.add(StringUtils.join(li.toArray(),"."));
            this.li.remove(sf.name());
        }
        return df;
    }