Flink sink OSS_flink jar 写 parquet 写入oss-优快云博客

本文链接：https://blog.youkuaiyun.com/max_hello/article/details/127574642

jindo使用地址jindoshttps://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/4.x/4.5.x/4.5.1/oss/flink/jindosdk_on_flink.md
写入parquet格式报错

org.apache.avro.AvroRuntimeException: No field named json in: class

写入ORC

一、引入flink-orc_2.12

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-orc_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>

二、代码

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.core.fs.Path;
import org.apache.flink.orc.writer.OrcBulkWriterFactory;
import org.apache.flink.runtime.state.hashmap.HashMapStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;

/**
 * Create by me on 2022/10/11 15:59
 */
@Slf4j
public class kafak2JindoOss {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //设置流批一体为流模式
        //Let the system decide based on the boundedness of the sources
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        //设置失败重启策略
        env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(4, 1000));

        //设置checkpoint mode exactly_once
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.enableCheckpointing(30000);
        //设置checkpoint 超时时间
        env.getCheckpointConfig().setCheckpointTimeout(1200000);

        //外部checkpoint cleanUp 策略
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        env.setStateBackend(new HashMapStateBackend());
        //加载外部配置
        ParameterTool customParam = ConfigUtil.createParameterTool(args);

        //设置全局配置
        env.getConfig().setGlobalJobParameters(customParam);

        String hdfsPrefix = customParam.get(Constants.HDFS_PATH);

        String jobName = customParam.get("jobName", "kafka2hdfs");

        List<String> topics = new ArrayList<>();
        List<String> timeColumns = new ArrayList<>();
        List<OutputTag<String>> outputTags = new ArrayList<>();

        SingleOutputStreamOperator<String> processStream = getKafkaStream(env, customParam, topics, timeColumns, outputTags);

        // 防止任务重启后文件名冲突 临时文件无法转为正式文件
        double random = Math.random();
        long l = Math.round(random * 100);
        //设置输出文件名的前缀和后缀
        OutputFileConfig fileConfig = OutputFileConfig
                .builder()
                .withPartPrefix("detail")
                .withPartSuffix("-" + l + ".orc")
                .build();
        for (int i = 0; i < outputTags.size(); i++) {

            String timeColumn = timeColumns.get(i);
            DataStream<String> sideOutput = processStream.getSideOutput(outputTags.get(i)).rebalance();
            String hdfsPath = hdfsPrefix + "/" + topics.get(i);

            SingleOutputStreamOperator<JsonText> JsonTextMap = sideOutput.map(x -> new JsonText(x));
//            StreamingFileSink sink = StreamingFileSink.forRowFormat(new Path(hdfsPath), new SimpleStringEncoder())
//                    // 采用的是自定义的分桶类，把文件保存到某个表的目录下，按照hive分区目录命名方式生成文件名
//                    .withBucketAssigner(new EventTimeBucketAssigner(timeColumn, "dt"))
//                    .withRollingPolicy(FileRollingPolicy.build())
////                    .withBucketCheckInterval(1000L)
//                    .withOutputFileConfig(fileConfig).build();
            String schema = "struct<_col0:string>";
            Properties writerProperties = new Properties();
            writerProperties.setProperty("orc.compress", "SNAPPY");
            OrcBulkWriterFactory<JsonText> orcBulkWriterFactory = new OrcBulkWriterFactory<>(
                    new MsgVectorizer(schema), writerProperties, new Configuration());
            StreamingFileSink<JsonText> sink = StreamingFileSink.forBulkFormat(
                    new Path(hdfsPath), orcBulkWriterFactory).withBucketAssigner(new BucketAssignerParquet(timeColumn))
                    .withOutputFileConfig(fileConfig)
                    .withRollingPolicy(FileRollingPolicy.build()).build();
//            StreamingFileSink<JsonText> sink = StreamingFileSink.forBulkFormat(
//                    new Path(hdfsPath), PaulParquetAvroWriters.forReflectRecord(JsonText.class, compressionCodecName))
//                    .withBucketAssigner(new BucketAssignerParquet(timeColumn))
//                    .withOutputFileConfig(fileConfig)
//                    .withRollingPolicy(FileRollingPolicy.build()).build();
            JsonTextMap.addSink(sink)
                    .setParallelism(customParam.getInt(Constants.SINK_PARALLELISM,env.getParallelism()))
                    .name("ossSink-" + topics.get(i)).uid("ossSink-" + topics.get(i));
        }
        env.execute(jobName);
    }


    static SingleOutputStreamOperator<String> getKafkaStream(StreamExecutionEnvironment env, ParameterTool customParam, List<String> topics, List<String> timeColumns, List<OutputTag<String>> outputTags) throws MyException {
        List<TopicProperties> listTopic = MysqlSource.getTopics(customParam);
        // 遍历 topic 信息集合中的集合并创建分流
        Properties kafkaProperties = KafkaUtils.getKafkaProperties(Boolean.valueOf(customParam.get(Constants.isAuthentication)), listTopic.get(0), customParam);
        for (TopicProperties topicInfo : listTopic) {
//            String topicName = topicInfo.getTopicName();
            String timeColumn = topicInfo.getTimeColumn();
            timeColumns.add(timeColumn);
            topics.add(topicName);
            OutputTag<String> outputTag = new OutputTag<String>(topicName) {
            };
            outputTags.add(outputTag);
        }

        FlinkKafkaConsumer<Tuple2<String, String>> kafkaConsumer = new FlinkKafkaConsumer<>(topics, new KafkaDeserializationTopicSchema(), kafkaProperties);
        // 设置消费策略
        String offset = customParam.get(Constants.KAFKA_OFFSET,"");
        log.info("jobName:{},offset:{}", customParam.get("jobName"), offset);
        if ("earliest".equals(offset.toLowerCase())) {
            kafkaConsumer.setStartFromEarliest();
        } else if ("latest".equals(offset.toLowerCase())) {
            kafkaConsumer.setStartFromLatest();
        } else if (Pattern.matches("[1-9]\\d{12}", offset)) {
            long ts = Long.parseLong(offset);
            kafkaConsumer.setStartFromTimestamp(ts);
            log.info("设置任务{} timestamp offset:{}", customParam.get("jobName"), offset);
        }
        DataStream<Tuple2<String, String>> kafkaSource = env.addSource(kafkaConsumer).name("kafkaSource").uid("kafkaSource").rebalance();

        SingleOutputStreamOperator<String> processStream = kafkaSource.process(new ProcessFunction<Tuple2<String, String>, String>() {
            @Override
            public void processElement(Tuple2<String, String> value, Context context, Collector<String> collector) {
                // topic 名称
                String topic_name = value.f0;
                // topic 内容
                String topic_mvalue = value.f1;

                for (int i = 0; i < topics.size(); i++) {
                    if (topic_name.equals(topics.get(i))) {
                        context.output(outputTags.get(i), topic_mvalue);
                    }
                }
            }
        }).name("outPutTagAddValue").uid("outPutTagAddValue");
        return processStream;
    }

}

三、引入jar

flink lib中放入两个jar包，不然报类找不到错误

orc-core-1.5.6.jar ,hive-storage-api-2.6.0.jar

四、连接配置

flink-conf.yaml中加入配置内容

#根据实际 oss bucket 开通情况配置并授权

classloader.resolve-order: parent-first
##fs.defaultFS: oss://BUCKET
fs.oss.endpoint: oss-cn-shanghai-internal.aliyuncs.com
fs.oss.accessKeyId: ******
fs.oss.accessKeySecret: *****

fs.AbstractFileSystem.oss.impl: com.aliyun.jindodata.oss.OSS
fs.oss.impl: com.aliyun.jindodata.oss.JindoOssFileSystem

五、flink run 运行