jindo使用地址jindoshttps://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/4.x/4.5.x/4.5.1/oss/flink/jindosdk_on_flink.md
写入parquet格式报错
org.apache.avro.AvroRuntimeException: No field named json in: class
写入ORC
一、引入flink-orc_2.12
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-orc_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
二、代码
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.core.fs.Path;
import org.apache.flink.orc.writer.OrcBulkWriterFactory;
import org.apache.flink.runtime.state.hashmap.HashMapStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
/**
* Create by me on 2022/10/11 15:59
*/
@Slf4j
public class kafak2JindoOss {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置流批一体为流模式
//Let the system decide based on the boundedness of the sources
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
//设置失败重启策略
env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(4, 1000));
//设置checkpoint mode exactly_once
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.enableCheckpointing(30000);
//设置checkpoint 超时时间
env.getCheckpointConfig().setCheckpointTimeout(1200000);
//外部checkpoint cleanUp 策略
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setStateBackend(new HashMapStateBackend());
//加载外部配置
ParameterTool customParam = ConfigUtil.createParameterTool(args);
//设置全局配置
env.getConfig().setGlobalJobParameters(customParam);
String hdfsPrefix = customParam.get(Constants.HDFS_PATH);
String jobName = customParam.get("jobName", "kafka2hdfs");
List<String> topics = new ArrayList<>();
List<String> timeColumns = new ArrayList<>();
List<OutputTag<String>> outputTags = new ArrayList<>();
SingleOutputStreamOperator<String> processStream = getKafkaStream(env, customParam, topics, timeColumns, outputTags);
// 防止任务重启后文件名冲突 临时文件无法转为正式文件
double random = Math.random();
long l = Math.round(random * 100);
//设置输出文件名的前缀和后缀
OutputFileConfig fileConfig = OutputFileConfig
.builder()
.withPartPrefix("detail")
.withPartSuffix("-" + l + ".orc")
.build();
for (int i = 0; i < outputTags.size(); i++) {
String timeColumn = timeColumns.get(i);
DataStream<String> sideOutput = processStream.getSideOutput(outputTags.get(i)).rebalance();
String hdfsPath = hdfsPrefix + "/" + topics.get(i);
SingleOutputStreamOperator<JsonText> JsonTextMap = sideOutput.map(x -> new JsonText(x));
// StreamingFileSink sink = StreamingFileSink.forRowFormat(new Path(hdfsPath), new SimpleStringEncoder())
// // 采用的是自定义的分桶类,把文件保存到某个表的目录下,按照hive分区目录命名方式生成文件名
// .withBucketAssigner(new EventTimeBucketAssigner(timeColumn, "dt"))
// .withRollingPolicy(FileRollingPolicy.build())
//// .withBucketCheckInterval(1000L)
// .withOutputFileConfig(fileConfig).build();
String schema = "struct<_col0:string>";
Properties writerProperties = new Properties();
writerProperties.setProperty("orc.compress", "SNAPPY");
OrcBulkWriterFactory<JsonText> orcBulkWriterFactory = new OrcBulkWriterFactory<>(
new MsgVectorizer(schema), writerProperties, new Configuration());
StreamingFileSink<JsonText> sink = StreamingFileSink.forBulkFormat(
new Path(hdfsPath), orcBulkWriterFactory).withBucketAssigner(new BucketAssignerParquet(timeColumn))
.withOutputFileConfig(fileConfig)
.withRollingPolicy(FileRollingPolicy.build()).build();
// StreamingFileSink<JsonText> sink = StreamingFileSink.forBulkFormat(
// new Path(hdfsPath), PaulParquetAvroWriters.forReflectRecord(JsonText.class, compressionCodecName))
// .withBucketAssigner(new BucketAssignerParquet(timeColumn))
// .withOutputFileConfig(fileConfig)
// .withRollingPolicy(FileRollingPolicy.build()).build();
JsonTextMap.addSink(sink)
.setParallelism(customParam.getInt(Constants.SINK_PARALLELISM,env.getParallelism()))
.name("ossSink-" + topics.get(i)).uid("ossSink-" + topics.get(i));
}
env.execute(jobName);
}
static SingleOutputStreamOperator<String> getKafkaStream(StreamExecutionEnvironment env, ParameterTool customParam, List<String> topics, List<String> timeColumns, List<OutputTag<String>> outputTags) throws MyException {
List<TopicProperties> listTopic = MysqlSource.getTopics(customParam);
// 遍历 topic 信息集合中的集合并创建分流
Properties kafkaProperties = KafkaUtils.getKafkaProperties(Boolean.valueOf(customParam.get(Constants.isAuthentication)), listTopic.get(0), customParam);
for (TopicProperties topicInfo : listTopic) {
// String topicName = topicInfo.getTopicName();
String timeColumn = topicInfo.getTimeColumn();
timeColumns.add(timeColumn);
topics.add(topicName);
OutputTag<String> outputTag = new OutputTag<String>(topicName) {
};
outputTags.add(outputTag);
}
FlinkKafkaConsumer<Tuple2<String, String>> kafkaConsumer = new FlinkKafkaConsumer<>(topics, new KafkaDeserializationTopicSchema(), kafkaProperties);
// 设置消费策略
String offset = customParam.get(Constants.KAFKA_OFFSET,"");
log.info("jobName:{},offset:{}", customParam.get("jobName"), offset);
if ("earliest".equals(offset.toLowerCase())) {
kafkaConsumer.setStartFromEarliest();
} else if ("latest".equals(offset.toLowerCase())) {
kafkaConsumer.setStartFromLatest();
} else if (Pattern.matches("[1-9]\\d{12}", offset)) {
long ts = Long.parseLong(offset);
kafkaConsumer.setStartFromTimestamp(ts);
log.info("设置任务{} timestamp offset:{}", customParam.get("jobName"), offset);
}
DataStream<Tuple2<String, String>> kafkaSource = env.addSource(kafkaConsumer).name("kafkaSource").uid("kafkaSource").rebalance();
SingleOutputStreamOperator<String> processStream = kafkaSource.process(new ProcessFunction<Tuple2<String, String>, String>() {
@Override
public void processElement(Tuple2<String, String> value, Context context, Collector<String> collector) {
// topic 名称
String topic_name = value.f0;
// topic 内容
String topic_mvalue = value.f1;
for (int i = 0; i < topics.size(); i++) {
if (topic_name.equals(topics.get(i))) {
context.output(outputTags.get(i), topic_mvalue);
}
}
}
}).name("outPutTagAddValue").uid("outPutTagAddValue");
return processStream;
}
}
三、引入jar
flink lib中放入两个jar包,不然报类找不到错误
orc-core-1.5.6.jar,hive-storage-api-2.6.0.jar
四、连接配置
flink-conf.yaml中加入配置内容
#根据实际 oss bucket 开通情况配置并授权
classloader.resolve-order: parent-first
##fs.defaultFS: oss://BUCKET
fs.oss.endpoint: oss-cn-shanghai-internal.aliyuncs.com
fs.oss.accessKeyId: ******
fs.oss.accessKeySecret: *****
fs.AbstractFileSystem.oss.impl: com.aliyun.jindodata.oss.OSS
fs.oss.impl: com.aliyun.jindodata.oss.JindoOssFileSystem
五、flink run 运行