Flink的Data Sink以及自定义Data Sink

Data Sink

Data sinks负责消费Data Stream的数据,将数据写出到外围系统,例如:文件/网络/NoSQL/RDBMS/Message Queue等。Flink底层也预定义了一些常用的Sinks,同时用户也可以根据实际需求定制Data Sink通过集成SinkFunction或者RichSinkFunction。

File Based(测试)

  • writeAsText()|writeAsCsv(…)|writeUsingOutputFormat() at-least-once
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment

val output = new CsvOutputFormat[Tuple2[String, Int]](new Path("file:///D:/fink-results"))
//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.socketTextStream("Spark",9999)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
    .map((_,1))
    .keyBy(0)
    .sum(1)
    .map(t=> new Tuple2(t._1,t._2))
    .writeUsingOutputFormat(output)

fsEnv.execute("FlinkWordCountsQuickStart")
  • Bucketing File Sink (exactly-once)
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-filesystem_2.11</artifactId>
    <version>1.8.1</version>
</dependency>

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.9.2</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.9.2</version>
</dependency>
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment

val bucketSink = new BucketingSink[String]("hdfs://Spark:9000/bucketSink")
bucketSink.setBucketer(new DateTimeBucketer("yyyy-MM-dd-HH", ZoneId.of("Asia/Shanghai")))

//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.socketTextStream("Spark",9999)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.map(t=>t._1+"\t"+t._2)
.addSink(bucketSink)

fsEnv.execute("FlinkWordCountsQuickStart")

print()/ printToErr()

//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment

//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.socketTextStream("Spark",9999)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print("error")

fsEnv.execute("FlinkWordCountsQuickStart")

自定义Sink

class UserDefineRichSinkFunction extends RichSinkFunction[(String,Int)]{
    override def open(parameters: Configuration): Unit = {
        println("open")
    }
    override def invoke(value: (String, Int)): Unit = {
        println("insert into xxx "+value)
    }

    override def close(): Unit = {
        println("close")
    }
}
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment

//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.socketTextStream("Spark",7788)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.addSink(new UserDefineRichSinkFunction)

fsEnv.execute("FlinkWordCountsQuickStart")

Redis Sink

参考:https://bahir.apache.org/docs/flink/current/flink-streaming-redis/

<dependency>
    <groupId>org.apache.bahir</groupId>
    <artifactId>flink-connector-redis_2.11</artifactId>
    <version>1.0</version>
</dependency>
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment


val conf = new FlinkJedisPoolConfig.Builder()
.setHost("Spark")
.setPort(6379).build()

//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.socketTextStream("Spark",7788)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.addSink(new RedisSink(conf,new UserDefineRedisMapper))

fsEnv.execute("FlinkWordCountsQuickStart")
class UserDefineRedisMapper extends RedisMapper[(String,Int)]{
  override def getCommandDescription: RedisCommandDescription = {
    new RedisCommandDescription(RedisCommand.HSET,"word-count")
  }

  override def getKeyFromData(t: (String, Int)): String = {
    t._1
  }

  override def getValueFromData(t: (String, Int)): String = {
    t._2.toString
  }
}

在安装Redis如果访问不到,需要关闭Redis protect-model:no

Kafka Sink

<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-kafka_2.11</artifactId>
    <version>1.8.1</version>
</dependency>
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment

val props = new Properties()
props.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"Spark:9092")
//不建议覆盖
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,classOf[ByteArraySerializer])
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,classOf[ByteArraySerializer])


props.put(ProducerConfig.RETRIES_CONFIG,"3")
props.put(ProducerConfig.ACKS_CONFIG,"-1")
props.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG,"true")
props.put(ProducerConfig.BATCH_SIZE_CONFIG,"100")
props.put(ProducerConfig.LINGER_MS_CONFIG,"500")


//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.socketTextStream("Spark",7788)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.addSink(new FlinkKafkaProducer[(String, Int)]("topicxx",
                                               new UserDefineKeyedSerializationSchema,
                                               props))

fsEnv.execute("FlinkWordCountsQuickStart")
class UserDefineKeyedSerializationSchema extends KeyedSerializationSchema[(String,Int)]{
  override def serializeKey(t: (String, Int)): Array[Byte] = {
    t._1.getBytes()
  }

  override def serializeValue(t: (String, Int)): Array[Byte] = {
    t._2.toString.getBytes()
  }

  override def getTargetTopic(t: (String, Int)): String = "topic01"
}

下面是一个使用Flink将数据写入到Hive表中的示例代码: ```java import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.table.catalog.hive.HiveCatalog; import org.apache.flink.table.descriptors.Schema; import org.apache.flink.table.descriptors.ConnectorDescriptor; import org.apache.flink.table.descriptors.FormatDescriptor; import org.apache.flink.table.descriptors.FileSystem; import org.apache.flink.table.descriptors.OldCsv; import org.apache.flink.table.sinks.TableSink; import org.apache.flink.table.sinks.hive.HiveTableSink; public class FlinkHiveSinkDemo { public static void main(String[] args) throws Exception { // 创建Flink的StreamExecutionEnvironment对象 final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); // 创建TableEnvironment对象 EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); // 创建Hive Catalog String catalogName = "myhive"; String defaultDatabase = "default"; String hiveConfDir = "/path/to/hive/conf"; HiveCatalog hiveCatalog = new HiveCatalog(catalogName, defaultDatabase, hiveConfDir); tableEnv.registerCatalog(catalogName, hiveCatalog); // 创建Hive表 String tableName = "mytable"; String[] fieldNames = {"name", "age", "gender"}; String[] fieldTypes = {"STRING", "INT", "STRING"}; tableEnv.sqlUpdate(String.format("CREATE TABLE %s (%s) PARTITIONED BY (dt STRING)", tableName, getFields(fieldNames, fieldTypes))); // 将DataStream转换为Table DataStream<Person> stream = env.fromElements(new Person("Alice", 18, "F"), new Person("Bob", 20, "M")); Table table = tableEnv.fromDataStream(stream, "name, age, gender"); // 将Table写入Hive表 TableSink sink = new HiveTableSink(tableName, catalogName, getFields(fieldNames, fieldTypes), new String[]{"dt"}); tableEnv.registerTableSink("hiveSink", sink); table.insertInto("hiveSink"); // 执行任务 env.execute("Flink Hive Sink Demo"); } private static String getFields(String[] fieldNames, String[] fieldTypes) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < fieldNames.length; i++) { sb.append(fieldNames[i]).append(" ").append(fieldTypes[i]); if (i < fieldNames.length - 1) { sb.append(","); } } return sb.toString(); } public static class Person { public String name; public int age; public String gender; public Person() {} public Person(String name, int age, String gender) { this.name = name; this.age = age; this.gender = gender; } } } ``` 这个示例代码中,先创建了一个Hive Catalog,然后创建了一个Hive表。将一个DataStream转换为Table,并通过HiveTableSink将Table写入到Hive表中。在实际使用中,需要根据具体的业务场景进行调整。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值