Flink之词频统计案例

本文展示了Flink实现WordCount的多个示例,包括批处理和流处理方式,使用了flatMap、map、keyBy和reduce操作。此外,还通过链式编程和Lambda表达式简化代码,并利用FlinkTableAPI和SQL进行单词统计。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

案例一:Flink 程序实现Wordcount单词统计(批处理) 

package day01;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @desc: Flink 程序实现Wordcount单词统计(批处理)
 */
public class Demo01_WordCountBatch {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        env.setRuntimeMode(RuntimeExecutionMode.BATCH);
        //2.数据输入(数据源)
        DataStreamSource<String> source = env.readTextFile("D:\\coding\\workspace4\\gz_flinkbase\\data\\words.txt");
        //3.数据处理,匿名内部类 new 接口类(){}
        //3.1 flatMap进行扁平化处理
        SingleOutputStreamOperator<String> flatMapStream = source.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String value, Collector<String> out) throws Exception {
                String[] words = value.split(",");
                for (String word : words) {
                    out.collect(word);
                }
            }
        });
        //3.2 使用map方法,进行转换(单词,1)int -> Integer
        SingleOutputStreamOperator<Tuple2<String, Integer>> mapStream = flatMapStream.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String value) throws Exception {
                return Tuple2.of(value, 1);
            }
        });
        //3.3 使用keyBy算子进行单词分组 (hello,1)
        KeyedStream<Tuple2<String, Integer>, String> keyedStream = mapStream.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
            @Override
            public String getKey(Tuple2<String, Integer> value) throws Exception {
                return value.f0;
            }
        });
        //3.4进行reduce(sum)操作(hello,1),(hello,1)
        SingleOutputStreamOperator<Tuple2<String, Integer>> result = keyedStream.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> reduce(Tuple2<String, Integer> value1, Tuple2<String, Integer> value2) throws Exception {
                return Tuple2.of(value1.f0, value1.f1 + value2.f1);
            }
        });
        //4.数据输出
        result.print();
        //5.启动流式任务
        env.execute();
    }
}

运行结果如下:

案例二:Flink 代码实现流处理,进行单词统计。数据源来自于socket数据

package day01;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @desc: Flink 代码实现流处理,进行单词统计。数据源来自于socket数据。
 */
public class Demo02_WordCountStream {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.STREAMING);
        env.setParallelism(1);
        //2.数据输入(数据源)
        //从socket读取数据,socket = hostname + port
        DataStreamSource<String> source = env.socketTextStream("node1", 9999);
        //3.数据处理
        //3.1 使用flatMap进行扁平化处理
        SingleOutputStreamOperator<String> flatMapStream = source.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String value, Collector<String> out) throws Exception {
                String[] words = value.split(" ");
                for (String word : words) {
                    out.collect(word);
                }
            }
        });
        //3.2 使用map进行转换,转换成(单词,1)
        SingleOutputStreamOperator<Tuple2<String, Integer>> mapStream = flatMapStream.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String value) throws Exception {
                return Tuple2.of(value, 1);
            }
        });
        //3.3使用keyBy进行单词分组
        KeyedStream<Tuple2<String, Integer>, String> keyedStream = mapStream.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
            @Override
            public String getKey(Tuple2<String, Integer> value) throws Exception {
                return value.f0;
            }
        });
        //3.4 使用reduce(sum)进行聚合操作,sum:就是根据第一个元素(Integer)进行sum操作
        SingleOutputStreamOperator<Tuple2<String, Integer>> result = keyedStream.sum(1);
        //4.数据输出
        result.print();
        //5.启动流式任务
        env.execute();
    }
}

运行结果如下:

 案例三:采用链式编程的方式实现Flink的Wordcount案例。

package day01;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @desc: 采用链式编程的方式实现Flink的Wordcount案例。
 */
public class Demo03_WordCountStream_02 {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        //2.数据输入
        DataStreamSource<String> source = env.socketTextStream("node1", 9999);
        //3.数据处理
        SingleOutputStreamOperator<Tuple2<String, Integer>> result = source.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String value, Collector<String> out) throws Exception {
                String[] words = value.split(" ");
                for (String word : words) {
                    out.collect(word);
                }
            }
        }).map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String value) throws Exception {
                // word -> (word,1)
                return Tuple2.of(value, 1);
            }
        }).keyBy(0)//根据第0个元素进行分组(单词)
                .sum(1);
        //4.数据输出
        result.print();
        //5.启动流式任务
        env.execute();
    }
}

运行结果如下:

案例四:采用Lambda表达式的方式来编写Flink  wordcount入门案例

package day01;

import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @desc: 扩展2:采用Lambda表达式的方式来编写Flink  wordcount入门案例
 */
public class Demo04_WordCountStream_03 {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        //2.数据输入
        DataStreamSource<String> source = env.socketTextStream("node1", 9999);
        //3.数据处理
        SingleOutputStreamOperator<Tuple2<String, Integer>> result = source.flatMap((String value, Collector<String> out) -> {
            String[] words = value.split(" ");
            for (String word : words) {
                out.collect(word);
            }
        }).returns(Types.STRING).map(value -> Tuple2.of(value, 1))
                .returns(Types.TUPLE(Types.STRING,Types.INT))
                .keyBy(value -> value.f0)
                .sum(1);
        //4.数据输出
        result.print();
        //5.启动流式任务
        env.execute();
    }
}

 运行结果如下:

案例五:采用Lambda表达式的方式来编写Flink  wordcount入门案例

package day01;

import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

import java.util.Arrays;

/**
 * @desc: 扩展3:采用Lambda表达式的方式来编写Flink  wordcount入门案例
 */
public class Demo04_WordCountStream_04 {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        //2.数据输入
        DataStreamSource<String> source = env.socketTextStream("node1", 9999);
        //3.数据处理
        SingleOutputStreamOperator<Tuple2<String, Integer>> result = source.flatMap((String value, Collector<String> out) -> {
            Arrays.stream(value.split(" ")).forEach(out::collect);
        }).returns(Types.STRING).map(value -> Tuple2.of(value, 1))
                .returns(Types.TUPLE(Types.STRING,Types.INT))
                .keyBy(value -> value.f0)
                .sum(1);
        //4.数据输出
        result.print();
        //5.启动流式任务
        env.execute();
    }
}

 运行结果如下:

案例六:使用Flink Table API进行wordcount单词统计。

package day01;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.api.Expressions;
import org.apache.flink.table.api.Schema;
import org.apache.flink.table.api.TableDescriptor;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import java.util.concurrent.ExecutionException;

/**
 * @desc: 使用Flink Table API进行wordcount单词统计。
 */
public class Demo05_WordCountTable {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        //env 对象是基于DataStream API构建的,如果需要使用Table API/SQL来提交Flink任务,则需要使用Flink里的StreamTableEnvironment对象
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
        t_env.getConfig().set("parallelism.default","1");
        //2.数据输入(数据输入表)
        /**
         * createTemporaryTable(String tableName,TableDescriptor tableDescriptor);
         * tableName:表名
         * tableDescriptor:描述表的schema,column等信息的
         * connector: 就类似于jdbc的驱动类,但是Flink不叫驱动包(驱动类),Flink叫做Connector,连接器。
         * 连接器:就是用来连接外部数据源的。
         */

        /**
         *      |    word   |
         *      |   hello   |
         *      |   hive    |
         *      |   flink   |
         */
        t_env.createTemporaryTable("source", TableDescriptor.forConnector("datagen")
                .schema(Schema.newBuilder()
                        .column("word", DataTypes.STRING()).build())
                .option("rows-per-second","1")
                .option("fields.word.kind","random")
                .option("fields.word.length","1")
                .build());
        //3.数据输出(数据输出表)
        /**
         *      |   word   |    counts    |
         *      |     a    |      2       |
         *      |     1    |      3       |
         */
        t_env.createTemporaryTable("sink",TableDescriptor.forConnector("print")
                .schema(Schema.newBuilder()
                        .column("word",DataTypes.STRING())
                        .column("counts",DataTypes.BIGINT()).build())
                .build());
        //4.数据处理(基于数据输入表、数据输出表进行业务处理(单词统计)
        /**
         * 处理逻辑:
         * 首先从源表把数据读取出来,根据单词进行分组,然后按照分组后的字段(word,count(*))进行统计。
         * from:从源表读取数据
         * groupBy:根据xx字段分组
         * select:分组后选择需要的数据,选择的数据&类型需要和目标表匹配
         * executeInsert:把最终结果插入到目标表中去
         * insert into sink
         * select word ,count(*) from source group by word
         */
        t_env.from("source")
                .groupBy(Expressions.$("word"))
                .select(Expressions.$("word"),Expressions.lit(1).count())
                .executeInsert("sink")
                .await();

        //5.启动流式任务
        env.execute();
    }
}

运行结果如下:

 案例七: 使用Flink SQL完成单词统计

package day01;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import java.util.concurrent.ExecutionException;

/**
 * @desc: 使用Flink SQL完成单词统计
 */
public class Demo06_WordCountSQL {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
        t_env.getConfig().set("parallelism.default","1");

        //2.构建数据源表(数据输入)
        /**
         *      |     word    |
         *      |     hello   |
         *      |     hive    |
         *      |     spark   |
         *      |     flink   |
         */
        t_env.executeSql("create table source(" +
                "word varchar" +
                ") with (" +
                "'connector' = 'datagen'," +
                "'rows-per-second' = '1'," +
                "'fields.word.kind' = 'random'," +
                "'fields.word.length' = '1'" +
                ")");

        //3.构建数据输出表(数据输出)
        /**   表结构如下:
         *       |    word    |   counts    |
         *       |    hello   |     1       |
         *       |    hive    |     2       |
         *       |    flink   |     3       |
         */
        t_env.executeSql("create table sink(" +
                "word varchar," +
                "counts bigint" +
                ") with (" +
                "'connector' = 'print'" +
                ")");

        //4.数据处理
        /**
         * 数据处理逻辑SQL如下:
         *  insert into sink select word,count(*) from source group by word
         */
        t_env.executeSql("insert into sink select word,count(*) from source group by word")
                .await();
        //5.启动流式任务
        env.execute();
    }
}

运行结果如下:

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值