[root@master pyflink]# cat test_max.py
from pyflink.common import Types
from pyflink.datastream import StreamExecutionEnvironment
env = StreamExecutionEnvironment.get_execution_environment()
# env.set_parallelism(1)
# env.set_parallelism(2)
ds = env.from_collection(
[('a', 'id=1', 1), ('a', 'id=2', 5), ('a', 'id=3', 3), ('b', 'home=1', 1), ('b', 'home=2', 4),('b', 'home=2', 2)],
type_info=Types.ROW_NAMED(["key", "url", "value"], [Types.STRING(), Types.STRING(), Types.INT()]))
# sum的参数可以是列名,也可以是position
# 该方法通过第一位(也就是key列)分区后,然后根据value列相加分别统计总数
result = ds.key_by(lambda x: x[0]).sum("value")
result1 = ds.key_by(lambda x: x[0]).max("value")
#result2 = ds.key_by(lambda x: x[0]).max_by("value")
result1.print("max:")
#result2.print("max_by:")
env.execute()
[root@master pyflink]# python3 test_max.py
max::2> +I[a,id=1,1]
max::2> +I[a,id=1,5]
max::2> +I[a,id=1,5]
max::1> +I[b,home=1,1]
max::1> +I[b,home=1,4]
max::1> +I[b,home=1,4]
[root@master pyflink]#
[root@master pyflink]#
[root@master pyflink]#