mysql group by xx top N

本文介绍了如何使用SQL处理复杂的业务数据统计需求,特别是针对供货商销售数据的排名与筛选问题,提供了有效的解决方案。

今天 运营又要数据;

            

1、每个产品的品牌、货号、规格、单位、单价、销售总数量、销售总额、销售订单数、采购实验室数、供货商(销售金额最高的3个商家);
2、品牌、货号相同,即为一个产品;
3、包含线上与线下数据;
4、2015年与2016年分开统计。

 前面几个数据还好 就是  供货商(销售金额最高的3个商家) 这个不好整

我的思路是前面的抽出 一张表 A表(商品id ……)

后面的抽出一张表 B表(供应商id ,总价格,商品id )

create table temp_suppler_key1 as
 SELECT t.fkey,  
 substring_index( group_concat( IFNULL(t.`name`,0) ORDER BY t.tprice DESC ), ",", 3 ) `name` 

FROM temp_suppler_key  t GROUP BY t.fkey  ;

1、先group_concat 排序 然后在substring 比较不错


网上有大神这样解决的感觉不错

http://www.wfuyu.com/datay/21132.html

依照a,b分组,并且依照itime字段倒叙排列,取每组的top3

SELECT t.a, t.b, substring_index( group_concat( IFNULL(t.c,0) ORDER BY t.itime DESC ), ",", 3 ) c, substring_index( group_concat( t.itime ORDER BY t.itime DESC ), ",", 3 ) time FROM t t GROUP BY t.a ,t.b;

from pyflink.datastream import StreamExecutionEnvironment from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes from pyflink.table.udf import udf import os import logging # 配置日志 logging.basicConfig(level=logging.INFO, format=&#39;%(asctime)s - %(levelname)s - %(message)s&#39;) logger = logging.getLogger(__name__) # 定义处理CDC操作的UDF @udf(result_type=DataTypes.STRING()) def process_cdc_op(op): return { &#39;I&#39;: &#39;INSERT&#39;, &#39;U&#39;: &#39;UPDATE&#39;, &#39;D&#39;: &#39;DELETE&#39; }.get(op, &#39;UNKNOWN&#39;) def safe_execute_sql(t_env, sql, object_type="statement"): """安全执行SQL语句,处理对象已存在的情况""" try: t_env.execute_sql(sql) logger.info(f"Successfully executed: {sql[:60]}...") return True except Exception as e: if "already exists" in str(e).lower(): logger.warning(f"Object already exists, skipping creation: {e}") return True else: logger.error(f"Error executing {object_type}: {sql}\n{str(e)}") return False def create_database_if_not_exists(t_env, catalog, db_name): """安全创建数据库""" t_env.execute_sql(f"USE CATALOG {catalog}") dbs = [row[0] for row in t_env.execute_sql("SHOW DATABASES").collect()] if db_name not in dbs: create_db_sql = f"CREATE DATABASE {db_name}" if not safe_execute_sql(t_env, create_db_sql, "database"): return False else: logger.info(f"Database {db_name} already exists in catalog {catalog}") t_env.execute_sql(f"USE {db_name}") return True def table_exists(t_env, table_name): """检查表是否存在""" try: t_env.from_path(table_name) return True except: return False def main(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.enable_checkpointing(5000) # 添加必要的连接器JAR包 flink_home = os.getenv(&#39;FLINK_HOME&#39;, &#39;/opt/flink&#39;) required_jars = [ f"file://{flink_home}/lib/flink-connector-kafka-1.17.1.jar", f"file://{flink_home}/lib/flink-connector-jdbc-1.17.1.jar", f"file://{flink_home}/lib/flink-sql-connector-hive-3.1.2_2.12-1.16.3.jar", f"file://{flink_home}/lib/mysql-connector-java-8.0.28.jar", f"file://{flink_home}/lib/hive-exec-3.1.2.jar", ] for jar in required_jars: env.add_jars(jar) logger.info(f"Added JAR: {jar}") settings = EnvironmentSettings.new_instance().in_streaming_mode().build() t_env = StreamTableEnvironment.create(env, environment_settings=settings) # 注册UDF t_env.create_temporary_function("process_cdc_op", process_cdc_op) # 1. 创建Hive Catalog - 使用单行语句避免解析错误 hive_catalog_sql = ( "CREATE CATALOG hive_catalog WITH (" "&#39;type&#39; = &#39;hive&#39;, " "&#39;hive-conf-dir&#39; = &#39;/opt/hive/conf&#39;" ")" ) safe_execute_sql(t_env, hive_catalog_sql, "catalog") # 2. 使用默认catalog(无需创建) t_env.use_catalog("default_catalog") # 3. 创建默认数据库(如果不存在) if not create_database_if_not_exists(t_env, "default_catalog", "default_database"): logger.error("Failed to create default database") return # 4. 创建Kafka源表(安全方式) kafka_table_ddl = ( "CREATE TABLE kafka_user_meal (" "id STRING, " "review STRING, " "rating DOUBLE, " "review_time BIGINT, " "user_id STRING, " "meal_id STRING, " "op STRING, " "ts AS TO_TIMESTAMP(FROM_UNIXTIME(review_time)), " "WATERMARK FOR ts AS ts - INTERVAL &#39;5&#39; SECOND, " "PRIMARY KEY (id) NOT ENFORCED" ") WITH (" "&#39;connector&#39; = &#39;kafka&#39;, " "&#39;topic&#39; = &#39;cleaned-user-meal-reviews&#39;, " "&#39;properties.bootstrap.servers&#39; = &#39;master:9092,slave01:9092,slave02:9092&#39;, " "&#39;properties.group.id&#39; = &#39;flink-cdc-group&#39;, " "&#39;scan.startup.mode&#39; = &#39;latest-offset&#39;, " "&#39;format&#39; = &#39;json&#39;, " "&#39;json.ignore-parse-errors&#39; = &#39;true&#39;" ")" ) if not safe_execute_sql(t_env, kafka_table_ddl, "table"): logger.error("Failed to create Kafka table") return # 5. 创建Hive数据库和表 t_env.execute_sql("USE CATALOG hive_catalog") if not create_database_if_not_exists(t_env, "hive_catalog", "user_meal"): return hive_table_ddl = ( "CREATE TABLE hive_user_meal_cdc (" "id STRING COMMENT &#39;唯一标识&#39;, " "review STRING COMMENT &#39;评价内容&#39;, " "rating DOUBLE COMMENT &#39;评分&#39;, " "review_time TIMESTAMP COMMENT &#39;评价时间&#39;, " "user_id STRING COMMENT &#39;用户ID&#39;, " "meal_id STRING COMMENT &#39;餐品ID&#39;, " "operation_type STRING COMMENT &#39;操作类型&#39;, " "operation_ts TIMESTAMP COMMENT &#39;操作时间&#39;, " "op STRING COMMENT &#39;原始操作类型&#39;" ") PARTITIONED BY (op) " "STORED AS ORC " "TBLPROPERTIES (" "&#39;sink.partition-commit.policy.kind&#39; = &#39;metastore,success-file&#39;, " "&#39;auto-compaction&#39; = &#39;true&#39;" ")" ) if not safe_execute_sql(t_env, hive_table_ddl, "table"): return # 6. 创建MySQL表(在默认catalog中) t_env.execute_sql("USE CATALOG default_catalog") t_env.execute_sql("USE default_database") mysql_table_ddl = ( "CREATE TABLE mysql_user_meal (" "id STRING PRIMARY KEY NOT ENFORCED COMMENT &#39;唯一标识&#39;, " "review STRING COMMENT &#39;评价内容&#39;, " "rating DOUBLE COMMENT &#39;评分&#39;, " "review_time TIMESTAMP(3) COMMENT &#39;评价时间&#39;, " "user_id STRING COMMENT &#39;用户ID&#39;, " "meal_id STRING COMMENT &#39;餐品ID&#39;, " "last_operation STRING COMMENT &#39;最后操作类型&#39;, " "update_time TIMESTAMP(3) COMMENT &#39;更新时间&#39;" ") WITH (" "&#39;connector&#39; = &#39;jdbc&#39;, " "&#39;url&#39; = &#39;jdbc:mysql://mysql-host:3306/user_meal&#39;, " "&#39;table-name&#39; = &#39;user_meal_reviews&#39;, " "&#39;username&#39; = &#39;root&#39;, " "&#39;password&#39; = &#39;5266&#39;, " "&#39;driver&#39; = &#39;com.mysql.cj.jdbc.Driver&#39;, " "&#39;sink.buffer-flush.max-rows&#39; = &#39;100&#39;, " "&#39;sink.buffer-flush.interval&#39; = &#39;5s&#39;" ")" ) if not safe_execute_sql(t_env, mysql_table_ddl, "table"): return # 7. 写入Hive hive_insert_sql = ( "INSERT INTO hive_catalog.user_meal.hive_user_meal_cdc " "SELECT " "id, " "review, " "rating, " "TO_TIMESTAMP(FROM_UNIXTIME(review_time)) AS review_time, " "user_id, " "meal_id, " "process_cdc_op(op) AS operation_type, " "CURRENT_TIMESTAMP AS operation_ts, " "op " "FROM default_catalog.default_database.kafka_user_meal" ) if not safe_execute_sql(t_env, hive_insert_sql, "insert"): return # 8. 写入MySQL mysql_insert_sql = ( "INSERT INTO mysql_user_meal " "SELECT " "id, " "LAST_VALUE(review) AS review, " "LAST_VALUE(rating) AS rating, " "MAX(ts) AS review_time, " "LAST_VALUE(user_id) AS user_id, " "LAST_VALUE(meal_id) AS meal_id, " "LAST_VALUE(process_cdc_op(op)) AS last_operation, " "CURRENT_TIMESTAMP AS update_time " "FROM (" "SELECT *, " "ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS row_num " "FROM default_catalog.default_database.kafka_user_meal " "WHERE op <> &#39;D&#39;" ") " "WHERE row_num = 1" ) if not safe_execute_sql(t_env, mysql_insert_sql, "insert"): return # 执行任务 logger.info("Starting pipeline execution...") try: env.execute("Flink CDC to Hive and MySQL Pipeline") logger.info("Pipeline execution completed successfully") except Exception as e: logger.error(f"Pipeline execution failed: {str(e)}") if __name__ == &#39;__main__&#39;: main() 代码可以跑起来,但是被杀死了
最新发布
06-19
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值