How to SUM and GROUP BY of JSON data?

本文介绍如何将SQL查询应用于JSON格式的数据,通过JavaScript实现对JSON对象进行分组、求和及排序的操作,以达到类似SQL查询的效果,并提供实际应用案例。

How to SUM and GROUP BY of JSON data?

Source: StackOverflow.com

Question

Some server-side code actually generates a JSON formatted string, which converts to JSON object:

    var dataString='[ 
      { "category" : "Search Engines", "hits" : 5, "bytes" : 50189 },
      { "category" : "Content Server", "hits" : 1, "bytes" : 17308 },
      { "category" : "Content Server", "hits" : 1, "bytes" : 47412 },
      { "category" : "Search Engines", "hits" : 1, "bytes" : 7601 },
      { "category" : "Business", "hits" : 1, "bytes" : 2847 },
      { "category" : "Content Server", "hits" : 1, "bytes" : 24210 },
      { "category" : "Internet Services", "hits" : 1, "bytes" : 3690 },
      { "category" : "Search Engines", "hits" : 6, "bytes" : 613036 },
      { "category" : "Search Engines", "hits" : 1, "bytes" : 2858 } 
    ];

I need to the equivalent of an SQL statement like this:

    SELECT category, sum(hits), sum(bytes) 
    FROM dataObject
    GROUP BY category
    ORDER BY sum(bytes) DESC

The desired output would be an object like this:

    var aggregatedObject='[ 
        { "category" : "Search Engines", "hits" : 13, "bytes" : 673684 },
        { "category" : "Content Server", "hits" : 3, "bytes" : 88930 },
        { "category" : "Internet Services", "hits" : 1, "bytes" : 3690 },
        { "category" : "Business", "hits" : 1, "bytes" : 2847 } 
    ]';

Answer

This is an example with SQL query from your question:

    var data = [ { "category" : "Search Engines", "hits" : 5, "bytes" : 50189 },...;

    var res = alasql('SELECT category, sum(hits) AS hits, sum(bytes) as bytes \
       FROM ? \
       GROUP BY category \
       ORDER BY bytes DESC',[data]);

Try this example at jsFiddle.

from pyspark.sql import SparkSession from pyspark.sql.functions import avg, col, corr, count, sum, when from functools import wraps def save_results(output_path): """ Decorator to save DataFrame results to a specified JSON path. """ def decorator(func): @wraps(func) def wrapper(*args, **kwargs): # Call the data processing function df = func(*args, **kwargs) # Save the DataFrame results as a single JSON file by coalescing. df.write.json(output_path, mode='overwrite') return df return wrapper return decorator def create_spark_session(): """ Create a Spark session configured for using PySpark. """ return SparkSession.builder \ .appName("NBA Player Analysis") \ .getOrCreate() def load_data(spark, file_path): """ Load data from HDFS into a Spark DataFrame. """ return spark.read.csv(file_path, header=True, inferSchema=True) @save_results("json_output/player_performance_by_age") def analyze_player_performance_by_age(df): """ Analyze how players' ages affect their average points, assists, and rebounds per game. """ # Select relevant columns and convert to proper types if necessary result_df = df.select( col('age').cast('integer'), col('pts').cast('float'), col('ast').cast('float'), col('reb').cast('float') ) # Group by age and calculate average points, assists, and rebounds age_analysis = result_df.groupBy('age').agg( avg('pts').alias('avg_points'), avg('ast').alias('avg_assists'), avg('reb').alias('avg_rebounds') ).orderBy('age') return age_analysis @save_results("json_output/correlations_over_time") def analyze_correlations_over_time(df): """ Analyze the correlations between height, weight, average assists, and average rebounds for each season. """ # Select necessary columns and cast to appropriate data types correlations_df = df.select( col('season'), col('player_height').cast('float'), col('player_weight').cast('float'), col('ast').cast('float').alias('average_assists'), col('reb').cast('float').alias('average_rebounds') ) # Calculate correlations for each season correlation_results = correlations_df.groupBy('season').agg( corr('player_height', 'average_rebounds').alias('height_rebounds_corr'), corr('player_height', 'average_assists').alias('height_assists_corr'), corr('player_weight', 'average_rebounds').alias('weight_rebounds_corr'), corr('player_weight', 'average_assists').alias('weight_assists_corr') ).orderBy('season') return correlation_results @save_results("json_output/performance_by_university") def analyze_performance_by_university(df): """ Analyze average scoring, assist, and rebound performance of players trained by different universities. """ # Select necessary columns and ensure proper data types university_performance_df = df.select( col('college').alias('university'), col('pts').cast('float').alias('points'), col('ast').cast('float').alias('assists'), col('reb').cast('float').alias('rebounds') ) # Group by university and calculate average points, assists, and rebounds result_df = university_performance_df.groupBy('university').agg( avg('points').alias('avg_points'), avg('assists').alias('avg_assists'), avg('rebounds').alias('avg_rebounds') ).orderBy('university') return result_df @save_results("json_output/net_rating_by_draft_number") def analyze_net_rating_by_draft_number(df): """ Analyze the 'net rating' differences for different 'draft numbers'. """ # Ensure proper data types draft_net_rating_df = df.select( col('draft_number').cast('integer'), col('net_rating').cast('float') ) # Group by draft number and calculate average net rating result_df = draft_net_rating_df.groupBy('draft_number').agg( avg('net_rating').alias('avg_net_rating') ).orderBy('draft_number') return result_df @save_results("json_output/non_usa_players_proportion_by_season.json") def analyze_non_usa_players_proportion_by_season(df): """ Analyze the proportion of non-USA players changes with season. """ # Filter and create a column 'is_non_usa' to indicate non-USA players non_usa_df = df.withColumn( 'is_non_usa', when(col('country') != 'USA', 1).otherwise(0) ) # Group by season, count total players and non-USA players season_stats = non_usa_df.groupBy('season').agg( count('*').alias('total_players'), sum('is_non_usa').alias('non_usa_players') ) # Calculate the proportion of non-USA players per season proportion_df = season_stats.withColumn( 'non_usa_proportion', col('non_usa_players') / col('total_players') ).select('season', 'non_usa_proportion').orderBy('season') return proportion_df @save_results("json_output/height_age_by_season") def analyze_height_age_by_season(df): """ Analyze the average height and age of players, grouped by season. """ # Calculate the average height and age grouped by season result_df = df.groupBy('season').agg( avg('player_height').alias('avg_height'), avg('age').alias('avg_age') ).orderBy('season') return result_df @save_results("json_output/top_colleges_by_stats") def analyze_top_colleges_by_stats(df): """ Analyze top 50 colleges by total points, rebounds, and assists. """ # Group by college and calculate total points, rebounds, and assists stats_df = df.groupBy('college').agg( sum('pts').alias('total_points'), sum('reb').alias('total_rebounds'), sum('ast').alias('total_assists') ) # Get top 50 for each category top_points = stats_df.sort(stats_df.total_points.desc()).limit(50) top_rebounds = stats_df.sort(stats_df.total_rebounds.desc()).limit(50) top_assists = stats_df.sort(stats_df.total_assists.desc()).limit(50) # Combine results into one DataFrame for export top_colleges = top_points.union(top_rebounds).union(top_assists).distinct() return top_colleges def main(): # Initialize Spark Session spark = create_spark_session() # Load data from HDFS data_path: str = "data/all_seasons_processed.csv" # Update this path df = load_data(spark, data_path) # Perform data analysis analyze_player_performance_by_age(df) analyze_correlations_over_time(df) analyze_performance_by_university(df) analyze_net_rating_by_draft_number(df) analyze_non_usa_players_proportion_by_season(df) analyze_height_age_by_season(df) analyze_top_colleges_by_stats(df) # Stop the Spark session spark.stop() if __name__ == "__main__": main()
06-03
from datetime import datetime import pandas as pd from bp import output_data from pe import original_df from hrc import hr_res import json import pprint import numpy as np hr_res = hr_res[(hr_res['年份'] >= 2025) | ((hr_res['年份'] >= 2024) & (hr_res['月份'] >= 7))] original_df = original_df[['工号', '姓名', '年月', 'ldept', 'findings', '异常项目详情', '异常数量', '分析意见', '可能疾病']] hr_res = hr_res[['年月', '员工总数']] hr = pd.read_excel('data/在职数据.xlsx') hr_pe = pd.merge( hr_res, original_df, left_on=['年月'], right_on=['年月'], how='outer' ) # hr_pe.to_excel("temp/hr_pe.xlsx", index=False) all_records = [] for emp_info in output_data['employees'].values(): employee_id = emp_info['employee_id'] department = emp_info['department'] name = emp_info['name'] # 遍历该员工的所有记录 for record in emp_info['records']: record['employee_id'] = employee_id record['department'] = department record['name'] = name all_records.append(record) df_emp = pd.DataFrame(all_records) df_emp['employee_id'] = df_emp['employee_id'].astype(float) # 将日期列转换为datetime类型以便排序 df_emp['date'] = pd.to_datetime(df_emp['date'], errors='coerce') # 每个人每年月只保留最新的一条数据 df_emp = df_emp.sort_values(['year_month', 'employee_id', 'date'], ascending=[True, True, False]) df_emp = df_emp.groupby(['year_month', 'employee_id']).first().reset_index() # df_emp.to_excel("temp/df_emp.xlsx", index=False) hr_bp = pd.merge( hr_res, df_emp, left_on=['年月'], right_on=['year_month'], how='outer' ) hr_bp.to_excel("temp/hr_bp.xlsx", index=False) # 合并数据 merged_df = pd.merge( hr_pe, hr_bp, left_on=['年月', '员工总数', '工号'], right_on=['年月', '员工总数', 'employee_id'], how='outer' ) # 打印合并后的结果 # print(merged_df) merged_df.to_excel("temp/所有数据.xlsx", index=False) # 定义健康等级函数 def assign_health_level(row): bp_level = row['bp_level'] bmi_level = row['bmi_level'] if bp_level in ["二级高血压(中度)", "三级高血压(重度)"]: return '风险' elif bp_level == "正常血压" and bmi_level == "正常" and (row['异常项目详情'] == "无异常"): return '健康' elif bp_level in ["正常高值", "一级高血压(轻度)"] or bmi_level in ["较瘦", "超重"] or (row['异常项目详情'] != "无异常"): return '亚健康' return '亚健康' merged_df = merged_df[~(merged_df['ldept'].isna() & merged_df['department'].isna())] # 应用健康等级分类 merged_df['健康等级'] = merged_df.apply(assign_health_level, axis=1) merged_df['姓名'] = merged_df['姓名'].astype('string') merged_df['name'] = merged_df['name'].astype('string') merged_df['姓名'] = merged_df['姓名'].where( merged_df['姓名'].notna() & (merged_df['姓名'] != ''), # 优先用 姓名 merged_df['name'] ) merged_df['姓名'] = merged_df['姓名'].replace('', pd.NA) merged_df = merged_df.drop(columns=['name']) merged_df['employee_id'] = merged_df['employee_id'].astype('string') merged_df['工号'] = merged_df['工号'].where( merged_df['工号'].notna() & (merged_df['工号'] != ''), merged_df['employee_id'] ) merged_df['工号'] = merged_df['工号'].replace('', pd.NA) merged_df = merged_df.drop(columns=['employee_id']) merged_df = merged_df.sort_values(['年月', '工号'], ascending=[True, True]) merged_df = merged_df.groupby(['年月', '工号']).first().reset_index() merged_df.to_excel("temp/表格数据.xlsx", index=False) # 按年月统计各级别人数 monthly_stats = merged_df.groupby(['年月', '健康等级']).size().unstack(fill_value=0) # monthly_stats.to_excel("temp/monthly_health_stats.xlsx") # 按年月统计量测人数(去重后) grouped_data = merged_df.groupby('年月').size().reset_index(name='量测人数') # grouped_data.to_excel("temp/grouped_data.xlsx") # 按年份和月份分组统计血压分级 bp_grouped = merged_df.groupby(['年月', 'bp_level']).size().reset_index(name='员工数量') bp_pivot = bp_grouped.pivot_table( index='年月', columns='bp_level', values='员工数量', fill_value=0 ).reset_index() # 按年份和月份分组统计BMI分级 bmi_grouped = merged_df.groupby(['年月', 'bmi_level']).size().reset_index(name='员工数量') bmi_pivot = bmi_grouped.pivot_table( index='年月', columns='bmi_level', values='员工数量', fill_value=0 ).reset_index() merged_df['工号'] = pd.to_numeric(merged_df['工号'].astype(str).str.strip(), errors='coerce') cost_center_map = dict(zip(hr['工号'], hr['成本中心'])) merged_df['sdept'] = merged_df['工号'].map(cost_center_map) merged_df['sdept'] = merged_df['sdept'].where(pd.notna(merged_df['sdept']), None) # merged_df_sorted = merged_df.sort_values(by='year_month', ascending=False) # merged_df_sorted.to_excel("temp/merged_df.xlsx", index=False) for col in merged_df.select_dtypes(include=['datetime64']).columns: merged_df[col] = merged_df[col].astype(str) merged_df = merged_df.where(pd.notnull(merged_df), None) merged_df['工号'] = ( pd.to_numeric(merged_df['工号'], errors='coerce') .fillna(0) .astype('Int64') .astype('string') .replace('<NA>', pd.NA) ) merged_df.to_excel("temp/merged_df_json.xlsx", index=False) merged_df_json = merged_df.to_json(orient='records', force_ascii=False, date_format='iso') # 合并数据 merged_data = pd.merge( left=grouped_data, right=bp_pivot, left_on='年月', right_on='年月', how='left' ) merged_data = pd.merge( left=merged_data, right=bmi_pivot, left_on='年月', right_on='年月', how='left' ) # 合并HR数据(员工总数) merged_data = pd.merge( left=merged_data, right=hr_res, left_on='年月', right_on='年月', how='left' ) # 填充缺失值 merged_data.fillna(0, inplace=True) # 计算量测率 merged_data['量测率'] = merged_data['量测人数'] / merged_data['员工总数'] merged_data.loc[merged_data['员工总数'] == 0, '量测率'] = 0 # 确保所有血压分级列都存在 blood_columns = ['正常血压', '正常高值', '一级高血压(轻度)', '二级高血压(中度)', '三级高血压(重度)'] for col in blood_columns: if col not in merged_data.columns: merged_data[col] = 0 # 确保所有BMI分级列都存在 bmi_columns = ['较瘦', '正常', '超重', '肥胖'] for col in bmi_columns: if col not in merged_data.columns: merged_data[col] = 0 # 标记体检记录 merged_df['体检标记'] = merged_df['异常项目详情'].notna().astype(int) # 标记异常记录(分析意见非空且不为空字符串) # merged_df['异常标记'] = merged_df['分析意见'].apply(lambda x: 1 if x and str(x).strip() not in ['', '[]'] else 0) merged_df['异常标记'] = merged_df.apply(lambda row: 1 if (row['体检标记'] == 1 and row['异常项目详情'] not in ['', '无异常']) else 0 if (row['体检标记'] == 1 and row['异常项目详情'] in ['', '无异常']) else 0, axis=1) merged_df.to_excel("temp/体检标记.xlsx", index=False) # 按年月分组统计 pe_stats = merged_df.groupby('年月').agg( 体检人数=('体检标记', 'sum'), # 统计体检人数 异常人数=('异常标记', 'sum') # 统计异常人数 ).reset_index() pe_stats = pe_stats.rename(columns={'年月': 'year_month'}) # 计算异常率 pe_stats['异常率'] = pe_stats['异常人数'] / pe_stats['体检人数'] all_data1 = pd.merge( left=merged_data, right=monthly_stats, left_on='年月', right_on='年月', how='outer' ) all_data = pd.merge( left=all_data1, right=pe_stats, left_on='年月', right_on='year_month', how='outer' ) all_data_sorted = all_data.sort_values(by='年月', ascending=False) all_data_sorted = all_data_sorted.fillna(0) # all_data_sorted.to_excel("temp/all_data.xlsx", index=False) # 转换为json for col in all_data_sorted.select_dtypes(include=['datetime64']).columns: all_data_sorted[col] = all_data_sorted[col].astype(str) all_data_sorted = all_data_sorted.where(pd.notnull(all_data), None) all_data_json = all_data_sorted.to_json(orient='records', force_ascii=False, date_format='iso') # print(all_data_json) # 统一部门信息(优先使用体检数据中的大部门,若为空则使用血压数据中的部门) merged_df['统一部门'] = merged_df['ldept'].combine_first(merged_df['department']) # 按年月、统一部门、健康等级分组统计人数 dept_health_stats = ( merged_df.groupby(['年月', '统一部门', '健康等级']) .size() .unstack(fill_value=0) .reset_index() ) # dept_health_stats.to_excel("temp/柱状图.xlsx", index=False) # 处理缺失的健康等级列(确保所有等级都存在) for level in ['健康', '亚健康', '风险']: if level not in dept_health_stats.columns: dept_health_stats[level] = 0 # 重命名列并转换数据类型 dept_health_stats = dept_health_stats.rename(columns={ # 'year_month': '年月', '统一部门': '部门' })[['年月', '部门', '健康', '亚健康', '风险']] # 转换为JSON格式(按年月分组) json_output = {} for (year_month, group) in dept_health_stats.groupby('年月'): # 转换部门数据为字典列表 departments = group.drop(columns='年月').to_dict('records') # 添加到结果字典 json_output[str(year_month)] = departments final_json = json.dumps(json_output, indent=2, ensure_ascii=False) # print(final_json) 修改这个perc.py
最新发布
10-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值