使用JsonParser流式解析json,并使用DataFrame进行矩阵转置。

本文介绍了一种方法,如何将结构复杂且包含{{}

需求:将一个结构化不太好的原始的大json文件,转为CSV文件,有{{}}嵌套也有[[ ]]嵌套。
在这里插入图片描述
思路:
1 .肯定不能使用原始的LIst Map…
2. 尽量减少对line 的遍历。
3. 可适当采用中间文件。

package convert;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import joinery.DataFrame;


import java.io.*;
import java.math.BigDecimal;


/**
 * @author zijian Wang
 * @date 2021/8/10 16:04
 * @VERSION 1.0
 */
public class convert2traindata {

    private static String filePath_origin ;
    private  static  String outPutPath;

    //intermediateFile
    private static String filePath_model = "E:\\change\\model_train.csv";
    private static String filePath_model_index = "E:\\change\\model_train_index.csv";
    private static String filePath_model_transpose = "E:\\change\\transpose.csv";
    private static String filePath_model_res;
   //window
    //private static String delimiter="\\";
    //linux
    private static String delimiter="/";

    public static void main(String[] args) throws IOException {

        //加载参数1.输入路径和文件名
        //加载参数2  输出的路径,名称和源输入文件一样。
        //linux
        filePath_origin=args[0];
        outPutPath =args[1];

        //window
/*        filePath_origin="E:\\change\\origin.json";
        outPutPath ="E:\\change\\";*/
        String outPutFileName= filePath_origin.substring(filePath_origin.lastIndexOf(delimiter)+1,filePath_origin.lastIndexOf("."));
        //生成输出路径
        filePath_model=outPutPath+outPutFileName+"_model.csv";
        filePath_model_index=outPutPath+outPutFileName+"_index.csv";
        filePath_model_transpose=outPutPath+outPutFileName+"_transpose.csv";
        filePath_model_res=outPutPath+outPutFileName+".csv";
      long startTime = System.currentTimeMillis();
        convert2traindata();
        mergeFile(filePath_model, filePath_model_index);
        transpose(filePath_model_index);
        printResFile(filePath_model_transpose, filePath_model_res);
        long endTime = System.currentTimeMillis();
        System.out.println("程序运行时间: " + (endTime - startTime) + "ms");
    }

    /**
     *使用jsonParser 提取数据并写入中间文件
     */
    public static void convert2traindata() throws IOException {

        JsonFactory jasonFactory = new JsonFactory();
        JsonParser jsonParser = null;

        PrintWriter writer_model = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model)), "UTF-8"));
        PrintWriter writer_index = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model_index)), "UTF-8"));
        PrintWriter writer_res = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model_res)), "UTF-8"));
        jsonParser = jasonFactory.createJsonParser(new File(filePath_origin));
        jsonParser.nextToken();
        while (jsonParser.nextToken() != JsonToken.NOT_AVAILABLE) {
            String fieldname = jsonParser.getCurrentName();
            if (jsonParser.nextToken() == null || fieldname == null) {
                jsonParser.close();
                writer_model.close();
                break;
            }
            int filedIndex = 0;
            //读取stdNames 要求写入的字段,直接放入結果文件中
            if (fieldname != null && fieldname.equals("stdNames")) {
                writer_res.append("ts").append(",");
                while (jsonParser.currentToken() != JsonToken.END_ARRAY) {
                    if (filedIndex == 16) {
                        writer_res.append(jsonParser.getText());
                    } else {
                        writer_res.append(jsonParser.getText()).append(",");
                    }
                    filedIndex++;
                    jsonParser.nextToken();
                }
                writer_res.write("\n");
                writer_res.close();
            }
            //读取times数据
            int transposeIndex = 0;
            if (fieldname != null && fieldname.equals("times")) {
                jsonParser.nextToken();
                while (jsonParser.currentToken() != JsonToken.END_ARRAY) {
                    transposeIndex++;
                    writer_model.append(new BigDecimal(jsonParser.getText()).toPlainString()).append(",");
                    jsonParser.nextToken();
                }
                //生成索引文件
                for (int i = 0; i < transposeIndex; i++) {
                    writer_index.append(String.valueOf(i)).append(",");
                }
                writer_index.append("\n");
                writer_index.close();
            }
            //读取dataMatrix数据
            if (fieldname != null && fieldname.equals("dataMatrix")) {
                writer_model.append("\n");
                while (jsonParser.currentToken() != JsonToken.END_OBJECT) {
                    if (jsonParser.getText() != "[") {
                        if (jsonParser.getText() == "]") {
                            writer_model.append("\n");
                        } else {
                            writer_model.append(jsonParser.getText()).append(",");
                        }
                    }
                    jsonParser.nextToken();
                }
                writer_model.close();
            }
        }
        jsonParser.close();
    }
    /**
     * 合并文件和索引
     *
     * @param file1
     * @param file2
     * @throws IOException
     */
    public static void mergeFile(String file1, String file2) throws IOException {
        BufferedReader inputStream = null;
        BufferedWriter outputStream = null;
        inputStream = new BufferedReader(new FileReader(file1));
        FileWriter filewriter = new FileWriter(new File(file2), true);
        outputStream = new BufferedWriter(filewriter);
        String count;
        while ((count = inputStream.readLine()) != null) {
            if (count != "" && count.length() > 17) {
                outputStream.write(count);
                outputStream.write("\n");
            }
        }
        outputStream.flush();
        outputStream.close();
        inputStream.close();
        new File(file1).delete();
    }

    /**
     * 矩阵转置
     *
     * @param filePath
     * @throws IOException
     */
    public static void transpose(String filePath) throws IOException {

        DataFrame df = null;
        df = DataFrame.readCsv(filePath,",",DataFrame.NumberDefault.LONG_DEFAULT);
        DataFrame<String> df3 = df.transpose();
        System.out.println(df3.length());
        for (int i=0;i<df3.length()-1;i++){
            String value=new BigDecimal(String.valueOf(df3.get(i,0))).toPlainString();
            df3.set(i,0,value);
        }
        df3.writeCsv(filePath_model_transpose);
        new File(filePath).delete();
    }

    /**
     * 生成结果文件
     *
     * @param file1
     * @param file2
     * @throws IOException
     */
    public static void printResFile(String file1, String file2) throws IOException {

        BufferedReader inputStream = null;
        BufferedWriter outputStream = null;
        FileWriter filewriter = null;
        inputStream = new BufferedReader(new FileReader(file1));
        filewriter = new FileWriter(new File(file2), true);
        outputStream = new BufferedWriter(filewriter);
        String count;
        int lineCode = 0;
        while ((count = inputStream.readLine()) != null) {

            if (count != "" && count.length() > 17 && lineCode > 0) {
                outputStream.write(count);
                outputStream.write("\n");
            }
            lineCode++;
        }
        outputStream.flush();
        outputStream.close();
        inputStream.close();
        new File(file1).delete();
    }
}

测试后3000行的json需要0.3S左右。
3w行的大约2.8S执行完。效率应对基本需求完全够用~
在这里插入图片描述

import socket import re import json import os from datetime import datetime from typing import Generator, List, Dict, Optional import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud # 确保中文显示正常 plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] ### 日志读取模块 ### def read_local_file(file_path: str) -> Generator[str, None, None]: """逐行读取本地日志文件,支持大文件处理""" if not os.path.exists(file_path): raise FileNotFoundError(f"日志文件不存在: {file_path}") with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: for line in f: yield line.strip() def listen_network_logs(host: str = '0.0.0.0', port: int = 8080) -> Generator[str, None, None]: """启动TCP服务器接收实时日志""" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind((host, port)) s.listen(5) print(f"日志服务器启动,监听地址: {host}:{port}") conn, addr = s.accept() with conn: print(f"接收到来自 {addr} 的连接") while True: data = conn.recv(1024) if not data: break yield data.decode('utf-8', errors='ignore').strip() def get_log_entries(source: str) -> List[str]: """获取日志条目,支持文件路径和网络地址""" if source.startswith('tcp://'): host, port = source.split('://')[1].split(':') return list(listen_network_logs(host, int(port))) else: return list(read_local_file(source)) ### 日志解析模块 ### class TextParser: """文本日志解析器,使用正则表达式提取字段""" def __init__(self, pattern: str, time_format: str): self.pattern = re.compile(pattern) self.time_format = time_format def parse(self, line: str) -> Optional[Dict[str, any]]: match = self.pattern.match(line) if not match: return None data = match.groupdict() # 转换时间戳为datetime对象 if 'timestamp' in data: try: data['timestamp'] = datetime.strptime(data['timestamp'], self.time_format) except ValueError: data['timestamp'] = datetime.now() return data class JsonParser: """JSON日志解析器,支持嵌套字段扁平化""" def parse(self, line: str) -> Optional[Dict[str, any]]: try: data = json.loads(line) flat_data: Dict[str, any] = {} def flatten(key: str, value: any): if isinstance(value, dict): for sub_key, sub_value in value.items(): flatten(f"{key}_{sub_key}", sub_value) else: flat_data[key] = value for k, v in data.items(): flatten(k, v) return flat_data except json.JSONDecodeError: return None def create_parser(config: Dict[str, any]): """根据配置创建相应的解析器""" parser_type = config['parser']['type'] if parser_type == 'text': return TextParser( pattern=config['parser']['pattern'], time_format=config['parser']['time_format'] ) elif parser_type == 'json': return JsonParser() else: raise ValueError(f"不支持的解析器类型: {parser_type}") ### 过滤与统计模块 ### def filter_by_time(entries: List[Dict[str, any]], start_time: datetime, end_time: datetime) -> List[Dict[str, any]]: """按时间范围过滤日志条目""" return [entry for entry in entries if 'timestamp' in entry and start_time <= entry['timestamp'] <= end_time] def filter_by_keyword(entries: List[Dict[str, any]], keyword: Optional[str] = None, exclude_keyword: Optional[str] = None) -> List[Dict[str, any]]: """正向和负向关键词过滤""" keyword_re = re.compile(keyword) if keyword else None exclude_re = re.compile(exclude_keyword) if exclude_keyword else None filtered = [] for entry in entries: message = entry.get('message', '') + ' ' + ' '.join(str(v) for v in entry.values()) # 正向匹配:包含关键词或未指定关键词 if keyword_re and not keyword_re.search(message): continue # 负向匹配:不包含排除关键词或未指定排除关键词 if exclude_re and exclude_re.search(message): continue filtered.append(entry) return filtered def analyze_frequency(entries: List[Dict[str, any]], group_by: str = 'level') -> List[Dict[str, any]]: """按指定字段统计频率""" if not entries: return [] df = pd.DataFrame(entries) if group_by not in df.columns: raise ValueError(f"无效的分组字段: {group_by}") frequency = df[group_by].value_counts().reset_index() frequency.columns = [group_by, 'count'] return frequency.to_dict('records') ### 可视化模块 ### def plot_bar_chart(data: List[Dict[str, any]], x_field: str, y_field: str, title: str, output_path: Optional[str] = None): """生成柱状图""" if not data: print("数据为空,无法生成柱状图") return df = pd.DataFrame(data) plt.figure(figsize=(10, 6)) plt.bar(df[x_field], df[y_field], color='skyblue') plt.xlabel(x_field) plt.ylabel(y_field) plt.title(title) plt.xticks(rotation=45, ha='right') plt.tight_layout() if output_path: os.makedirs(os.path.dirname(output_path), exist_ok=True) plt.savefig(output_path, dpi=300) print(f"柱状图已保存至: {output_path}") else: plt.show() plt.close() def generate_wordcloud(text: str, stopwords: List[str], output_path: Optional[str] = None): """生成词云图""" if not text: print("文本内容为空,无法生成词云图") return wordcloud = WordCloud( stopwords=stopwords, background_color='white', width=800, height=400, font_path="simhei.ttf" # 确保中文显示(若本地无此字体可删除此参数) ).generate(text) if output_path: os.makedirs(os.path.dirname(output_path), exist_ok=True) wordcloud.to_file(output_path) print(f"词云图已保存至: {output_path}") else: plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() plt.close() ### 主函数示例 ### def main(): # 示例配置(实际应用中可从配置文件读取) config = { "parser": { "type": "text", "pattern": "^(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}) \\[(?P<level>\\w+)\\] (?P<message>.*)$", "time_format": "%Y-%m-%d %H:%M:%S" }, "visualization": { "bar": { "x": "level", "y": "count", "title": "日志级别分布" }, "wordcloud": { "text_field": "message", "stopwords": ["GET", "POST", "HTTP/1.1"] } } } # 日志源(文件或网络) log_source = "sample_logs.txt" # 替换为实际日志文件路径或 "tcp://127.0.0.1:8080" try: # 1. 读取日志 log_entries = get_log_entries(log_source) print(f"读取到 {len(log_entries)} 条日志") # 2. 创建解析器 parser = create_parser(config) # 3. 解析日志 parsed_entries = [parser.parse(line) for line in log_entries if parser.parse(line)] print(f"成功解析 {len(parsed_entries)} 条日志") # 4. 过滤日志(示例:过滤2025-06-01之后的ERROR级别日志) if parsed_entries: start_time = datetime(2025, 6, 1) filtered_entries = filter_by_time(parsed_entries, start_time, datetime.now()) filtered_entries = filter_by_keyword(filtered_entries, keyword="ERROR") # 5. 统计分析 frequency_data = analyze_frequency(filtered_entries) print("统计结果:") for item in frequency_data: print(f"{item['level']}: {item['count']} 条") # 6. 可视化 output_dir = "visualization_results" visualize_results(filtered_entries, config, output_dir) except Exception as e: print(f"程序运行出错: {str(e)}") if __name__ == "__main__": main()预览一下以上代码
06-19
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

王子健121

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值