Llama.cpp Tools 实用工具深度分析

原创于 2025-12-29 15:30:50 发布 · 置顶 · 578 阅读

18 ·

CC 4.0 BY-SA版权

文章标签：

#llama #人工智能 #自动驾驶

AI 同时被 2 个专栏收录

27 篇文章

订阅专栏

Llama.cpp

3 篇文章

订阅专栏

文章目录

1. 模块概述
- 1.1 核心定位
- 1.2 设计目标
2. 整体架构设计
- 2.1 目录组织结构
- 2.2 构建系统集成
3. 核心工具深度分析
4. 使用场景和最佳实践
5. 技术特色和创新
6. 总结
- 6.1 技术优势
- 6.2 工程价值

团队博客: 汽车电子社区

1. 模块概述

tools/ 目录是 llama.cpp 项目的实用工具集，提供了一整套生产级别的命令行工具，涵盖了模型推理、性能评估、模型优化、部署服务等完整的工作流程。这些工具不仅为开发者提供了便利的操作接口，更是 llama.cpp 项目工程化成熟度的重要体现。

1.1 核心定位

- 生产工具集：提供可直接用于生产的命令行工具
- 工作流支撑：覆盖模型生命周期管理的各个环节
- 性能优化：包含多种性能分析和优化工具
- 标准化接口：统一的命令行设计和用户体验

1.2 设计目标

- 功能完整：覆盖从模型转换到生产部署的全流程
- 性能优化：充分利用硬件性能，提供多种优化选项
- 易于使用：统一的接口设计和丰富的帮助信息
- 可扩展性：模块化设计，便于添加新功能

2. 整体架构设计

2.1 目录组织结构

tools/
├── 核心推理工具 (Core Inference Tools)
│   ├── main/                    # 主命令行工具 (llama-cli)
│   │   ├── main.cpp            # 主入口 (1007行)
│   │   └── README.md           # 详细使用说明
│   ├── server/                  # HTTP API服务器 (llama-server)
│   │   ├── server.cpp          # 服务器主入口 (307行)
│   │   ├── server-context.cpp  # 上下文管理
│   │   ├── server-http.cpp     # HTTP处理模块
│   │   ├── server-models.cpp   # 模型管理
│   │   ├── server-queue.cpp    # 任务队列管理
│   │   └── README.md           # 服务器文档 (1743行)
│   └── run/                     # 简化运行工具 (llama-run)
│       ├── run.cpp             # 简化接口实现
│       └── README.md           # 使用说明
├── 性能评估工具 (Performance Evaluation Tools)
│   ├── llama-bench/             # 综合性能基准测试
│   │   ├── llama-bench.cpp     # 基准测试主程序 (2242行)
│   │   └── README.md           # 基准测试文档 (350行)
│   ├── batched-bench/           # 批处理性能测试
│   │   └── README.md           # 批处理测试说明 (61行)
│   └── perplexity/              # 困惑度计算评估
│       ├── perplexity.cpp      # 困惑度计算 (2071行)
│       └── README.md           # 困惑度文档 (194行)
├── 模型处理工具 (Model Processing Tools)
│   ├── quantize/               # 模型量化工具
│   │   ├── quantize.cpp        # 量化实现 (683行)
│   │   └── README.md           # 量化说明 (172行)
│   ├── gguf-split/              # GGUF文件分割/合并
│   │   ├── gguf-split.cpp       # 分割实现 (584行)
│   │   └── README.md           # 分割说明 (11行)
│   ├── imatrix/                # 重要性矩阵计算
│   │   ├── imatrix.cpp         # 矩阵计算 (1303行)
│   │   └── README.md           # 矩阵说明 (99行)
│   └── export-lora/            # LoRA适配器导出
│       └── README.md           # 导出说明 (34行)
├── 专业功能工具 (Specialized Function Tools)
│   ├── tokenize/               # 分词器工具
│   │   ├── tokenize.cpp        # 分词实现 (417行)
│   │   └── README.md           # 分词说明
│   ├── cvector-generator/       # 控制向量生成
│   │   └── README.md           # 控制向量说明 (46行)
│   ├── tts/                    # 文本转语音
│   │   └── README.md           # TTS说明 (118行)
│   ├── mtmd/                   # 多模态支持
│   │   └── README.md           # 多模态说明 (64行)
│   └── rpc/                    # 远程过程调用
│       └── README.md           # RPC说明 (105行)
└── CMakeLists.txt              # 构建配置 (40行)

2.2 构建系统集成

# tools/CMakeLists.txt 核心构建配置
# 主工具
add_subdirectory(main)
add_subdirectory(server)

# 性能评估工具
if (BUILD_LLAMA_BENCH)
    add_subdirectory(llama-bench)
endif()

if (BUILD_PERPLEXITY)
    add_subdirectory(perplexity)
endif()

# 模型处理工具
if (BUILD_TINYBLAS)
    add_subdirectory(batched-bench)
endif()

add_subdirectory(quantize)
add_subdirectory(gguf-split)
add_subdirectory(imatrix)

# 专业功能工具
add_subdirectory(export-lora)
add_subdirectory(tokenize)
add_subdirectory(cvector-generator)
add_subdirectory(tts)
add_subdirectory(mtmd)
add_subdirectory(rpc)
add_subdirectory(run)

3. 核心工具深度分析

3.1 主命令行工具 (main/)

3.1.1. 功能定位

main/ 是 llama.cpp 的主要交互入口，提供最完整的模型推理功能和最丰富的参数配置选项。

3.1.2. 核心特性

// 全局状态管理
static llama_context * g_ctx;
static llama_model * g_model;
static common_sampler * g_smpl;
static std::vector<llama_token> g_input_tokens;
static std::vector<llama_token> g_output_tokens;
static std::string g_output;
static bool g_interactive = false;
static bool g_antiprompt = false;
static bool g_is_interacting = false;

// 信号处理
static void llama_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    fprintf(stderr, "%s", text);
    fflush(stderr);
}

static void sigint_handler(int sig) {
    if (sig == SIGINT) {
        if (!g_is_interacting) {
            _exit(130); // 优雅退出
        } else {
            g_is_interacting = false;
        }
    }
}

3.1.3. 交互式对话实现

// 交互式对话主循环
static void interactive_loop(bool is_chat_mode) {
    bool is_antiprompt = false;
    int n_pred = 0;
    
    // 设置信号处理
    struct sigaction sigint_action;
    sigint_action.sa_handler = sigint_handler;
    sigemptyset(&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, nullptr);
    
    // 主循环
    while (n_pred < params.n_predict) {
        // 输入提示
        std::string input;
        if (is_chat_mode) {
            input = get_input_line(params.prompt_prefix);
        }
        
        // 分词输入
        auto input_tokens = common_tokenize(g_ctx, input, true);
        
        // 构建批处理
        llama_batch batch = llama_batch_init(input_tokens.size() + 1, 0, 1);
        for (size_t i = 0; i < input_tokens.size(); ++i) {
            llama_batch_add_seq(batch, input_tokens[i], i, false);
        }
        
        // 推理循环
        for (int i = 0; i < params.n_predict; ++i) {
            // 设置最后一个token为输出token
            if (i == 0) {
                llama_batch_add_seq(batch, llama_token_eos(g_model), 
                                 input_tokens.size(), true);
            }
            
            // 执行推理
            if (llama_decode(g_ctx, batch) != 0) {
                fprintf(stderr, "%s : failed to decode\n", __func__);
                break;
            }
            
            // 采样
            llama_token new_token_id = common_sampler_sample(g_smpl, g_ctx, -1);
            
            // 检查结束条件
            if (new_token_id == llama_token_eos(g_model)) {
                break;
            }
            
            // 输出token
            std::string piece = common_token_to_piece(g_ctx, new_token_id);
            printf("%s", piece.c_str());
            fflush(stdout);
            
            // 更新状态
            g_output_tokens.push_back(new_token_id);
            g_output += piece;
            
            // 检查反提示词
            for (const auto & antiprompt : params.antiprompt) {
                if (g_output.find(antiprompt) != std::string::npos) {
                    is_antiprompt = true;
                    break;
                }
            }
            
            if (is_antiprompt) {
                break;
            }
            
            // 准备下一个批处理
            llama_batch_clear(batch);
            llama_batch_add_seq(batch, new_token_id, g_output_tokens.size(), true);
        }
        
        n_pred += g_output_tokens.size() - input_tokens.size();
        
        // 重置输出
        g_output.clear();
        g_output_tokens = input_tokens;
    }
}

3.1.4. 参数系统设计

// 主要参数类型
struct main_params : public common_params {
    // 交互式参数
    bool interactive = false;
    bool interactive_start = false;
    bool instruction = false;
    bool chatml = false;
    
    // 反提示词
    std::vector<std::string> antiprompt;
    
    // 输入输出
    std::string prompt_prefix = "> ";
    std::string prompt_suffix = "";
    std::string input_prefix = "";
    std::string input_suffix = "";
    
    // 显示选项
    bool color = false;
    bool show_perplexity = false;
    int n_pp = 0;  // perplexity计算步数
    int n_ctx_total = 0;  // 总上下文大小
    
    // 控制向量
    std::vector<std::string> control_vectors;
    std::vector<std::pair<int, int>> control_vector_layer_ranges;
    
    // 聊天模板
    std::string chat_template;
    
    // 长文本处理
    int chunk_size = 2048;
    std::string input_prefix_first = "";
};

3.1.5. 性能监控

// 性能统计输出
static void print_statistics() {
    llama_print_timings(g_ctx);
    
    if (g_params.show_perplexity) {
        printf("\n");
        printf("final perplexity: %.3f\n", 
               llama_get_perplexity(g_ctx));
    }
    
    if (g_params.n_ctx_total > 0) {
        printf("\n");
        printf("processed %d tokens in %d chunks\n", 
               g_input_tokens.size(), 
               (g_input_tokens.size() + g_params.chunk_size - 1) / g_params.chunk_size);
    }
}

3.2 HTTP API 服务器 (server/)

3.2.1. 架构设计

// 服务器核心架构
class llama_server {
private:
    // 组件管理
    std::unique_ptr<server_context> ctx;
    std::unique_ptr<server_queue> queue;
    std::unique_ptr<server_model> model_manager;
    std::unique_ptr<server_http> http_handler;
    
    // 配置参数
    server_params params;
    std::atomic<bool> running{false};
    
    // 线程池
    std::vector<std::thread> worker_threads;
    
public:
    bool initialize(const server_params & p);
    void start();
    void stop();
    
private:
    void worker_thread_func();
    void setup_routes();
};

3.2.2. 核心模块分析

3.2.2.1. 上下文管理 (server-context.cpp)

// 上下文管理器
class server_context {
private:
    llama_model * model;
    llama_context * ctx;
    common_sampler * sampler;
    
    // 状态管理
    std::mutex ctx_mutex;
    std::condition_variable cv;
    
    // 批处理管理
    llama_batch batch;
    std::vector<llama_token> input_tokens;
    std::vector<llama_token> output_tokens;
    
public:
    bool initialize(const common_params & params);
    
    // 批处理操作
    bool add_sequence(const std::vector<llama_token> & tokens, int seq_id);
    bool generate_batch();
    
    // 状态查询
    std::vector<llama_token> get_last_output(int seq_id);
    float get_perplexity();
    
    // 重置操作
    void clear();
    void reset_sequence(int seq_id);
};

3.2.2.2. 任务队列 (server-queue.cpp)

// 任务队列管理器
class server_queue {
private:
    struct server_task {
        int id;
        enum task_type {
            TOKENIZE,
            INFERENCE,
            EMBEDDING,
            RERANK
        } type;
        
        std::vector<llama_token> tokens;
        std::function<void(const server_result &)> callback;
        std::chrono::steady_clock::time_point created_at;
        
        // 优先级
        int priority;
        int retry_count;
    };
    
    std::queue<server_task> task_queue;
    std::mutex queue_mutex;
    std::condition_variable queue_cv;
    
    // 性能统计
    std::atomic<int> tasks_completed{0};
    std::atomic<int> tasks_failed{0};
    
public:
    int enqueue_task(server_task task);
    server_task get_next_task();
    
    void task_completed(const server_task & task, bool success);
    
    // 统计信息
    int get_queue_size();
    double get_average_wait_time();
};

3.2.2.3. HTTP处理 (server-http.cpp)

// HTTP处理器
class server_http {
private:
    httplib::Server http_server;
    server_queue * queue;
    server_context * ctx;
    
    // 路由配置
    void setup_openai_routes();
    void setup_anthropic_routes();
    void setup_management_routes();
    
    // API响应格式
    nlohmann::json create_openai_response(const server_result & result);
    nlohmann::json create_error_response(const std::string & message, 
                                        const std::string & type = "invalid_request_error");
    
public:
    void initialize(const std::string & host, int port);
    void start();
    void stop();
    
private:
    // OpenAI兼容接口
    void handle_completions(const httplib::Request & req, httplib::Response & res);
    void handle_chat_completions(const httplib::Request & req, httplib::Response & res);
    void handle_embeddings(const httplib::Request & req, httplib::Response & res);
    void handle_models(const httplib::Request & req, httplib::Response & res);
    
    // Anthropic兼容接口
    void handle_messages(const httplib::Request & req, httplib::Response & res);
    
    // 管理接口
    void handle_stats(const httplib::Request & req, httplib::Response & res);
    void handle_health(const httplib::Request & req, httplib::Response & res);
};

3.2.2.4. OpenAI API兼容性实现

// 聊天完成接口
void server_http::handle_chat_completions(const httplib::Request & req, 
                                         httplib::Response & res) {
    try {
        nlohmann::json request = nlohmann::json::parse(req.body);
        
        // 解析参数
        std::vector<chat_message> messages = parse_chat_messages(request["messages"]);
        float temperature = request.value("temperature", 0.7f);
        int max_tokens = request.value("max_tokens", 100);
        bool stream = request.value("stream", false);
        
        // 构建提示
        std::string prompt = build_chat_prompt(messages, request);
        
        if (stream) {
            // 流式响应
            res.set_header("Content-Type", "text/event-stream");
            res.set_header("Cache-Control", "no-cache");
            
            auto stream_callback = [&](const std::string & chunk, bool finished) {
                nlohmann::json chunk_response;
                chunk_response["id"] = generate_uuid();
                chunk_response["object"] = "chat.completion.chunk";
                chunk_response["created"] = std::time(nullptr);
                
                nlohmann::json delta;
                delta["content"] = chunk;
                delta["finish_reason"] = finished ? "stop" : nullptr;
                
                chunk_response["choices"] = nlohmann::json::array();
                nlohmann::json choice;
                choice["index"] = 0;
                choice["delta"] = delta;
                chunk_response["choices"].push_back(choice);
                
                std::string sse_data = "data: " + chunk_response.dump() + "\n\n";
                res.write(sse_data);
                
                if (finished) {
                    res.write("data: [DONE]\n\n");
                }
            };
            
            // 异步流式生成
            queue->enqueue_task({
                .id = generate_task_id(),
                .type = server_task::INFERENCE,
                .tokens = common_tokenize(ctx->get_context(), prompt, true),
                .callback = [stream_callback](const server_result & result) {
                    stream_callback(result.text, result.finished);
                }
            });
            
        } else {
            // 同步响应
            std::string result_text;
            bool generation_finished = false;
            
            auto completion_callback = [&](const server_result & result) {
                result_text += result.text;
                generation_finished = result.finished;
            };
            
            queue->enqueue_task({
                .id = generate_task_id(),
                .type = server_task::INFERENCE,
                .tokens = common_tokenize(ctx->get_context(), prompt, true),
                .callback = completion_callback
            });
            
            // 等待完成
            while (!generation_finished) {
                std::this_thread::sleep_for(std::chrono::milliseconds(10));
            }
            
            // 构建响应
            nlohmann::json response;
            response["id"] = generate_uuid();
            response["object"] = "chat.completion";
            response["created"] = std::time(nullptr);
            response["model"] = ctx->get_model_name();
            response["choices"] = nlohmann::json::array();
            
            nlohmann::json choice;
            choice["index"] = 0;
            choice["message"] = {
                {"role", "assistant"},
                {"content", result_text}
            };
            choice["finish_reason"] = "stop";
            response["choices"].push_back(choice);
            
            response["usage"] = {
                {"prompt_tokens", count_tokens(prompt)},
                {"completion_tokens", count_tokens(result_text)},
                {"total_tokens", count_tokens(prompt) + count_tokens(result_text)}
            };
            
            res.set_content(response.dump(), "application/json");
        }
        
    } catch (const std::exception & e) {
        nlohmann::json error = create_error_response(e.what());
        res.status = 400;
        res.set_content(error.dump(), "application/json");
    }
}

3.3 性能基准测试 (llama-bench/)

3.3.1. 测试类型和指标

// 基准测试配置
struct bench_params {
    enum test_type {
        TEST_PP,      // 提示处理
        TEST_TG,      // 文本生成  
        TEST_PG,      // 混合测试
        TEST_ALL
    } test_type = TEST_ALL;
    
    // 性能参数
    int32_t pp = 512;        // 提示长度
    int32_t tg = 128;        // 生成长度
    int32_t pl = 1;          // 并行度
    int32_t nr = 10;         // 重复次数
    
    // 系统配置
    int32_t n_gpu_layers = -1;  // GPU层数
    int32_t n_threads = -1;     // 线程数
    std::string numa = "auto";    // NUMA配置
    
    // 输出格式
    enum output_format {
        FORMAT_MARKDOWN,
        FORMAT_CSV,
        FORMAT_JSON,
        FORMAT_JSONL,
        FORMAT_SQL
    } output_format = FORMAT_MARKDOWN;
    
    std::string output_file;
    bool verbose = false;
};

// 性能统计结果
struct bench_result {
    // 时间指标
    double avg_ms;
    double std_ms;
    double min_ms;
    double max_ms;
    
    // 吞吐量指标
    double t_s;           // tokens/sec
    double t_min_s;        // 最小tokens/sec
    double t_max_s;        // 最大tokens/sec
    
    // 测试信息
    int32_t pp;           // 提示长度
    int32_t tg;           // 生成长度
    int32_t pl;           // 并行度
    int32_t nr;           // 重复次数
    
    // 系统信息
    std::string model_name;
    std::string gpu_name;
    std::string cpu_info;
};

3.3.2. 核心测试算法

// 提示处理测试 (PP)
bench_result run_prompt_test(const bench_params & params, 
                            llama_model * model, llama_context * ctx) {
    
    std::vector<bench_result> results;
    
    for (int i = 0; i < params.nr; ++i) {
        // 生成随机输入
        std::vector<llama_token> tokens = generate_random_tokens(params.pp);
        
        // 构建批处理
        llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
        for (size_t j = 0; j < tokens.size(); ++j) {
            llama_batch_add_seq(batch, tokens[j], j, j == tokens.size() - 1);
        }
        
        // 性能测试
        auto start_time = std::chrono::high_resolution_clock::now();
        
        llama_decode(ctx, batch);
        
        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration<double, std::milli>(end_time - start_time);
        
        // 记录结果
        bench_result result;
        result.pp = params.pp;
        result.tg = 0;
        result.pl = 1;
        result.nr = 1;
        result.avg_ms = duration.count();
        result.t_s = params.pp / (duration.count() / 1000.0);
        
        results.push_back(result);
        
        llama_batch_free(batch);
    }
    
    // 统计分析
    return aggregate_results(results);
}

// 文本生成测试 (TG)
bench_result run_generation_test(const bench_params & params,
                               llama_model * model, llama_context * ctx) {
    
    std::vector<bench_result> results;
    
    // 初始提示
    std::vector<llama_token> input_tokens = generate_random_tokens(params.pp);
    llama_kv_cache_clear(ctx);
    
    for (int i = 0; i < params.nr; ++i) {
        // 初始解码
        llama_batch batch = llama_batch_init(input_tokens.size(), 0, 1);
        for (size_t j = 0; j < input_tokens.size(); ++j) {
            llama_batch_add_seq(batch, input_tokens[j], j, j == input_tokens.size() - 1);
        }
        llama_decode(ctx, batch);
        llama_batch_free(batch);
        
        // 生成循环
        std::vector<llama_token> generated_tokens;
        auto start_time = std::chrono::high_resolution_clock::now();
        
        for (int j = 0; j < params.tg; ++j) {
            // 采样
            llama_token next_token = llama_sample_token_greedy(ctx, llama_get_logits(ctx));
            generated_tokens.push_back(next_token);
            
            // 构建下一步批处理
            batch = llama_batch_init(1, 0, 1);
            llama_batch_add_seq(batch, next_token, input_tokens.size() + j, true);
            llama_decode(ctx, batch);
            llama_batch_free(batch);
        }
        
        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration<double, std::milli>(end_time - start_time);
        
        // 记录结果
        bench_result result;
        result.pp = params.pp;
        result.tg = params.tg;
        result.pl = 1;
        result.nr = 1;
        result.avg_ms = duration.count();
        result.t_s = params.tg / (duration.count() / 1000.0);
        
        results.push_back(result);
    }
    
    return aggregate_results(results);
}

3.3.3. 输出格式化

// Markdown格式输出
void print_markdown_results(const std::vector<bench_result> & results) {
    printf("| Model | PP | TG | PL | Avg(ms) | Std(ms) | T/s |\n");
    printf("|-------|----|----|----|---------|---------|------|\n");
    
    for (const auto & result : results) {
        printf("| %s | %d | %d | %d | %.1f | %.1f | %.1f |\n",
               result.model_name.c_str(),
               result.pp,
               result.tg,
               result.pl,
               result.avg_ms,
               result.std_ms,
               result.t_s);
    }
}

// CSV格式输出
void print_csv_results(const std::vector<bench_result> & results) {
    printf("model,pp,tg,pl,avg_ms,std_ms,t_s\n");
    
    for (const auto & result : results) {
        printf("%s,%d,%d,%d,%.3f,%.3f,%.3f\n",
               result.model_name.c_str(),
               result.pp,
               result.tg,
               result.pl,
               result.avg_ms,
               result.std_ms,
               result.t_s);
    }
}

// JSON格式输出
void print_json_results(const std::vector<bench_result> & results) {
    nlohmann::json json_results = nlohmann::json::array();
    
    for (const auto & result : results) {
        nlohmann::json json_result;
        json_result["model"] = result.model_name;
        json_result["pp"] = result.pp;
        json_result["tg"] = result.tg;
        json_result["pl"] = result.pl;
        json_result["avg_ms"] = result.avg_ms;
        json_result["std_ms"] = result.std_ms;
        json_result["t_s"] = result.t_s;
        json_results.push_back(json_result);
    }
    
    printf("%s\n", json_results.dump(2).c_str());
}

3.4 重要性矩阵计算 (imatrix/)

3.4.1. 算法原理

重要性矩阵 (Importance Matrix) 是量化优化中的关键技术，用于指导量化过程，在保持模型性能的同时实现更高的压缩比。

// 重要性矩阵计算器
class importance_matrix_calculator {
private:
    struct tensor_stats {
        std::string name;
        size_t size;
        double sum_sq = 0.0;      // Σ(Act²)
        double sum = 0.0;         // Σ(Act)
        double sum_sq_sq = 0.0;    // Σ(Act⁴)
        size_t count = 0;          // 活跃元素数量
        
        // 统计指标
        double mean() const { return sum / count; }
        double variance() const { return (sum_sq_sq - sum * sum / count) / count; }
        double std_dev() const { return sqrt(variance()); }
        double entropy() const;
        double cosine_similarity(const tensor_stats & other) const;
    };
    
    std::map<std::string, tensor_stats> tensor_statistics_;
    std::mutex stats_mutex_;
    
    // 处理参数
    size_t chunk_size_ = 1024;      // 块大小
    size_t output_frequency_ = 100;   // 输出频率
    size_t save_frequency_ = 1000;    // 保存频率
    bool parse_special_tokens_ = true;  // 是否解析特殊token
    
    // 文件处理
    std::ifstream input_file_;
    std::ofstream output_file_;
    std::string output_path_;
    
public:
    bool initialize(const std::string & model_path, const std::string & data_path);
    bool process_data();
    void save_importance_matrix(const std::string & output_path);
    void print_statistics();
    
private:
    bool process_chunk(const std::vector<std::string> & texts);
    void update_tensor_stats(const std::vector<llama_token> & tokens);
    void collect_activations(llama_context * ctx, const std::string & layer_name);
};

3.4.2. 核心计算流程

// 数据处理主循环
bool importance_matrix_calculator::process_data() {
    std::vector<std::string> chunk;
    std::string line;
    size_t lines_processed = 0;
    
    while (std::getline(input_file_, line)) {
        chunk.push_back(line);
        lines_processed++;
        
        // 按块处理
        if (chunk.size() >= chunk_size_) {
            if (!process_chunk(chunk)) {
                return false;
            }
            
            chunk.clear();
            
            // 定期输出进度
            if (lines_processed % output_frequency_ == 0) {
                printf("Processed %zu lines...\n", lines_processed);
                print_current_stats();
            }
            
            // 定期保存结果
            if (lines_processed % save_frequency_ == 0) {
                save_importance_matrix(output_path_ + ".temp");
            }
        }
    }
    
    // 处理剩余数据
    if (!chunk.empty()) {
        if (!process_chunk(chunk)) {
            return false;
        }
    }
    
    return true;
}

// 处理单个数据块
bool importance_matrix_calculator::process_chunk(const std::vector<std::string> & texts) {
    for (const std::string & text : texts) {
        // 分词
        auto tokens = common_tokenize(ctx_, text, parse_special_tokens_);
        if (tokens.empty()) continue;
        
        // 清空KV缓存
        llama_kv_cache_clear(ctx_);
        
        // 构建批处理
        llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
        for (size_t i = 0; i < tokens.size(); ++i) {
            llama_batch_add_seq(batch, tokens[i], i, i == tokens.size() - 1);
        }
        
        // 前向传播
        if (llama_decode(ctx_, batch) != 0) {
            fprintf(stderr, "Failed to decode\n");
            llama_batch_free(batch);
            return false;
        }
        
        // 收集激活值
        for (int layer = 0; layer < llama_n_layer(model_); ++layer) {
            collect_activations(ctx_, "layer_" + std::to_string(layer));
        }
        
        llama_batch_free(batch);
    }
    
    return true;
}

// 收集激活值
void importance_matrix_calculator::collect_activations(llama_context * ctx, 
                                                   const std::string & layer_name) {
    
    // 获取中间激活
    const float * activations = llama_get_layer_output(ctx, layer_name);
    if (!activations) return;
    
    size_t layer_size = llama_get_layer_size(ctx, layer_name);
    
    std::lock_guard<std::mutex> lock(stats_mutex_);
    tensor_stats & stats = tensor_statistics_[layer_name];
    
    if (stats.size == 0) {
        stats.name = layer_name;
        stats.size = layer_size;
    }
    
    // 更新统计信息
    for (size_t i = 0; i < layer_size; ++i) {
        float act = activations[i];
        stats.sum += act;
        stats.sum_sq += act * act;
        stats.sum_sq_sq += act * act * act * act * act;
        
        if (fabs(act) > 1e-6f) {
            stats.count++;
        }
    }
}

3.4.3. 统计分析和输出

// 计算熵
double tensor_stats::entropy() const {
    if (count == 0) return 0.0;
    
    double p = (double)count / size;
    if (p <= 0.0 || p >= 1.0) return 0.0;
    
    return -p * log2(p) - (1.0 - p) * log2(1.0 - p);
}

// 计算余弦相似度
double tensor_stats::cosine_similarity(const tensor_stats & other) const {
    if (count == 0 || other.count == 0) return 0.0;
    
    // 简化的相似度计算（基于统计特征）
    double mean1 = mean();
    double mean2 = other.mean();
    double std1 = std_dev();
    double std2 = other.std_dev();
    
    if (std1 == 0.0 || std2 == 0.0) return 0.0;
    
    // 相关系数作为相似度的近似
    return (mean1 * mean2) / (std1 * std2);
}

// 保存重要性矩阵
void importance_matrix_calculator::save_importance_matrix(const std::string & output_path) {
    nlohmann::json imatrix;
    imatrix["version"] = "1.0";
    imatrix["created_at"] = std::time(nullptr);
    imatrix["tensor_stats"] = nlohmann::json::object();
    
    for (const auto & [name, stats] : tensor_statistics_) {
        nlohmann::json tensor_stats_json;
        tensor_stats_json["name"] = stats.name;
        tensor_stats_json["size"] = stats.size;
        tensor_stats_json["sum_sq"] = stats.sum_sq;
        tensor_stats_json["sum"] = stats.sum;
        tensor_stats_json["sum_sq_sq"] = stats.sum_sq_sq;
        tensor_stats_json["count"] = stats.count;
        tensor_stats_json["mean"] = stats.mean();
        tensor_stats_json["std_dev"] = stats.std_dev();
        tensor_stats_json["entropy"] = stats.entropy();
        tensor_stats_json["importance_score"] = stats.sum_sq / stats.size;
        
        imatrix["tensor_stats"][name] = tensor_stats_json;
    }
    
    std::ofstream file(output_path);
    if (!file.is_open()) {
        fprintf(stderr, "Failed to open output file: %s\n", output_path.c_str());
        return;
    }
    
    file << imatrix.dump(2);
    file.close();
    
    printf("Importance matrix saved to: %s\n", output_path.c_str());
}

// 打印统计信息
void importance_matrix_calculator::print_statistics() {
    printf("\n=== Importance Matrix Statistics ===\n");
    printf("%-30s %10s %10s %10s %10s %10s\n", 
           "Tensor", "Size", "SumSq", "Mean", "StdDev", "Entropy");
    printf("%-30s %10s %10s %10s %10s %10s\n", 
           "------", "----", "-----", "----", "------", "------");
    
    for (const auto & [name, stats] : tensor_statistics_) {
        printf("%-30s %10zu %10.2e %10.4f %10.4f %10.4f\n",
               name.substr(0, 30).c_str(),
               stats.size,
               stats.sum_sq,
               stats.mean(),
               stats.std_dev(),
               stats.entropy());
    }
    
    printf("\nTotal tensors: %zu\n", tensor_statistics_.size());
}

3.5 GGUF 文件分割 (gguf-split/)

3.5.1. 功能定位

GGUF Split 工具用于处理大型模型文件，提供分割和合并功能，解决存储、传输和部署中的实际问题。

// 分割参数配置
struct split_params {
    enum operation_type {
        OP_SPLIT,
        OP_MERGE,
        OP_INFO
    } operation = OP_SPLIT;
    
    // 分割模式
    enum split_mode {
        MODE_TENSORS,      // 按张量数量分割
        MODE_SIZE          // 按文件大小分割
    } mode = MODE_TENSORS;
    
    // 分割参数
    size_t n_split_tensors = 128;    // 每个文件的张量数量
    size_t n_bytes_split = 0;        // 每个文件的字节数
    std::string input_path;           // 输入文件路径
    std::string output_prefix;         // 输出文件前缀
    
    // 合并参数
    std::vector<std::string> input_files;  // 输入文件列表
    std::string output_path;               // 输出文件路径
    
    // 选项
    bool dry_run = false;             // 预览模式
    bool no_tensor_first_split = false; // 第一个文件不包含张量
};

3.5.2. 核心算法实现

// 分割主函数
bool split_gguf_file(const split_params & params) {
    // 1. 验证输入文件
    gguf_context * ctx = gguf_init_from_file(params.input_path.c_str());
    if (!ctx) {
        fprintf(stderr, "Failed to load GGUF file: %s\n", params.input_path.c_str());
        return false;
    }
    
    // 2. 获取文件信息
    int n_tensors = gguf_get_n_tensors(ctx);
    int n_kv = gguf_get_n_kv(ctx);
    
    printf("GGUF file info:\n");
    printf("  Tensors: %d\n", n_tensors);
    printf("  KV pairs: %d\n", n_kv);
    printf("  Total size: %zu bytes\n", gguf_get_file_size(ctx));
    
    if (params.dry_run) {
        // 预览模式：只显示分割计划
        print_split_plan(ctx, params);
        gguf_free(ctx);
        return true;
    }
    
    // 3. 计算分割计划
    std::vector<split_plan> split_plans = calculate_split_plans(ctx, params);
    
    printf("Split plan: %zu files\n", split_plans.size());
    for (size_t i = 0; i < split_plans.size(); ++i) {
        printf("  File %zu: tensors %d-%d (%d tensors, %zu bytes)\n",
               i + 1,
               split_plans[i].start_tensor,
               split_plans[i].end_tensor,
               split_plans[i].tensor_count,
               split_plans[i].estimated_size);
    }
    
    // 4. 执行分割
    for (size_t i = 0; i < split_plans.size(); ++i) {
        std::string output_path = params.output_prefix + ".part" + std::to_string(i + 1) + ".gguf";
        
        if (!create_split_file(ctx, split_plans[i], output_path)) {
            fprintf(stderr, "Failed to create split file %zu\n", i + 1);
            gguf_free(ctx);
            return false;
        }
        
        printf("Created: %s\n", output_path.c_str());
    }
    
    gguf_free(ctx);
    return true;
}

// 分割计划结构
struct split_plan {
    int start_tensor;
    int end_tensor;
    int tensor_count;
    size_t estimated_size;
    std::vector<int> tensor_indices;
};

// 计算分割计划
std::vector<split_plan> calculate_split_plans(gguf_context * ctx, 
                                            const split_params & params) {
    std::vector<split_plan> plans;
    int n_tensors = gguf_get_n_tensors(ctx);
    
    if (params.mode == split_params::MODE_TENSORS) {
        // 按张量数量分割
        int tensors_per_file = params.n_split_tensors;
        int n_files = (n_tensors + tensors_per_file - 1) / tensors_per_file;
        
        for (int i = 0; i < n_files; ++i) {
            split_plan plan;
            plan.start_tensor = i * tensors_per_file;
            plan.end_tensor = std::min((i + 1) * tensors_per_file, n_tensors);
            plan.tensor_count = plan.end_tensor - plan.start_tensor;
            
            // 计算文件大小
            size_t size = 0;
            if (i == 0 && !params.no_tensor_first_split) {
                // 第一个文件包含元数据
                size = gguf_get_file_size(ctx);
            }
            
            for (int j = plan.start_tensor; j < plan.end_tensor; ++j) {
                const char * name = gguf_get_tensor_name(ctx, j);
                const ggml_tensor * tensor = gguf_get_tensor(ctx, name);
                size += ggml_nbytes(tensor);
            }
            
            plan.estimated_size = size;
            
            // 收集张量索引
            for (int j = plan.start_tensor; j < plan.end_tensor; ++j) {
                plan.tensor_indices.push_back(j);
            }
            
            plans.push_back(plan);
        }
        
    } else if (params.mode == split_params::MODE_SIZE) {
        // 按文件大小分割
        size_t target_size = params.n_bytes_split;
        size_t current_size = 0;
        int start_tensor = 0;
        
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const ggml_tensor * tensor = gguf_get_tensor(ctx, name);
            size_t tensor_size = ggml_nbytes(tensor);
            
            if (current_size + tensor_size > target_size && start_tensor < i) {
                // 创建新的分割
                split_plan plan;
                plan.start_tensor = start_tensor;
                plan.end_tensor = i;
                plan.tensor_count = i - start_tensor;
                plan.estimated_size = current_size;
                
                for (int j = start_tensor; j < i; ++j) {
                    plan.tensor_indices.push_back(j);
                }
                
                plans.push_back(plan);
                
                start_tensor = i;
                current_size = 0;
            }
            
            current_size += tensor_size;
        }
        
        // 最后一个文件
        if (start_tensor < n_tensors) {
            split_plan plan;
            plan.start_tensor = start_tensor;
            plan.end_tensor = n_tensors;
            plan.tensor_count = n_tensors - start_tensor;
            plan.estimated_size = current_size;
            
            for (int j = start_tensor; j < n_tensors; ++j) {
                plan.tensor_indices.push_back(j);
            }
            
            plans.push_back(plan);
        }
    }
    
    return plans;
}

// 创建分割文件
bool create_split_file(gguf_context * src_ctx, const split_plan & plan, 
                       const std::string & output_path) {
    // 创建新的GGUF上下文
    gguf_context * dst_ctx = gguf_init_empty();
    if (!dst_ctx) {
        fprintf(stderr, "Failed to create GGUF context\n");
        return false;
    }
    
    // 复制元数据（仅第一个文件）
    if (plan.start_tensor == 0) {
        for (int i = 0; i < gguf_get_n_kv(src_ctx); ++i) {
            const char * key = gguf_get_key(src_ctx, i);
            enum gguf_type type = gguf_get_kv_type(src_ctx, i);
            
            switch (type) {
                case GGUF_TYPE_UINT8:
                    {
                        uint8_t value;
                        gguf_get_kv_uint8(src_ctx, key, &value);
                        gguf_set_kv_uint8(dst_ctx, key, value);
                    }
                    break;
                case GGUF_TYPE_INT32:
                    {
                        int32_t value;
                        gguf_get_kv_int32(src_ctx, key, &value);
                        gguf_set_kv_int32(dst_ctx, key, value);
                    }
                    break;
                case GGUF_TYPE_STRING:
                    {
                        const char * value;
                        gguf_get_kv_str(src_ctx, key, &value);
                        gguf_set_kv_str(dst_ctx, key, value);
                    }
                    break;
                // ... 其他类型处理
            }
        }
    }
    
    // 复制张量
    for (int tensor_idx : plan.tensor_indices) {
        const char * name = gguf_get_tensor_name(src_ctx, tensor_idx);
        const ggml_tensor * src_tensor = gguf_get_tensor(src_ctx, name);
        
        // 创建新张量（复制数据）
        ggml_tensor * dst_tensor = ggml_dup_tensor(src_tensor);
        dst_tensor->data = malloc(ggml_nbytes(src_tensor));
        memcpy(dst_tensor->data, src_tensor->data, ggml_nbytes(src_tensor));
        
        gguf_add_tensor(dst_ctx, dst_tensor);
    }
    
    // 写入文件
    bool success = gguf_write_to_file(dst_ctx, output_path.c_str());
    
    // 清理
    gguf_free(dst_ctx);
    
    return success;
}

3.6 困惑度计算 (perplexity/)

3.6.1. 困惑度评估原理

困惑度 (Perplexity) 是评估语言模型质量的重要指标，反映模型对测试数据的预测能力。

// 困惑度计算器
class perplexity_calculator {
private:
    struct perplexity_stats {
        double log_prob_sum = 0.0;     // log概率和
        double log_prob_sum_sq = 0.0;   // log概率平方和
        size_t token_count = 0;        // token总数
        size_t seq_count = 0;          // 序列总数
        
        // 计算困惑度
        double perplexity() const {
            return exp(log_prob_sum / token_count);
        }
        
        // 标准差
        double std_deviation() const {
            if (seq_count <= 1) return 0.0;
            double mean = log_prob_sum / seq_count;
            return sqrt((log_prob_sum_sq - 2.0 * mean * log_prob_sum + seq_count * mean * mean) / (seq_count - 1));
        }
    };
    
    llama_model * model_;
    llama_context * ctx_;
    perplexity_stats stats_;
    
    // 配置参数
    int n_ctx_;                    // 上下文大小
    int n_batch_ = 512;             // 批大小
    int n_gpu_layers_ = -1;         // GPU层数
    bool use_colors_ = false;         // 是否使用颜色
    bool compute_pp_ = true;         // 是否计算困惑度
    bool logit recording_ = false;    // 是否记录logits
    
    // 记录数据
    std::vector<std::vector<float>> recorded_logits_;
    std::vector<std::vector<llama_token>> recorded_tokens_;
    
public:
    bool initialize(const std::string & model_path, const common_params & params);
    bool compute_perplexity(const std::string & dataset_path);
    void print_results();
    void save_logits(const std::string & output_path);
    
private:
    bool process_text_file(const std::string & file_path);
    double compute_sequence_logprob(const std::vector<llama_token> & tokens);
    void update_stats(double logprob);
};

3.6.2. 核心计算算法

// 计算单个序列的对数概率
double perplexity_calculator::compute_sequence_logprob(const std::vector<llama_token> & tokens) {
    if (tokens.size() < 2) return 0.0;
    
    double total_logprob = 0.0;
    size_t eval_count = 0;
    
    // 滑动窗口计算
    for (size_t i = 0; i < tokens.size() - 1; ++i) {
        // 构建输入序列（最多n_ctx_个token）
        size_t start_pos = (i >= n_ctx_) ? i - n_ctx_ + 1 : 0;
        size_t context_len = i - start_pos + 1;
        
        std::vector<llama_token> context(tokens.begin() + start_pos, 
                                      tokens.begin() + i + 1);
        
        // 批处理设置
        llama_batch batch = llama_batch_init(context_len, 0, 1);
        for (size_t j = 0; j < context_len; ++j) {
            llama_batch_add_seq(batch, context[j], j, j == context_len - 1);
        }
        
        // 前向传播
        if (llama_decode(ctx_, batch) != 0) {
            fprintf(stderr, "Failed to decode sequence\n");
            llama_batch_free(batch);
            return -INFINITY;
        }
        
        // 获取logits
        const float * logits = llama_get_logits(ctx_);
        int vocab_size = llama_n_vocab(model_);
        
        // 计算下一个token的概率
        int next_token = tokens[i + 1];
        float max_logit = logits[0];
        
        // 数值稳定性：减去最大值
        for (int k = 1; k < vocab_size; ++k) {
            max_logit = std::max(max_logit, logits[k]);
        }
        
        // 计算logits的指数和
        double sum_exp = 0.0;
        for (int k = 0; k < vocab_size; ++k) {
            sum_exp += exp(logits[k] - max_logit);
        }
        
        // 计算目标token的对数概率
        float target_logit = logits[next_token];
        double logprob = (target_logit - max_logit) - log(sum_exp);
        
        total_logprob += logprob;
        eval_count++;
        
        // 记录logits（如果需要）
        if (logit_recording_) {
            std::vector<float> token_logits(logits, logits + vocab_size);
            recorded_logits_.push_back(token_logits);
            recorded_tokens_.push_back(tokens[i + 1]);
        }
        
        llama_batch_free(batch);
    }
    
    return total_logprob / eval_count;  // 返回平均对数概率
}

// 处理文本文件
bool perplexity_calculator::process_text_file(const std::string & file_path) {
    std::ifstream file(file_path);
    if (!file.is_open()) {
        fprintf(stderr, "Failed to open file: %s\n", file_path.c_str());
        return false;
    }
    
    std::string line;
    size_t line_count = 0;
    size_t total_lines = count_lines(file_path);
    
    printf("Processing file: %s\n", file_path.c_str());
    
    while (std::getline(file, line)) {
        line_count++;
        
        // 分词
        auto tokens = common_tokenize(ctx_, line, false);
        if (tokens.empty()) continue;
        
        // 计算困惑度
        double logprob = compute_sequence_logprob(tokens);
        
        if (logprob == -INFINITY) {
            fprintf(stderr, "Failed to compute logprob for line %zu\n", line_count);
            continue;
        }
        
        // 更新统计信息
        update_stats(logprob);
        
        // 显示进度
        if (use_colors_) {
            printf("\033[2K\r"); // 清除行
            printf("\033[36mProcessing:\033[0m %zu/%zu lines (%.1f%%) - ", 
                   line_count, total_lines, 100.0 * line_count / total_lines);
            printf("\033[32mCurrent PPL: %.2f\033[0m", stats_.perplexity());
            fflush(stdout);
        } else if (line_count % 100 == 0) {
            printf("Processed %zu lines, current PPL: %.2f\n", 
                   line_count, stats_.perplexity());
        }
    }
    
    if (use_colors_) {
        printf("\n"); // 换行
    }
    
    file.close();
    return true;
}

// 更新统计信息
void perplexity_calculator::update_stats(double logprob) {
    stats_.log_prob_sum += logprob;
    stats_.log_prob_sum_sq += logprob * logprob;
    stats_.seq_count++;
    
    // 计算token数量（近似）
    stats_.token_count += n_ctx_;  // 这里简化处理
}

3.6.3. 高级分析功能

// KL散度计算（用于比较两个模型）
double compute_kl_divergence(const std::string & model1_path,
                             const std::string & model2_path,
                             const std::string & test_data) {
    
    perplexity_calculator calc1, calc2;
    
    // 初始化两个模型
    common_params params1, params2;
    calc1.initialize(model1_path, params1);
    calc2.initialize(model2_path, params2);
    
    // 计算两个模型的logits
    calc1.compute_perplexity(test_data);
    calc2.compute_perplexity(test_data);
    
    // 获取记录的logits
    const auto & logits1 = calc1.get_recorded_logits();
    const auto & logits2 = calc2.get_recorded_logits();
    const auto & tokens = calc1.get_recorded_tokens();
    
    // 计算KL散度
    double kl_divergence = 0.0;
    size_t total_tokens = 0;
    
    for (size_t i = 0; i < logits1.size() && i < logits2.size(); ++i) {
        const auto & logit_vec1 = logits1[i];
        const auto & logit_vec2 = logits2[i];
        
        int target_token = tokens[i];
        
        // 转换为概率分布
        std::vector<double> prob1 = logits_to_probabilities(logit_vec1);
        std::vector<double> prob2 = logits_to_probabilities(logit_vec2);
        
        // KL散度公式: KL(P||Q) = Σ P(i) * log(P(i)/Q(i))
        if (prob1[target_token] > 1e-10 && prob2[target_token] > 1e-10) {
            kl_divergence += prob1[target_token] * 
                           (log(prob1[target_token]) - log(prob2[target_token]));
        }
        
        total_tokens++;
    }
    
    return kl_divergence / total_tokens;
}

// 概率分布转换
std::vector<double> logits_to_probabilities(const std::vector<float> & logits) {
    std::vector<double> probs(logits.size());
    
    // 数值稳定性
    float max_logit = *std::max_element(logits.begin(), logits.end());
    double sum_exp = 0.0;
    
    for (size_t i = 0; i < logits.size(); ++i) {
        probs[i] = exp(logits[i] - max_logit);
        sum_exp += probs[i];
    }
    
    // 归一化
    for (size_t i = 0; i < probs.size(); ++i) {
        probs[i] /= sum_exp;
    }
    
    return probs;
}

// 详细的统计分析
void print_detailed_analysis() {
    printf("\n=== Detailed Perplexity Analysis ===\n");
    printf("Total sequences: %zu\n", stats_.seq_count);
    printf("Total tokens: %zu\n", stats_.token_count);
    printf("Average log probability: %.6f\n", stats_.log_prob_sum / stats_.seq_count);
    printf("Log probability std dev: %.6f\n", stats_.std_deviation());
    printf("Perplexity: %.2f\n", stats_.perplexity());
    
    // 分位数分析
    if (!recorded_logits_.empty()) {
        std::vector<double> per_token_perplexities;
        
        for (size_t i = 0; i < recorded_tokens_.size(); ++i) {
            const auto & logits = recorded_logits_[i];
            int target_token = recorded_tokens_[i];
            
            // 计算每个token的困惑度
            double target_prob = logits_to_probabilities(logits)[target_token];
            if (target_prob > 1e-10) {
                per_token_perplexities.push_back(1.0 / target_prob);
            }
        }
        
        if (!per_token_perplexities.empty()) {
            std::sort(per_token_perplexities.begin(), per_token_perplexities.end());
            
            printf("Token-level perplexity statistics:\n");
            printf("  Median: %.2f\n", per_token_perplexities[per_token_perplexities.size() / 2]);
            printf("  25th percentile: %.2f\n", per_token_perplexities[per_token_perplexities.size() / 4]);
            printf("  75th percentile: %.2f\n", per_token_perplexities[3 * per_token_perplexities.size() / 4]);
            printf("  Min: %.2f\n", per_token_perplexities.front());
            printf("  Max: %.2f\n", per_token_perplexities.back());
        }
    }
}

4. 使用场景和最佳实践

4.1 工具链工作流

# 模型处理工作流
# 1. 转换格式
python3 convert_hf_to_gguf.py model_repo --outdir ./models

# 2. 计算重要性矩阵（用于量化优化）
./llama-imatrix -m models/model.gguf -f training_data.txt -o imatrix.dat

# 3. 量化模型
./llama-quantize models/model.gguf models/model-q4_0.gguf q4_0 \
    --imatrix imatrix.dat

# 4. 分割大模型文件
./llama-gguf-split models/model-q4_0.gguf \
    --split-mode size --split-size 2G \
    --output-prefix model-split

# 5. 性能评估
./llama-perplexity -m model-split-part1.gguf \
    --chunks model-split-part*.gguf \
    -f test_data.txt

# 6. 基准测试
./llama-bench -m model-split-part1.gguf \
    --chunks model-split-part*.gguf \
    --pp 512 --tg 128 --nr 10

# 7. 部署服务器
./llama-server -m model-split-part1.gguf \
    --chunks model-split-part*.gguf \
    --host 0.0.0.0 --port 8080 \
    --n-gpu-layers 99

4.2 性能优化策略

# 1. 选择合适的量化格式
# Q4_K_M: 平衡质量和大小
# Q5_K_M: 更高质量
# Q8_0: 最高质量

./llama-quantize model.gguf model-q4_k_m.gguf q4_k_m
./llama-quantize model.gguf model-q5_k_m.gguf q5_k_m

# 2. GPU卸载配置
./llama-cli -m model.gguf \
    --n-gpu-layers 99 \           # 99层到GPU
    --main-gpu 0 \                # 主GPU设备
    --tensor-split 0.8,0.2 \      # 多GPU分布
    --gpu-layers-draft 32         # 草稿模型GPU层数

# 3. 内存和性能优化
./llama-server -m model.gguf \
    --ctx-size 4096 \             # 上下文大小
    --batch-size 512 \             # 批处理大小
    --ubatch-size 512 \            # 用户批处理大小
    --memory-f16 \                 # 使用半精度
    --mlock \                     # 锁定内存
    --no-mmap \                   # 禁用内存映射
    --cache-type-k q4_0 \         # KV缓存量化
    --parallel 2                  # 并行解码数

4.3 监控和调试

# 1. 详细性能监控
./llama-server -m model.gguf \
    --log-disable \                # 禁用日志
    --metrics \                    # 启用性能指标
    --slots \                     # 槽位监控
    --pid-file /tmp/llama.pid     # PID文件

# 2. 性能分析
./llama-bench -m model.gguf \
    --pp 1024 --tg 256 \          # 更大的测试
    --output-format json \         # JSON输出
    --output-file benchmark.json \  # 保存结果
    --verbose                     # 详细信息

# 3. 内存使用分析
./llama-cli -m model.gguf \
    --verbose-prompt \             # 详细提示信息
    --print-special \              # 打印特殊token
    --color \                     # 颜色输出
    --dump-kv-cache cache.bin     # 导出KV缓存

5. 技术特色和创新

5.1 统一的命令行接口设计

// 参数解析模式
struct common_cmd_arg {
    const char * short_arg;
    const char * long_arg;
    const char * help_text;
    const char * env_var;  // 环境变量支持
    bool is_boolean;
};

// 自动帮助生成
void print_help(const std::vector<common_cmd_arg> & args) {
    printf("Usage: llama-cli [options]\n\n");
    printf("Options:\n");
    
    for (const auto & arg : args) {
        printf("  %s, %s", arg.short_arg, arg.long_arg);
        
        if (!arg.is_boolean) {
            printf(" <value>");
        }
        
        printf("    %s", arg.help_text);
        
        if (arg.env_var) {
            printf(" (env: %s)", arg.env_var);
        }
        
        printf("\n");
    }
}

5.2 异常安全的资源管理

// RAII资源管理
class llama_resource_guard {
private:
    llama_model * model_ = nullptr;
    llama_context * ctx_ = nullptr;
    ggml_context * ggml_ctx_ = nullptr;
    
public:
    ~llama_resource_guard() {
        if (ctx_) llama_free(ctx_);
        if (model_) llama_free(model_);
        if (ggml_ctx_) ggml_free(ggml_ctx_);
    }
    
    void set_model(llama_model * model) { model_ = model; }
    void set_context(llama_context * ctx) { ctx_ = ctx; }
    void set_ggml_context(ggml_context * ctx) { ggml_ctx_ = ctx; }
    
    // 释放所有权
    llama_model* release_model() { 
        llama_model* tmp = model_; 
        model_ = nullptr; 
        return tmp; 
    }
};

5.3 高性能I/O处理

// 大文件高效读取
class fast_file_reader {
private:
    std::ifstream file_;
    size_t buffer_size_;
    std::unique_ptr<char[]> buffer_;
    size_t buffer_pos_;
    size_t buffer_valid_;
    
public:
    fast_file_reader(size_t buffer_size = 64 * 1024) 
        : buffer_size_(buffer_size), buffer_(new char[buffer_size]) {}
    
    bool open(const std::string & path) {
        file_.open(path, std::ios::binary);
        return file_.is_open();
    }
    
    bool read_line(std::string & line) {
        line.clear();
        
        while (true) {
            // 如果缓冲区为空，重新填充
            if (buffer_pos_ >= buffer_valid_) {
                file_.read(buffer_.get(), buffer_size_);
                buffer_valid_ = file_.gcount();
                buffer_pos_ = 0;
                
                if (buffer_valid_ == 0) {
                    return !line.empty(); // 文件结束
                }
            }
            
            // 查找换行符
            char * newline = std::find(buffer_.get() + buffer_pos_, 
                                      buffer_.get() + buffer_valid_, '\n');
            
            if (newline < buffer_.get() + buffer_valid_) {
                // 找到换行符
                size_t line_len = newline - (buffer_.get() + buffer_pos_);
                line.append(buffer_.get() + buffer_pos_, line_len);
                buffer_pos_ += line_len + 1;
                return true;
            } else {
                // 添加剩余缓冲区内容
                line.append(buffer_.get() + buffer_pos_, 
                           buffer_valid_ - buffer_pos_);
                buffer_pos_ = buffer_valid_;
            }
        }
    }
};