任务描述
某生态科普平台需要进行生态科普文本的智能压缩。为提升生态科普文本在网络中的传输效率,需对文本中高频出现的生态术语进行统计与压缩、解压。现计划将高频生态术语利用哈夫曼编码技术实现数据压缩和解压。首先统计给定文本中各高频生态术语的出现频次,然后构建哈夫曼树并生成对应的编码表,计算压缩率,并实现文本的压缩(编码)和解压(译码)。
高频生态术语有12个,这些术语分别是:biodiversity, ecosystems, conservation, animals, species, forests, habitats, pollution, climate, change, energy, renewable。
设计一个算法,统计一段生态科普文本里用户对于这些高频生态术语的关注频次(忽略单词大小写,如 "Biodiversity" 视为 "biodiversity"),并生成对应的哈夫曼编码表,计算压缩率。
压缩率 = (1 - 压缩后文件大小 / 原始文件大小) × 100%
编程要求
输入
输入一段英文,单词间用空格或符号隔开。保证输入的文章的字母、空格的总和小于2000000个。
输出
输出总计13行。前12行依次为术语的出现频次和对应编码,要求按照出现频次从高到低进行排序,第13行是压缩率,使用百分号表示,保留两位小数,第14行是使用生成的编码压缩后的文本,第15行是将第14行压缩后的文本解压后的文本。
测试说明
平台会对你编写的代码进行测试:
测试输入:
climate change pollution species biodiversity pollution energy animals pollution forests change habitats pollution biodiversity ecosystems pollution change conservation species pollution climate habitats animals energy pollution species forests change biodiversity ecosystems pollution animals habitats species conservation forests pollution energy change pollution biodiversity animals species pollution habitats forests ecosystems conservation energy change pollution renewable
预期输出:
pollution:12 01
change:6 001
species:5 000
biodiversity:4 1011
animals:4 1100
forests:4 1101
habitats:4 1110
energy:4 1111
ecosystems:3 1000
conservation:3 1001
climate:2 10101
renewable:1 10100
4.59%
10101001010001011011111110001110100111100110111000010011001000011010111101100111101000110100110111000011100111000010011101011111001011011110000001111011011000100111110010110100
climate change pollution species biodiversity pollution energy animals pollution forests change habitats pollution biodiversity ecosystems pollution change conservation species pollution climate habitats animals energy pollution species forests change biodiversity ecosystems pollution animals habitats species conservation forests pollution energy change pollution biodiversity animals species pollution habitats forests ecosystems conservation energy change pollution renewable#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_TERMS 12
#define MAX_TEXT_LENGTH 2000000
#define MAX_CODE_LENGTH 100
#define MAX_COMPRESSED_LENGTH (MAX_TEXT_LENGTH * 10) // 足够大的压缩缓冲区
// 哈夫曼树节点结构
typedef struct {
char word[50]; // 生态术语
int weight; // 权值(出现频次)
int parent, lch, rch; // 双亲、左右孩子指针
char code[MAX_CODE_LENGTH]; // 哈夫曼编码
} HTNode, *HuffmanTree;
// 全局变量
HuffmanTree HT; // 哈夫曼树
int m; // 哈夫曼树节点总数
// 预定义的生态术语
const char *ecological_terms[MAX_TERMS] = {
"biodiversity", "ecosystems", "conservation", "animals",
"species", "forests", "habitats", "pollution",
"climate", "change", "energy", "renewable"
};
// 函数声明(修正:calculateCompressionRatio 增加 compressed 参数)
void initializeTerms(HuffmanTree terms[]);
int findTermIndex(const char *word, HuffmanTree terms[]);
void countFrequencies(const char *text, HuffmanTree terms[]);
void buildHuffmanTree(int n, HuffmanTree terms[]);
void selectTwoMin(int range, int *s1, int *s2);
void generateHuffmanCodes(int n);
void sortTermsByFrequency(HuffmanTree terms[]);
double calculateCompressionRatio(const char *text, const char *compressed); // 增加 compressed 参数
void compressText(const char *text, char *compressed, HuffmanTree terms[]);
void decompressText(const char *compressed, char *decompressed);
// 初始化术语数组
void initializeTerms(HuffmanTree terms[]) {
for (int i = 0; i < MAX_TERMS; i++) {
terms[i] = (HTNode*)malloc(sizeof(HTNode));
strcpy(terms[i]->word, ecological_terms[i]);
terms[i]->weight = 0;
terms[i]->parent = terms[i]->lch = terms[i]->rch = 0;
terms[i]->code[0] = '\0';
}
}
// 查找术语索引(忽略大小写)
int findTermIndex(const char *word, HuffmanTree terms[]) {
char lowerWord[50];
strcpy(lowerWord, word);
for (int i = 0; lowerWord[i]; i++) {
lowerWord[i] = tolower(lowerWord[i]);
}
for (int i = 0; i < MAX_TERMS; i++) {
if (strcmp(lowerWord, terms[i]->word) == 0) {
return i;
}
}
return -1;
}
// 统计术语频次(修正:正确清理单词标点)
void countFrequencies(const char *text, HuffmanTree terms[]) {
char *textCopy = strdup(text);
char *token = strtok(textCopy, " "); // 按空格分割单词
while (token != NULL) {
// 过滤单词中的非字母字符(仅保留字母,转为小写)
char cleanWord[50] = {0};
int idx = 0;
for (int i = 0; token[i]; i++) {
if (isalpha(token[i])) {
cleanWord[idx++] = tolower(token[i]);
}
}
cleanWord[idx] = '\0';
if (idx > 0) { // 非空单词才统计
int index = findTermIndex(cleanWord, terms);
if (index != -1) {
terms[index]->weight++;
}
}
token = strtok(NULL, " ");
}
free(textCopy);
}
// 选择两个权值最小的节点
void selectTwoMin(int range, int *s1, int *s2) {
int min1 = 0x7fffffff, min2 = 0x7fffffff;
*s1 = *s2 = -1;
for (int i = 1; i <= range; i++) {
if (HT[i].parent == 0) { // 未被选中的节点
if (HT[i].weight < min1) {
min2 = min1;
*s2 = *s1;
min1 = HT[i].weight;
*s1 = i;
} else if (HT[i].weight < min2) {
min2 = HT[i].weight;
*s2 = i;
}
}
}
}
// 构建哈夫曼树
void buildHuffmanTree(int n, HuffmanTree terms[]) {
if (n <= 1) return;
m = 2 * n - 1; // 哈夫曼树总节点数
HT = (HuffmanTree)malloc((m + 1) * sizeof(HTNode)); // 1-based索引
// 初始化叶子节点(1~n)
for (int i = 1; i <= n; i++) {
strcpy(HT[i].word, terms[i-1]->word);
HT[i].weight = terms[i-1]->weight;
HT[i].parent = HT[i].lch = HT[i].rch = 0;
HT[i].code[0] = '\0';
}
// 初始化非叶子节点(n+1~m)
for (int i = n + 1; i <= m; i++) {
HT[i].weight = 0;
HT[i].parent = HT[i].lch = HT[i].rch = 0;
HT[i].code[0] = '\0';
}
// 构建哈夫曼树
for (int i = n + 1; i <= m; i++) {
int s1, s2;
selectTwoMin(i - 1, &s1, &s2); // 选择两个最小权值节点
HT[s1].parent = i;
HT[s2].parent = i;
HT[i].lch = s1;
HT[i].rch = s2;
HT[i].weight = HT[s1].weight + HT[s2].weight;
}
}
// 生成哈夫曼编码
void generateHuffmanCodes(int n) {
for (int i = 1; i <= n; i++) {
int c = i;
int f = HT[i].parent;
int codeIdx = MAX_CODE_LENGTH - 2; // 从后往前存
HT[i].code[MAX_CODE_LENGTH - 1] = '\0';
while (f != 0) {
if (HT[f].lch == c) {
HT[i].code[codeIdx--] = '0'; // 左孩子为0
} else {
HT[i].code[codeIdx--] = '1'; // 右孩子为1
}
c = f;
f = HT[f].parent;
}
// 移动到编码起始位置
strcpy(HT[i].code, &HT[i].code[codeIdx + 1]);
}
}
// 按频次从高到低排序术语
void sortTermsByFrequency(HuffmanTree terms[]) {
for (int i = 0; i < MAX_TERMS - 1; i++) {
for (int j = 0; j < MAX_TERMS - i - 1; j++) {
if (terms[j]->weight < terms[j + 1]->weight) {
HTNode *temp = terms[j];
terms[j] = terms[j + 1];
terms[j + 1] = temp;
}
}
}
}
// 修正:计算压缩率(传入 compressed 参数,确保作用域正确)
double calculateCompressionRatio(const char *text, const char *compressed) {
// 原始文件大小:文本总字节数(含空格、标点,每个字符1字节)
int originalSize = strlen(text);
// 压缩后文件大小:编码字符串总字节数(仅0和1,无空格)
int compressedSize = strlen(compressed);
// 避免除零错误
if (originalSize == 0) return 0.0;
// 压缩率 = (1 - 压缩后大小/原始大小) × 100%
return (1.0 - (double)compressedSize / originalSize) * 100.0;
}
// 修正:压缩文本(无空格,纯编码串)
void compressText(const char *text, char *compressed, HuffmanTree terms[]) {
compressed[0] = '\0';
char *textCopy = strdup(text);
char *token = strtok(textCopy, " ");
while (token != NULL) {
// 过滤单词中的非字母字符(仅保留字母,转为小写)
char cleanWord[50] = {0};
int idx = 0;
for (int i = 0; token[i]; i++) {
if (isalpha(token[i])) {
cleanWord[idx++] = tolower(token[i]);
}
}
cleanWord[idx] = '\0';
int index = -1;
if (idx > 0) {
index = findTermIndex(cleanWord, terms);
}
if (index != -1) {
// 术语:拼接编码(无空格)
strcat(compressed, terms[index]->code);
} else {
// 非术语:拼接原单词(无空格)
strcat(compressed, token);
}
token = strtok(NULL, " ");
}
free(textCopy);
}
// 修正:解压文本(适配无空格压缩串)
void decompressText(const char *compressed, char *decompressed) {
decompressed[0] = '\0';
int len = strlen(compressed);
int i = 0;
while (i < len) {
// 判断当前字符是否为编码(0或1)
if (compressed[i] == '0' || compressed[i] == '1') {
// 解码:从根节点遍历哈夫曼树,直到叶子节点
int current = m; // 根节点索引
while (i < len && (compressed[i] == '0' || compressed[i] == '1')) {
if (compressed[i] == '0') {
current = HT[current].lch;
} else {
current = HT[current].rch;
}
i++;
}
// 拼接解码后的术语
strcat(decompressed, HT[current].word);
// 术语间添加空格(还原原始文本的单词分隔)
if (i < len && !(compressed[i] == '0' || compressed[i] == '1')) {
strcat(decompressed, " ");
}
} else {
// 非编码字符(非术语):连续读取直到遇到编码(0/1),拼接原字符
int j = i;
while (j < len && !(compressed[j] == '0' || compressed[j] == '1')) {
j++;
}
char temp[1000] = {0};
strncpy(temp, compressed + i, j - i);
strcat(decompressed, temp);
// 非术语后添加空格(还原分隔)
if (j < len) {
strcat(decompressed, " ");
}
i = j;
}
}
// 去除末尾多余空格
int decompLen = strlen(decompressed);
if (decompLen > 0 && decompressed[decompLen - 1] == ' ') {
decompressed[decompLen - 1] = '\0';
}
}#include "1-基于哈夫曼树的生态科普文本压缩和解压.h"
// 主函数
int main() {
char text[MAX_TEXT_LENGTH];
char compressed[MAX_TEXT_LENGTH * 10];
char decompressed[MAX_TEXT_LENGTH];
HuffmanTree terms[MAX_TERMS];
// 读取输入文本
if (fgets(text, sizeof(text), stdin) == NULL) {
return 1;
}
// 移除换行符
text[strcspn(text, "\n")] = 0;
// 初始化术语
initializeTerms(terms);
// 统计频次
countFrequencies(text, terms);
// 构建哈夫曼树
buildHuffmanTree(MAX_TERMS, terms);
// 生成编码
generateHuffmanCodes(MAX_TERMS);
// 将编码复制回terms数组
for (int i = 0; i < MAX_TERMS; i++) {
strcpy(terms[i]->code, HT[i+1].code);
}
// 按频次排序
sortTermsByFrequency(terms);
// 输出术语频次和编码
for (int i = 0; i < MAX_TERMS; i++) {
printf("%s:%d %s\n", terms[i]->word, terms[i]->weight, terms[i]->code);
}
// 计算并输出压缩率
double ratio = calculateCompressionRatio(text, terms);
printf("%.2f%%\n", 100-ratio);
// 压缩并输出压缩文本
compressText(text, compressed, terms);
printf("%s\n", compressed);
// 解压并输出解压文本
decompressText(compressed, decompressed);
printf("%s\n", decompressed);
// 释放内存
free(HT);
for (int i = 0; i < MAX_TERMS; i++) {
free(terms[i]);
}
return 0;
}
补全代码
最新发布