【问题描述】
编写一程序采用Huffman编码对一个正文文件进行压缩。具体压缩方法如下:
- 对正文文件中字符(换行字符’'除外,不统计)按出现次数(即频率)进行统计
- 依据字符频率生成相应的Huffman树(未出现的字符不生成)
- 依据Huffman树生成相应字符的Huffman编码
- 依据字符Huffman编码压缩文件(即按照Huffman编码依次输出源文件字符)。
说明: - 只对文件中出现的字符生成Huffman,注意:一定不要处理\n,即不要为其生成Huffman码。
- 采用ASCII码值为0的字符作为压缩文件的结束符(即可将其出现次数设为1来参与编码).
- 在生成Huffman树时,初始在对字符频率权重进行(由小至大)排序时,频率相同的字符ASCII编码值小的在前;新生成的权重节点插入到有序权重序列中时,出现相同权重时,插入到其后(采用稳定排序)。
- 遍历Huffman树生成字符Huffman码时,左边为0右边为1。
- 源文件是文本文件,字符采用ASCII编码,每个字符点8位;而采用Huffman编码后,高频字符编码长度较短(小于8位),因此最后输出时需要使用C语言中的位运算将字符Huffman码依次输出到每个字节中。
【输入形式】对当前目录下文件input.txt进行压缩。
【输出形式】将压缩后结果输出到文件output.txt中,同时将压缩结果用十六进制形式(printf("%x",…))输出到屏幕上,以便检查和查看结果。
【样例输出1】15f0
同时程序将压缩结果输出到文件output.txt中。
【样例说明】输入文件中字符的频率为:a为3,b为2,c为1,此外,\0字符将作为压缩文件的结束标志,其出现次数设为1。因此,采用Huffman码生成方法,它们的Huffman编码分别为:
a : 0
b : 10
c : 111
\0 : 110
因此,最终文件压缩结果(按位)为:0001010111110000
将上述结果按字节按十六进制输出到屏幕上则为15f0(即0001010 111110000的十六进制表示)。
说明:采用Huffman码输出字符序列长度为:1+1+1+2+2+3+3=13(位),由于C语言中输出的最小单位为字节(8位),因此,最后补了三个位0,压缩后实际输出为2个字节。由于文本文件是按ASCII来解释的,因此,以文本方式打开压缩文件将显示乱码(最好用二进制文件查看器来看)。
【样例输入2】
若当前目录下input.txt中内容如下:do not spend all that you have.do not sleep as long as you want.
【样例输出2】ea3169146ce9eee6cff4b2a93fe1a5d462d21d9a87c0eb2f3eb2a9cfe6cae
同时程序将压缩结果输出到文件output.txt中。
代码说明
#define _CRT_SECURE_NO_WARNINGS
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
struct Node {
int nth;
char chn;
char code[10];
struct Node* lchild;
struct Node* rchild;
struct Node* next;
}theNode;
//函数待优化。。。功能过于简单:
//如果连表更新以后,首字符频数增加就对链表进行调整,保证在字符检索结束时所有字符频数从小到大排列
void Ajustnth(Node* add) {
Node* addc = add;
Node* addcn = addc->next;
while (addcn) {
if (addc->nth > addcn->nth) {
char ch = addc->chn;
addc->chn = addcn->chn;
addcn->chn = ch;
int num = addc->nth;
addc->nth = addcn->nth;
addcn->nth = num;
}
addc = addc->next;
addcn = addc->next;
}
}
//判断该输入字符是否已经在链表中存在
int chinList(Node* head, char ch) {
Node* headc = head->next;
int tag = 0;
while (headc) {
if (headc->chn == ch) {
headc->nth++;
tag = 1;
break;
}
headc = headc->next;
}
//如果链表原有字符个数增加,调整对字符链表的顺序
if (tag == 1)
Ajustnth(headc);
return tag;
}
//创建空白节将字符插入
void ListAddch(Node* head, char ch) {
if (!chinList(head, ch)) {
Node* add = (Node*)malloc(sizeof(Node));
add->chn = ch;
add->nth = 1;
add->next = head->next;
add->lchild = nullptr;
add->rchild = nullptr;
add->code[0] = '\0';
head->next = add;
}
}
//调整统计的字符的顺序,根据ASCII码进行两两交换
void AjustchList(Node* head) {
//设计两个标记
int tagtag = 1;
while (tagtag) {
//由于进行两两交换,当内部字符不进行调整时停止
Node* headc = head;
Node* headn = headc->next;
int tag = 0;
while (headn) {
if (headc->nth == headn->nth && headc->chn > headn->chn) {
char ch = headc->chn;
headc->chn = headn->chn;
headn->chn = ch;
tag++;
}
headc = headc->next;
headn = headc->next;
}
if (tag == 0)
tagtag = 0;
}
}
void printListPlus(Node* head) {
Node* headc = head->next;
while (headc) {
printf("%c %s\n", headc->chn, headc->code);
headc = headc->next;
}
}
//移动合成后的节点,根据其频数,更新链表序列
Node* moveNode(Node* current) {
Node* mainhead = current->next;
Node* head = current;
int tag = 0;
int n = current->nth;
while (head->next) {
if (head->next->nth > n) {
current->next = head->next;
head->next = current;
tag = 1;
break;
}
head = head->next;
}
//找不到频数大于合成节点频数时,说明该节点频数最大,并将其连接到链表尾部
if (tag == 0) {
head->next = current;
current->next = nullptr;
}
return mainhead;
}
//根据创建的符号及其频数创建huffman树
Node* buildTree(Node* head) {
Node* thehead = head;
while (thehead->next) {
Node* headn = thehead->next;
Node* addParent = (Node*)malloc(sizeof(Node));
addParent->lchild = thehead;
addParent->rchild = headn;
addParent->chn = '~';
addParent->nth = thehead->nth + headn->nth;
addParent->next = headn->next;
addParent->code[0] = '\0';
thehead = addParent;
if (thehead && thehead->next) {
//如果创建的合成节点频数最小,不需要更新链表序列
if (thehead->next->nth <= thehead->nth)
thehead = moveNode(thehead);
}
else
break;
}
return thehead;
}
//打印编码表
void printTreeMPlus(Node* head) {
if (head) {
printTreeMPlus(head->lchild);
printf("%c ", head->chn);
printf("%d ", head->nth);
printf("%s\n", head->code);
printTreeMPlus(head->rchild);
}
}
//遍历Huffman树,生成叶节点的Huffman编码
void visitTree(Node* head) {
Node* current = head;
if (current->lchild) {
//更新左节点编码
strcpy(current->lchild->code, current->code);
strcat(current->lchild->code, "0");
visitTree(current->lchild);
}
if (current->rchild) {
//更新右节点编码
strcpy(current->rchild->code, current->code);
strcat(current->rchild->code, "1");
visitTree(current->rchild);
}
}
//遍历Huffman树,将树的叶节点以链表的形式提取出来,最为压缩编码时符号的参考
Node* getTreeLeaf(Node* head, Node* leaflist) {
Node* current = head;
if (!current->lchild && !current->rchild) {
leaflist->next = current;
leaflist = leaflist->next;
leaflist->next = nullptr;
//printf("leaflist:::%c %s\n", leaflist->chn, leaflist->code);
}
if (current->lchild)
leaflist = getTreeLeaf(current->lchild, leaflist);
if (current->rchild)
leaflist = getTreeLeaf(current->rchild, leaflist);
return leaflist;
}
//根据编码字符,返回该字符在链表的位置
Node* thechCode(char ch, Node* listhead) {
Node* search = listhead;
while (search) {
if (search->chn == ch)
break;
search = search->next;
}
return search;
}
void printTheCode(char* strcmp, Node* encodeList) {
char strtrans[1000] = "";
int max = strlen(strcmp);
//根据数组中的字符,找到其编码,并拼接到strtrans上,得到Huffman编码的01序列
for (int i = 0; i < max + 1; i++) {
Node* getnode = thechCode(strcmp[i], encodeList);
strcat(strtrans, getnode->code);
printf("%s %s\n", strtrans, getnode->code);
}
max = strlen(strtrans);
//根据8位序列进行补0
int n = 8 - max % 8;
if (n != 8) {
for (int i = 0; i < n + 1; i++)
strcat(strtrans, "0");
}
printf("%s\n", strtrans);
//将编码序列分4位进行打印,其中第奇数个0不需要打印
max = strlen(strtrans);
int num[200] = { 0 };
max = max / 4;
for (int i = 0; i < max; i++) {
num[i] = 8 * strtrans[4 * i] + 4 * strtrans[4 * i + 1] + 2 * strtrans[4 * i + 2] + strtrans[4 * i + 3] - 15 * int('0');
if (num[i] != 0)
printf("%x", num[i]);
else if (i % 2 == 1)
printf("%x", num[i]);
}
//将编码写入到文件中
FILE* fpout = fopen("output.txt", "wb");
for (int i = 0; i < max; i++) {
if (num[i] != 0)
fprintf(fpout, "%x", num[i]);
else if (i % 2 == 1)
fprintf(fpout, "%x", num[i]);
}
fclose(fpout);
}
int main() {
Node* nodehead = (Node*)malloc(sizeof(Node));
nodehead->next = nullptr;
nodehead->lchild = nullptr;
nodehead->rchild = nullptr;
FILE* fpin = fopen("article.txt", "r");
//初始化头节点
char strch[200];
int i = 0;
strch[0] = '\0';
char ch;
do {
ch = fgetc(fpin);
if (strch[i] != '\n') {
if (ch == EOF)
ch = '\0';
strch[i++] = ch;
//将字符对应到链表中
ListAddch(nodehead, ch);
}
} while (ch != '\0');
fclose(fpin);
AjustchList(nodehead);
nodehead = buildTree(nodehead->next);
visitTree(nodehead);
Node* leafhead = (Node*)malloc(sizeof(Node));
Node* currentleaf = leafhead;
getTreeLeaf(nodehead, currentleaf);
printListPlus(currentleaf);
printTheCode(strch, leafhead);
printf("\n");
}