Huffman编码文件压缩

最新推荐文章于 2022-07-24 15:32:47 发布

sheng0023

最新推荐文章于 2022-07-24 15:32:47 发布

阅读量591

点赞数

文章标签： c++ 数据结构 visual studio

本文链接：https://blog.youkuaiyun.com/Wang121201/article/details/115080620

版权

该程序实现了对文本文件的Huffman编码压缩，首先统计文件中字符的频率，然后生成Huffman树并为每个字符分配编码。在生成编码后，文件被压缩并以二进制形式写入output.txt，同时以十六进制形式在屏幕上输出压缩结果。程序考虑了ASCII码值为0的字符作为结束符，并确保在字符频率相同时，ASCII值较小的字符优先。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

【问题描述】
编写一程序采用Huffman编码对一个正文文件进行压缩。具体压缩方法如下：

对正文文件中字符(换行字符’'除外，不统计)按出现次数（即频率）进行统计
依据字符频率生成相应的Huffman树（未出现的字符不生成）
依据Huffman树生成相应字符的Huffman编码
依据字符Huffman编码压缩文件（即按照Huffman编码依次输出源文件字符）。
说明：
只对文件中出现的字符生成Huffman，注意：一定不要处理\n，即不要为其生成Huffman码。
采用ASCII码值为0的字符作为压缩文件的结束符（即可将其出现次数设为1来参与编码）.
在生成Huffman树时，初始在对字符频率权重进行（由小至大）排序时，频率相同的字符ASCII编码值小的在前；新生成的权重节点插入到有序权重序列中时，出现相同权重时，插入到其后（采用稳定排序）。
遍历Huffman树生成字符Huffman码时，左边为0右边为1。
源文件是文本文件，字符采用ASCII编码，每个字符点8位；而采用Huffman编码后，高频字符编码长度较短（小于8位），因此最后输出时需要使用C语言中的位运算将字符Huffman码依次输出到每个字节中。
【输入形式】对当前目录下文件input.txt进行压缩。
【输出形式】将压缩后结果输出到文件output.txt中，同时将压缩结果用十六进制形式（printf("%x",…)）输出到屏幕上，以便检查和查看结果。
【样例输出1】15f0
同时程序将压缩结果输出到文件output.txt中。
【样例说明】输入文件中字符的频率为：a为3，b为2，c为1，此外，\0字符将作为压缩文件的结束标志，其出现次数设为1。因此，采用Huffman码生成方法，它们的Huffman编码分别为：
a : 0
b : 10
c : 111
\0 : 110
因此，最终文件压缩结果（按位）为：0001010111110000
将上述结果按字节按十六进制输出到屏幕上则为15f0（即0001010 111110000的十六进制表示）。
说明：采用Huffman码输出字符序列长度为：1+1+1+2+2+3+3=13（位），由于C语言中输出的最小单位为字节（8位），因此，最后补了三个位0，压缩后实际输出为2个字节。由于文本文件是按ASCII来解释的，因此，以文本方式打开压缩文件将显示乱码（最好用二进制文件查看器来看）。
【样例输入2】
若当前目录下input.txt中内容如下：do not spend all that you have.do not sleep as long as you want.
【样例输出2】ea3169146ce9eee6cff4b2a93fe1a5d462d21d9a87c0eb2f3eb2a9cfe6cae
同时程序将压缩结果输出到文件output.txt中。

代码说明

#define _CRT_SECURE_NO_WARNINGS
#include<stdio.h>
#include<string.h>
#include<stdlib.h>

struct Node {
	int nth;
	char chn;
	char code[10];
	struct Node* lchild;
	struct Node* rchild;
	struct Node* next;
}theNode;


//函数待优化。。。功能过于简单：
//如果连表更新以后，首字符频数增加就对链表进行调整，保证在字符检索结束时所有字符频数从小到大排列 
void Ajustnth(Node* add) {
	Node* addc = add;
	Node* addcn = addc->next;
	while (addcn) {
		if (addc->nth > addcn->nth) {
			char ch = addc->chn;
			addc->chn = addcn->chn;
			addcn->chn = ch;
			int num = addc->nth;
			addc->nth = addcn->nth;
			addcn->nth = num;
		}
		addc = addc->next;
		addcn = addc->next;
	}
}

//判断该输入字符是否已经在链表中存在
int chinList(Node* head, char ch) {
	Node* headc = head->next;
	int tag = 0;
	while (headc) {
		if (headc->chn == ch) {
			headc->nth++;
			tag = 1;
			break;
		}
		headc = headc->next;
	}
	//如果链表原有字符个数增加，调整对字符链表的顺序
	if (tag == 1)
		Ajustnth(headc);
	return tag;
}

//创建空白节将字符插入
void ListAddch(Node* head, char ch) {
	if (!chinList(head, ch)) {
		Node* add = (Node*)malloc(sizeof(Node));
		add->chn = ch;
		add->nth = 1;
		add->next = head->next;
		add->lchild = nullptr;
		add->rchild = nullptr;
		add->code[0] = '\0';
		head->next = add;
	}
}

//调整统计的字符的顺序，根据ASCII码进行两两交换
void AjustchList(Node* head) {
	//设计两个标记
	int tagtag = 1;
	while (tagtag) {
		//由于进行两两交换，当内部字符不进行调整时停止
		Node* headc = head;
		Node* headn = headc->next;
		int tag = 0;
		while (headn) {
			if (headc->nth == headn->nth && headc->chn > headn->chn) {
				char ch = headc->chn;
				headc->chn = headn->chn;
				headn->chn = ch;
				tag++;
			}
			headc = headc->next;
			headn = headc->next;
		}
		if (tag == 0)
			tagtag = 0;
	}
}

void printListPlus(Node* head) {
	Node* headc = head->next;
	while (headc) {
		printf("%c             %s\n", headc->chn, headc->code);
		headc = headc->next;
	}
}

//移动合成后的节点，根据其频数，更新链表序列
Node* moveNode(Node* current) {
	Node* mainhead = current->next;
	Node* head = current;
	int tag = 0;
	int n = current->nth;
	while (head->next) {
		if (head->next->nth > n) {
			current->next = head->next;
			head->next = current;
			tag = 1;
			break;
		}
		head = head->next;
	}
	//找不到频数大于合成节点频数时，说明该节点频数最大，并将其连接到链表尾部
	if (tag == 0) {
		head->next = current;
		current->next = nullptr;
	}
	return mainhead;
}

//根据创建的符号及其频数创建huffman树
Node* buildTree(Node* head) {
	Node* thehead = head;
	while (thehead->next) {
		Node* headn = thehead->next;
		Node* addParent = (Node*)malloc(sizeof(Node));
		addParent->lchild = thehead;
		addParent->rchild = headn;
		addParent->chn = '~';
		addParent->nth = thehead->nth + headn->nth;
		addParent->next = headn->next;
		addParent->code[0] = '\0';
		thehead = addParent;
		if (thehead && thehead->next) {
			//如果创建的合成节点频数最小，不需要更新链表序列
			if (thehead->next->nth <= thehead->nth)
				thehead = moveNode(thehead);
		}
		else
			break;
	}
	return thehead;
}

//打印编码表
void printTreeMPlus(Node* head) {
	if (head) {
		printTreeMPlus(head->lchild);
		printf("%c   ", head->chn);
		printf("%d   ", head->nth);
		printf("%s\n", head->code);
		printTreeMPlus(head->rchild);
	}
}

//遍历Huffman树，生成叶节点的Huffman编码
void visitTree(Node* head) {
	Node* current = head;
	if (current->lchild) {
		//更新左节点编码
		strcpy(current->lchild->code, current->code);
		strcat(current->lchild->code, "0");
		visitTree(current->lchild);
	}
	if (current->rchild) {
		//更新右节点编码
		strcpy(current->rchild->code, current->code);
		strcat(current->rchild->code, "1");
		visitTree(current->rchild);
	}
}

//遍历Huffman树，将树的叶节点以链表的形式提取出来，最为压缩编码时符号的参考
Node* getTreeLeaf(Node* head, Node* leaflist) {
	Node* current = head;
	if (!current->lchild && !current->rchild) {
		leaflist->next = current;
		leaflist = leaflist->next;
		leaflist->next = nullptr;
		//printf("leaflist:::%c    %s\n", leaflist->chn, leaflist->code);
	}
	if (current->lchild)
		leaflist = getTreeLeaf(current->lchild, leaflist);
	if (current->rchild)
		leaflist = getTreeLeaf(current->rchild, leaflist);
	return leaflist;
}

//根据编码字符，返回该字符在链表的位置
Node* thechCode(char ch, Node* listhead) {
	Node* search = listhead;
	while (search) {
		if (search->chn == ch)
			break;
		search = search->next;
	}
	return search;
}


void printTheCode(char* strcmp, Node* encodeList) {
	char strtrans[1000] = "";
	int max = strlen(strcmp);
	//根据数组中的字符，找到其编码，并拼接到strtrans上，得到Huffman编码的01序列
	for (int i = 0; i < max + 1; i++) {
		Node* getnode = thechCode(strcmp[i], encodeList);
		strcat(strtrans, getnode->code);
		printf("%s   %s\n", strtrans, getnode->code);
	}
	max = strlen(strtrans);
	//根据8位序列进行补0
	int n = 8 - max % 8;
	if (n != 8) {
		for (int i = 0; i < n + 1; i++)
			strcat(strtrans, "0");
	}
	printf("%s\n", strtrans);

	//将编码序列分4位进行打印，其中第奇数个0不需要打印
	max = strlen(strtrans);
	int num[200] = { 0 };
	max = max / 4;
	for (int i = 0; i < max; i++) {
		num[i] = 8 * strtrans[4 * i] + 4 * strtrans[4 * i + 1] + 2 * strtrans[4 * i + 2] + strtrans[4 * i + 3] - 15 * int('0');
		if (num[i] != 0)
			printf("%x", num[i]);
		else if (i % 2 == 1)
			printf("%x", num[i]);
	}
	//将编码写入到文件中
	FILE* fpout = fopen("output.txt", "wb");
	for (int i = 0; i < max; i++) {
		if (num[i] != 0)
			fprintf(fpout, "%x", num[i]);
		else if (i % 2 == 1)
			fprintf(fpout, "%x", num[i]);

	}
	fclose(fpout);
}


int main() {
	Node* nodehead = (Node*)malloc(sizeof(Node));
	nodehead->next = nullptr;
	nodehead->lchild = nullptr;
	nodehead->rchild = nullptr;
	
	FILE* fpin = fopen("article.txt", "r");
	//初始化头节点
	char strch[200];
	int i = 0;
	strch[0] = '\0';
	char ch;
	do {
		ch = fgetc(fpin);
		if (strch[i] != '\n') {
			if (ch == EOF)
				ch = '\0';
			strch[i++] = ch;
			//将字符对应到链表中
			ListAddch(nodehead, ch);
		}
	} while (ch != '\0');
	fclose(fpin);
	AjustchList(nodehead);
	nodehead = buildTree(nodehead->next);
	visitTree(nodehead);
	Node* leafhead = (Node*)malloc(sizeof(Node));
	Node* currentleaf = leafhead;
	getTreeLeaf(nodehead, currentleaf);
	printListPlus(currentleaf);
	printTheCode(strch, leafhead);
	printf("\n");
}