7-9 Huffman Codes

最新推荐文章于 2021-11-18 21:16:37 发布

原创最新推荐文章于 2021-11-18 21:16:37 发布 · 619 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#Huffman Codes

PTA 同时被 2 个专栏收录

36 篇文章

订阅专栏

算法与数据结构

34 篇文章

订阅专栏

本文探讨哈夫曼编码的原理，展示如何构造哈夫曼树并计算最优编码长度。通过实例，讲解如何判断学生提交的编码是否满足哈夫曼编码的特性，即带权路径长度最小和无歧义编码。

In 1953, David A. Huffman published his paper "A Method for the Construction of Minimum-Redundancy Codes", and hence printed his name in the history of computer science. As a professor who gives the final exam problem on Huffman codes, I am encountering a big problem: the Huffman codes are NOT unique. For example, given a string "aaaxuaxz", we can observe that the frequencies of the characters 'a', 'x', 'u' and 'z' are 4, 2, 1 and 1, respectively. We may either encode the symbols as {'a'=0, 'x'=10, 'u'=110, 'z'=111}, or in another way as {'a'=1, 'x'=01, 'u'=001, 'z'=000}, both compress the string into 14 bits. Another set of code can be given as {'a'=0, 'x'=11, 'u'=100, 'z'=101}, but {'a'=0, 'x'=01, 'u'=011, 'z'=001} is NOT correct since "aaaxuaxz" and "aazuaxax" can both be decoded from the code 00001011001001. The students are submitting all kinds of codes, and I need a computer program to help me determine which ones are correct and which ones are not.

Input Specification:

Each input file contains one test case. For each case, the first line gives an integer N (2≤N≤63), then followed by a line that contains all the N distinct characters and their frequencies in the following format:

c[1] f[1] c[2] f[2] ... c[N] f[N]

where c[i] is a character chosen from {'0' - '9', 'a' - 'z', 'A' - 'Z', '_'}, and f[i] is the frequency of c[i] and is an integer no more than 1000. The next line gives a positive integer M (≤1000), then followed by M student submissions. Each student submission consists of N lines, each in the format:

c[i] code[i]

where c[i] is the i-th character and code[i] is an non-empty string of no more than 63 '0's and '1's.

Output Specification:

For each test case, print in each line either "Yes" if the student's submission is correct, or "No" if not.

Note: The optimal solution is not necessarily generated by Huffman algorithm. Any prefix code with code length being optimal is considered correct.

Sample Input:

7
A 1 B 1 C 1 D 3 E 3 F 6 G 6
4
A 00000
B 00001
C 0001
D 001
E 01
F 10
G 11
A 01010
B 01011
C 0100
D 011
E 10
F 11
G 00
A 000
B 001
C 010
D 011
E 100
F 101
G 110
A 00000
B 00001
C 0001
D 001
E 00
F 10
G 11

Sample Output:

Yes
Yes
No
No

题目要求判断学生给出的编码是否为最优编码，但可以不是哈夫曼编码。

这要求学生提交的编码满足：

1.带权路径长度最小（跟哈夫曼编码一样小）；

2.无歧义编码——是前缀码：数据仅存在于叶子结点中；

3.没有度为1的结点

因为满足1,2必然有3，所以我们只要证明学生提交的编码满足1,2条件即可。

#include<stdio.h>
#include<stdlib.h>
#include<string.h>

typedef struct TreeNode *HuffmanTree;
typedef struct TreeNode{
	char ch;  //要编码的字符 
	int Weight;  //权值
	HuffmanTree Left;
	HuffmanTree Right; 
}HuffmanNode;

#define MinData -1  //随着堆元素的具体值而改变 

typedef struct HeapStruct *MinHeap;
struct HeapStruct{
	HuffmanTree *data;  //存储堆元素的数组  存储时从下标1开始 
	int Size;  //堆的当前元素的个数
	int Capacity;  //堆的最大容量 
};

#define MaxN 64

int N,w[MaxN];
char ch[MaxN];
int code_length;  //最优编码的带权路径长度 

void PreOrderTraversal(HuffmanTree BST);
HuffmanTree CreateTree();
MinHeap CreateMinHeap(int MaxSize);
bool Insert(MinHeap H,HuffmanTree item); 
HuffmanTree DeleteMin(MinHeap H);
HuffmanTree Huffman(MinHeap H);
int WPL(HuffmanTree BST,int depth);
int Judge();

int main()
{
	int i,M;
	MinHeap h;
	HuffmanTree T,BT = NULL;
	
	scanf("%d",&N);
	h = CreateMinHeap(2*N);  //创建最小堆   //N个叶子节点最终形成的哈夫曼树最多有2N-1个树结点 
	for(i=0; i<N; i++){/*最小堆元素赋值*/ 
		T = CreateTree();
		getchar();//吸收换行符及空格
		scanf("%c %d",&ch[i],&w[i]);
		T->ch = ch[i];
		T->Weight = w[i];
		Insert(h,T);
	}
	BT = Huffman(h);  //构造哈夫曼树
	//PreOrderTraversal(BT);
	code_length = WPL(BT,0);
	scanf("%d",&M);
	while(M--){
		if(Judge())  printf("Yes\n");
		else  printf("No\n");
	}
	
	return 0;
}

/*****先序遍历*****/ 
void PreOrderTraversal(HuffmanTree BST)
{
	if( BST ){
		printf("%d ",BST->Weight);     //先访问根节点 
		PreOrderTraversal(BST->Left);  //再访问左子树 
		PreOrderTraversal(BST->Right); //最后访问右子树 
	}
}

HuffmanTree CreateTree()
{
	HuffmanTree BST = (HuffmanTree)malloc(sizeof(HuffmanNode));
	BST->ch = '\0';  //空字符
	BST->Weight = 0;
	BST->Left = BST->Right = NULL;
	
	return BST;
}

/*哈夫曼树构造算法*/
HuffmanTree Huffman(MinHeap H)
{
 	int i,num;
	HuffmanTree T = NULL;
	
	/*此处必须将H->Size的值交给num,因为后面做DeleteMin()和 Insert()函数会改变H->Size的值*/
	num = H->Size;     
	for(i=1; i<num; i++){  //做 H->Size-1次合并
		T = CreateTree();  //建立一个新的根结点 
		T->Left = DeleteMin(H);  //从最小堆中删除一个节点，作为新T的左子结点
		T->Right = DeleteMin(H);  //从最小堆中删除一个节点，作为新T的右子结点 
		T->Weight = T->Left->Weight+T->Right->Weight;  //计算新权值 
		//printf("%3d 0x%x 0x%x\n",T->Weight,T->Left,T->Right);
		Insert(H,T);  //将新T插入到最小堆 
	} 
	T = DeleteMin(H);
	
	return T; 
}

/*带权路径长度计算算法*/
int WPL(HuffmanTree BST,int depth)  //depth为目前编码到哈夫曼树的深度（层次） 
{
	if( BST ){
		if( !BST->Left && !BST->Right){
			return depth*BST->Weight;
		}else{
			return WPL(BST->Left,depth+1)+WPL(BST->Right,depth+1);
		}
	}else{
		return 0;
	}
}

int Judge()
{
	int i,j,weight;
	int flag = 1;  //判断是否为前缀码的标志，flag=表示是前缀码 
	char s1[MaxN],s2[MaxN];
	
	HuffmanTree T = CreateTree();
	HuffmanTree pt = NULL;
	for(i=0; i<N; i++){
		scanf("%s%s",s1,s2);
		/*最坏情况下，N个叶子结点构成的Huffman树编出的码字长度为N-1*/
		if(strlen(s2) > N-1)  return 0;
		for(j=0; s1[0] != ch[j]; j++);
		weight = w[j];
		pt = T;//每次建树前先将指针移动到根节点上
		for(j=0; s2[j] != '\0'; j++){
			if(s2[j] == '0'){
				if(!pt->Left)  pt->Left = CreateTree();
				pt = pt->Left;
			}
			if(s2[j] == '1'){
				if(!pt->Right)  pt->Right = CreateTree();
				pt = pt->Right;
			}
			if(pt->Weight)  flag = 0;  // 判断叶结点之前的结点权值是否为0，若不为0，则为非前缀码 
			if(s2[j+1] == '\0'){  //倒数第一个码值 说明此时应该到达叶结点位置 
				if(pt->Left || pt->Right)  flag = 0;  //非前缀码
				pt->Weight = weight; 
			} 
		}
	}
	if( !flag )  return 0;
	if(code_length == WPL(T,0))  return 1;  //判断带权路径长度是否相等 
	else return 0;
}

MinHeap CreateMinHeap(int MaxSize)
{  /*创建容量为MaxSize的最小堆*/
	MinHeap H = (MinHeap)malloc(sizeof(struct HeapStruct));
	H->data = (HuffmanTree *)malloc((MaxSize+1) * sizeof(HuffmanTree));
	H->Size = 0;
	H->Capacity = MaxSize;
	HuffmanTree T = CreateTree();
	T->Weight = MinData;  /*定义哨兵-为小于堆中所有可能元素权值的值，便于以后更快操作*/
	H->data[0] = T;
	
	return H;
}

bool  IsFull(MinHeap H)
{
	return (H->Size == H->Capacity);
}

bool IsEmpty(MinHeap H)
{
	return (H->Size == 0);
}

/*插入算法-将新增结点插入到从其父结点到根结点的有序序列中*/
bool Insert(MinHeap H,HuffmanTree item)
{/*将元素item插入到最小堆H中，其中H->data[0]已被定义为哨兵*/
	int i;
	if( IsFull(H) ){
		printf("最小堆已满\n");
		return false;
	}
	i = ++H->Size;  //i指向插入后堆中的最后一个元素的位置
	for(; H->data[i/2]->Weight > item->Weight; i/=2)  //无哨兵，则增加判决条件 i>1 
	    H->data[i] = H->data[i/2];  //向下过滤结点 
	H->data[i] = item;   //将item插入 
	
	return true;
 }
 
HuffmanTree DeleteMin(MinHeap H)
{/*从最小堆H中取出权值为最小的元素，并删除一个结点*/
	int parent,child;
	HuffmanTree MinItem,temp = NULL;
	if( IsEmpty(H) ){
		printf("最小堆为空\n");
		return NULL;
	}
	MinItem = H->data[1];  //取出根结点-最小的元素-记录下来
	/*用最小堆中的最后一个元素从根结点开始向上过滤下层结点*/
	temp = H->data[H->Size--];  //最小堆中最后一个元素，暂时将其视为放在了根结点
	for(parent=1; parent*2<=H->Size; parent=child){
		child = parent*2;
		if((child != H->Size) && (H->data[child]->Weight > H->data[child+1]->Weight)){/*有右儿子，并且左儿子权值大于右儿子*/
			child++; //child指向左右儿子中较小者 
		}
		if(temp->Weight > H->data[child]->Weight){
			H->data[parent] = H->data[child];  //向上过滤结点-temp存放位置下移到child位置 
		}else{
			break;  //找到了合适的位置
		}
	} 
	H->data[parent] = temp;  //temp存放到此处
	
	return MinItem; 
}