Huffman树,又称为最优二叉树,是加权路径最短的二叉树。Huffman树的构建利用到贪心算法。
【贪心算法】
贪心算法是指在问题求解时,总是做出当前最好的选择,也就是说贪心算法做出的不一定是全局的最优解,但是某种意义上的局部最优解。贪心算法不一定能够求得整体的最优解。
使用贪心算法构建哈夫曼树:
/*
主要原理:将每个字符与哈夫曼编码相对应,
压缩:统计字符出现的次数-》构建哈夫曼树-》生成哈夫曼编码-》压缩到新的文件中
解压:读压缩后的文件-》将哈夫曼编码与字符相对应-》将字符写入新的文件中
—次数多的字符路径短,次数少的字符路径长
*/
--heap.h
#pragma once
//堆
#include <vector>
#include <assert.h>
//添加仿函数
template<class T>
struct Less
{
bool operator()(const T* L, const T* R)
{
return L < R;
}
};
template<class T>
struct Greate
{
bool operator()(const T* L, const T* R)
{
return L > R;
}
};
template <class T, class compare = Less> //Less为默认的类型
//template <class T, template<class> compare = Less> //模板的模板参数
class Heap
{
public:
Heap() //无参构造
:_size(0)
{}
Heap(vector<T>& a) //拷贝构造
{
_a.swap(a); //vector中swap函数?
//建堆
for (int i = (_a.size() - 2) / 2; i >= 0; --i)
{
AdjustDown(i);
}
}
Heap(const T* a, size_t size)
{
_a.reverse(size); //申请空间
for (int i = 0; i < size; i++) //将所有元素压入容器中
{
_a.push_back(a[i]);
}
for (int i = (_a.size() - 2) / 2; i >= 0; i--) //i为元素下标,调整堆
{
_AdjustDown(i);
}
}
void Push(const T& x) //将数据插入堆中
{
_a.push_back(x);
AdjustUp(_a.size() - 1); //进行向上调整
++_size;
}
void Pop() //删除数据
{
size_t size = _a.size();
assert(size > 0);
//将堆的头元素和尾元素进行交换,然后进行pop,对堆进行调整
swap(_a[0], _a[size - 1]);
_a.pop_back();
AdjustDown(0);
--_size;
}
T Top() //堆顶上元素
{
return _a[0];
}
size_t Size() //求堆中数据的个数
{
return _size;
}
public:
void AdjustDown(size_t parent) //将堆进行下调(小堆)
{
size_t child = parent * 2 + 1; //计算孩子节点的下标
size_t size = _a.size();
while (child < size)
{
compare com;
if (child + 1 < size && com(_a[child + 1], _a[child])) //寻找左右孩子中最小的
{
++child;
}
if (com(_a[child], _a[parent])) //将最大的孩子与父节点进行比较
{
swap(_a[parent], _a[child]);
parent = child;
child = parent * 2 + 1;
}
else
{
break;
}
}
}
void AdjustUp(size_t child) //上调
{
size_t parent = (child - 1) / 2;
while (child > 0)
{
compare com;
if (com(_a[child], _a[parent]))
{
swap(_a[child], _a[parent]);
child = parent;
parent = (child - 1) / 2;
}
else
{
break;
}
}
}
protected:
vector<T> _a;
size_t _size;
};
--Huffman.h
#pragma once
#include "Heap.h"
//实现哈夫曼树
template <class T>
struct HuffmanNode
{
HuffmanNode<T>* _left; //指向哈夫曼树的左节点的指针
HuffmanNode<T>* _right; //指向哈夫曼树的右节点的指针
T _weight; //节点的权值
HuffmanNode(const T& x)
:_left(NULL)
, _right(NULL)
, _weight(x)
{}
};
template <class T>
class HuffmanTree
{
typedef HuffmanNode<T> Node;
public:
HuffmanTree() //进行无参构造
:_root(NULL)
{}
~HuffmanTree() //析构函数
{
_clear(_root);
}
//构造哈夫曼树
HuffmanTree(T* a, size_t size, const T& invalid)
{
_root = CreateTree(a, size, invalid);
}
Node* GetRootNode() //获得根节点
{
return _root;
}
protected:
Node* CreateTree(T* a, size_t size, const T& invalid)
{
struct compare //构造仿函数
{
bool operator()(const Node* dt, const Node* st)
{
return dt->_weight < st->_weight;
}
};
Heap<Node*, compare> minHeap; //创建最小堆
for (size_t i = 0; i < size; ++i)//将所有的数据压入堆中
{
if (a[i] != invalid)
{
minHeap.Push(new Node(a[i]));
}
}
Node* parent = new Node(0);
while (minHeap.Size() > 1) //小堆的大小不为空
{
Node* left = minHeap.Top(); //将小堆中最小的两个数据取出
minHeap.Pop();
Node* right = minHeap.Top();
minHeap.Pop();
parent = new Node(left->_weight + right->_weight);
parent->_left = left;
parent->_right = right;
minHeap.Push(parent);
}
return parent;
}
void _clear(Node* root)
{
if (root)
{
_clear(root->_left);
_clear(root->_right);
delete root;
}
}
protected:
Node* _root;
};
--FileCompress.h
#pragma once
//利用哈夫曼树实现文件压缩
/*
主要原理:将每个字符与哈夫曼编码相对应,
压缩:统计字符出现的次数-》构建哈夫曼树-》生成哈夫曼编码-》压缩到新的文件中
解压:读压缩后的文件-》将哈夫曼编码与字符相对应-》将字符写入新的文件中
次数多的字符路径短,次数少的字符路径长
*/
#include "Huffman.h"
#include <string>
#include <math.h>
typedef long long LongType;
struct charInfo
{
unsigned char _ch; //字符
LongType _count; //出现的次数
string _code; //哈夫曼编码
charInfo()
:_count(0)
, _ch(0)
{}
charInfo(const LongType& count) //构造
:_count(count)
, _ch(0)
{ }
bool operator!=(const charInfo& info)const //重载!=(常成员函数)
{
return _count != info._count;
}
charInfo operator+(const charInfo& info)const //重载+
{
return charInfo(_count + info._count);
}
bool operator<(const charInfo& info)const //重载<
{
return _count < info._count;
}
};
class FileCompress
{
public:
FileCompress() //无参构造
{
for (int i = 0; i < 256; ++i)
{
_infos[i]._count = 0;
_infos[i]._ch = i;
}
}
void compress(const char* Filename) //压缩
{
assert(Filename);
FILE* FOut = fopen(Filename, "rb"); //打开源文件
assert(FOut); //判断打开文件是否失败
//使用直接定址法统计每个字符出现的次数
char ch = fgetc(FOut); //取一个字符
while (!feof(FOut))
{
_infos[(unsigned char)ch]._count++;
ch = fgetc(FOut);
}
//构建哈夫曼树
charInfo invalid(0);
HuffmanTree<charInfo> tree(_infos, 256, invalid);
//生成哈夫曼编码
string code;
GenerateHuffmanCode(tree.GetRootNode(),*this, code);
/*HuffmanNode<charInfo>* root = tree.GetRootNode();
GenerateHuffmanCode(root, *this, code);*/
//写配置文件,方便解压缩时重建HuffmanTree。
/*新建配置文件,给中间放入读取的字符种类,不需要存每个字符读取的次数,
可以直接利用哈夫曼树的根节点的权值,就可以确定文件字符的总个数*/
string configFile = Filename;
string compressFileName = Filename; //新建压缩文件
size_t last_ = configFile.find_last_of('.');
if (last_ < configFile.size())
{
configFile.erase(last_);
compressFileName.erase(last_);
}
configFile += ".config";
FILE* FInconfig = fopen(configFile.c_str(), "wb");
assert(FInconfig);
string str; //使用str来保存出现的字符
char buffer[20] = {0};
for (size_t i = 0; i < 256; ++i)
{
if (_infos[i]._count != 0)
{
str += _infos[i]._ch;
str += ':';
str += (string)_itoa(_infos[i]._count, buffer, 10);
//str += buffer;
str += '\n';
fputs(str.c_str(), FInconfig);
str.clear(); //每次对str进行清除,就能够保存下一个出现的字符
}
}
//将文件进行压缩
compressFileName += ".compress";
FILE* FIn = fopen(compressFileName.c_str(), "wb");
assert(FIn);
fseek(FOut, 0, SEEK_SET); //将fout文件指针移动到0的位置
ch = fgetc(FOut);
unsigned char value = 0;
int pos = 0;
while (!feof(FOut)) //将每个字符的编码写入文件
{
str = _infos[(unsigned char)ch]._code;
for (size_t i = 0; i < str.size(); ++i)
{
value <<= 1;
value |= (str[i] - '0');
if (++pos == 8)
{
fputc(value, FIn);
pos = 0;
value = 0;
}
}
ch = fgetc(FOut);
}
//如果编码最后一个写入时,一个字符的空间没有占满时,采用的方式是进行补0操作
if (pos > 0)
{
value <<= (8 - pos);
fputc(value, FIn);
}
fclose(FIn);
fclose(FOut);
fclose(FInconfig);
}
void unCompress(const char* Filename) //解压
{
assert(Filename);
FILE* FOut = fopen(Filename, "rb");
assert(FOut);
//配置文件
string configFile = (string)Filename;
string FileInName = (string)Filename;
size_t last_ = configFile.find_last_of('.'); //查找字符串中出现的最后一个‘.’
if (last_ < configFile.size())
{
configFile.erase(last_); //将后面的字符进行删除
FileInName.erase(last_);
}
configFile += ".config";
FILE* FConfig = fopen(configFile.c_str(), "rb");
assert(FConfig);
//解压后的文件
FileInName += "_com.txt";
FILE* FIn = fopen(FileInName.c_str(), "wb");
assert(FIn);
//修改_count,注意\n,有可能代表字符,有可能是行结束标志
char buff[20] = { 0 };
unsigned char ch = fgetc(FConfig);
while (!feof(FConfig))
{
fgetc(FConfig);
fgets(buff, 20, FConfig);
this->_infos[ch]._count = (LongType)atoi(buff);
ch = fgetc(FConfig);
}
//重建哈夫曼树
charInfo invalid(0); //定义非法值
HuffmanTree<charInfo> tree(_infos, 256, invalid);
HuffmanNode<charInfo>* root = tree.GetRootNode();
HuffmanNode<charInfo>* cur = root;
ch = fgetc(FOut);
int count = root->_weight._count; //记录字符的总个数控制结束,根节点的权值表示字符的总个数
int pos = 7;
while (count > 0) //读取文件的编码
{
while (pos >= 0)
{
if (ch & (1 << pos))
{
cur = cur->_right;
}
else
{
cur = cur->_left;
}
if (cur->_left == NULL && cur->_right == NULL)
{
fputc(cur->_weight._ch, FIn);
if (--count == 0) //将剩余没有写入的字符总次数减1
{
break;
}
cur = root;
}
--pos;
}
pos = 7;
ch = fgetc(FOut);
}
fclose(FOut);
fclose(FIn);
fclose(FConfig);
}
/*void PrintCode()const
{
for (int i = 0; i < 256; ++i)
{
if (this->_infos[i]._count != 0)
{
cout << this->_infos[i]._ch << ":>" << this->_infos[i]._code << endl;
}
}
}*/
protected:
//后序遍历生成哈夫曼编码(使用递归)
void GenerateHuffmanCode(HuffmanNode<charInfo>* root, FileCompress& file, string code)
{
if (root == NULL)
{
return;
}
if (root->_left == NULL && root->_right == NULL) //叶子节点
{
file._infos[root->_weight._ch]._code = code;
return;
}
GenerateHuffmanCode(root->_left, file, code + '0'); //string类型的可以直接进行+追加字符
GenerateHuffmanCode(root->_right, file, code + '1'); //左加0,右加1
}
protected:
charInfo _infos[256];
};
方法:
//记录补字符的位数,
//解压和压缩的字符是相同的,
//源文件出现字符的次数为哈夫曼树的根节点的值,每处理一个字符,减值进行减减
--testRunningTime.h
#pragma once
#ifndef __TIME_CHECK_H__
#define __TIME_CHECK_H__
#include <windows.h>
class MyTimer
{
public:
MyTimer()
{
QueryPerformanceFrequency(&_freq);
costTime = 0.0;
}
void Start()
{
for (int i = 0; i<EN_NUMER; ++i)
{
QueryPerformanceCounter(&_array[i]._begin);
}
}
void Stop()
{
for (int i = 0; i<EN_NUMER; ++i)
{
QueryPerformanceCounter(&_array[i]._end);
}
}
void Reset()
{
costTime = 0.0;
}
void showTime()
{
double allTime = 0.0;
for (int i = 0; i<EN_NUMER; ++i)
{
allTime += (((double)_array[i]._end.QuadPart - (double)_array[i]._begin.QuadPart) / (double)_freq.QuadPart);
}
costTime = allTime / EN_NUMER;
costTime *= 1000000;
if ((((int)costTime) / 1000000) > 0)
{
cout << costTime / 1000000 << " s" << endl;
}
else if (((int)costTime) / 1000 > 0)
{
cout << costTime / 1000 << " ms" << endl;
}
else
{
cout << costTime << " us" << endl;
}
}
private:
class Array
{
public:
LARGE_INTEGER _begin;
LARGE_INTEGER _end;
};
enum{ EN_NUMER = 5 };
LARGE_INTEGER _freq;
double costTime;
Array _array[EN_NUMER];
};
#endif
--test.cpp
#define _CRT_SECURE_NO_WARNINGS 1
#include <iostream>
using namespace std;
#include "FileCompress.h"
#include "testRunningTime.h"
#include "testRunningTime.h"
void Test()
{
string filename = "1.txt";
FileCompress ht;
ht.compress(filename.c_str());
//ht.PrintCode();
string filename1 = "1.compress";
ht.unCompress(filename1.c_str());
}
void Test1() //测试压缩时间
{
//string filename = "Input.BIG";
cout << "压缩时间";
MyTimer timer;
timer.Start();
FileCompress ht;
ht.compress("Input.BIG");
timer.Stop();
timer.showTime();
}
void Test2() //测试解压时间
{
//string filename = "compressFileName";
cout << "解压时间";
MyTimer timer;
timer.Start();
FileCompress ht;
ht.unCompress("Input.compress");
timer.Stop();
timer.showTime();
}
int main()
{
//Test();
Test1();
Test2();
system("pause");
return 0;
}