利用huffman编码的思想对文件进行压缩,主要原理是通过huffman编码来重新表示字符,使得出现频率高的字符编码短,出现少的字符编码长。整体下来的话,所需的总的bit位是减少的。但是要注意当大部分字符出现的频率都差不多时,huffman压缩的压缩效率会很低。
对于Huffman编码进行文件压缩,大致可以分为两个部分:
说明一下:文中知识每个模块的代码;完整版代码基于Huffman编码的文件压缩;仅供各位大神参考,不吝赐教;
压缩
1、统计文件中每个字符出现的次数
因为文件的底层都是有256个字符所构成的,所以我们需要统计他们所出现的次数来在构建Huffman树时保证他们的编码长度,以达到文件压缩;
while (1)
{
size_t ret = fread(FileComBuff, 1, 1024, fIn);
if (0 == ret)
break;
for (size_t idx = 0; idx < ret; ++idx)
{
unsigned char ch = FileComBuff[idx]; //注意要将ch设置为unsigned char型的,因为有可能有特殊字符的FileComBuff[idx]为负数
_Info[ch]._count++;
}
}
2、构建HuffmanTree
Node* _CreatTree(const T array[], size_t size, T invalid)
{
Heap<Node*, Last<Node*>> _ht;
for (size_t idx = 0; idx < size; ++idx)
{
if (array[idx] != invalid)
{
_ht.Push(new Node(array[idx])); //创建每个结点放在Heap中
}
}
while (_ht.Size() > 1) //直到只有一个结点的时候就是最终的huffman树
{
Node* left = _ht.Top();
_ht.Pop();
Node* right = _ht.Top();
_ht.Pop();
Node* NewParent = new Node(left->_weight + right->_weight);
NewParent->_pLeft = left;
left->_pParent = NewParent;
NewParent->_pRight = right;
right->_pParent = NewParent;
_ht.Push(NewParent);
}
return _ht.Top();
}
3、得到Huffman的编码
void _GenerateHuffmanTreeCode(Node* root)//用HuffmanTree进行编码
{
if (NULL == root)
return;
_GenerateHuffmanTreeCode(root->_pLeft);
_GenerateHuffmanTreeCode(root->_pRight);
//只有结点为叶子结点时才进行编码
if (NULL == root->_pLeft && NULL == root->_pRight)
{
Node* pCur = root;
Node* parent = root->_pParent;
string& code = _Info[pCur->_weight._ch]._code;
while (parent)
{
if (parent->_pLeft == pCur)
code += '0';
else
code += '1';
pCur = parent;
parent = pCur->_pParent;
}
reverse(code.begin(), code.end()); //因为编码完后,是与原来的编码相反的,所以需要调整;
}
}
4、保存配置信息(后缀,行号,每个字符出现的次数)这些是为解压缩的时候准备的
size_t lineCount = 0;
char str[128] = { 0 };
//后缀的保存
string strFileHead = GetFilePostFix(filename); //获取 后缀的函数
//n个叶子结点的保存
string chInfo;
for (size_t i = 0; i < 256; ++i)
{
if (_Info[i]._count > 0)
{
chInfo += _Info[i]._ch;
chInfo += ',';
chInfo += _itoa(_Info[i]._count, str,10);
chInfo += '\n';
lineCount++;
}
}
//行数的保存
_itoa(lineCount, str, 10);
strFileHead += '\n';
strFileHead += str;
strFileHead += '\n';
strFileHead += chInfo;
fwrite(strFileHead.c_str(), 1, strFileHead.length(), fOut);
5、Huffman编码的存储
fseek(fIn, 0, SEEK_SET); //指针回到起始位置
char* ReadBuff = new char[1024];
char* WriteBuff = new char[1024];
long long j = ht.GreatRoot()->_weight._count;
long long arv = j / 100;
long long k = 0;
unsigned char inch = 0;
size_t pos = 0;
size_t writepos = 0;
while (1)
{
int ReadSize = fread(ReadBuff, 1, 1024, fIn);
if (0 == ReadSize)
break;
for (int idx = 0; idx < ReadSize; idx++)
{
unsigned char ch = ReadBuff[idx];
string code = _Info[ch]._code;
for (int i = 0; i < code.length(); ++i)
{
inch <<= 1;
if (code[i] == '1')
inch |= 1;
if (++pos == 8)
{
k++; //下面6句是在问价压缩的时候出现一个进度条,比较简陋
if (k == arv)
{
cout << "*";
k = 0;
}
WriteBuff[writepos++] = inch;
if (1024 == writepos)
{
fwrite(WriteBuff, 1, 1024, fOut);
writepos = 0;
}
inch = 0;
pos = 0;
}
}
}
}
if (pos)
{
inch <<= (8 - pos);
WriteBuff[writepos++] = inch;
}
if (writepos)
fwrite(WriteBuff, 1, writepos, fOut);
解压缩
1、读取配置文件以及文件的编码信息
FILE* fIn = fopen(filename.c_str(), "rb");
assert(fIn);
string uncomfile = Getfilename(filename);
int linecount = 0;
//读取后缀
string post;
post = _ReadLine(fIn);
uncomfile += post;
//读取行号
post = _ReadLine(fIn);
linecount = atoi(post.c_str());
//读取编码的信息
for (size_t i = 0; i < linecount; i++)
{
string huffmanNode = _ReadLine(fIn);
_Info[i]._ch = huffmanNode[0];
_Info[i]._count = atoi(huffmanNode.c_str() + 2);
}
2、重新构建Huffman树,用来解压缩
3、文件解压缩
FILE* fp = fopen(uncomfile.c_str(), "wb");
char* pReadBuff = new char[1024];
char* pWriteBuff = new char[1024];
size_t pWriteSize = 0;
Node* pCur = ht.GreatRoot();
size_t filesize = ht.GreatRoot()->_weight._count;
int pos = 7;
while (true)
{
size_t ReadSize = fread(pReadBuff, 1, 1024, fIn);
if (0 == ReadSize)
break;
for (size_t idx = 0; idx < ReadSize;)
{
if (pReadBuff[idx] & (1 << pos))
pCur = pCur->_pRight;
else
pCur = pCur->_pLeft;
if (NULL == pCur->_pLeft && NULL == pCur->_pRight)
{
pWriteBuff[pWriteSize++] = pCur->_weight._ch;
pCur = ht.GreatRoot();
if (1024 == pWriteSize)
{
fwrite(pWriteBuff, 1, 1024, fp);
pWriteSize = 0;
}
filesize--;
if (filesize == 0)
break;
}
pos--;
if (pos < 0)
{
pos = 7;
idx++;
}
}
}
if (pWriteSize)
{
fwrite(pWriteBuff, 1, pWriteSize, fp);
}
上述;
就是Huffman编码文件压缩与解压缩的基本步骤和模块代码
其中,我用的是fread()和fwrite()函数,速度比较快,也可以用别的,但是效率和函数的使用有很大关系;