基于Huffman树的文件压缩

本文详细介绍了利用Huffman编码对文件进行压缩的方法,包括统计字符频次、构建Huffman树、编码过程及文件的压缩与解压缩实现。适用于对文件压缩原理感兴趣的读者。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

利用huffman编码的思想对文件进行压缩,主要原理是通过huffman编码来重新表示字符,使得出现频率高的字符编码短,出现少的字符编码长。整体下来的话,所需的总的bit位是减少的。但是要注意当大部分字符出现的频率都差不多时,huffman压缩的压缩效率会很低。
对于Huffman编码进行文件压缩,大致可以分为两个部分:
说明一下:文中知识每个模块的代码;完整版代码基于Huffman编码的文件压缩;仅供各位大神参考,不吝赐教;

压缩

1、统计文件中每个字符出现的次数
因为文件的底层都是有256个字符所构成的,所以我们需要统计他们所出现的次数来在构建Huffman树时保证他们的编码长度,以达到文件压缩;

        while (1)
        {
            size_t ret = fread(FileComBuff, 1, 1024, fIn);
            if (0 == ret)
                break;
            for (size_t idx = 0; idx < ret; ++idx)
            {
                unsigned char ch = FileComBuff[idx]; //注意要将ch设置为unsigned char型的,因为有可能有特殊字符的FileComBuff[idx]为负数
                _Info[ch]._count++;
            }
        }

2、构建HuffmanTree

    Node* _CreatTree(const T array[], size_t size, T invalid)
    {
        Heap<Node*, Last<Node*>> _ht;
        for (size_t idx = 0; idx < size; ++idx)
        {
            if (array[idx] != invalid)
            {
                _ht.Push(new Node(array[idx])); //创建每个结点放在Heap中
            }
        }

        while (_ht.Size() > 1) //直到只有一个结点的时候就是最终的huffman树
        {
            Node* left = _ht.Top();
            _ht.Pop();
            Node* right = _ht.Top();
            _ht.Pop();
            Node* NewParent = new Node(left->_weight + right->_weight);
            NewParent->_pLeft = left;
            left->_pParent = NewParent;
            NewParent->_pRight = right;
            right->_pParent = NewParent;
            _ht.Push(NewParent);
        }
        return _ht.Top();
    }

3、得到Huffman的编码

    void _GenerateHuffmanTreeCode(Node* root)//用HuffmanTree进行编码
    {
        if (NULL == root)
            return;
        _GenerateHuffmanTreeCode(root->_pLeft);
        _GenerateHuffmanTreeCode(root->_pRight);

        //只有结点为叶子结点时才进行编码
        if (NULL == root->_pLeft && NULL == root->_pRight)
        {
            Node* pCur = root;
            Node* parent = root->_pParent;
            string& code = _Info[pCur->_weight._ch]._code;
            while (parent)
            {
                if (parent->_pLeft == pCur)
                    code += '0';
                else
                    code += '1';
                pCur = parent;
                parent = pCur->_pParent;
            }
            reverse(code.begin(), code.end()); //因为编码完后,是与原来的编码相反的,所以需要调整;
        }
    }

4、保存配置信息(后缀,行号,每个字符出现的次数)这些是为解压缩的时候准备的

        size_t lineCount = 0;
        char str[128] = { 0 };
        //后缀的保存
        string strFileHead = GetFilePostFix(filename); //获取 后缀的函数

        //n个叶子结点的保存
        string chInfo;
        for (size_t i = 0; i < 256; ++i)
        {
            if (_Info[i]._count > 0)
            {
                chInfo += _Info[i]._ch;
                chInfo += ',';
                chInfo += _itoa(_Info[i]._count, str,10);
                chInfo += '\n';
                lineCount++;
            }
        }
        //行数的保存
        _itoa(lineCount, str, 10);

        strFileHead += '\n';
        strFileHead += str;
        strFileHead += '\n';
        strFileHead += chInfo;
        fwrite(strFileHead.c_str(), 1, strFileHead.length(), fOut);

5、Huffman编码的存储

        fseek(fIn, 0, SEEK_SET); //指针回到起始位置
        char* ReadBuff = new char[1024];
        char* WriteBuff = new char[1024];
        long long j = ht.GreatRoot()->_weight._count;
        long long arv = j / 100;
        long long k = 0;
        unsigned char inch = 0;
        size_t pos = 0;
        size_t writepos = 0;

        while (1)
        {
            int ReadSize = fread(ReadBuff, 1, 1024, fIn);
            if (0 == ReadSize)
                break;
            for (int idx = 0; idx < ReadSize; idx++)
            {
                unsigned char ch = ReadBuff[idx];
                string code = _Info[ch]._code;
                for (int i = 0; i < code.length(); ++i)
                {
                    inch <<= 1;
                    if (code[i] == '1')
                        inch |= 1;
                    if (++pos == 8)
                    {
                        k++;  //下面6句是在问价压缩的时候出现一个进度条,比较简陋
                        if (k == arv)
                        {
                            cout << "*";
                            k = 0;
                        }
                        WriteBuff[writepos++] = inch;
                        if (1024 == writepos)
                        {
                            fwrite(WriteBuff, 1, 1024, fOut);
                            writepos = 0;
                        }
                        inch = 0;
                        pos = 0;
                    }
                }
            }
        }
        if (pos)
        {
            inch <<= (8 - pos);
            WriteBuff[writepos++] = inch;
        }
        if (writepos)
            fwrite(WriteBuff, 1, writepos, fOut);

解压缩

1、读取配置文件以及文件的编码信息

        FILE* fIn = fopen(filename.c_str(), "rb");
        assert(fIn);

        string uncomfile = Getfilename(filename);
        int linecount = 0;
        //读取后缀
        string post;
        post = _ReadLine(fIn);
        uncomfile += post;
        //读取行号
        post = _ReadLine(fIn);
        linecount = atoi(post.c_str());
        //读取编码的信息
        for (size_t i = 0; i < linecount; i++)
        {
            string huffmanNode = _ReadLine(fIn);
            _Info[i]._ch = huffmanNode[0];
            _Info[i]._count = atoi(huffmanNode.c_str() + 2);
        }

2、重新构建Huffman树,用来解压缩
3、文件解压缩

        FILE* fp = fopen(uncomfile.c_str(), "wb");
        char* pReadBuff = new char[1024];
        char* pWriteBuff = new char[1024];
        size_t pWriteSize = 0;
        Node* pCur = ht.GreatRoot();
        size_t filesize = ht.GreatRoot()->_weight._count;

        int pos = 7;
        while (true)
        {
            size_t ReadSize = fread(pReadBuff, 1, 1024, fIn);
            if (0 == ReadSize)
                break;
            for (size_t idx = 0; idx < ReadSize;)
            {
                if (pReadBuff[idx] & (1 << pos))
                    pCur = pCur->_pRight;
                else
                    pCur = pCur->_pLeft;

                if (NULL == pCur->_pLeft && NULL == pCur->_pRight)
                {
                    pWriteBuff[pWriteSize++] = pCur->_weight._ch;
                    pCur = ht.GreatRoot();
                    if (1024 == pWriteSize)
                    {
                        fwrite(pWriteBuff, 1, 1024, fp);
                        pWriteSize = 0;
                    }
                    filesize--;
                    if (filesize == 0)
                        break;
                }
                pos--;
                if (pos < 0)
                {
                    pos = 7;
                    idx++;
                }
            }
        }
        if (pWriteSize)
        {
            fwrite(pWriteBuff, 1, pWriteSize, fp);
        }

上述;
就是Huffman编码文件压缩与解压缩的基本步骤和模块代码
其中,我用的是fread()和fwrite()函数,速度比较快,也可以用别的,但是效率和函数的使用有很大关系;

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值