2024年6月20日总结 文本文件压缩

今天写完了课程设计,将文末字节的问题搞定了

但是不知道为什么解压后的文件会变小,但是字符数相等说一下我的全部思路

我的文本文件压缩是由哈夫曼编码实现的,由于要构建哈夫曼树我手搓了一个小顶堆的数据结构来实现优先队列,定义了长度为256的二维字符数组储存字符对应的哈夫曼编码作为一个哈希表的作用,定义了一个long long数组记录频率,先遍历文件得到频率表再由频率表生成节点放入小顶堆中,放入完毕再一直往小顶堆取出两个节点结合为一个直到只剩一个节点时返回,那么该节点就是哈夫曼树的头节点,再通过一个类似序列化的方法写入dat文件储存哈夫曼树,再遍历哈夫曼树写入哈夫曼码写入txt文件。

压缩:遍历文件再对字节处理就能将哈夫曼编码一位一位写入压缩的dat文件中。对字节的处理是先定义一个uint8_t类currentByte,先在压缩文件首位填入一个全零的字节用来记录末尾字节的有效位,从压缩文件第二个字节开始,遍历字符串每遇到1就左移添1,遇到0左移添0,满八位再写入压缩文件,末位字节则补零并记录有效位并输入第一个字节,

解压缩:先反序列化哈夫曼树得到树,再先读压缩文件第一位得到末字节有效位,再求得文件长度。从第二个字节开始一次读取一个字节,通过位操作得到每一个位,1则让指向哈夫曼树的指针向右,反之亦然。遇到字符节点则往复原文件写入字符,指针重新指向头节点,循环操作直到最后一个字节,只在有效位内操作

附上源代码

#include <iostream>
#include <string>
#include <fstream>
#include <vector>
using namespace std;
int num;
struct HufTree {
    char val;
    long long frequency;
    bool yes;
    int number;
    HufTree* left;
    HufTree* right;

    bool operator < (const HufTree& A) {
        return frequency < A.frequency;
        }
    bool operator > (const HufTree& A) {
        return frequency > A.frequency;
    }
};
HufTree* nodesum(HufTree *a,HufTree *b) {
    HufTree* c = new HufTree();
    c->frequency = a->frequency + b->frequency;
    c->yes = false;
    c->left = a; c->right = b;
    return c;
}
class MinHeap {
private:
    vector<HufTree*> heap;

    // 返回父节点索引
    int parent(int i) { return (i - 1) / 2; }

    // 返回左子节点索引
    int left(int i) { return 2 * i + 1; }

    // 返回右子节点索引
    int right(int i) { return 2 * i + 2; }

    // 交换两个元素
    void swap(int a, int b) {
        HufTree* temp = heap[a];
        heap[a] = heap[b];
        heap[b] = temp;
    }

    // 自底向上调整堆
    void heapifyUp(int i) {
        while (i > 0 && *heap[parent(i)] > *heap[i]) {
            swap(i, parent(i));
            i = parent(i);
        }
    }

    // 自顶向下调整堆
    void heapifyDown(int i) {
        int smallest = i;
        int l = left(i);
        int r = right(i);

        if (l < heap.size() && *heap[l] < *heap[smallest]) {
            smallest = l;
        }

        if (r < heap.size() && *heap[r] < *heap[smallest]) {
            smallest = r;
        }

        if (smallest != i) {
            swap(i, smallest);
            heapifyDown(smallest);
        }
    }

public:
    // 插入元素
    void insert(HufTree* a) {
        heap.push_back(a);
        int index = heap.size() - 1;
        heapifyUp(index);
    }

    // 弹出堆顶元素
    HufTree* extractMax() {
        HufTree* root = heap[0];
        heap[0] = heap.back();
        heap.pop_back();
        heapifyDown(0);

        return root;
    }

    // 获取堆顶元素
    HufTree* getMax() {
        if (heap.empty()) {
            throw runtime_error("Heap is empty");
        }
        return heap[0];
    }

    // 判断堆是否为空
    bool isEmpty() {
        return heap.empty();
    }
};
string swapload(string s) {
    string ss = "";
    for (int i = 0; i < s.size(); ++i) {
        if (s[i] == '\\') {
            ss += '/';
        }
        else ss += s[i];
    }
    return ss;
}
HufTree* setHufTree(string s,long long an[]) {
    std::ifstream file(s);

    // 检查文件是否成功打开  
    if (!file.is_open()) {
        std::cerr << "无法打开文件: " << s << std::endl;
        return nullptr;
    }

    // 逐个字符读取文件内容  
    char ch;
    while (file.get(ch)) {
        an[ch]++;
    }
    // 关闭文件  
    file.close();
    MinHeap pqueue;
    num = 0;
    for (int i = 0; i < 256; ++i) {
        if (an[i]) {
            num++;
            HufTree* c = new HufTree();
            c->frequency = an[i];
            c->val = static_cast<char>(i);
            c->number = num;
            c->left = nullptr; c->right = nullptr; c->yes = true;
            pqueue.insert(c);
        }
    }
    while (!pqueue.isEmpty()) {
        HufTree* a = pqueue.getMax();
        pqueue.extractMax();
        if (pqueue.isEmpty()) return a;
        HufTree* b = pqueue.getMax();
        pqueue.extractMax();
        HufTree* c = nodesum(a, b);
        c->number = ++num;
        pqueue.insert(c);
    }
}

void pretraversal(HufTree* a){
    if (a == nullptr)return;
    if (a->yes) cout << a->val << " ";
    //cout << a->number << " ";
    pretraversal(a->left);
    pretraversal(a->right);
}

void serializeHuffmanTree(HufTree* root, ofstream& outFile) {
    if (root == nullptr) {
        // 写入空节点的标记,例如使用-1作为编号  
        int a = -1;
        outFile.write(reinterpret_cast<char*>(&(a)), sizeof(int));
        return;
    }

    // 写入节点的编号  
    outFile.write(reinterpret_cast<char*>(&root->number), sizeof(int));
    outFile.write(reinterpret_cast<char*>(&root->val), sizeof(char));
    outFile.write(reinterpret_cast<char*>(&root->yes), sizeof(bool));
    outFile.write(reinterpret_cast<char*>(&root->frequency), sizeof(long long));

    // 递归地序列化左子树和右子树  
    serializeHuffmanTree(root->left, outFile);
    serializeHuffmanTree(root->right, outFile);
}
HufTree* deserializeHuffmanTree(ifstream& inFile) {
    int number;
    inFile.read(reinterpret_cast<char*>(&number), sizeof(int));

    // 检查是否为空节点  
    if (number == -1) {
        return nullptr;
    }

    HufTree* root = new HufTree();
    root->number = number;
    inFile.read(reinterpret_cast<char*>(&root->val), sizeof(char));
    inFile.read(reinterpret_cast<char*>(&root->yes), sizeof(bool));
    inFile.read(reinterpret_cast<char*>(&root->frequency), sizeof(long long));
    // 假设其他属性如频率在反序列化时不需要(因为它们可以从子树频率计算得出)  
    //root->frequency = 0;   
    root->left = deserializeHuffmanTree(inFile); // 递归反序列化左子树  
    root->right = deserializeHuffmanTree(inFile); // 递归反序列化右子树  
    return root;
}
void setHufTreeTXTFile(HufTree* a, ofstream& outFile,string huffload,char temporarily[][256]) {
    if (a == nullptr) return;
    if (a->yes) {
        outFile << a->val << "  " << huffload << endl;
        for (int i = 0; i < huffload.size(); i++) {
            temporarily[(int)a->val][i] = huffload[i];
        }
        temporarily[(int)a->val][huffload.size()] = '\0';
    }
    setHufTreeTXTFile(a->left, outFile, huffload + '0', temporarily);
    setHufTreeTXTFile(a->right, outFile, huffload + '1', temporarily);
    return;
}uint8_t readByteFromDatFile(std::ifstream& file) {
    uint8_t byte;
    if (file.read(reinterpret_cast<char*>(&byte), sizeof(byte))) {
        //cout << byte;
        return byte;
    }
    else {
        //throw std::runtime_error("Unable to read byte from file");
        cout << "jieshu";
    }
}

//D:\c\ahuff\acca.txt
//D:\c\ahuff
//D:\c\ahuff\target.txt
int main()
{
    cout << "欢迎来到文本文件压缩助手\n";
    cout << "输入a为压缩功能\n输入b为解压缩功能\n输入c为退出程序\n";
    while (true) {
        char a;
        string thisfile_address, thiscompressfile_address;
        a = getchar();
        getchar();
        long long an[257] = { 0 };
        HufTree* aaa = nullptr;
        string HufCodecompressfile_address = "";
        string HufTreecompressfile_address = "";
        switch (a-'a')
        {
            case 0:
            {
                cout << "请输入将要压缩的文件地址\n";
                getline(cin, thisfile_address);
                thisfile_address = swapload(thisfile_address);
                cout << "请输入压缩文件放置的位置\n";
                getline(cin, thiscompressfile_address);
                thiscompressfile_address = swapload(thiscompressfile_address);
                char temporarily[256][256] = {0};
                aaa=setHufTree(thisfile_address, an);
                cout << aaa->frequency<<endl;
                pretraversal(aaa);
                cout << endl;
                // 序列化哈夫曼树到文件  
                HufTreecompressfile_address = thiscompressfile_address + "/HufTree.dat";
                ofstream outFile(HufTreecompressfile_address, ios::binary);
                if (outFile.is_open()) {
                    serializeHuffmanTree(aaa, outFile);
                    outFile.close();
                    cout << "哈夫曼树已保存到 " << HufTreecompressfile_address << endl;
                }
                else {
                    cerr << "无法打开文件以保存哈夫曼树: " << HufTreecompressfile_address << endl;
                }
                HufCodecompressfile_address = thiscompressfile_address + "/HufCode.txt";
                ofstream codeFile(HufCodecompressfile_address);
                if (codeFile.is_open()) {
                    string huffload="";
                    setHufTreeTXTFile(aaa,codeFile,huffload,temporarily);
                    codeFile.close();
                    cout << "哈夫曼编码已保存到 " << HufCodecompressfile_address << endl;
                }
                else {
                    cerr << "无法打开文件以保存哈夫曼编码: " << HufCodecompressfile_address << endl;
                }
                string CodeFilecompressfile_address = thiscompressfile_address + "/CodeFile.dat";
                ofstream dataFile(CodeFilecompressfile_address, ios::binary);
                ifstream OriFile(thisfile_address, ios::binary);
                if (dataFile.is_open()&& OriFile.is_open()) {
                    char zhi;
                    uint8_t currentByte = 0;  // 当前字节,用于累积位
                    int bitCount = 0;// 当前字节中已有的位数
                    dataFile.write(reinterpret_cast<char*>(&currentByte), sizeof(currentByte));
                    while (OriFile.get(zhi)) {
                        for (int i = 0; i < strlen(temporarily[zhi]); ++i) {
                            if (temporarily[zhi][i] == '0') {
                                currentByte = (currentByte << 1) | 0;  // 将0加入当前字节的最低位
                                bitCount++;  // 写入二进制位0
                            }
                            else if (temporarily[zhi][i] == '1') {
                                currentByte = (currentByte << 1) | 1;  // 将1加入当前字节的最低位
                                bitCount++;
                            }
                            else {
                                // 可以添加适当的错误处理,处理不合法的输入字符
                                cerr << "遇到非法字符 '" << zhi << "',无法转换为二进制位." << endl;
                            }
                            if (bitCount == 8) {
                                //cout << currentByte << endl;
                                dataFile.write(reinterpret_cast<char*>(&currentByte), sizeof(currentByte));
                                currentByte = 0;
                                bitCount = 0;

                            }
                        }
                    }
                    if (bitCount > 0) {
                        currentByte <<= (8 - bitCount); // 左移补零
                        uint8_t cac = 0;
                        for (int i = 0; i < bitCount; ++i) {
                            cac = (cac << 1) | 1;
                        }
                        cac <<= (8 - bitCount);
                        dataFile.write(reinterpret_cast<char*>(&currentByte), sizeof(currentByte));
                        dataFile.seekp(0, std::ios::beg);
                        dataFile.write(reinterpret_cast<char*>(&cac), sizeof(cac));
                        /*for (int i = 7; i >= 0; --i) {
                            if ((cac >> i) & 1) cout << 1;
                            else cout << 0;
                            cout << endl;
                        }*/
                    }
                    else {
                        uint8_t cac = 0;
                        for (int i = 0; i < 8; ++i) {
                            cac = (cac << 1) | 1;
                        }
                        dataFile.seekp(0, std::ios::beg);
                        dataFile.write(reinterpret_cast<char*>(&cac), sizeof(cac));
                    }

                    dataFile.close();
                    cout << "压缩数据已保存到 " << thisfile_address << endl;
                }
                else {
                    cerr << "无法打开文件以保存压缩数据: " << thisfile_address << endl;
                }
            }
                break;
            case 1:
            {
                cout << "请输入压缩文件地址\n";
                getline(cin, thiscompressfile_address);
                thiscompressfile_address = swapload(thiscompressfile_address);
                HufTreecompressfile_address = thiscompressfile_address + "/HufTree.dat";
                ifstream inFile(HufTreecompressfile_address, ios::binary);
                HufTree* deserializedTree = nullptr;
                if (inFile.is_open()) {
                    deserializedTree = deserializeHuffmanTree(inFile);
                    //pretraversal(deserializedTree);
                    //cout << endl;
                    inFile.close();
                }
                else {
                    cerr << "无法打开文件以反序列化哈夫曼树: " << HufTreecompressfile_address << endl;
                    break;
                }
                cout << "请输入解压文件放置的位置并在后面加入文件的名字\n";
                getline(cin, thisfile_address);
                thisfile_address = swapload(thisfile_address);
                string oriFileload = thiscompressfile_address + "/CodeFile.dat";
                ofstream targetFile(thisfile_address, ios::binary);
                ifstream oriFile(oriFileload, ios::binary);
                uint8_t lastByteBits;
                if (targetFile.is_open()&& oriFile.is_open()) {
                    //uint8_t lastByteBits;
                    oriFile.seekg(0, 2);
                    long long len = static_cast<long long>(oriFile.tellg()) - 2;
                    oriFile.seekg(0, 0);

                    oriFile.read(reinterpret_cast<char*>(&lastByteBits), sizeof(lastByteBits));
                    
                    //HufTree* deserializedTree = deserializeHuffmanTree(inFile);
                    HufTree* traverse = deserializedTree;
                    //pretraversal(traverse);
                    while (oriFile.good() && len--) {
                        uint8_t byte = readByteFromDatFile(oriFile);
                        for (int i = 7; i >= 0; --i) {
                            if ((byte >> i) & 1) traverse = traverse->right;
                            else  traverse = traverse->left;
                            //pretraversal(traverse);
                            if (traverse->yes) {
                                targetFile << (traverse->val);
                                //cout << traverse->val<<endl;
                                traverse = deserializedTree;
                            }
                        }
                    }
                    for (int i = 7; i >= 0; --i) {
                        if ((lastByteBits >> i) & 1) cout << 1;
                        else cout << 0;
                        
                    }
                    uint8_t byte = readByteFromDatFile(oriFile);
                    for (int i = 7; i >= 0; --i) {
                        if (!((lastByteBits >> i) & 1)) break;
                        if ((byte >> i) & 1) traverse = traverse->right;
                        else traverse = traverse->left;
                        if (traverse->yes) {
                            targetFile << (traverse->val);
                            cout << traverse->val;
                            traverse = deserializedTree;
                        }
                    }
                    targetFile.close();
                    oriFile.close();
                    //inFile.close();
                    /*delete traverse;
                    delete deserializedTree;*/
                }
                else {
                    cerr << "No " << HufTreecompressfile_address << endl;
                }
            }
                break;
            case 2:
                return 0;
            default:
                cout << "输入错误,已退回主菜单";
        }
    }
}
//D:\c\ahuff\acca.txt
//D:\c\ahuff
//D:\c\ahuff\target.txt
//D:\c\ahuff\zwz.txt

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值