文件压缩（小项目）

最新推荐文章于 2023-12-02 19:56:54 发布

原创最新推荐文章于 2023-12-02 19:56:54 发布 · 置顶 · 592 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#文件压缩 #解压 #哈夫曼树 #贪心算法

数据结构专栏收录该内容

17 篇文章

订阅专栏

Huffman树，又称为最优二叉树，是加权路径最短的二叉树。Huffman树的构建利用到贪心算法。

【贪心算法】

贪心算法是指在问题求解时，总是做出当前最好的选择，也就是说贪心算法做出的不一定是全局的最优解，但是某种意义上的局部最优解。贪心算法不一定能够求得整体的最优解。

使用贪心算法构建哈夫曼树：

/*
主要原理：将每个字符与哈夫曼编码相对应，
压缩：统计字符出现的次数-》构建哈夫曼树-》生成哈夫曼编码-》压缩到新的文件中
解压：读压缩后的文件-》将哈夫曼编码与字符相对应-》将字符写入新的文件中

—次数多的字符路径短，次数少的字符路径长
*/

--heap.h

#pragma once
//堆
#include <vector>
#include <assert.h>

//添加仿函数
template<class T>
struct Less
{
	bool operator()(const T* L, const T* R)
	{
		return L < R;
	}
};

template<class T>
struct Greate
{
	bool operator()(const T* L, const T* R)
	{
		return L > R;
	}
};

template <class T, class compare = Less>     //Less为默认的类型
//template <class T, template<class> compare = Less>     //模板的模板参数
class Heap
{
public:
	Heap()     //无参构造
		:_size(0)
	{}

	Heap(vector<T>& a)     //拷贝构造
	{
		_a.swap(a);         //vector中swap函数？
		//建堆
		for (int i = (_a.size() - 2) / 2; i >= 0; --i)
		{
			AdjustDown(i);
		}
	}

	Heap(const T* a, size_t size)
	{
		_a.reverse(size);   //申请空间
		for (int i = 0; i < size; i++)    //将所有元素压入容器中
		{
			_a.push_back(a[i]);
		}
		for (int i = (_a.size() - 2) / 2; i >= 0; i--)   //i为元素下标，调整堆
		{
			_AdjustDown(i);
		}
	}

	void Push(const T& x)      //将数据插入堆中
	{
		_a.push_back(x);
		AdjustUp(_a.size() - 1);    //进行向上调整
		++_size;
	}

	void Pop()       //删除数据
	{
		size_t size = _a.size();
		assert(size > 0);
		//将堆的头元素和尾元素进行交换，然后进行pop，对堆进行调整
		swap(_a[0], _a[size - 1]);
		_a.pop_back();
		AdjustDown(0);
		--_size;
	}

	T Top()   //堆顶上元素
	{
		return _a[0];
	}

	size_t Size()     //求堆中数据的个数
	{
		return _size;
	}

public:
	void AdjustDown(size_t parent)     //将堆进行下调（小堆）
	{
		size_t child = parent * 2 + 1;   //计算孩子节点的下标
		size_t size = _a.size();
		while (child < size)
		{
			compare com;
			if (child + 1 < size && com(_a[child + 1], _a[child]))      //寻找左右孩子中最小的
			{
				++child;
			}
			if (com(_a[child], _a[parent]))    //将最大的孩子与父节点进行比较
			{
				swap(_a[parent], _a[child]);
				parent = child;
				child = parent * 2 + 1;
			}
			else
			{
				break;
			}
		}
	}
	void AdjustUp(size_t child)   //上调
	{
		size_t parent = (child - 1) / 2;
		while (child > 0)
		{
			compare com;
			if (com(_a[child], _a[parent]))
			{
				swap(_a[child], _a[parent]);
				child = parent;
				parent = (child - 1) / 2;
			}
			else
			{
				break;
			}
		}
	}
protected:
	vector<T> _a;
	size_t _size;
};

--Huffman.h

#pragma once
#include "Heap.h"
//实现哈夫曼树

template <class T>
struct HuffmanNode
{
	HuffmanNode<T>* _left;    //指向哈夫曼树的左节点的指针
	HuffmanNode<T>* _right;   //指向哈夫曼树的右节点的指针
	T _weight;     //节点的权值

	HuffmanNode(const T& x)
		:_left(NULL)
		, _right(NULL)
		, _weight(x)
	{}
};

template <class T>
class HuffmanTree
{
	typedef HuffmanNode<T> Node;
public:
	HuffmanTree()       //进行无参构造
		:_root(NULL)
	{}

	~HuffmanTree()      //析构函数
	{
		_clear(_root);
	}

	//构造哈夫曼树
	HuffmanTree(T* a, size_t size, const T& invalid)
	{
		_root = CreateTree(a, size, invalid);
	}

	Node* GetRootNode()     //获得根节点
	{
		return _root;
	}

protected:
	Node* CreateTree(T* a, size_t size, const T& invalid)
	{
		struct compare     //构造仿函数
		{
			bool operator()(const Node* dt, const Node* st)
			{
				return dt->_weight < st->_weight;
			}
		};

		Heap<Node*, compare> minHeap;     //创建最小堆

		for (size_t i = 0; i < size; ++i)//将所有的数据压入堆中
		{
			if (a[i] != invalid)
			{
				minHeap.Push(new Node(a[i]));
			}
		}
		Node* parent = new Node(0);
		while (minHeap.Size() > 1)      //小堆的大小不为空
		{
			Node* left = minHeap.Top();     //将小堆中最小的两个数据取出
			minHeap.Pop();
			Node* right = minHeap.Top();
			minHeap.Pop();

			parent = new Node(left->_weight + right->_weight);   
			parent->_left = left;
			parent->_right = right;
			minHeap.Push(parent);
		}
		return parent;
	}

	void _clear(Node* root)
	{
		if (root)
		{
			_clear(root->_left);
			_clear(root->_right);
			delete root;
		}
	}

protected:
	Node* _root;
};

--FileCompress.h

#pragma once

//利用哈夫曼树实现文件压缩
/*
主要原理：将每个字符与哈夫曼编码相对应，
压缩：统计字符出现的次数-》构建哈夫曼树-》生成哈夫曼编码-》压缩到新的文件中
解压：读压缩后的文件-》将哈夫曼编码与字符相对应-》将字符写入新的文件中

次数多的字符路径短，次数少的字符路径长
*/
#include "Huffman.h"
#include <string>
#include <math.h>

typedef long long LongType;
struct charInfo
{
	unsigned char _ch;    //字符
	LongType _count;        //出现的次数
	string _code;           //哈夫曼编码

	charInfo()
		:_count(0)
		, _ch(0)
	{}

	charInfo(const LongType& count)        //构造
		:_count(count)
		, _ch(0)
	{ }

	bool operator!=(const charInfo& info)const    //重载！=(常成员函数)
	{
		return _count != info._count;
	}

	charInfo operator+(const charInfo& info)const       //重载+
	{
		return charInfo(_count + info._count);
	}

	bool operator<(const charInfo& info)const      //重载<
	{
		return _count < info._count;
	}
};
 

class FileCompress
{
public:
	FileCompress()     //无参构造
	{
		for (int i = 0; i < 256; ++i)
		{
			_infos[i]._count = 0;
			_infos[i]._ch = i;
		}
	}

	void compress(const char* Filename)    //压缩
	{
		assert(Filename);
		FILE* FOut = fopen(Filename, "rb");     //打开源文件
		assert(FOut);      //判断打开文件是否失败

		//使用直接定址法统计每个字符出现的次数
		char ch = fgetc(FOut);    //取一个字符
		while (!feof(FOut))
		{
			_infos[(unsigned char)ch]._count++;
			ch = fgetc(FOut);
		}

		//构建哈夫曼树
		charInfo invalid(0);
		HuffmanTree<charInfo> tree(_infos, 256, invalid);    

		//生成哈夫曼编码
		string code;
		GenerateHuffmanCode(tree.GetRootNode(),*this, code);
		/*HuffmanNode<charInfo>* root = tree.GetRootNode();
		GenerateHuffmanCode(root, *this, code);*/


		//写配置文件，方便解压缩时重建HuffmanTree。
		/*新建配置文件，给中间放入读取的字符种类，不需要存每个字符读取的次数，
		可以直接利用哈夫曼树的根节点的权值，就可以确定文件字符的总个数*/
		string configFile = Filename;
		string compressFileName = Filename;   //新建压缩文件
		size_t last_ = configFile.find_last_of('.');
		if (last_ < configFile.size())
		{
			configFile.erase(last_);
			compressFileName.erase(last_);
		}

		configFile += ".config";
		FILE* FInconfig = fopen(configFile.c_str(), "wb");
		assert(FInconfig);

		string str;     //使用str来保存出现的字符
		char buffer[20] = {0};
		for (size_t i = 0; i < 256; ++i)
		{
			if (_infos[i]._count != 0)
			{
				str += _infos[i]._ch;
				str += ':';
				str += (string)_itoa(_infos[i]._count, buffer, 10);
				//str += buffer;
				str += '\n';
				fputs(str.c_str(), FInconfig);
				str.clear();      //每次对str进行清除，就能够保存下一个出现的字符
			}
			
		}


		//将文件进行压缩
		
		compressFileName += ".compress";
		FILE* FIn = fopen(compressFileName.c_str(), "wb");
		assert(FIn);

		fseek(FOut, 0, SEEK_SET);       //将fout文件指针移动到0的位置
		ch = fgetc(FOut);
		unsigned char value = 0;
		int pos = 0;
		while (!feof(FOut))    //将每个字符的编码写入文件
		{
			str = _infos[(unsigned char)ch]._code;
			for (size_t i = 0; i < str.size(); ++i)  
			{
				value <<= 1;
				value |= (str[i] - '0');
				if (++pos == 8)
				{
					fputc(value, FIn);
					pos = 0;
					value = 0;
				}
			}
			ch = fgetc(FOut);
		}
		//如果编码最后一个写入时，一个字符的空间没有占满时，采用的方式是进行补0操作
		if (pos > 0)     
		{
			value <<= (8 - pos);
			fputc(value, FIn);
		}
		
		fclose(FIn);
		fclose(FOut);
		fclose(FInconfig);
	}


	void unCompress(const char* Filename)    //解压
	{
		assert(Filename);
		FILE* FOut = fopen(Filename, "rb");
		assert(FOut);

		//配置文件
		string configFile = (string)Filename;
		string FileInName = (string)Filename;

		size_t last_ = configFile.find_last_of('.');   //查找字符串中出现的最后一个‘.’
		if (last_ < configFile.size())
		{
			configFile.erase(last_);    //将后面的字符进行删除
			FileInName.erase(last_);
		}
		configFile += ".config";
		FILE* FConfig = fopen(configFile.c_str(), "rb");
		assert(FConfig);

		//解压后的文件
		FileInName += "_com.txt";
		FILE* FIn = fopen(FileInName.c_str(), "wb");
		assert(FIn);

		//修改_count,注意\n，有可能代表字符，有可能是行结束标志
		char buff[20] = { 0 };
		unsigned char ch = fgetc(FConfig);
		while (!feof(FConfig))
		{
			fgetc(FConfig);
			fgets(buff, 20, FConfig);
			this->_infos[ch]._count = (LongType)atoi(buff);
			ch = fgetc(FConfig);
		}
		
		//重建哈夫曼树
		charInfo invalid(0);    //定义非法值
		HuffmanTree<charInfo> tree(_infos, 256, invalid);
		HuffmanNode<charInfo>* root = tree.GetRootNode();  
		HuffmanNode<charInfo>* cur = root;
		ch = fgetc(FOut);
		int count = root->_weight._count;    //记录字符的总个数控制结束,根节点的权值表示字符的总个数
		int pos = 7;
		while (count > 0)     //读取文件的编码
		{
			while (pos >= 0)
			{
				if (ch & (1 << pos))
				{
					cur = cur->_right;
				}
				else
				{
					cur = cur->_left;
				}
				if (cur->_left == NULL && cur->_right == NULL)
				{
					fputc(cur->_weight._ch, FIn);
					if (--count == 0)       //将剩余没有写入的字符总次数减1 
					{
						break;
					}
					cur = root;
				}
				--pos;
			}
			pos = 7;
			ch = fgetc(FOut);
		}
		fclose(FOut);
		fclose(FIn);
		fclose(FConfig);
	}

	/*void PrintCode()const
    {
		for (int i = 0; i < 256; ++i)
		{
			if (this->_infos[i]._count != 0)
			{
				cout << this->_infos[i]._ch << ":>" << this->_infos[i]._code << endl;
			}
		}
	}*/

protected:
	//后序遍历生成哈夫曼编码(使用递归)
	void GenerateHuffmanCode(HuffmanNode<charInfo>* root, FileCompress& file, string code)
	{
		if (root == NULL)
		{
			return;
		}
		if (root->_left == NULL && root->_right == NULL)    //叶子节点
		{
			file._infos[root->_weight._ch]._code = code;
			return;
		}
		GenerateHuffmanCode(root->_left, file, code + '0');    //string类型的可以直接进行+追加字符
		GenerateHuffmanCode(root->_right, file, code + '1');    //左加0，右加1	
	}


protected:
	charInfo _infos[256];
};

方法：
//记录补字符的位数，
//解压和压缩的字符是相同的，
//源文件出现字符的次数为哈夫曼树的根节点的值，每处理一个字符，减值进行减减

--testRunningTime.h

#pragma once 
#ifndef __TIME_CHECK_H__
#define __TIME_CHECK_H__

#include <windows.h>

class MyTimer
{
	public:
		MyTimer()
		{
			QueryPerformanceFrequency(&_freq);
		 	costTime = 0.0;
		}

		void Start()
		{
			for (int i = 0; i<EN_NUMER; ++i)
				{
					QueryPerformanceCounter(&_array[i]._begin);
				}
		}
		void Stop()
		{
			for (int i = 0; i<EN_NUMER; ++i)
			{
				QueryPerformanceCounter(&_array[i]._end);
			}
		}

		void Reset()
		{
			costTime = 0.0;
		}

		void showTime()
			{
			    double allTime = 0.0;
			    for (int i = 0; i<EN_NUMER; ++i)
				{
					allTime += (((double)_array[i]._end.QuadPart - (double)_array[i]._begin.QuadPart) / (double)_freq.QuadPart);
				}
			    costTime = allTime / EN_NUMER;
			    costTime *= 1000000;

				if ((((int)costTime) / 1000000) > 0)
				{
					cout << costTime / 1000000 << " s" << endl;
				}
			    else if (((int)costTime) / 1000 > 0)
				{
					cout << costTime / 1000 << " ms" << endl;
				}
			    else
				{
					cout << costTime << " us" << endl;
				}
			}

		private:
			class Array
			{
				public:
					LARGE_INTEGER _begin;
					LARGE_INTEGER _end;
					};
			enum{ EN_NUMER = 5 };
			LARGE_INTEGER _freq;
			double costTime;
			Array _array[EN_NUMER];
			};
#endif

--test.cpp

#define _CRT_SECURE_NO_WARNINGS 1

#include <iostream>
using namespace std;

#include "FileCompress.h"
#include "testRunningTime.h"
#include "testRunningTime.h"

void Test()
{
	string filename = "1.txt";
	FileCompress ht;
	ht.compress(filename.c_str());
	//ht.PrintCode();

	string filename1 = "1.compress";
	ht.unCompress(filename1.c_str());
}

void Test1()    //测试压缩时间
{

	//string filename = "Input.BIG";
	cout << "压缩时间";
	MyTimer timer;
	timer.Start();

	FileCompress ht;
	ht.compress("Input.BIG");

	timer.Stop();
	timer.showTime();

}

void Test2()     //测试解压时间
{
	//string filename = "compressFileName";
	cout << "解压时间";
	MyTimer timer;
	timer.Start();

	FileCompress ht;
	ht.unCompress("Input.compress");

	timer.Stop();
	timer.showTime();
}


int main()
{
	//Test();
	Test1();
	Test2();
	system("pause");
	return 0;
}