写个haffman编码玩一玩,haffman编码是很多压缩方法的基础,其中参考文献1非常详尽的介绍了gzip的算法,很有参考价值。
在haffman tree的实现过程中,有一个比较让人困惑的地方就是:在建立haffman tree的过程中,需要向最小堆中添加新的元素,同时新添加的元素的左右孩子的指针也需要设置好,但最小堆在更新的过程中可能不断交换数组中元素,导致haffman tree记录元素的位置失效,解决方法非常简单最小堆中保存的不是元素而是元素的指针,这样最小堆Pop时返回的不是位置而是节点指针,这样无论元素如何移动都不会影响树的构建了。文献[2]的代码展示了这个特点,而文献[3]的构建过程更加简洁优美。
#include <stdio.h>
#include <string>
#include <vector>
#include <stdlib.h>
template<typename T, typename Compare = std::less<T> >
class MinHeap {
public:
MinHeap() : size_(0) {}
MinHeap(Compare compare) : compare_(compare), size_(0) {}
T Pop() {
T value;
if (size_ > 0) {
value = data_[0];
Swap(data_[0], data_[size_ - 1]);
size_--;
if (size_ > 0) {
Heapfy(0);
}
data_.pop_back();
}
return value;
}
void Push(const T& value) {
data_.push_back(value);
size_++;
size_t index = size_ - 1;
size_t parent;
while(index > 0) {
parent = Parent(index);
if (compare_(data_[index], data_[parent])) {
Swap(data_[index], data_[parent]);
index = parent;
} else {
break;
}
}
}
size_t Size() {
return size_;
}
private:
void Swap(T& left, T& right) {
T tmp = left;
left = right;
right = tmp;
}
size_t Parent(size_t child) {
if (child != 0) {
return (child - 1) / 2;
} else {
return 0;
}
}
size_t LeftChild(size_t parent) {
return (parent + 1) * 2 - 1;
}
size_t RightChild(size_t parent) {
return (parent + 1) * 2;
}
void Heapfy(size_t index) {
size_t min = index;
size_t left = LeftChild(index);
size_t right = RightChild(index);
if ((left < size_) && compare_(data_[left],data_[index])) {
min = left;
}
if ((right < size_) && compare_(data_[right],data_[min])) {
min = right;
}
if (min != index) {
Swap(data_[index], data_[min]);
Heapfy(min);
}
}
Compare compare_;
size_t size_;
std::vector<T> data_;
};
class CharFrequence {
public:
CharFrequence() : char_('0'), frequence_(1) {}
CharFrequence(unsigned char character, unsigned int frequence) : char_(character), frequence_(frequence) {}
unsigned char char_;
unsigned int frequence_;
};
class HaffmanTreeNode : public CharFrequence {
public:
HaffmanTreeNode() : left_(NULL), right_(NULL) {}
HaffmanTreeNode(unsigned char character, unsigned frequence) : CharFrequence(character, frequence), left_(NULL), right_(NULL) {}
HaffmanTreeNode( HaffmanTreeNode* left_child, HaffmanTreeNode* right_child)
: left_(left_child), right_(right_child) {
if (left_child && right_child) {
frequence_ = left_child->frequence_ + right_child->frequence_;
}
}
friend bool operator<(const HaffmanTreeNode& left, const HaffmanTreeNode& right) {
return left.frequence_ < right.frequence_;
}
HaffmanTreeNode* left_;
HaffmanTreeNode* right_;
};
class EncodeByte {
public:
void SetBit(size_t offset) {
}
bool GetBit(size_t offset) {
size_t byte_num = offset / 8;
size_t byte_offset = offset % 8;
unsigned char byte = data_[byte_num];
byte >>= (7 - byte_offset);
if (byte & 1) {
return true;
} else {
return false;
}
}
class BitsIterator {
public:
BitsIterator(size_t offset, EncodeByte* byte_manager) : offset_(offset), byte_manager_(byte_manager) { }
BitsIterator(size_t offset, const EncodeByte* byte_manager) : offset_(offset), byte_manager_(const_cast<EncodeByte*>(byte_manager)) { }
bool operator*() {
return byte_manager_->GetBit(offset_);
}
BitsIterator& operator++() {
offset_++;
return *this;
}
BitsIterator operator++(int) {
BitsIterator tmp = *this;
offset_++;
return tmp;
}
BitsIterator& operator--() {
offset_--;
return *this;
}
BitsIterator operator--(int) {
BitsIterator tmp = *this;
offset_--;
return tmp;
}
friend bool operator!=(const BitsIterator& left, const BitsIterator& right) {
return left.offset_ != right.offset_;
}
private:
size_t offset_;
EncodeByte* byte_manager_;
};
EncodeByte(const std::vector<unsigned char>& encode) : data_(encode) {
if (data_.size() > 0) {
unsigned char last_byte = data_[data_.size() - 1];
if (last_byte == 128) {
bits_length_ = (data_.size() - 1) * 8;
} else if (last_byte == 0) {
bits_length_ = (data_.size() -1) * 8 - 1;
} else {
size_t filling_bits_num = 0;
while((last_byte & 1) == 0) {
filling_bits_num++;
last_byte >>= 1;
}
bits_length_ = data_.size() * 8 - filling_bits_num -1;
}
}
}
BitsIterator Begin() const {
BitsIterator it(0, this);
return it;
}
BitsIterator End() const {
BitsIterator it(bits_length_, this);
return it;
}
private:
std::vector<unsigned char> data_;
size_t bits_length_;
};
class HaffmanTree {
public:
void Build(const std::vector<HaffmanTreeNode>& char_frequence) {
for (int i = 0; i < char_frequence.size(); ++i) {
min_heap_.Push(char_frequence[i]);
}
HaffmanTreeNode* left = NULL;
HaffmanTreeNode* right = NULL;
HaffmanTreeNode* parent = NULL;
while (min_heap_.Size() > 1) {
left = new HaffmanTreeNode(min_heap_.Pop());
right = new HaffmanTreeNode(min_heap_.Pop());
parent = new HaffmanTreeNode(left, right);
min_heap_.Push(*parent);
}
root_ = new HaffmanTreeNode(min_heap_.Pop());
std::string code;
Trival(root_, &code);
}
std::string GetHaffmanCode(char character) {
return haffman_code[character];
}
void Decode(const EncodeByte& bytes, std::vector<unsigned char>* orginal) {
EncodeByte::BitsIterator it = bytes.Begin();
HaffmanTreeNode* current = root_;
while (it != bytes.End()) {
if (current && current->left_ == NULL) {
orginal->push_back(current->char_);
current = root_;
}
if (*it) {
current = current->right_;
} else {
current = current->left_;
}
++it;
}
if (current && current->left_ == NULL) {
orginal->push_back(current->char_);
}
}
private:
void Trival(HaffmanTreeNode* node, std::string* code) {
if (node) {
if (node->left_ == NULL && node->right_ == NULL) {
haffman_code[node->char_] = *code;
}
(*code) += "0";
Trival(node->left_, code);
code->erase(code->size() - 1, 1);
(*code) += "1";
Trival(node->right_, code);
code->erase(code->size() - 1, 1);
}
}
HaffmanTreeNode* root_;
MinHeap<HaffmanTreeNode> min_heap_;
std::string haffman_code[256];
};
class MyCompress {
public:
void Compress(const std::string orginal,std::vector<unsigned char>* encode) {
CharacterFrequenceCompute(orginal);
int bits_count = 0;
std::string haffman_code;
unsigned char code = '\0';
for (int i = 0; i < orginal.size(); ++i) {
haffman_code = haffman_tree_.GetHaffmanCode(orginal[i]);
for (int j = 0; j < haffman_code.size(); ++j) {
code <<= 1;
bits_count++;
if (haffman_code[j] == '1') {
code += 1;
}
if (bits_count == 8) {
encode->push_back(code);
code = '\0';
bits_count = 0;
}
}
}
//last bits ending with 10...0
if (bits_count == 0) {
encode->push_back(128);
} else if (bits_count == 7) {
code <<= 1;
code += 1;
encode->push_back(0);
} else {
code <<= 1;
code += 1;
code <<= (8 - bits_count - 1);
encode->push_back(code);
}
}
void Decompress(const std::vector<unsigned char>& encode, std::string* orginal) {
}
void Decompress(const std::vector<unsigned char>& encode, std::vector<unsigned char>* orginal) {
EncodeByte encoded_bytes(encode);
haffman_tree_.Decode(encoded_bytes, orginal);
}
void PrintHaffmanCode(unsigned char c) {
printf("code %c : %s\n", c, haffman_tree_.GetHaffmanCode(c).c_str());
}
private:
void CharacterFrequenceCompute(const std::string& text) {
HaffmanTreeNode tree_nodes[256];
for (int i = 0; i < 256; ++i) {
tree_nodes[i].char_ = i;
}
for (int i = 0; i < text.size(); ++i) {
tree_nodes[text[i]].char_ = text[i];
tree_nodes[text[i]].frequence_++;
}
haffman_tree_.Build(std::vector<HaffmanTreeNode>(tree_nodes, tree_nodes + sizeof(tree_nodes) / sizeof(HaffmanTreeNode)));
}
HaffmanTree haffman_tree_;
};
void MinHeapTest() {
MinHeap<HaffmanTreeNode> min_heap;
const size_t kElementSize = 10;
HaffmanTreeNode current;
int key = 0;
for (int i = 0; i < kElementSize; ++i) {
key = rand() % 500;
min_heap.Push(HaffmanTreeNode(static_cast<unsigned char>(i +'0'), key));
}
while (min_heap.Size() > 0) {
current = min_heap.Pop();
printf("pop: char=%c, frequence=%d\n", current.char_, current.frequence_);
}
}
void HaffmanTreeTest() {
HaffmanTree haffman_tree;
std::vector<HaffmanTreeNode> char_freq;
const int kCharSize = 256;
int freq = 0;
unsigned char character;
for (int i = 0; i < kCharSize; ++i) {
freq = rand() * rand() % 10000;
character = static_cast<unsigned char>(0);
printf("char:%c\n", character);
char_freq.push_back(HaffmanTreeNode(character, i));
}
haffman_tree.Build(char_freq);
}
void HaffmanTreeTest1() {
HaffmanTree haffman_tree;
std::vector<HaffmanTreeNode> char_freq;
char_freq.push_back(HaffmanTreeNode('a', 1));
char_freq.push_back(HaffmanTreeNode('b', 10));
char_freq.push_back(HaffmanTreeNode('c', 2));
char_freq.push_back(HaffmanTreeNode('d', 8));
haffman_tree.Build(char_freq);
}
void CompressTest() {
std::string text ="aaaaaaaaaaaaaadsfadsfasdfasaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaadfasb";
std::vector<unsigned char> encode;
MyCompress compresser;
compresser.Compress(text, &encode);
std::vector<unsigned char> orginal;
compresser.Decompress(encode, &orginal);
printf("orginal:%s\n", text.c_str());
printf("decoded:");
for (int i = 0; i < orginal.size(); ++i) {
printf("%c", orginal[i]);
}
printf("\n");
printf("orginal bytes:%zd, compressed bytes:%zd\ncompressed ratio:%f\n", text.size(), encode.size(), (float)encode.size() / text.size() );
}
int main(int argc, char** argv) {
CompressTest();
// HaffmanTreeTest();
}
实现中的心得:
1)EncodeByte::GetBit()函数的移位操作开始出错了,正确的移位应该是7 - offset,而不是offset,这个需要注意
2)Compress函数的一些功能应该实现在HaffmanTree中更合适,不想改了
3)使用EncodeByte管理bits简化了很多工作,使得bits相当与iterator来处理
4)利用最小堆构建haffman tree的过程比较经典,设计到对象的创建过程具有很好的参考价值
5)还有困惑的地方当模板类使用比较参数时,还有不是很清楚的地方,例如std::less<T>的代码还要看看,如何让模板支持函数对象和函数需要再了解一下
参考文献:
[1]http://www.360doc.com/content/11/0218/15/2150347_94086443.shtml