赫夫曼编码
简介
赫夫曼树的一个重要应用就是赫夫曼编码。赫夫曼编码是不定长编码,对于出现频率高的字母用较短的编码,对于出现频率低的使用较长的编码,从而进行数据的压缩。
举个例子
对于下面这个字符串,我们可以使用赫夫曼树进行编码
can you can a can as a can canner can a can.
第一步:我们可以统计出各个字符出现的频率,并将它们出现的频率作为权重。
n:8 空格:11 a:11 c:7 o:1 y:1 .:1 u:1 e:1 r:1 s:1
第二步:构造赫夫曼树
当然,根据排序算法的不同,树的节点可能有细小的差别,不过并不影响。编码规则为左子树表示0,右子树表示1
这样,我们我们就得到对应的赫夫曼编码表
n | 00 |
---|---|
空格 | 01 |
a | 10 |
c | 111 |
a | 10 |
s | 11000 |
o | 110010 |
y | 110011 |
. | 110100 |
u | 110101 |
e | 110110 |
r | 110111 |
原先的
0110001101100001011011100010000001111001011011110111010100100000011000110110000101101110001000000110000100100000011000110110000101101110001000000110000101110011001000000110000100100000011000110110000101101110001000000110001101100001011011100110111001100101011100100010000001100011011000010110111000100000011000010010000001100011011000010110111000101110
编码后:
11101001011010011011011001010111010010011011101001001110011100110111010010111010000110101110001011101001001101110100110100000011
这样我们得到了根据编码表对字符串进行重新编码,编码后我们会发现编码的长度短了70%。
代码实现赫夫曼编码和解码
package com.wuxudong.HuffmanCode;
public class Node implements Comparable<Node> {
Byte data;
int weight;
Node left;
Node right;
public Node(Byte data,int weight){
this.data=data;
this.weight=weight;
}
@Override
public String toString() {
return "Node{" +
"data=" + data +
", weight=" + weight +
'}';
}
public int compareTo(Node o) {
return o.weight-this.weight;
}
}
package com.wuxudong.HuffmanCode;
import jdk.management.resource.internal.inst.FileOutputStreamRMHooks;
import java.util.*;
public class TestHuffmanCode {
//用于临时存储路径值
static StringBuilder sb = new StringBuilder();
//用于存储赫夫曼编码
static Map<Byte, String> huffCodes = new HashMap<Byte, String>();
public static void main(String[] args) {
String msg = "can you can a can as a can canner can a can.";
byte[] bytes = msg.getBytes();
for (byte b2:bytes){
System.out.print(Integer.toBinaryString((b2 & 0xFF) + 0x100).substring(1));
}
//进行赫夫曼编码压缩
byte[] b = huffmanZip(bytes);
//使用赫夫曼解码
byte[] newBytes = decode(huffCodes, b);
//System.out.println(new String(newBytes));
}
private static byte[] decode(Map<Byte, String> huffCodes, byte[] bytes) {
StringBuilder sb=new StringBuilder();
for (int i = 0; i <bytes.length ; i++) {
byte b=bytes[i];
//是否是最后一个
boolean flag=(i==bytes.length-1);
sb.append(byteToBitStr(!flag,b));
}
//把字符串按照指定的赫夫曼编码进行编码
//把赫夫曼编码键值对进行调换
Map<String,Byte> map=new HashMap<String, Byte>();
for (Map.Entry<Byte,String> entry:huffCodes.entrySet()){
map.put(entry.getValue(),entry.getKey());
}
//创建一个集合用于存储byte
List<Byte> list=new ArrayList<Byte>();
//处理字符串
for (int i = 0; i < sb.length(); ) {
int count=1;
boolean flag=true;
Byte b=null;
while (flag){
String key=sb.substring(i,i+count);
b=map.get(key);
if (b==null){
count++;
}
else {
flag=false;
}
}
list.add(b);
i+=count;
}
byte [] b=new byte[list.size()];
//将集合转为数组
for (int i=0;i<list.size();i++){
b[i]=list.get(i);
}
return b;
}
private static String byteToBitStr(boolean flag,byte b){
int temp=b;
if (flag){
temp|=256;
}
String str=Integer.toBinaryString(temp);
if (flag){
return str.substring(str.length()-8);
}else {
return str;
}
}
private static byte[] huffmanZip(byte[] bytes) {
//先统计每一个byte出现的次数,并放入一个集合中
List<Node> nodes = getNodes(bytes);
//创建一个赫夫曼树
Node tree = huffmanTree(nodes);
//创建一个赫夫曼编码表
Map<Byte, String> huffCodes = getCodes(tree);
//编码
byte[] b = zip(bytes, huffCodes);
return b;
}
private static byte[] zip(byte[] bytes, Map<Byte, String> huffCodes) {
StringBuilder sb = new StringBuilder();
//把需要压缩的byte数组处理成一个二进制的字符串
for (byte b : bytes) {
sb.append(huffCodes.get(b));
}
//将这个字符串处理成byte返回去
//定义长度
int len;
if (sb.length() % 8 == 0) {
len = sb.length() / 8;
} else {
len = sb.length() / 8 + 1;
}
byte[] by = new byte[len];
//记录byte的位置
int index = 0;
for (int i = 0; i < sb.length(); i = i + 8) {
String str;
if (i + 8 > sb.length()) {
str = sb.substring(i);
} else {
str = sb.substring(i, i + 8);
}
//将字符串转为byte值,表示输出在二进制下的十进制的数
byte byt = (byte) Integer.parseInt(str, 2);
by[index] = byt;
index++;
}
return by;
}
private static Map<Byte, String> getCodes(Node tree) {
if (tree == null) {
return null;
}
getCodes(tree.left, "0", sb);
getCodes(tree.right, "1", sb);
return huffCodes;
}
private static void getCodes(Node node, String code, StringBuilder sb) {
StringBuilder sb2 = new StringBuilder(sb);
sb2.append(code);
if (node.data == null) {
getCodes(node.left, "0", sb2);
getCodes(node.right, "1", sb2);
} else {
huffCodes.put(node.data, sb2.toString());
}
}
private static Node huffmanTree(List<Node> nodes) {
while (nodes.size() > 1) {
//对节点进行排序
Collections.sort(nodes);
//去权重最小的两个节点
Node left = nodes.get(nodes.size() - 1);
Node right = nodes.get(nodes.size() - 2);
//创建一个新节点
Node parent = new Node(null, left.weight + right.weight);
parent.left = left;
parent.right = right;
//删除左右节点
nodes.remove(left);
nodes.remove(right);
nodes.add(parent);
}
return nodes.get(0);
}
private static List<Node> getNodes(byte[] bytes) {
//创建map集合存储byte出现的次数
Map<Byte, Integer> counts = new HashMap<Byte, Integer>();
//遍历数组
for (byte b : bytes) {
Integer count = counts.get(b);
if (count == null) {
//第一个遍历到这个byte
counts.put(b, 1);
} else {
counts.put(b, count + 1);
}
}
List<Node> nodes = new ArrayList<Node>();
//遍历map集合
for (Map.Entry<Byte, Integer> entry : counts.entrySet()) {
Node node = new Node(entry.getKey(), entry.getValue());
nodes.add(node);
}
return nodes;
}
}