>一书中哈夫曼编码代码及笔记

最新推荐文章于 2025-03-31 14:40:19 发布

原创最新推荐文章于 2025-03-31 14:40:19 发布 · 2.1k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#file #output #compression #input #tree #extension

本文详细介绍了哈夫曼编码的基本原理与实现方法，包括哈夫曼树的构建过程及压缩解压流程。文中还提供了具体的源代码实现，帮助读者深入理解哈夫曼编码在数据压缩中的应用。

由于没能找到书中的源代码,且书中有许多印刷错误,所以我把通过编译的代码摘录如下

/******************Start of BITIO.c **********************/

/*
* This utility file contains all of routines needed to implement
* bit oriented routines under either ANSI or K&R C. It needs to
* be linked with every program used in the book
*/

#include <stdio.h>
#include <stdlib.h>
#include "bitio.h"
#include "errhand.h"

BIT_FILE *OpenOutputBitFile ( name )
char *name;
{
BIT_FILE *bit_file;

bit_file = (BIT_FILE *) calloc( 1, sizeof(BIT_FILE) );
if ( bit_file == NULL )
return ( bit_file );
bit_file->file = fopen( name, "wb" );
bit_file->rack = 0;
bit_file->mask = 0x80;
bit_file->pacifier_counter = 0;
return ( bit_file );
}

BIT_FILE *OpenInputBitFile( name )
char *name;
{
BIT_FILE *bit_file;

bit_file = (BIT_FILE *) calloc( 1, sizeof(BIT_FILE) );
if ( bit_file == NULL )
return ( bit_file );
bit_file->file = fopen( name, "rb" );

bit_file->rack = 0;
bit_file->mask = 0x80;
bit_file->pacifier_counter = 0;
return ( bit_file );
}

void CloseOutputBitFile( bit_file )
BIT_FILE *bit_file;
{
if ( bit_file->mask != 0x80 )
if ( putc( bit_file->rack, bit_file->file ) != bit_file->rack )
fatal_error( "Fatal error in CloseBitFile!/n" );
fclose( bit_file->file );
free( (char *)bit_file );
}

void CloseInputBitFile( bit_file )
BIT_FILE *bit_file;
{
fclose( bit_file->file );
free( (char*) bit_file );
}

void OutputBit( bit_file, bit )
BIT_FILE *bit_file;
int bit;
{
if ( bit )
  bit_file->rack |= bit_file->mask;
bit_file->mask >>= 1;
if ( bit_file->mask == 0 )
{
  if ( putc( bit_file->rack, bit_file->file ) != bit_file->rack )
   fatal_error( "Fatal error in OutputBit!/n");
  else
   if ( (bit_file->pacifier_counter++ & 4095 ) == 0 ) /* 4095 eq 111111111111 */
    putc( '.', stdout);
  bit_file->rack = 0;
  bit_file->mask = 0x80;
}
}

void OutputBits( bit_file, code, count )
BIT_FILE *bit_file;
unsigned long code;
int count;
{
unsigned long mask;

mask = 1L << ( count - 1 );
while (mask != 0)
{
  if ( mask & code )
   bit_file->rack |= bit_file->mask;
  bit_file->mask >>= 1;
  if ( bit_file->mask == 0 )
  {
   if ( putc( bit_file->rack, bit_file->file ) != bit_file->rack )
    fatal_error( "Fatal error in OutputBits!/n");
   else if ((bit_file->pacifier_counter++ & 2047)==0)
    putc( '.', stdout );
   bit_file->rack = 0;
   bit_file->mask = 0x80;
  }
  mask >>= 1;
}
}

int InputBit( bit_file )
BIT_FILE *bit_file;
{
int value;

if ( bit_file->mask == 0x80 )
{
  bit_file->rack = getc( bit_file->file );
  if ( bit_file->rack == EOF )
   fatal_error( "Fatal error in InputBit!/n");
  if ( ( bit_file->pacifier_counter++ & 2047 ) == 0 )
   putc( '.', stdout );
}
value = bit_file->rack & bit_file->mask;
bit_file->mask >>= 1;
if ( bit_file->mask == 0 )
  bit_file->mask = 0x80;
return ( value ? 1 : 0 );
}

unsigned long InputBits( bit_file, bit_count )
BIT_FILE *bit_file;
int bit_count;
{
unsigned long mask;
unsigned long return_value;

mask = 1L << ( bit_count -1 );
return_value = 0;
while ( mask != 0)
{
  if ( bit_file->mask == 0x80 )
  {
   bit_file->rack = getc( bit_file->file );
   if ( bit_file->rack == EOF )
    fatal_error( "Fatal error in InputBit!/n" );
   if ( (bit_file->pacifier_counter++ & 2047 ) == 0 )
    putc( '.', stdout );
  }
  if ( bit_file->rack & bit_file->mask ) /* 如何相应位为1,则置1,恰好就是与mask或 */
   return_value |= mask;
  mask >>= 1;
  bit_file->mask >>=1;
  if ( bit_file->mask == 0 )
   bit_file->mask = 0x80;
}
return (return_value);
}

void FilePrintBinary( file, code, bits)
FILE *file;
unsigned int code;
int bits;
{
unsigned int mask;
mask = 1 << (bits - 1);
while ( mask != 0 )
{
  if ( code & mask )
   fputc( '1', file );
  else
   fputc( '0', file );
  mask >>= 1;
}
}

/***************************End of BITTO.C*********************/

/************************ Start of ERRHAND.C ***********************/

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include "errhand.h"

#ifdef __STDC__
void fatal_error( char *fmt, ... )
#else
#ifdef __UNIX__
void fatal_error( fmt )
char *fmt;
va_dcl
#else
void fatal_error( fmt )
#endif
#endif
{
va_list argptr;
va_start( argptr, fmt );
printf( "Fatal error: " );
vprintf( fmt, argptr );
va_end( argptr );
exit(-1);
}
/************************ End of ERRHAND.C ***********************/

/********************** Start of HUFF.C *************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "bitio.h"
#include "errhand.h"
#include "main.h"

/*
* The NODE structure is a node in the Huffman decoding tree. It has a
* count, which is its weight in the tree, and the node numbers of its
* two children. The saved_count member of the structure is only
* there for debugging purposes, and can be safely taken out at any
* time. It just holds the intial count for each of the symbols, since
* the count member is continually being modified as the tree grows.
*/

typedef struct tree_node {
unsigned int count;
unsigned int saved_count;
int child_0;
int child_1;
}NODE;

/*
* A Huffman tree is set up for decoding, not encoding. When encoding,
* I first walk through the tree and build up a table of codes for
* each symbol. The codes are stored in this CODE structure.
*/

typedef struct code {
unsigned int code;
int code_bits;
}CODE;

/*
* The special EOS symbol is 256, the first available symbol after all
* of the possible bytes. When decoding, reading this symbol
* indicates that all of the data has been read in.
*/
#define END_OF_STREAM 256

/*
* Local function prototypes, defined with or without ANSI prototypes.
*/

#ifdef __STDC__

void count_bytes( FILE *input, unsigned long *long_counts );
void scale_counts( unsigned long *long_counts, NODE *nodes );
int build_tree( NODE *nodes );
void convert_tree_to_code( NODE *nodes,
         CODE *codes,
         unsigned int code_so_far,
         int bits,
         int node );
void output_counts( BIT_FILE *output, NODE *nodes );
void input_counts( BIT_FILE *input, NODE *nodes );
void print_model( NODE *nodes, CODE *codes );
void compress_data( FILE *input, BIT_FILE *output, CODE *codes );
void expand_data( BIT_FILE *input, FILE *output, NODE *nodes,
      int root_node );
void print_char( int c );
#else /* __STDC__ */

void count_bytes();
void scale_counts();
int build_tree();
void convert_tree_to_code();
void output_counts();
void input_counts();
void print_model();
void compress_data();
void expand_data();
void print_char();

#endif /* __STDC__ */

/*
* These two strings are used by MAIN-C.C and MAIN-E.C to print
* messages of importance to the use of the program.
*/

char *CompressionName = "static order 0 model with Huffman coding";
char *Usage = "infile outfile [-d]/n/n Specifying -d will dump the modeling data/n";

/*
* CompressFile is the compression routine called by MAIN-C.C. It
* looks for a single additional argument to be passed to it from
* the command line: "-d". If a "-d" is present, it means the
* user wants to see the model data dumped out for debugging
* purposes.
*
* This routine works in a fairly straightforward manner. First,
* it has to allocate storage for three different arrays of data.
* Next, it counts all the bytes in the input file. The counts
* are all stored in long int, so the next step is to scale them down
* to single byte counts in the NODE array. After the counts are
* scaled, the Huffman decoding tree is built on top of the NODE
* array. Another routine walks through the tree to build a table
* of codes, one per symbol. Finally, when the codes are all ready,
* compressing the file is a simple matter. After the file is
* compressed, the storage is freed up, and the routine returns.
*
*/
void CompressFile( input, output, argc, argv )
FILE *input;
BIT_FILE *output;
int argc;
char *argv[];
{
unsigned long *counts;
NODE *nodes;
CODE *codes;
int root_node;

counts = ( unsigned long * )calloc(256, sizeof(unsigned long));
if ( counts==NULL )
  fatal_error( "Error allocating counts array/n" );
if ( ( nodes = (NODE *)calloc(514, sizeof( NODE ) ) ) == NULL )
  fatal_error( "Error allocating counts nodes/n" );
if ( ( codes = (CODE *)calloc(257, sizeof( CODE ) ) ) == NULL )
  fatal_error( "Error allocating counts codes/n" );
count_bytes( input, counts );
scale_counts( counts, nodes );
output_counts( output, nodes );
root_node = build_tree( nodes );
convert_tree_to_code( nodes, codes, 0, 0, root_node );
if ( argc>0 && strcmp( argv[0], "-d" ) == 0 )
  print_model( nodes, codes );
compress_data( input, output, codes );
free( (char *) counts );
free( (char *) nodes );
free( (char *) codes );
}

/*
* ExpandFile is the routine called by MAIN-E.C to expand a file that
* has been compressed with order 0 Huffman coding. This routine has
* a simpler job than that of the Compression routine. All it has to
* do is read in the counts that have been stored in the compressed
* file, then build the Huffman tree. The data can then be expanded
* by reading in a bit at a time from the compressed file. Finally,
* the node array is freed and the routine returns.
*
*/

void ExpandFile( input, output, argc, argv )
BIT_FILE *input;
FILE *output;
int argc;
char *argv[];
{
NODE *nodes;
int root_node;

if ( (nodes = (NODE *)calloc(514, sizeof( NODE ) ) ) == NULL )
  fatal_error( "Error allocating nodes array/n" );
input_counts( input, nodes );
root_node = build_tree( nodes );
if ( argc>0 && strcmp( argv[0], "-d" ) == 0 )
  print_model( nodes, 0 );
expand_data( input, output, nodes, root_node );
free( (char *)nodes );
}
/*
* In order for the compressor to build the same model, I have to
* store the symbol counts in the compressed file so the expander can
* read them in. In order to save space, I don't save all 256 symbols
* unconditionally. The format used to store counts looks like this:
*
* start, stop, counts, start, stop, counts, ... 0
*
* This means that I store runs of counts, until all the non-zero
* counts have been stored. At this time the list is terminated by
* storing a start value of 0. Note that at least 1 run of counts has
* to be stored, so even if the first start value is 0, I read it in.
* It also means that even in an empty file that has no counts, I have
* to pass at least one count, which will have a value of 0.
*
* In order to efficiently use this format, I have to identify runs of
* non-zero counts. Because of the format used, I don't want to stop a
* run because of just one or two zeros in the count stream. So I have
* to sit in a loop looking for strings of three or more zero values
* in a row.
*
* This is simple in concept, but it ends up being one of the most
* complicated routines in the whole program. A routine that just
* writes out 256 values without attempting to optimize would be much
* simpler, but would hurt compression quite a bit on small files.
*
*/
void output_counts( output, nodes )
BIT_FILE *output;
NODE *nodes;
{
int first;
int last;
int next;
int i;
first = 0;
while ( first<255 && nodes[first].count == 0 )
  first++;
/*
* Each time I hit the start of the loop, I assume that first is the
* start of a run of non-zero values. The rest of the loop is
* concerned with finding the value for last, which is the end of the
* run, and the value of next, which is the start of the next run.
* At the end of the loop, I assign next to first, so it starts in on
* the next run.
*/
for ( ; first<256; first = next)
{
  last = first + 1;
  for (;;)
  {
   for (; last<256; last++ )
    if ( nodes[last].count == 0 )
     break;
   last--;
   for ( next = last + 1; next<256; next++ )
    if ( nodes[next].count != 0 )
     break;
   if ( next > 255 )
    break;
   if ( (next-last)>3 )
    break;
   last = next;
  };
/*
* Here is where I output first, last, and all the counts in between.
*/
  if ( putc( first, output->file ) != first )
   fatal_error( "Error writing byte counts/n" );
  if ( putc( last, output->file ) != last )
   fatal_error( "Error writing byte counts/n" );
  for ( i=first; i<=last; i++ )
  {
   if ( putc( nodes[i].count, output->file ) != (int) nodes[i].count )
    fatal_error( "Error writing byte counts/n" );
  }
}
if ( putc(0, output->file ) != 0 )
  fatal_error( "Error writing byte counts/n" );
}

/*
* When expanding, I have to read in the same set of counts. This is
* quite a bit easier that the process of writing them out, since no
* decision making needs to be done. All I do is read in first, check
* to see if I am all done, and if not, read in last and a string of
* counts.
*/
void input_counts( input, nodes )
BIT_FILE *input;
NODE *nodes;
{
int first;
int last;
int i;
int c;

// for ( i=0; i<256; i++ ) //because use calloc ,here we may not operate
//  nodes[i].count = 0;

if ( (first = getc( input->file ) ) == EOF )
  fatal_error( "Error reading byte counts/n");
if ( (last = getc( input->file ) ) == EOF )
  fatal_error( "Error reading byte counts/n");
for (;;)
{
  for ( i=first; i<=last; i++)
   if ( (c=getc(input->file))==EOF )
    fatal_error( "Error reading byte counts/n");
   else
    nodes[i].count = (unsigned int)c;
  if ( (first = getc( input->file ) ) == EOF )
   fatal_error( "Error reading byte counts/n");
  if ( first == 0 )
   break;
  if ( (last = getc( input->file ) ) == EOF )
   fatal_error( "Error reading byte counts/n");
}
nodes[END_OF_STREAM].count = 1;
}

/*
* This routine counts the frequency of occurence of every byte in
* the input file. It marks the place in the input stream where it
* started, counts up all the bytes, then returns to the place where
* it started. In most C implementations, the length of a file
* cannot exceed an unsigned long, so this routine should always
* work.
*/
#ifndef SEEK_SET
#define SEEK_SET 0
#endif

void count_bytes( input, counts )
FILE *input;
unsigned long *counts;
{
long input_marker; // record the file position
int c;

input_marker = ftell( input );
while( ( c=getc( input ) )!=EOF )
counts[c]++;
fseek( input, input_marker, SEEK_SET );
}

/*
* In order to limit the size of my Huffman codes to 16 bits, I scale
* my counts down so they fit in an unsigned char, and then store them
* all as initial weights in my NODE array. The only thing to be
* careful of is to make sure that a node with a non-zero count doesn't
* get scaled down to 0. Nodes with values of 0 don't get codes.
*/
void scale_counts( counts, nodes )
unsigned long *counts;
NODE *nodes;
{
unsigned long max_count;
int i;

max_count = 0;
for ( i=0; i<256; i++ )
  if ( counts[i]>max_count )
   max_count = counts[i];
if ( max_count == 0 )
{
  counts[0] = 1;
  max_count = 1;
}
max_count = max_count /255;
max_count = max_count + 1;
for ( i=0; i<256; i++ )
{
  nodes[i].count = (unsigned int)(counts[i]/max_count);
  if ( nodes[i].count == 0 && counts[i]!=0 )
   nodes[i].count = 1;
}
nodes[ END_OF_STREAM ].count = 1;
}

/*
* Building the Huffman tree is fairly simple. All of the active nodes
* are scanned in order to locate the two nodes with the minimum
* weights. These two weights are added together and assigned to a new
* node. The new node makes the two minimum nodes into its 0 child
* and 1 child. The two minimum nodes are then marked as inactive.
* This process repeats until there is only one node left, which is
* the root node. The tree is done, and the root node is passed back
* to the calling routine.
*
* Node 513 is used here to arbitratily provide a node with a guaran
* teed maximum value. It starts off being min_1 and min_2. After all
* active nodes have been scanned, I can tell if there is only one
* active node left by checking to see if min_1 is still 513.
*/
int build_tree( nodes )
NODE *nodes;
{
int next_free;
int i;
int min_1;
int min_2;

nodes[513].count = 0xffff;
for ( next_free = END_OF_STREAM + 1; ; next_free++ )
{
  min_1 = 513;
  min_2 = 513;
  for ( i=0; i<next_free; i++ )
   if ( nodes[i].count!=0 )
   {
    if ( nodes[i].count<nodes[min_1].count )
    {
     min_2 = min_1;
     min_1 = i;
    }
    else if ( nodes[i].count<nodes[min_2].count)
     min_2 = i;
   }
  if ( min_2 == 513 )
   break;
  nodes[next_free].count = nodes[min_1].count+nodes[min_2].count;
  nodes[min_1].saved_count = nodes[min_1].count;
  nodes[min_1].count = 0;
  nodes[ min_2 ].saved_count = nodes[ min_2 ].count;
        nodes[ min_2 ].count = 0;
  nodes[next_free].child_0 = min_1;
  nodes[next_free].child_1 = min_2;
}
next_free--;
nodes[next_free].saved_count = nodes[next_free].count;
return (next_free);
}

/*
* Since the Huffman tree is built as a decoding tree, there is
* no simple way to get the encoding values for each symbol out of
* it. This routine recursively walks through the tree, adding the
* child bits to each code until it gets to a leaf. When it gets
* to a leaf, it stores the code value in the CODE element, and
* returns.
*/

void convert_tree_to_code( nodes, codes, code_so_far, bits, node )
NODE *nodes;
CODE *codes;
unsigned int code_so_far;
int bits;
int node;
{
if ( node<=END_OF_STREAM )
{
  codes[node].code = code_so_far;
  codes[node].code_bits = bits;
  return;
}
code_so_far <<=1;
bits++;
convert_tree_to_code( nodes, codes, code_so_far, bits,
        nodes[node].child_0 );
convert_tree_to_code( nodes, codes, code_so_far|1,
        bits, nodes[node].child_1 );
}

/*
* If the -d command line option is specified, this routine is called
* to print out some of the model information after the tree is built.
* Note that this is the only place that the saved_count NODE element
* is used for anything at all, and in this case it is just for
* diagnostic information. By the time I get here, and the tree has
* been built, every active element will have 0 in its count.
*/
void print_model( nodes, codes )
NODE *nodes;
CODE *codes;
{
int i;

for ( i=0; i<513; i++ )
{
  if ( nodes[i].saved_count != 0 )
  {
   printf( "node=");
   print_char(i);
   printf( " count=%3d", nodes[i].saved_count );
   printf( " child_0=" );
   print_char( nodes[i].child_0 );
   printf( " child_1=" );
   print_char( nodes[i].child_1 );
   if ( codes&&i<=END_OF_STREAM )
   {
    printf( " Huffman code=" );
    FilePrintBinary( stdout, codes[i].code,
         codes[i].code_bits );
   }
   printf( "/n" );
  }
}
}

/*
* The print_model routine uses this function to print out node num
* bers. The catch is if it is a printable character, it gets printed
* out as a character. This makes the debug output a little easier to
* read.
*/
void print_char( c )
int c;
{
if ( c>=0x20&&c<127 )
printf("%c", c);
else
printf("%3d", c);
}

/*
* Once the tree gets built, and the CODE table is built, compressing
* the data is a breeze. Each byte is read in, and its corresponding
* Huffman code is sent out.
*/
void compress_data( input, output, codes )
FILE *input;
BIT_FILE *output;
CODE *codes;
{
int c;

while ((c = getc( input ) ) != EOF )
OutputBits( output, (unsigned long) codes[c].code, codes[c].code_bits );
OutputBits( output, (unsigned long) codes[END_OF_STREAM].code, codes[END_OF_STREAM].code_bits );
}

/*
* Expanding compressed data is a little harder than the compression
* phase. As each new symbol is decoded, the tree is traversed,
* starting at the root node, reading a bit in, and taking either the
* child_0 or child_1 path. Eventually, the tree winds down to a
* leaf node, and the corresponding symbol is output. If the symbol
* is the END_OF_STREAM symbol, it doesn't get written out, and
* instead the whole process terminates.
*/
void expand_data( input, output, nodes, root_node )
BIT_FILE *input;
FILE *output;
NODE *nodes;
int root_node;
{
int node;

for(;;)
{
  node = root_node;
  do {
   if ( InputBit(input) )
    node = nodes[node].child_1;
   else
    node = nodes[node].child_0;
  }while( node > END_OF_STREAM );
  if ( node == END_OF_STREAM )
   break;
  if ( (putc( node, output) ) != node )
   fatal_error( "Error trying to write byte to output" );
}
}
/******************************End of HUFF.C***************************/

/***********************Start of MAIN-C.C*****************************/
/*
* This is the driver program used when testing compression algorithms.
* In order to cut back on repetitive code, this version of main is
* used with all of the compression routines. In order to turn it into
* a real pragram, it needs to have another module that supplies one
* routine and two strings, namely:
*
* void CompressFile( File *input, BIT_FILE *output,
* int argc, char*argv );
* char *Usage;
* char *CompressionName;
*
* The main() routine supplied here has the job of checking for valid
* input and output files, opening them, and then calling the
* compression routine. If the files are not present, or no arguments
* are supplied, it prints out an error message, which includes the
* Usage string supplied by the compression module. All of the
* routines and strings needed by this routine are defined in the
* main.h header file.
*
* After this is built into a compression program of any sort, the
* program can be called like this:
*
* main-c infile outfile [ options ]
*
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bitio.h"
#include "errhand.h"
#include "main.h"
#ifdef __STDC__

void usage_exit( char *prog_name );
void print_ratios( char *input, char *output);
long file_size( char *name );

#else

void usage_exit();
void print_ratios();
long file_size();

#endif

int main( argc, argv )
int argc;
char *argv[];
{
BIT_FILE *output;
FILE *input;

setbuf( stdout, NULL );
if ( argc < 3 )
  usage_exit( argv[0] );
input = fopen((char*)argv[1], "rb");
if ( input == NULL )
  fatal_error( "Error opening %s for input/n", argv[1] );
output = OpenOutputBitFile( argv[2] );
if ( output == NULL )
  fatal_error( "Error opening %s for output/n", argv[2] );
printf( "/nCompressing %s to %s/n", argv[1], argv[2] );
printf( "Using %s/n", CompressionName );
// argc -= 3;
// argv += 3;
CompressFile( input, output, argc-3, argv+3 );
CloseOutputBitFile( output );
fclose( input );
print_ratios( argv[1], argv[2] );
return 0;
}

/*
* This routine just wants to print out the usage message that is
* called for when the program is run with no parameters. The first
* part of the Usage statement is supposed to be just the program
* name. argv[0] generally holds the fully qualified path name
* of the program being run. I make a half-hearted attempt to strip
* out that path info and file extension before printing it. It should
* get the general idea across.
*/

void usage_exit( prog_name )
char *prog_name;
{
char *short_name;
char *extension;

short_name = strrchr( prog_name, '//' );
if ( short_name == NULL )
  short_name = strrchr( prog_name, '/' );
if ( short_name == NULL )
  short_name = strrchr( prog_name, ':' );
if ( short_name != NULL )
  short_name++;
else
  short_name = prog_name;
extension = strrchr( short_name, '.' );
if ( extension != NULL )
  *extension = '/0';
printf( "/nUsage: %s %s/n", short_name, Usage );
exit( 0 );
}

/*
* This routine is used by main to get the size of a file after it has
* been closed. It does all the work, and returns a long. The main
* program gets the file size for the plain text, and the size of the
* compressed file, and prints the ratio.
*/
#ifndef SEEK_END
#define SEEK_END 2
#endif

long file_size( name )
char *name;
{
long eof_ftell;
FILE *file;

file = fopen( name, "r" );
if ( file == NULL )
return (0L);
fseek( file, 0L, SEEK_END );
eof_ftell = ftell( file );
fclose( file );
return (eof_ftell ) ;
}

/*
* This routine prints out the compression ratios after the input and
* output files have been closed.
*/
void print_ratios( input, output )
char *input;
char *output;
{
long input_size;
long output_size;
int ratio;
input_size = file_size( input );
if ( input_size == 0 )
  input_size = 1;
printf( "/nInput bytes:        %ld/n", input_size );
output_size = file_size( output );
if ( output_size == 0 )
         output_size = 1;
printf( "Output bytes:      %ld/n", output_size );
ratio = 100 - (int)(output_size * 100L / input_size );
    printf( "Compression ratio: %d%%/n", ratio );
}

/*********************** End of MAIN-C.C *************************/

/******************Start of BITIO.h**************************/
#ifndef _BITIO_H
#define _BITIO_H
#include <stdio.h>

/* rack:
   contains the current byte of data either read in from
   the file or waiting to be written out to the file
   mask:
   contains a single bit mask used either to set or clear
   the current output bit or to mask in the current input bit.
*/

typedef struct bit_file {
     FILE *file;
     unsigned char mask;
     int rack;
     int pacifier_counter;
} BIT_FILE;

#ifdef __STDC__
/* The routine open input files for bit I/O */
BIT_FILE *OpenInputBitFile( char *name );

/* The routine open output files for bit I/O */
BIT_FILE *OpenOutputBitFile( char *name );

void OutputBit( BIT_FILE *bit_file, int bit );
void OutputBits( BIT_FILE *bit_file, unsigned long code, int count);
int InputBit( BIT_FILE *bit_file );
unsigned long InputBits( BIT_FILE *bit_file, int bit_count );

void CloseInputBitFile( BIT_FILE *bit_file );
void CloseOutputBitFile( BIT_FILE *bit_file );

void FilePrintBinary( FILE *file, unsigned int code, int bits );

#else /* __STDC__ */

BIT_FILE* OpenInputBitFile();
BIT_FILE* OpenOutputBitFile();
void OutputBit();
void OutputBits();
int InputBit();
int InputBit();
unsigned long InputBits();
void CloseInputBitFile();
void CloseOutputBitFile();
void FilePrintBinary();
#endif /* __STDC__ */

#endif /* _BITIO_H */

/**********************End of BITIO.h**********************/

/********************** Start of ERRHAND.H **********************/
#ifndef _ERRHAND_H
#define _ERRHAND_H

#ifdef __STDC__

void fatal_error( char *fmt, ... );

#else /* __STDC__ */

void fatal_error();

#endif /* __STDC__ */

#endif /* _ERRHAND_H */

/********************** End of ERRHAND.H *************************/

/********************** Start of MAIN.H ***********************/

#ifndef _MAIN_H
#define _MAIN_H

#ifdef _STDC_
void CompressFile( FILE *input, BIT_FILE *output, int argc, char *argv[] );
void ExpandFile( BIT_FILE *input, FILE *output, int argc, char *argv[] );

#else /* __STDC__ */

void CompressFile();
void ExpandFile();

#endif /* __STDC__ */

extern char *Usage;
extern char *CompressionName;
#endif /* _MAIN_H */

/************************* End of MAIN.H ************************/

/********************************Start of MAIN_E.c******************************/
/* This driver program tests compression algorithms. To cut back on
* repetitive code, this version of main is used with all the expansion
* routines. The main() routine supplied here checks for valid input and
* output files, opens them, then calls the compression routine.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bitio.h"
#include "errhand.h"
#include "main.h"

#ifdef __STDC__
void usage_exit( char *prog_name );
#else
void usage_exit();
#endif

int main( argc, argv )
int argc;
char *argv[];
{
FILE *output;
BIT_FILE *input;

setbuf( stdout, NULL );
if ( argc < 3 )
  usage_exit( argv[0] );
input = OpenInputBitFile( argv[1] );
if ( input == NULL )
  fatal_error( "Error opening %s for input/n", argv[1] );
output = fopen( argv[2], "wb" );
if ( output == NULL )
  fatal_error( "Error opening %s for output/n", argv[2] );
printf( "/nExpanding %s to %s for output/n", argv[2], argv[1] );
printf( "Using %s/n", CompressionName );
//argc -= 3;
//argv += 3;
ExpandFile( input, output, argc-3, argv+3 );
CloseInputBitFile( input );
fclose( output );
putc('/n', stdout );
return 0;
}

/*
* This routine wants to print out the usage message called for when the
* program is run with no parameters. The first part of the Usage statement
* is supposed to be just the programname. argv[0] generally holds
* the fully qualified path name of the program being run.
*/
void usage_exit( prog_name )
char *prog_name;
{
char *short_name;
char *extension;

学习笔记:

一个字符的熵被定义为它概率的负对数.为能以位为单位计算一个信息的熵,我们以2为底的对数定义: Number of bits = - Log base 2 (probability),而每个信息的熵刚是简单地通过把所有的字符的熵加总.

假设,A出现15次,而一个消息一共是39个字符,那么其熵就是-log base 2 (15/39) = 1.38

统计模型:每次读入一个字符就使用该字符出现的概率来编码.

基于字典模型:使用一个编码代替字符串.

Order0静态表需要256字节,而Order1静态表则需要65536(256*256)个字节.

因此,采用自适应的方法解决”静态表对于不满足的输入字符流的不好压缩性能”.但是自适应也有缺点,就是自适应需要一个过程,一般要求1000个字符后,才能产生出好的压缩比率.而自适模型有一个优点就是,就是能够很好地适应局部压缩条件.

在信息论开始时一个最流行的观念就是如果在一个信息中的符号出现概念已知,那么应该存在一个字符的编码方式使得信息的保存仅需更少的空间.

Given the probabilities, a table of codes could be constructed that has several important properties:

• Different codes have different numbers of bits.

• Codes for symbols with low probabilities have more bits, and codes for symbols with high probabilities have fewer bits.

• Though the codes are of different bit lengths, they can be uniquely decoded.

The Shannon-Fano Algorithm

A Shannon-Fano tree is built according to a specification designed to define an effective code table. The actual algorithm is simple:

1. For a given list of symbols, develop a corresponding list of probabilities or frequency counts so that each symbol’s relative frequency of occurrence is known.

2. Sort the lists of symbols according to frequency, with the most frequently occuring symbols at the top and the least common at the bottom.

3. Divide the list into two parts, with the total frequency counts of the upper half being as close to the total of the bottom half as possible.

4. The upper half of the list is assigned the binary digit 0, and the lower half is assigned the digit 1. This means that the codes for the symbols in the first half will all start with 0, and the codes in the second half will all start with 1.

5. Recursively apply the steps 3 and 4 to each of the two halves, subdividing groups and adding bits to the codes until each symbol has become a corresponding code leaf on the tree.

Huffman codes have the unique prefix attribute, which means they can be correctly decoded despite being variable length.

The Shannon-Fano tree is built from the top down, starting by assigning the most significant bits to each code and working down the tree until finished. Huffman codes are built from the bottom up, starting with the leaves of the tree and working progressively closer to the root.

The procedure for building the tree is simple and elegant. The individual symbols are laid out as a string of leaf nodes that are going to be connected by a binary tree. Each node has a weight, which is simply the frequency or probability of the symbol’s appearance. The tree is then built with the following steps:

• The two free nodes with the lowest weights are located.

• A parent node for these two nodes is created. It is assigned a weight equal to the sum of the two child nodes.

• The parent node is added to the list of free nodes, and the two child nodes are removed from the list.

• One of the child nodes is designated as the path taken from the parent node when decoding a 0 bit. The other is arbitrarily set to the 1 bit.

• The previous steps are repeated until only one free node is left. This free node is designated the root of the tree.

To determine the code for a given symbol, we have to walk from the leaf node to the root of the Huffman tree, accumulating new bits as we pass through each parent node. Unfortunately, the bits are returned to us in the reverse order that we want them, which means we have to push the bits onto a stack, then pop them off to generate the code.

the codes have the unique prefix property. Since no code is a prefix to another code

And Huffman was able to prove that this coding method cannot be improved on with any other integral bit-width coding stream.

解决EOF压缩的方法有两种,一种是采用特殊的代码处理这一问题,第二种就是把文件的大小当代压缩数据进行一起压缩.

在哈夫曼编码中,由于压缩和解压的哈夫曼树是相同的,采用缩放并只保存每个字母对应的频率我们就可以在解压中重构哈夫曼树.同时由于每个字母的频率通过缩放后均采用无符号字符的精度保存,所以相对于一个4K的哈夫曼树结构是很省的.另一方面,我们也发现,文件中可能并不全部出现所有的字母,所以我们保存也只需要保存相应的字母即可,其余则频率值为0.这样我们就可以通过以下的结构来保存,首先指出范围,起始的ASCII码,和终止的ASCII码,接着就是在这个范围里面的每个字母对应的频率值.最后字母表以0为终结.所以一张以压缩文件”头部”的字母表可以如下所示:

next free node will be at 257.

There is an extra member in the node structure called saved_count. When a node is taken off the active list by having its count set to zero, the previous count is stored in saved_count.

Nodes with a weight of 0 are considered to be unused and will never again be selected to represent a minimum.

Using the Tree

解压

l Starting at the root node, a single bit at a time is read in by the decoder.

l If the bit is a 0, the next node is the one pointed to by the child_0 index.

l If the bit is a 1, the next node is the one pointed to by the child_1 index.

l If the new node is 256 or less, we have reached a leaf of the tree and can output the corresponding symbol.

l If the symbol was the special end-of-stream symbol, we can exit instead of sending it out.

压缩

这里存在一个问题就是对于每个字符要想得到其压缩码,我们就必须从树的叶子结点开始向上一直找到父亲结点,这可以通过增加一个指向双亲结点的指针实现,然而这还要增加一个栈用于保存在遍历过程中的01编码,然后在到达根结点时,再一次性出栈输出压缩码.这又增加缓冲空间.