计算哈夫曼编码长度

2019-11-09 19:38:40

字体：大中小

来源：转载

供稿：网友

本篇文章向大家介绍一个不用构造哈夫曼树的方法来计算哈夫曼编码的长度,这对于较大字符集有极大的优势,因为构造一个树要花费相当大的空间和时间,本算法的时间复杂度为O(nlogn),空间复杂度为O(N);参考文献<深入搜索引擎>第二章程序处理:高亮显示#include <iostream>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>using namespace std;#define BUFF_SIZE 4096#define HASH_SIZE 256char buff[BUFF_SIZE]; //缓冲区 int hash[HASH_SIZE]; //统计每个字符出现的次数int heap[(HASH_SIZE<<1)+2];int pos[HASH_SIZE+1][3];//内部节点 , 0和1记录子结点位置,3节录当前的深度int tlen[HASH_SIZE+1]; //记录每个叶子结点的深度int fd; //文件描述符int sym_num ; //文件中出现的符号数量int SUM(0); //文件中字符总数//初始化程序void init(const char * pathname){ memset(buff , 0 , sizeof(buff)); memset(hash , 0 , sizeof(hash)); memset(heap , 0 , sizeof(heap)); memset(pos , 0 , sizeof(pos)); memset(tlen , 0 , sizeof(tlen)); //打开文件 fd = open(pathname , O_RDONLY); if(fd < 0){ PRintf("init: %s dont exit!/n" , pathname); exit(1); }}//统计文件中每个符号出现的次数void count_symbol(){ lseek(fd , 0 , SEEK_SET); while(read(fd , buff , BUFF_SIZE)){ SUM += strlen(buff); for(int i=strlen(buff) - 1;i>=0;i--) hash[(unsigned int)(buff[i] & 0xFF)]++; } //记录出现的符号数量; for(int i = HASH_SIZE - 1; i >= 0; i--) if(hash[i])sym_num++;}//建立一个最小堆void build_min_heap(){ for(int i=sym_num;i>0;i--){ int p = i >> 1 , j = i; while(p >= 1){ if(heap[heap[p]] > heap[heap[j]]) std::swap(heap[j] , heap[p]); j = p; p >>= 1; } }}//每次取出最小数之后重新调整堆,//h 指推中元素的个数void heap_adjust(int h){ int t = 1 , p , q , l; while(t<h){ p = t<<1; q = p + 1; l = t; if(p <= h && heap[heap[p]] < heap[heap[t]])l = p; if(q <= h && heap[heap[q]] < heap[heap[l]])l = q; if(l == t)break; std::swap(heap[l] , heap[t]); t = l; }}//计算每个字符编码的长度void huff_length(){ int i , j , p , h , m1 , m2; for(i=1 , p=0;i<=sym_num;i++){ while(!hash[p]) p++; heap[sym_num + i] = hash[p]; heap[i] = sym_num + i; p++; } h = sym_num; //对1到n建立最小堆 build_min_heap(); while(h>1){ //取出最小数 m1 = heap[heap[1]]; pos[h][0] = heap[1]; heap[1] = heap[h]; h--; heap_adjust(h); //取出次小数 m2 = heap[heap[1]]; pos[h+1][1] = heap[1]; //最后数和次小数之和放在堆的最后一个位置 heap[h+1] = m1 + m2; //重新指向最新合并的结点 heap[1] = h+1; heap_adjust(h); } //统计编码长度 , 线性时间统计 int ts = sym_num << 1; for(int i=2;i<=sym_num;i++){ if(pos[i][0] <= sym_num) pos[pos[i][0]][2] = pos[i][2] + 1; else tlen[pos[i][0] - sym_num] = pos[i][2] + 1; if(pos[i][1] <= sym_num) pos[pos[i][1]][2] = pos[i][2] + 1; else tlen[pos[i][1] - sym_num] = pos[i][2] + 1; }}int main(){ init("data.dat"); count_symbol(); huff_length(); unsigned int sum = 0; for(int i=1;i<=sym_num;i++) sum += tlen[i] * heap[sym_num + i]; cout<<SUM <<"/t/t"<<sum<<"/t/t"<<sum*1.0/SUM<<endl; return 0;}

上一篇：IP黑名单校验

下一篇：Linux OpenSSH后门的添加与防范