Entropy of Bytes in a File, with C and Ruby code


by Joshua Robinson (joshua.robinson at gmail.com)
Please send me any comments, questions, suggestions, or corrections


The following is a C code implementation to calculate the entropy of the bytes in a user-specified file (or list of files). The unit of entropy reported is bits, so the maximum possible value of entropy would be 8. The code is not guaranteed to run correctly in all cases as I have only tested it on my system, but if it does not, please let me know!

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

int main(int argc, char **argv)
{
  //check command line arguments
  if (argc < 2) {
    fprintf(stderr, "Error, program usage: %s [inputfilename]\n", argv[0]);
    exit(1);
  }

  //loop over all the command line inputs
  for (int argnum=1; argnum < argc; argnum++) {
    //open the input file
    FILE *fp;  //pointer to file we will read in
    if ((fp = fopen(argv[argnum], "r"))==NULL) {
      fprintf(stderr, "Error, unable to open input file %s\n", argv[1]);
      continue; //gracefully skip over this bad filename
    }
    
    int a;  //storage for the current data characer
    int byte_counters[256];
    memset(byte_counters, 0, sizeof(int) * 256);  //reset everything to zero
    int total_bytes = 0;
    //now calculate the p(x) probabilities of each possible byte (0-255)
    while ((a = fgetc(fp))!=EOF) {  
      byte_counters[a]++;
      total_bytes++;
    }
    fclose(fp); //finished with the input file
    
    //now calculate the entropy value
    double h = 0.0;  //entropy
    for (int i=0; i<256; i++) {
      double p_i  = (double)byte_counters[i] / (double)total_bytes;
      if (p_i > 0.0)  //avoid the inf value returned by log(0.0)
	h -= p_i * (log(p_i) / log(2));
    }
    printf("%s:\t%g\n", argv[argnum], h);  //output our entropy value for the file
  }

  return 0;
}

Ruby version

The following is the same code as above, but written in the Ruby language.


#!/usr/bin/ruby
#ruby code to calculate the entropy of bytes in a file

if ARGV.length < 1
  printf $stderr, "Error, program usage: entropy.rb [inputfilenames] ....\n"
  exit
end

ARGV.each do |filename|  #loop over all the command line inputs
  #open the input file
  fp = File.new(filename, "r")

  if !fp  #error checking
    printf $stderr, "Error, unable to open input file %s\n", filename
    next #gracefully skip over this bad filename
  end
    
  byte_counters = Array.new(256) {0}
  total_bytes = 0

  #now calculate the p(x) probabilities of each possible byte (0-255)
  fp.each_byte do |a|
    byte_counters[a] = byte_counters[a] + 1
    total_bytes = total_bytes + 1
  end
  fp.close  #finished with the input file
    
  #now calculate the entropy value
  h = 0.0  #entropy
  for i in 0...256
    p_i  = byte_counters[i].to_f / total_bytes.to_f
    if p_i > 0.0
	h = h - p_i * (Math.log(p_i) / Math.log(2))
    end
  end

  printf "%s:\t%g\n", filename, h  #output our entropy value for the file
 end