Entropy of Bytes in a File, with C and Ruby code
by Joshua Robinson (joshua.robinson at gmail.com)
Please send me any comments, questions, suggestions, or corrections
The following is a C code implementation to calculate the entropy of the bytes in a user-specified file (or list of files). The unit of entropy reported is bits, so the maximum possible value of entropy would be 8. The code is not guaranteed to run correctly in all cases as I have only tested it on my system, but if it does not, please let me know!
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
int main(int argc, char **argv)
{
//check command line arguments
if (argc < 2) {
fprintf(stderr, "Error, program usage: %s [inputfilename]\n", argv[0]);
exit(1);
}
//loop over all the command line inputs
for (int argnum=1; argnum < argc; argnum++) {
//open the input file
FILE *fp; //pointer to file we will read in
if ((fp = fopen(argv[argnum], "r"))==NULL) {
fprintf(stderr, "Error, unable to open input file %s\n", argv[1]);
continue; //gracefully skip over this bad filename
}
int a; //storage for the current data characer
int byte_counters[256];
memset(byte_counters, 0, sizeof(int) * 256); //reset everything to zero
int total_bytes = 0;
//now calculate the p(x) probabilities of each possible byte (0-255)
while ((a = fgetc(fp))!=EOF) {
byte_counters[a]++;
total_bytes++;
}
fclose(fp); //finished with the input file
//now calculate the entropy value
double h = 0.0; //entropy
for (int i=0; i<256; i++) {
double p_i = (double)byte_counters[i] / (double)total_bytes;
if (p_i > 0.0) //avoid the inf value returned by log(0.0)
h -= p_i * (log(p_i) / log(2));
}
printf("%s:\t%g\n", argv[argnum], h); //output our entropy value for the file
}
return 0;
}
Ruby version
The following is the same code as above, but written in the Ruby language.
#!/usr/bin/ruby
#ruby code to calculate the entropy of bytes in a file
if ARGV.length < 1
printf $stderr, "Error, program usage: entropy.rb [inputfilenames] ....\n"
exit
end
ARGV.each do |filename| #loop over all the command line inputs
#open the input file
fp = File.new(filename, "r")
if !fp #error checking
printf $stderr, "Error, unable to open input file %s\n", filename
next #gracefully skip over this bad filename
end
byte_counters = Array.new(256) {0}
total_bytes = 0
#now calculate the p(x) probabilities of each possible byte (0-255)
fp.each_byte do |a|
byte_counters[a] = byte_counters[a] + 1
total_bytes = total_bytes + 1
end
fp.close #finished with the input file
#now calculate the entropy value
h = 0.0 #entropy
for i in 0...256
p_i = byte_counters[i].to_f / total_bytes.to_f
if p_i > 0.0
h = h - p_i * (Math.log(p_i) / Math.log(2))
end
end
printf "%s:\t%g\n", filename, h #output our entropy value for the file
end