#!/usr/bin/env perl # To get a text version of this file, click on this link #--------------------------------------------------------------------- # file information #--------------------------------------------------------------------- # Filename: bytehisto.pl # Purpose: Generates a byte histogram in CSV format # License: CC BY-NC-SA 4.0 # Original author: OldCoder (Robert Kiraly) # Revision: 180802 #--------------------------------------------------------------------- # remarks #--------------------------------------------------------------------- # 1. This program takes an input file, which may be binary, as a com- # mand-line argument and generates a CSV text file as output. The CSV # file shows number of occurrences of each input-byte value from zero # to 255. # 2. This program is largely standard boilerplate, with modifications, # down to the main routine. # 3. This program is written in Perl 5, but does show roughly how the # same task might be accomplished in 'C'. #--------------------------------------------------------------------- # important note #--------------------------------------------------------------------- # This software is provided on an AS IS basis with ABSOLUTELY NO WAR- # RANTY. The entire risk as to the quality and performance of the # software is with you. Should the software prove defective, you as- # sume the cost of all necessary servicing, repair or correction. In # no event will any of the developers, or any other party, be liable # to anyone for damages arising out of use of the software, or inabil- # ity to use the software. #--------------------------------------------------------------------- # module setup #--------------------------------------------------------------------- require 5.16.1; use strict; use Carp; use warnings; # Trap warnings $SIG{__WARN__} = sub { die @_; }; #--------------------------------------------------------------------- # basic constants #--------------------------------------------------------------------- use constant ZERO => 0; # Zero use constant ONE => 1; # One use constant TWO => 2; # Two use constant FALSE => 0; # Boolean FALSE use constant TRUE => 1; # Boolean TRUE #--------------------------------------------------------------------- # program parameters #--------------------------------------------------------------------- my $MAXSIZE = 50000000; # Max. file size supported (bytes) my $CSVFILE = "output.csv"; # Output-file name (or pathname) #--------------------------------------------------------------------- # main routine #--------------------------------------------------------------------- sub Main { #--------------------------------------------------------------------- # Initial setup. select STDERR; $| = ONE; # Force STDERR flush on write select STDOUT; $| = ONE; # Force STDOUT flush on write #--------------------------------------------------------------------- # Get ready to read the data. # Input-file name or pathname my $InputFile = shift (@ARGV); # Was one specified? die "Usage: bytehisto input.dat\n" unless defined $InputFile; # A reusable error-message string my $EM = "Error: File is valid or inaccessible: $InputFile\n"; # Confirm that it's a file $_ = $InputFile; die $EM unless -f $_ && -r $_; # Check the file size my @stat = stat ($InputFile); # 'C' has a similar file-status func- # tion die $EM unless scalar @stat; my $InputSize = $stat [7]; die "Error: File is too large\n" unless $InputSize <= $MAXSIZE; #--------------------------------------------------------------------- # Read the data. # Open the input file open (OFD, "<$InputFile") || die $EM; binmode (OFD); # Set binary mode undef $/; # Set entire-file mode # Read entire-file my $data = <OFD>; $data = "" unless defined ($data); close (OFD); # Close the input file #--------------------------------------------------------------------- # Convert to an array of bytes. # This part is Perl-specific. In 'C', the data would probably be an # array of bytes to begin with. my @bytes = unpack ("C*", $data); #--------------------------------------------------------------------- # Note regarding data. # At this point, @bytes is an array which contains all of the data in # a form that can be accessed as follows: # # $bytes [0] # 1st byte as unsigned char # $bytes [1] # 2nd byte as unsigned char # etc. # This program may, therefore, be modified easily to produce a range # of tools such as hex dumpers, editors, and patchers. The histogram # produced below is simply one application. #--------------------------------------------------------------------- # Build the histogram. # This section is the only tricky part. my @histo = (); # This will hold counters for each # possible byte value from zero to 255 # Start with zero for each counter for my $ii (ZERO .. 255) { $histo [$ii] = ZERO; } # Count number of occurrences of each # byte value for my $byte (@bytes) { $histo [$byte]++; } # The rest of the code in this section is about sorting rows based on # number of occurrences while retaining both byte values and the num- # bers in question. for my $ii (ZERO .. 255) { $histo [$ii] =~ s@^@$ii,@; } @histo = sort { my ($a1, $b1) = ($a, $b); $a1 =~ s@^.+,@@; $b1 =~ s@^.+,@@; $a1 <=> $b1; } @histo; #--------------------------------------------------------------------- # Display the results as a CSV file. # Open the output file open (OFD, ">$CSVFILE") || die "Error: Can't create output file: $!\n$CSVFILE\n"; # Write completed and sorted entries for my $ii (ZERO .. 255) { print OFD $histo [$ii] . "\n"; } close (OFD); # Close the output file print "Created $CSVFILE\n"; # Print a status message undef; } #--------------------------------------------------------------------- # main program #--------------------------------------------------------------------- &Main(); # Call the main routine exit ZERO; # Normal exit