#!/usr/bin/env perl

# To get a text version of this file, click on this link

#---------------------------------------------------------------------
#                          file information
#---------------------------------------------------------------------

# Filename:         bytehisto.pl
# Purpose:          Generates a byte histogram in CSV format
# License:          CC BY-NC-SA 4.0
# Original author:  OldCoder (Robert Kiraly)
# Revision:         180802

#---------------------------------------------------------------------
#                               remarks
#---------------------------------------------------------------------

# 1. This program takes an input file, which may be binary,  as a com-
# mand-line argument and generates a CSV text file as output.  The CSV
# file shows  number of occurrences of each input-byte value from zero
# to 255.

# 2. This program is largely standard boilerplate,  with modifications,
# down to the main routine.

# 3. This program is written in Perl 5,  but does  show roughly how the
# same task might be accomplished in 'C'.

#---------------------------------------------------------------------
#                           important note
#---------------------------------------------------------------------

# This software is provided on an  AS IS basis with ABSOLUTELY NO WAR-
# RANTY.  The  entire risk as to the  quality and  performance of  the
# software is with you.  Should the software prove defective,  you as-
# sume the cost of all necessary  servicing, repair or correction.  In
# no event will any of the developers,  or any other party, be  liable
# to anyone for damages arising out of use of the software, or inabil-
# ity to use the software.

#---------------------------------------------------------------------
#                            module setup
#---------------------------------------------------------------------

require 5.16.1;
use strict;
use Carp;
use warnings;
                                # Trap warnings
$SIG{__WARN__} = sub { die @_; };

#---------------------------------------------------------------------
#                           basic constants
#---------------------------------------------------------------------

use constant ZERO  => 0;        # Zero
use constant ONE   => 1;        # One
use constant TWO   => 2;        # Two

use constant FALSE => 0;        # Boolean FALSE
use constant TRUE  => 1;        # Boolean TRUE

#---------------------------------------------------------------------
#                         program parameters
#---------------------------------------------------------------------

my $MAXSIZE = 50000000;         # Max. file size supported (bytes)
my $CSVFILE = "output.csv";     # Output-file name (or pathname)

#---------------------------------------------------------------------
#                            main routine
#---------------------------------------------------------------------

sub Main
{

#---------------------------------------------------------------------
# Initial setup.

    select STDERR; $| = ONE;    # Force STDERR flush on write
    select STDOUT; $| = ONE;    # Force STDOUT flush on write

#---------------------------------------------------------------------
# Get ready to read the data.

                                # Input-file name or pathname
    my $InputFile = shift (@ARGV);
                                # Was one specified?
    die "Usage: bytehisto input.dat\n"
        unless defined $InputFile;
                                # A reusable error-message string
    my $EM = "Error: File is valid or inaccessible: $InputFile\n";

                                # Confirm that it's a file
    $_ = $InputFile;
    die $EM unless -f $_ && -r $_;
                                # Check the file size
    my @stat = stat ($InputFile);
                                # 'C' has  a similar file-status func-
                                # tion
    die $EM unless scalar @stat;
    my $InputSize = $stat [7];
    die "Error: File is too large\n" unless $InputSize <= $MAXSIZE;

#---------------------------------------------------------------------
# Read the data.

                                # Open  the input file
    open (OFD, "<$InputFile") || die $EM;

    binmode (OFD);              # Set  binary      mode
    undef $/;                   # Set  entire-file mode
                                # Read entire-file
    my $data = <OFD>;
       $data = "" unless defined ($data);

    close (OFD);                # Close the input file

#---------------------------------------------------------------------
# Convert to an array of bytes.

# This  part is  Perl-specific.  In 'C', the data would probably be an
# array of bytes to begin with.

    my @bytes = unpack ("C*", $data);

#---------------------------------------------------------------------
# Note regarding data.

# At this point,  @bytes is an array which contains all of the data in
# a form that can be accessed as follows:
#
# $bytes [0] # 1st byte as unsigned char
# $bytes [1] # 2nd byte as unsigned char
# etc.

# This  program may, therefore, be modified easily to  produce a range
# of tools such as hex dumpers, editors, and patchers.  The  histogram
# produced below is simply one application.

#---------------------------------------------------------------------
# Build the histogram.

# This section is the only tricky part.

    my @histo = ();             # This  will  hold  counters for  each
                                # possible byte value from zero to 255

                                # Start with zero for each counter
    for my $ii (ZERO .. 255) { $histo [$ii] = ZERO; }

                                # Count  number of occurrences of each
                                # byte value
    for my $byte (@bytes) { $histo [$byte]++; }

# The rest of  the code in this section is about sorting rows based on
# number of occurrences while retaining both  byte values and the num-
# bers in question.

    for my $ii (ZERO .. 255) { $histo [$ii] =~ s@^@$ii,@; }

    @histo = sort
    {
        my ($a1, $b1) = ($a, $b);
        $a1 =~ s@^.+,@@;
        $b1 =~ s@^.+,@@;
        $a1 <=> $b1;
    }
        @histo;

#---------------------------------------------------------------------
# Display the results as a CSV file.

                                # Open  the output file
    open (OFD, ">$CSVFILE") ||
        die "Error: Can't create output file: $!\n$CSVFILE\n";

                                # Write completed and sorted entries
    for my $ii (ZERO .. 255) 
        { print OFD $histo [$ii] . "\n"; }

    close (OFD);                # Close the output file

    print "Created $CSVFILE\n"; # Print a status message
    undef;
}

#---------------------------------------------------------------------
#                            main program
#---------------------------------------------------------------------

&Main();                        # Call the main routine
exit ZERO;                      # Normal exit