#!/home/richmit/bin/verGo.sh ruby
# -*- Mode:Ruby; Coding:us-ascii-unix; fill-column:160 -*-

################################################################################################################################################################
##
# @file      byteAnalysis.rb
# @author    Mitch Richling <http://www.mitchr.me/>
# @Copyright Copyright 1996,2006,2008,2015 by Mitch Richling.  All rights reserved.
# @brief     Statistical summary of bytes on STDIN or in a file.@EOL
# @Std       Ruby 2.0
# @LICENSE   @EOL
#  =============================================================================================================================================================
#  Copyright (c) 1996-2015, Mitchell Jay Richling <https://www.mitchr.me> All rights reserved.
#  
#  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#  
#  1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer.
#  
#  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#  
#  3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without
#     specific prior written permission.
#  
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
#  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#  =============================================================================================================================================================
#            
################################################################################################################################################################

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Set defaults and process command line arguments
printOptions = Hash.new
fileNames    = Array.new
printOptWords = ['counts', 'class', 'color', 'wide', '8bit', 'utf8', 'eol']
printOptWords.each { |w| printOptions[w] = TRUE }
ARGV.each do |cArg|
  validOption = false
  printOptWords.each do |w|
    if(cArg == "-#{w}") then
      printOptions[w] = false
      validOption = true
      break;
    elsif(cArg == "+#{w}") then
      printOptions[w] = true
      validOption = true
      break;
    end
  end
  if( !(validOption)) then
    if(cArg.match(/^-+h/)) then
      puts(' Read data from files or from STDIN, and print out statistical     ')
      puts(' information regarding byte occurrence.                             ')
      puts('                                                                  ')
      puts(' Print options (a - turns it off and a + turns it on) :           ')
      puts('   [+|-]counts Print character count table                        ')
      puts('   [+|-]color  Use color in character count table                 ')
      puts('   [+|-]wide   Use wide format for character count table          ')
      puts('   [+|-]8bit   Print upper 128 counts in the table for ASCII input')
      puts('   [+|-]class  Print character class summary                      ')
      puts('   [+|-]eol    Print EOL summary                                  ')
      puts('   [+|-]utf8   Print UTF-8 count summary                          ')
      puts(' Other options:                                                   ')
      puts('   -all    Set all print options to TRUE                          ')
      puts('   -none   Set all print options to FALSE                         ')
      puts('   -help   Print this message and exit                            ')
      exit
    elsif(cArg.match(/^-+all/)) then
      printOptWords.each { |w| printOptions[w] = TRUE }
    elsif(cArg.match(/^-+none/)) then
      printOptWords.each { |w| printOptions[w] = FALSE }
    else
      fileNames.push(cArg)
    end
  end
end

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
files = Array.new
if (fileNames.empty?) then
  files.push($stdin)
else
  files = fileNames.map { |n| open(n, 'r') }
end

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Read the data
charCounts = Array.new
lastChar = nil
cntCrLf = 0
utf8PfxCnt1 = utf8PfxCnt2 = utf8PfxCnt3 = utf8PfxCnt4 = utf8PfxCnt5 = utf8PfxCnt6 = utf8PfxCntC = utf8PfxCntA = 0
lastEOL = 0
curByte = 0
files.each do |inFile|
  inFile.each_byte do |b|
    if (printOptions['utf8']) then
      if((b & 0b10000000) == 0b00000000) then
        utf8PfxCnt1 += 1
      end
      if((b & 0b11100000) == 0b11000000)
        utf8PfxCnt2 += 1
      end
      if((b & 0b11110000) == 0b11100000)
        utf8PfxCnt3 += 1
      end
      if((b & 0b11111000) == 0b11110000)
        utf8PfxCnt4 += 1
      end
      if((b & 0b11111100) == 0b11111000)
        utf8PfxCnt5 += 1
      end
      if((b & 0b11111110) == 0b11111100)
        utf8PfxCnt6 += 1
      end
      if((b & 0b11000000) == 0b10000000)
        utf8PfxCntC += 1
      end
    end
    charCounts[b] = (charCounts[b] || 0) + 1
    if ((lastChar == 13) && (b == 10)) then
      cntCrLf += 1
    end
    lastChar = b
    curByte += 1
  end
end
utf8PfxCntCexp = utf8PfxCnt2+utf8PfxCnt3*2+utf8PfxCnt4*3+utf8PfxCnt5*4+utf8PfxCnt6*5
utf8PfxCnt1    = utf8PfxCnt2 + utf8PfxCnt3 + utf8PfxCnt4 + utf8PfxCnt5 + utf8PfxCnt6 + utf8PfxCntC;

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Compute some stats...
highCnt   = highUnq   = 0
chrCnt    = chrUnq    = 0
ucCnt     = ucUnq     = 0
lcCnt     = lcUnq     = 0
digCnt    = digUnq    = 0
puncCnt   = puncUnq   = 0
wsLowCnt  = wsLowUnq  = 0
spcCnt    = spcUnq    = 0
nprLowCnt = nprLowUnq = 0
pLenMax = 4
chIdxCnt = Array.new
0.upto(255) do |idx|
  cnt = charCounts[idx] || 0
  charCounts[idx] = cnt
  chIdxCnt.push([idx, cnt])
  pLenMax = [ sprintf("%d", cnt).length, pLenMax].max
  if (idx <= 32) then
    if ((idx >= 9) && (idx <= 13)) then
      wsLowCnt += cnt
      wsLowUnq += ( cnt>0 ? 1 : 0)
    elsif (idx == 32) then
      spcCnt += cnt
      spcUnq += ( cnt>0 ? 1 : 0)
    else
      nprLowCnt += cnt
      nprLowUnq += ( cnt>0 ? 1 : 0)
    end
  elsif (idx > 126)
    highCnt += cnt
    highUnq += ( cnt>0 ? 1 : 0)
  else
    if ( (idx >= 97) && (idx <= 122) ) then
      lcCnt += cnt
      lcUnq += ( cnt>0 ? 1 : 0)
    elsif ( (idx >= 65) && (idx <= 90) ) then
      ucCnt += cnt
      ucUnq += ( cnt>0 ? 1 : 0)
    elsif ( (idx >= 48) && (idx <= 57) ) then
      digCnt += cnt
      digUnq += ( cnt>0 ? 1 : 0)
    else
      puncCnt += cnt
      puncUnq += ( cnt>0 ? 1 : 0)
    end
  end
  chrCnt += cnt
  chrUnq += ( cnt>0 ? 1 : 0)
end
wsCnt     = wsLowCnt + spcCnt;
anCnt     = ucCnt + lcCnt + digCnt
prCnt     = anCnt + puncCnt + wsCnt
nonPrtCnt = nprLowCnt + highCnt
wsUnq     = wsLowUnq + spcUnq;
anUnq     = ucUnq + lcUnq + digUnq
prUnq     = anUnq + puncUnq + wsUnq
nonPrtUnq = nprLowUnq + highUnq
manPrt = prCnt + charCounts[8]

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Find the smallest group of chars that make up 50% of the total char count
target  = chrCnt/2
curSum  = 0
topList = Hash.new
(chIdxCnt.sort { |x,y| [y[1], y[0]] <=> [x[1],x[0]] }).each do |idx, cnt|
  curSum += cnt
  topList[idx] = 1
  if (curSum >= target) then
    break
  end
end

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Print the char counts
if (printOptions['counts']) then
  desc = [ "NUL", "SOH", "STX", "ETX", "EOT", "NEQ", "ACK", "BEL", "BS ",
           "HT ", "NL ", "VT ", "NP ", "CR ", "SO ", "SI ", "DLE", "DC1",
           "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", "CAN", "EM ", "SUB",
           "ESC", "FS ", "GS ", "RS ", "US ", "SP " ]
  desc[127] = 'DEL'  
  128.upto(255) { |i| desc[i] = 'N/A' }
  0.upto(127) { |i| desc[i] = ( desc[i] ? desc[i] : i.chr ) }

  numRows = numCols = 0
  if (printOptions['wide']) then
    numRows = 32
    numCols = 8
  else
    numRows = 64
    numCols = 4
  end

  # Suppress the display of the non-ASCII part of the table if it is empty
  chrCappa = 255
  if ( !(printOptions['8bit']) || (highCnt == 0)) then
    chrCappa = 127
  end
  
  # Print titles
  0.upto(numCols-1) do |colNum|
    idx = colNum*numRows;
    if (idx <= chrCappa) then
      if (idx <= 127) then
        printf("| %3s %3s %2s %-3s : %#{pLenMax}s ", 'Oct', 'Dec', 'Hx', 'Chr', 'Cnt')
      else
        printf("| %3s %3s %2s : %#{pLenMax}s ", 'Oct', 'Dec', 'Hx', 'Cnt')
      end
    end
  end
  print "|\n";

  # Print table
  0.upto(numRows-1) do |rowNum|
    printf("|");
    0.upto(numCols-1) do |colNum|
      idx = rowNum+colNum*numRows
      if (idx <= chrCappa) then
        cStart = cEnd = ''
        if (printOptions['color'] && (charCounts[idx] > 0)) then
          if (topList.member?(idx)) then
            cStart = "\e[0;43m"
            cEnd = "\e[0m"
          else
            cStart = "\e[0;46m"
            cEnd = "\e[0m"
          end
        end
        if (idx <= 127) then
          printf(" %s%03o %3d %02x %-3s : %#{pLenMax}d%s |", cStart, idx, idx, idx, desc[idx], charCounts[idx], cEnd)
        else
          printf(" %s%03o %3d %02x : %#{pLenMax}d%s |", cStart, idx, idx, idx, charCounts[idx], cEnd)
        end
      end
    end
    print "\n"
  end
  print "\n"
end

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Guess at the line ending mode
eolCnt = charCounts[10] + charCounts[13]
eolMode = "UNKNOWN"
if (eolCnt > 0) then
  if ((charCounts[10] > 0) && (charCounts[10] * 0.1 > charCounts[13])) then
    if ((charCounts[10] > 0) && (charCounts[13] == 0)) then
      eolMode = "UNIX"
    else
      eolMode = "UNIX -- ALMOST"
    end
  elsif ((charCounts[13] > 0) && (charCounts[13] * 0.1 > charCounts[10])) then
    if ((charCounts[13] > 0) && (charCounts[10] == 0)) then
      eolMode = "MacOS 9"
    else
      eolMode = "MacOS 9 -- ALMOST"
    end
  elsif ((cntCrLf > 0) && (cntCrLf * 0.1 > (charCounts[10]-cntCrLf)) && (cntCrLf * 0.1 > (charCounts[13]-cntCrLf))) then
    if ((cntCrLf > 0) && (charCounts[10] == charCounts[13]) && (cntCrLf == charCounts[10])) then
      eolMode = "MSDOS"
    else
      eolMode = "MSDOS -- ALMOST"
    end
  end
else
  eolMode = "NONE"
end

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
prWid  = sprintf("%d", chrCnt).length + 1
prWidB = [ prWid, 9 ].max

#---------------------------------------------------------------------------------------------------------------------------------------------------------------
# Print out some stats
if (printOptions['class'] && (chrCnt > 0)) then
  (0<chrCnt)            && printf("Character Class Breakdown\n")
  (0<chrCnt)            && printf("  Chars .............. %#{prWid}d  Chars (%d/256)                                   \n", chrCnt,                                         chrUnq)
  (0<prCnt)             && printf("    Printable ........ %#{prWid}d  | %5.1f%% --- Printable (%d/100)                 \n", prCnt,           100*prCnt/chrCnt,              prUnq)
  (0<puncCnt)           && printf("      Punct .......... %#{prWid}d  |            | %5.1f%% --- Punctuation (%d/32)   \n", puncCnt,         100*puncCnt/prCnt,             puncUnq)
  (0<charCounts[40])    && printf("        Par ( ........ %#{prWid}d  |            |            | %5.1f%% %s           \n", charCounts[40],  100*charCounts[40]/puncCnt,                     (charCounts[40]>charCounts[41]  ?"Unmatched: #{charCounts[40] -charCounts[41]}" :""))
  (0<charCounts[41])    && printf("        Par ) ........ %#{prWid}d  |            |            | %5.1f%% %s           \n", charCounts[41],  100*charCounts[41]/puncCnt,                     (charCounts[41]>charCounts[40]  ?"Unmatched: #{charCounts[41] -charCounts[40]}" :""))
  (0<charCounts[91])    && printf("        Sqr [ ........ %#{prWid}d  |            |            | %5.1f%% %s           \n", charCounts[91],  100*charCounts[91]/puncCnt,                     (charCounts[91]>charCounts[93]  ?"Unmatched: #{charCounts[91] -charCounts[93]}" :""))
  (0<charCounts[93])    && printf("        Sqr ] ........ %#{prWid}d  |            |            | %5.1f%% %s           \n", charCounts[93],  100*charCounts[93]/puncCnt,                     (charCounts[93]>charCounts[91]  ?"Unmatched: #{charCounts[93] -charCounts[91]}" :""))
  (0<charCounts[123])   && printf("        Cur { ........ %#{prWid}d  |            |            | %5.1f%% %s           \n", charCounts[123], 100*charCounts[123]/puncCnt,                    (charCounts[123]>charCounts[125]?"Unmatched: #{charCounts[123]-charCounts[125]}":""))
  (0<charCounts[125])   && printf("        Cur } ........ %#{prWid}d  |            |            | %5.1f%% %s           \n", charCounts[125], 100*charCounts[125]/puncCnt,                    (charCounts[125]>charCounts[123]?"Unmatched: #{charCounts[125]-charCounts[123]}":""))
  (0<anCnt)             && printf("      Alphanumeric ... %#{prWid}d  |            | %5.1f%% --- Alphanumeric (%d/36)  \n", anCnt,           100*anCnt/prCnt,               anUnq)
  (0<ucCnt)             && printf("        Upper ........ %#{prWid}d  |            |            | %5.1f%%              \n", ucCnt,           100*ucCnt/anCnt)
  (0<lcCnt)             && printf("        Lower ........ %#{prWid}d  |            |            | %5.1f%%              \n", lcCnt,           100*lcCnt/anCnt)
  (0<digCnt)            && printf("        Digit ........ %#{prWid}d  |            |            | %5.1f%%              \n", digCnt,          100*digCnt/anCnt)
  (0<wsCnt)             && printf("      Whitespace ..... %#{prWid}d  |            | %5.1f%% --- Whitespace (%d/6)     \n", wsCnt,           100*wsCnt/prCnt,               wsUnq)
  (0<spcCnt)            && printf("        Spaces ....... %#{prWid}d  |                         | %5.1f%%              \n", spcCnt,          100*spcCnt/wsCnt)
  (0<wsLowCnt)          && printf("        Below 32 ..... %#{prWid}d  |                         | %5.1f%% --- Below 32 \n", wsLowCnt,        100*wsLowCnt/wsCnt)
  (0<charCounts[9])     && printf("          H Tabs ..... %#{prWid}d  |                                      | %5.1f%% \n", charCounts[9],   100*charCounts[9]/wsLowCnt)
  (0<charCounts[11])    && printf("          V Tabs ..... %#{prWid}d  |                                      | %5.1f%% \n", charCounts[11],  100*charCounts[11]/wsLowCnt)
  (0<charCounts[10])    && printf("          Newlines ... %#{prWid}d  |                                      | %5.1f%% \n", charCounts[10],  100*charCounts[10]/wsLowCnt)
  (0<charCounts[13])    && printf("          C Returns .. %#{prWid}d  |                                      | %5.1f%% \n", charCounts[13],  100*charCounts[13]/wsLowCnt)
  (0<charCounts[12])    && printf("          New Page ... %#{prWid}d  |                                      | %5.1f%% \n", charCounts[12],  100*charCounts[12]/wsLowCnt)
  (0<nprLowCnt+highCnt) && printf("    Non-printable .... %#{prWid}d  | %5.1f%% --- Non-printable (%d/156)             \n", nonPrtCnt,       100*nonPrtCnt/chrCnt,          nonPrtUnq)
  (0<nprLowCnt)         && printf("      Below 32 ....... %#{prWid}d               | %5.1f%% --- Below 32 (%d/27)      \n", nprLowCnt,       100*nprLowCnt/nonPrtCnt,       nprLowUnq)
  (0<charCounts[8])     && printf("        BSP .......... %#{prWid}d               |            | %5.1f%%              \n", charCounts[8],   100*charCounts[8]/nprLowCnt)
  (0<highCnt)           && printf("      Above 126 ...... %#{prWid}d               | %5.1f%% --- Above 32 (%d/129)     \n", highCnt,         100*highCnt/nonPrtCnt,         highUnq)
  (0<charCounts[127])   && printf("        DEL .......... %#{prWid}d                            | %5.1f%%              \n", charCounts[127], 100*charCounts[127]/highCnt)

  (0<chrCnt)            && printf("Other statistics\n")
  (0<manPrt)            && printf("  Man Printable ..... %5.1f%%\n", 100.0*manPrt/chrCnt)
end

if(printOptions['eol']) then
  (0<eolCnt)            && printf("Line Ending Analysis\n")
  (0<charCounts[10])    && printf("  LF ............ %d\n", charCounts[10])
  (0<charCounts[13])    && printf("  CR ............ %d\n", charCounts[13])
  (0<eolCnt)            && printf("  CR/LF Pairs ... %d\n", cntCrLf)
  (0<eolCnt)            && printf("  EOL Mode ...... %s\n", eolMode)
end
  
if(printOptions['utf8']) then
  (0<utf8PfxCnt1)       && printf("UTF-8 Stats: ...... %#{prWidB}s %#{prWidB}s %#{prWidB}s %#{prWidB}s %#{prWidB}s %#{prWidB}s %#{prWidB}s \n", "0xxxxxxx", "10xxxxxx", "110xxxxx", "1110xxxx", "11110xxx", "111110xx", "10xxxxxx")
  (0<utf8PfxCnt1)       && printf("             ...... %#{prWidB}d %#{prWidB}d %#{prWidB}d %#{prWidB}d %#{prWidB}d %#{prWidB}d %#{prWidB}d %s \n", utf8PfxCnt1,utf8PfxCnt2, utf8PfxCnt3, utf8PfxCnt4, utf8PfxCnt5, utf8PfxCnt6, utf8PfxCntC,
                                  (utf8PfxCntC < utf8PfxCntCexp ? "short by #{utf8PfxCntCexp-utf8PfxCntC}" : (utf8PfxCntC > utf8PfxCntCexp ? "over by #{utf8PfxCntC-utf8PfxCntCexp}" : "correct value for UTF-8 data")))
end