# Split off punctuation...
# 1st argument: name of input file
# 2nd argument: name of output file
# 3rd argument (optional): list of flags:
#     f            follow word by its count in corpus
#     s            sort by frequency, not alphabetically
#     p		   ignore punctuation
#     t		   threshold -- now set at 5; frequencies below we will not sort or print.
		$Threshold = 5;
#	  r			reverse alphabetization (from end of word);
#     z			include Zipf ranking, where all words of the same frequency have the same ranking
#	  l			compute letter entropy

# delete HTML markings

if ( scalar( @ARGV ) >=  2 )
{ 
            $TheFile = $ARGV[0];
            $TheMainOutFile = ">" . $ARGV[1];
} 

 

else 
{           print  "\n    f    include word count";
			print  "\n    s    sort by frequency";
			print  "\n    p    ignore punctuation";
			print  "\n    t    threshold 5";
			print  "\n    r    reverse alphbetization (from end of word)";
			print  "\n    z    include false Zipf ranking"; 
			print  "\n    Followed by maximum number of words";
			print  "\n Not enough arguments given to operate; goodbye."; 
            exit;
}


$Options = @ARGV[2];
if ( $Options =~ m/f/ ) 
{
           $PrintFreqs = TRUE;
} 
else 
{
           $PrintFreqs = FALSE;
}

$MaxNumberOfWords = @ARGV[3];

open (OUTFILE, $TheMainOutFile);
open (INFILE, $TheFile) or die "The file $TheFile could not be found";

$WordCount=0;
$LineCount =0;
$Counter = 0;
$LongestWordlength =0;

$PreviousLetter = "START";
$Remainder = "";
while (<INFILE>)        
{		
			if ($MaxNumberOfWords and $WordCount > $MaxNumberOfWords ) { last; }

            $TheLine= $_;                               # unnecessarily explicit in real PERL
            chomp ($TheLine); 
            $LineCount =  $LineCount + 1;		

	    $TheLine =~ s/(\S)([.,;!?()'"\]])/$1 $2/g;     # \S finds a non-whitespace character
	    $TheLine =~ s/(\S)([.,;!?()'"\]])/$1 $2/g;     # \S finds a non-whitespace character


	    $TheLine =~ s/([\[\'"(])(\S)/$1 $2/g;
	    $TheLine =~ s/([\[\'"(])(\S)/$1 $2/g;

	    $TheLine =~ s/(\S)(--)/$1 $2/g;		# separate a "--" 


			
            @Words=split(/ /, $TheLine); 
            while ( @Words )     
            { 		
				$Word= lc( shift (@Words) );
			
				if ( length $Remainder > 0 )   
				{
					$Word = $Remainder.$Word;
					$Remainder = ""; 
				
				}
				if (length $Word == 0) {next;}
				if ( $Options =~ m/p/ )                                     #delete punctuation
				{ 
				   if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ )  { next; }
				   if ( $Word =~ m/--/ ) { next; }
				}

				if ( $Word =~ m/\-$/   ) 
				{	
					$Remainder = $Word;
					chop $Remainder;		
					next; 					   
				}

				#HTML:
				if ( substr ($Word,0,1) eq "<" and substr ($Word,length($Word)-1,1) eq ">" ) {next;}

				$WordCount++;              			  
				$Frequency { $Word }++;                              

				for ($q = 0; $q<length($Word); $q++) 
				{	$l = substr ($Word, $q,1);
					if (ord $l == 13 or ord $l == 13) {print "\n", $Word;}
				}

				if ( $Options =~ /l/ )
				{
					for ( $a = 0; $a < length $Word; $a++)
					{	
						$Letter = substr $Word, $a, 1;					
						$Letters{$PreviousLetter}{$Letter}++;
						$LetterCount{$Letter}++;
						$TotalLetterCount++;
						$PreviousLetter = $Letter;


					}

				}			
			
			}
}


print "finished reading.\n";





if ( $Options =~ m/t/ )                                     # threshold
{ 
    foreach $Word ( keys %Frequency )
    {
	if ( $Frequency{$Word} < $Threshold )  { $Temp{$Word} = 1; }
    }
    foreach $Word ( keys %Temp )
    {
	delete $Frequency{$Word};
    }
}





if ( $Options =~ /s/ ) 
{
              @SortedList = sort {    $Frequency{ $b } <=> $Frequency { $a }    }  keys(%Frequency) ;
}
elsif ( $Options =~ /r/ )
{
			foreach $Word ( keys %Frequency )
			{
				$TempRev{ Reverse($Word) } = $Frequency {$Word } ;
			}


			@SortedList = sort  keys ( %TempRev );
			for ($f = 0; $f <= $#SortedList; $f++)
			{
				$SortedList[$f] = Reverse ( $SortedList[$f] );
			}


}
else 
{
              @SortedList = sort  keys(%Frequency) ;
}






$NumberOfWords =  $#SortedList;
print "\nNumber of words: ", $NumberOfWords;  
print "\nWord Count : $WordCount";
$a = 1;
$ZipfRank = 0;
$PriorFreq = 0;
while (@SortedList)
{
		print OUTFILE $a;
		$Word = shift (@SortedList);
		if ($Options =~ /z/) 
		{ 
			if ($Frequency{$Word} == $PriorFreq) 
			{	print OUTFILE "\t$ZipfRank"; } 
			else 
			{ $ZipfRank++; print OUTFILE "\t$ZipfRank";}
		}
		$PriorFreq = $Frequency{$Word};

        print OUTFILE  "\t",  $Word;        
		      
		if ( $PrintFreqs eq TRUE  ) 
		{ 	
			print OUTFILE  "\t", $Frequency{$Word}, "\t", $Frequency{$Word}/$WordCount, "\t", $Frequency{$Word}* $a/$WordCount, "\t", log ( $Frequency{$Word} ),

		}
        print OUTFILE "\n"; 
		$a++;
}
	
		if ( $Options =~ /l/ )
		{
			
			
			for  $FirstLetter ( keys %Letters ) 
			{
					
					print OUTFILE "\nFollowing stats for: $FirstLetter\n";
					for $SecondLetter ( keys %{$Letters{$FirstLetter}} ) 
					{
						if ($LetterCount {$FirstLetter } == 0) {print "\nNo count: $FirstLetter\n"; next;}
						$Proportion = $Letters{$FirstLetter}{$SecondLetter}/$LetterCount{$FirstLetter};
						print OUTFILE "\n\t$SecondLetter\t$Letters{$FirstLetter}{$SecondLetter}\t$Proportion" ;
						$Entropy{$FirstLetter}  +=  H ( $Proportion) ; 

					}
					print OUTFILE "\nPreceding stats for: $FirstLetter  Entropy: $Entropy{$FirstLetter}\n\n";
					$Entropy += $LetterCount{$FirstLetter} * $Entropy{$FirstLetter} / $TotalLetterCount;
				

			}
			print OUTFILE "\n\n Conditional Entropy: $Entropy";
		}


close OUTFILE;




sub Reverse 
{	my $i;
	$string = @_[0];
	$length = length ($string);
	$out = "";
	for ($i = 0; $i < $length; $i++)	
	{
		substr ($out, $i, 1,substr ($string, $length - $i -1 , 1 ) );
	}
	return $out;
}
 
 sub H
 {
	if ( @_ == 0) {return 0;}
	return -1 * @_[0] * log (@_[0]) / log (2);
 }