# Find frequency-ranked list of bi-words.
# In this 2nd program, we allow us to divide by expected frequency.
# 1st argument: name of input file
# 2nd argument: name of output file
# 3rd argument (optional): list of flags:

#     s            sort by frequency, not alphabetically
#     e		       sort by frequency divided by expected frequency (overrides s)
#     p            ignore punctuation (not yet implemented)
#	  w			   do not use entire words

$True = 1; $False = 0;

if ( scalar( @ARGV ) >=  2 )
{ 
            $TheFile = $ARGV[0];
            $TheMainOutFile = ">" . $ARGV[1];
} 

 

else 
{           print "Too few arguments; first argument should be input file, second argument output file. Goodbye!";
   	     exit;
}


$Options = @ARGV[2];

if ( $Options =~ m/e/ ) {   $ExpFreqs = $True;   } else {  $ExpFreqs   = $False; }



open (OUTFILE, $TheMainOutFile);
open (INFILE, $TheFile) or die "The file $TheFile could not be found";

#################################################################################################

$WordCount=0;
$LineCount =0;
$Counter = 0;
$LongestWordlength =0;
$PreviousWord = "";

$MaxLength = 5;
$MinLength = 2;
# Array Count[n] counts the number of n-grams of length n.

while (<INFILE>)        
{

	$TheLine= $_;                               # unnecessarily explicit in real PERL
        chomp ($TheLine); 
        $LineCount =  $LineCount + 1;
		
		$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
		$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
    	$TheLine =~ s/(['"(])(\S)/$1 $2/;

        @Words=split(/ /, $TheLine); 
        while ($Word= lc(shift (@Words) ) )     
        {    
		

 
			if ( $Options =~ m/p/ )                                     #delete punctuation
			{ 
				if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ )  
				{ next; }
			}		
			$WordCount++;     

			$Word = "#".$Word."#";		
			$WordLength = length ($Word);
			
			@Letters = split (//, $Word );
			while ( $Letter = pop (@Letters) )
			{					
				$Counts[1]++;
				if (exists $Letters{$Letter} ) 
				{ $Letters{$Letter}++; 
				} 
				else 
				{ $Letters {$Letter} = 1 ;					
				}
			}			

			for ($a = 0; $a <=  $WordLength-$MinLength ; $a++)  
			{		
				for ($b = $MinLength; $b <= $MaxLength; $b++)
				{
					if ( $Options =~ /w/ ) { if ( $b >= $WordLength - 1) {next;} } 
					
					$Counts[$b]++;

					$Piece = substr $Word, $a, $b;
					if ( exists $Ngrams{$Piece} )
					{	$Ngrams{$Piece}++; 	} 
					else 	
					{ $Ngrams{$Piece} = 1; }
				}
				
			}		                               
        }
}

close INFILE;



print "finished reading.\n";

 

###########################################################################################3

foreach $Letter ( keys %Letters )
{	
	$Letters{$Letter} /= $Counts[1];
}

foreach $Ngram ( keys %Ngrams )
{
	$Ngrams{$Ngram} /= $Counts[length $Ngram];
	$Product = 1;
	
	for ($a = 0; $a < length $Ngram; $a++)	
	{	
		
		$Product *= $Letters{ substr ($Ngram, $a, 1) };
		
	}
	$FreqOverExpected {$Ngram} =    $Ngrams{$Ngram} * log (  $Ngrams{$Ngram} / $Product );

}

 

##############################################################################################?
if ( $Options =~ /e/) 
{
              @SortedList = sort {    $FreqOverExpected{ $b } <=> $FreqOverExpected { $a }    }  keys(%Ngrams) ;
}
elsif ( $Options =~ /s/ ) 
{
              @SortedList = sort {    $Ngrams{ $b } <=> $Ngrams { $a }    }  keys(%Ngrams) ;
}
else
{
              @SortedList = sort  keys(%Ngrams) ;
}


 
$Rank = 0;
while (@SortedList)
{
	$Rank++;
	print OUTFILE $Rank, "\t";
    $Ngram = shift (@SortedList);
	print OUTFILE "$Ngram \t$Ngrams{$Ngram} \t$FreqOverExpected{$Ngram}\n"; 	
	

}

 

close OUTFILE;