# Find frequency-ranked list of bi-words. Outputs a tab-separated matrix.

# 1st argument: name of input file
# 2nd argument: name of output file# 
# 3thd argument (optional): list of flags:
#     f            follow word by its count in corpus, etc.
#     s            sort by frequency, not alphabetically
#     t 	   threshold: eliminate all words whose count is less than 5
				$WordThreshold = 5;
				$BiwordThreshold = 2;
#     e		   divide biword frequency by expected frequency
#     p            ignore punctuation
#     o            ignore words in the top 100 most frequent list (stop list);
#	  m				make a matrix of cooccurrence of the 100 most common words;not yet implemented
#     w             sort by weighted mutual information 


$MinRank = 0;
$MaxRank = 35;  # we will look at words whose rank is between these values and consider
		# all their bigrams.







$True = 1; $False = 0;

if ( scalar( @ARGV ) >=  2 )
{ 
            $TheFile = $ARGV[0];
            $TheMainOutFile = ">" . $ARGV[1];
} 


else 
{           print "Too few arguments; first argument should be input file, second argument output file. Goodbye!";
   	     exit;
}


$Options = @ARGV[2];
if ( $Options =~ m/f/ ) {   $PrintFreqs = $True; } else {  $PrintFreqs = $False; }
if ( $Options =~ m/e/ ) {   $ExpFreqs = $True;   } else {  $ExpFreqs   = $False; }



open (OUTFILE, $TheMainOutFile);
open (INFILE, $TheFile) or die "The file $TheFile could not be found";

#################################################################################################

$WordCount=0;
$LineCount =0;
$Counter = 0;
$LongestWordlength =0;
$PreviousWord = "";


while (<INFILE>)        
{

	$TheLine= $_;                               # unnecessarily explicit in real PERL
        chomp ($TheLine); 
        $LineCount =  $LineCount + 1;
		
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
    	$TheLine =~ s/(['"(])(\S)/$1 $2/;

        @Words=split(/ /, $TheLine); 
        while (@Words )     
        {  
			$Word= lc(shift (@Words) );
			$Words{$Word} +=1 ;			
        }
}

close INFILE;


if ( $Options =~ m/t/ )				# threshold 
{
	foreach $Word ( keys %Words )
	{	
		if ( $Words {$Word} < $WordThreshold ) 
		{	
			$Temp{$Word} =1;
		}
	}
	foreach $Word ( keys %Temp )
	{
		delete $Words{$Word};
	}
}
if ( $Options =~ m/o/ )				# stop list
{	
	$StopListSize = 100;
	@SortedWords = sort {    $Words{ $b } <=> $Words { $a }    }  keys(%Words) ;	
	for ($n = 0; $n < $StopListSize; $n++)
	{	
		$Word = $SortedWords[$n];
		delete $Words{$Word};
	}	 
}

if ( $Options =~ m/p/ )                                     #delete punctuation
{ 
    foreach $Word ( keys %Words )
    {
	if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ )  { $Temp{$Word} = 1; }
    }
    foreach $Word ( keys %Temp )
    {
	delete $Words{$Word};
    }
}

@SortedWords = sort {    $Words{ $b } <=> $Words { $a }    }  keys(%Words) ;	
for ( $i = 0; $i < $#SortedWords; $i++ )
{	
	$Rank{ $SortedWords[$i] } = $i
}


open (INFILE, $TheFile) or die "The file $TheFile could not be found";
while (<INFILE>)        
{

	$TheLine= $_;                               # unnecessarily explicit in real PERL
        chomp ($TheLine); 
        $LineCount =  $LineCount + 1;
		
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
    	$TheLine =~ s/(['"(])(\S)/$1 $2/;

        @Words=split(/ /, $TheLine); 
        while (@Words )     
        {   
			$Word= lc(shift (@Words) );
			if ($PreviousWord eq "") 
			{ 
				$PreviousWord = $Word;
				next;
			}		
			
			if ( !exists $Rank{$PreviousWord} or !exists $Rank{$Word} )
			{	
				$PreviousWord = $Word;
				next;
			}
			$WordRank = $Rank{$Word};
			$PreviousRank = $Rank {$PreviousWord};
		
			if ( $WordRank < $MinRank or $WordRank > $MaxRank ) 
			{
				$PreviousWord = $Word;
				next;
			}		
			if ( $PreviousRank < $MinRank or $PreviousRank > $MaxRank ) 
			{
				$PreviousWord = $Word;
				next;
			}		

			

			$Biwords[$PreviousRank][$WordRank]   ++;
			$RowCount[$PreviousRank]++;
			$ColumnCount[$WordRank]++;
			$BiwordCount++;
        
                                 
			$PreviousWord = $Word;
                                
       }
}

print "finished reading.\n";

##############################################################################################


for ( $i = $MinRank; $i <= $MaxRank; $i++)
{

	for ( $j = $MinRank; $j <= $MaxRank; $j++)
	{
		if ( $Biwords[$i][$j] > 0)
		{
			$JointEntropy += ( $Biwords[$i][$j] / $BiwordCount) 
										* log ( $BiwordCount / $Biwords[$i][$j] )  ;
			$MutualInformation += ( $Biwords[$i][$j] / $BiwordCount) 
								* log (  $Biwords[$i][$j] *  $BiwordCount / ($RowCount[$i]*$ColumnCount[$j]) );
		}
	}
}


for ( $i = $MinRank; $i <= $MaxRank; $i++)
{
	if ( $RowCount[$i] > 0)
	{
		$PreviousEntropy +=  ( $RowCount[$i] / $BiwordCount ) * log ($BiwordCount/ $RowCount[$i] );
	}
	if ( $ColumnCount[$i] > 0)
	{
		$FollowingEntropy +=  ( $ColumnCount[$i] / $BiwordCount ) * log ($BiwordCount / $ColumnCount[$i] );
	}

}


print OUTFILE "Joint entropy: $JointEntropy\n";
print "\nJoint entropy: $JointEntropy\n";

print OUTFILE "Mutual information: $MutualInformation\n";
print "Mutual information: $MutualInformation\n";


print OUTFILE "Entropy on preceding word: $PreviousEntropy\n";
print "Entropy on preceding word: $PreviousEntropy\n";

print OUTFILE "Entropy on following word: $FollowingEntropy\n";
print "Entropy on following word: $FollowingEntropy\n";

print OUTFILE "Joint entropy + Mutual information: ", $JointEntropy+ $MutualInformation, "\n";
print "Joint entropy + Mutual information: ", $JointEntropy+ $MutualInformation, "\n";

print OUTFILE "Previous entropy + following entropy:",  $PreviousEntropy+ $FollowingEntropy, "\n";
print "Previous entropy + following entropy: ",  $PreviousEntropy+ $FollowingEntropy, "\n";




for ( $i = $MinRank; $i <= $MaxRank; $i++)
{
	print OUTFILE "\t", $SortedWords[$i];
	
}
for ( $i = $MinRank; $i <= $MaxRank; $i++)
{
	print OUTFILE "\n", $SortedWords[$i], "\t";
	for ( $j = $MinRank; $j <= $MaxRank; $j++)
	{
		print OUTFILE $Biwords[$i][$j], "\t";
	}
}




 
 

close OUTFILE;

 