# Find frequency-ranked list of bi-words.
# In this 2nd program, we allow us to divide by expected frequency.
# 1st argument: name of input file
# 2nd argument: word on left: $Word1
# 3rd argument: word to compare to it: $Word2
# 4rd argument (optional): list of flags:
#     f            follow word by its count in corpus, etc.
#     s            sort by frequency, not alphabetically
#     t 	   threshold: eliminate all words whose count is less than 5
				$WordThreshold = 5;
				$BiwordThreshold = 2;
#     e		   divide biword frequency by expected frequency
#     p            ignore punctuation
#     o            ignore words in the top 100 most frequent list (stop list);
#	  m				make a matrix of cooccurrence of the 100 most common words;not yet implemented
#     w             sort by weighted mutual information 


$True = 1; $False = 0;

if ( scalar( @ARGV ) >=  2 )
{ 
            $TheFile = $ARGV[0];
            $TheMainOutFile = ">" . $ARGV[1];
} 

 

else 
{           print "Too few arguments; first argument should be input file, second argument output file. Goodbye!";
   	     exit;
}


$Options = @ARGV[2];
if ( $Options =~ m/f/ ) {   $PrintFreqs = $True; } else {  $PrintFreqs = $False; }
if ( $Options =~ m/e/ ) {   $ExpFreqs = $True;   } else {  $ExpFreqs   = $False; }



open (OUTFILE, $TheMainOutFile);
open (INFILE, $TheFile) or die "The file $TheFile could not be found";

#################################################################################################

$WordCount=0;
$LineCount =0;
$Counter = 0;
$LongestWordlength =0;
$PreviousWord = "";


while (<INFILE>)        
{

	$TheLine= $_;                               # unnecessarily explicit in real PERL
        chomp ($TheLine); 
        $LineCount =  $LineCount + 1;
		
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
    	$TheLine =~ s/(['"(])(\S)/$1 $2/;

        @Words=split(/ /, $TheLine); 
        while ( @Words )     
        {   
			$Word= lc(shift (@Words) )  
			$WordCount++;     
			$Words{$Word} +=1 ;				                            
        }
}

close INFILE;


if ( $Options =~ m/t/ )				# threshold 
{
	foreach $Word ( keys %Words )
	{	
		if ( $Words {$Word} < $WordThreshold ) 
		{	
			$Temp{$Word} =1;
		}
	}
	foreach $Word ( keys %Temp )
	{
		delete $Words{$Word};
	}
}
if ( $Options =~ m/o/ )				# stop list
{	
	$StopListSize = 100;
	@SortedWords = sort {    $Words{ $b } <=> $Words { $a }    }  keys(%Words) ;	
	for ($n = 0; $n < $StopListSize; $n++)
	{	
		$Word = $SortedWords[$n];
		delete $Words{$Word};
	}	 
}

if ( $Options =~ m/p/ )                                     #delete punctuation
{ 
    foreach $Word ( keys %Words )
    {
	if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ )  { $Temp{$Word} = 1; }
    }
    foreach $Word ( keys %Temp )
    {
	delete $Words{$Word};
    }
}









open (INFILE, $TheFile) or die "The file $TheFile could not be found";
while (<INFILE>)        
{

	$TheLine= $_;                               # unnecessarily explicit in real PERL
        chomp ($TheLine); 
        $LineCount =  $LineCount + 1;
		
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
	$TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g;     # \S finds a non-whitespace character
    	$TheLine =~ s/(['"(])(\S)/$1 $2/;

        @Words=split(/ /, $TheLine); 
        while ( @Words )     
        {    
			$Word= lc(shift (@Words) )
			if ($PreviousWord eq "") 
			{ 
				$PreviousWord = $Word;
				next;
			}		
			
		if ( !exists $Words{$PreviousWord} or !exists $Words{$Word} )
		{
			next;
		}

		$Biword = $PreviousWord . " " . $Word; 
		$Biwords { $Biword } ++;
		$BiwordCount++;
        
                                 
		$PreviousWord = $Word;
                                
       }
}

print "finished reading.\n";


###########################################################################################3

foreach $Word ( keys  %Words  )
{	
        $WordFreqs { $Word } = $Words{ $Word } / $WordCount;
	
}

 

##############################################################################################



if ( $Options =~ /s/ ) 
{
              @SortedList = sort {    $Biwords{ $b } <=> $Biwords { $a }    }  keys(%Biwords) ;
}
elsif ( $Options =~ /w/ ) 
{	
	 for $Word ( keys %Biwords )
	 {
		$WMI{$Word} = $Biwords{$Word}/$BiwordCount  * 
					log ( $Biwords{$Word} * $BiwordCount / ($Words { First ($Word) } * $Words { Second($Word) })  );
			
	 }
	 @SortedList = sort {    $WMI{ $b } <=> $WMI { $a }    }  keys(%Biwords) ;

} else
{
              @SortedList = sort  keys(%Biwords) ;
}


 
$Rank = 0;
while (@SortedList)
{
	$Rank++;
	print OUTFILE $Rank, "\t";
        $Biword = shift (@SortedList);

        print OUTFILE $Biword ;

	if ( $Options =~ /w/ )
	{
		print OUTFILE "\t", $WMI {$Biword};

	}
              
	if ( $PrintFreqs == $True  ) 
	{ 	
		print OUTFILE  "\t", $Biwords{$Biword};
		@Pieces = split / /, $Biword;
		$Word1 = $Pieces[0];
		$Word2 = $Pieces[1];
		print OUTFILE  "\t$Word1: $Words{$Word1}\t$Word2: $Words{$Word2}";
		if ( $ExpFreqs = $True ) { print OUTFILE  "\tWMI: $WMI{$Biword}" };
	}
	print OUTFILE "\n";        

}

 

close OUTFILE;




sub First
{
	my @P = split (/ /, @_[0]);
	
	return @P[0];

}

sub Second
{	
	my @P = split (/ /, @_[0]);
	
	return @P[1];

}