# Find frequency-ranked list of bi-words. Outputs a tab-separated matrix. # 1st argument: name of input file # 2nd argument: name of output file# # 3thd argument (optional): list of flags: # f follow word by its count in corpus, etc. # s sort by frequency, not alphabetically # t threshold: eliminate all words whose count is less than 5 $WordThreshold = 5; $BiwordThreshold = 2; # e divide biword frequency by expected frequency # p ignore punctuation # o ignore words in the top 100 most frequent list (stop list); # m make a matrix of cooccurrence of the 100 most common words;not yet implemented # w sort by weighted mutual information $MinRank = 0; $MaxRank = 35; # we will look at words whose rank is between these values and consider # all their bigrams. $True = 1; $False = 0; if ( scalar( @ARGV ) >= 2 ) { $TheFile = $ARGV[0]; $TheMainOutFile = ">" . $ARGV[1]; } else { print "Too few arguments; first argument should be input file, second argument output file. Goodbye!"; exit; } $Options = @ARGV[2]; if ( $Options =~ m/f/ ) { $PrintFreqs = $True; } else { $PrintFreqs = $False; } if ( $Options =~ m/e/ ) { $ExpFreqs = $True; } else { $ExpFreqs = $False; } open (OUTFILE, $TheMainOutFile); open (INFILE, $TheFile) or die "The file $TheFile could not be found"; ################################################################################################# $WordCount=0; $LineCount =0; $Counter = 0; $LongestWordlength =0; $PreviousWord = ""; while () { $TheLine= $_; # unnecessarily explicit in real PERL chomp ($TheLine); $LineCount = $LineCount + 1; $TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(['"(])(\S)/$1 $2/; @Words=split(/ /, $TheLine); while (@Words ) { $Word= lc(shift (@Words) ); $Words{$Word} +=1 ; } } close INFILE; if ( $Options =~ m/t/ ) # threshold { foreach $Word ( keys %Words ) { if ( $Words {$Word} < $WordThreshold ) { $Temp{$Word} =1; } } foreach $Word ( keys %Temp ) { delete $Words{$Word}; } } if ( $Options =~ m/o/ ) # stop list { $StopListSize = 100; @SortedWords = sort { $Words{ $b } <=> $Words { $a } } keys(%Words) ; for ($n = 0; $n < $StopListSize; $n++) { $Word = $SortedWords[$n]; delete $Words{$Word}; } } if ( $Options =~ m/p/ ) #delete punctuation { foreach $Word ( keys %Words ) { if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ ) { $Temp{$Word} = 1; } } foreach $Word ( keys %Temp ) { delete $Words{$Word}; } } @SortedWords = sort { $Words{ $b } <=> $Words { $a } } keys(%Words) ; for ( $i = 0; $i < $#SortedWords; $i++ ) { $Rank{ $SortedWords[$i] } = $i } open (INFILE, $TheFile) or die "The file $TheFile could not be found"; while () { $TheLine= $_; # unnecessarily explicit in real PERL chomp ($TheLine); $LineCount = $LineCount + 1; $TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(['"(])(\S)/$1 $2/; @Words=split(/ /, $TheLine); while (@Words ) { $Word= lc(shift (@Words) ); if ($PreviousWord eq "") { $PreviousWord = $Word; next; } if ( !exists $Rank{$PreviousWord} or !exists $Rank{$Word} ) { $PreviousWord = $Word; next; } $WordRank = $Rank{$Word}; $PreviousRank = $Rank {$PreviousWord}; if ( $WordRank < $MinRank or $WordRank > $MaxRank ) { $PreviousWord = $Word; next; } if ( $PreviousRank < $MinRank or $PreviousRank > $MaxRank ) { $PreviousWord = $Word; next; } $Biwords[$PreviousRank][$WordRank] ++; $RowCount[$PreviousRank]++; $ColumnCount[$WordRank]++; $BiwordCount++; $PreviousWord = $Word; } } print "finished reading.\n"; ############################################################################################## for ( $i = $MinRank; $i <= $MaxRank; $i++) { for ( $j = $MinRank; $j <= $MaxRank; $j++) { if ( $Biwords[$i][$j] > 0) { $JointEntropy += ( $Biwords[$i][$j] / $BiwordCount) * log ( $BiwordCount / $Biwords[$i][$j] ) ; $MutualInformation += ( $Biwords[$i][$j] / $BiwordCount) * log ( $Biwords[$i][$j] * $BiwordCount / ($RowCount[$i]*$ColumnCount[$j]) ); } } } for ( $i = $MinRank; $i <= $MaxRank; $i++) { if ( $RowCount[$i] > 0) { $PreviousEntropy += ( $RowCount[$i] / $BiwordCount ) * log ($BiwordCount/ $RowCount[$i] ); } if ( $ColumnCount[$i] > 0) { $FollowingEntropy += ( $ColumnCount[$i] / $BiwordCount ) * log ($BiwordCount / $ColumnCount[$i] ); } } print OUTFILE "Joint entropy: $JointEntropy\n"; print "\nJoint entropy: $JointEntropy\n"; print OUTFILE "Mutual information: $MutualInformation\n"; print "Mutual information: $MutualInformation\n"; print OUTFILE "Entropy on preceding word: $PreviousEntropy\n"; print "Entropy on preceding word: $PreviousEntropy\n"; print OUTFILE "Entropy on following word: $FollowingEntropy\n"; print "Entropy on following word: $FollowingEntropy\n"; print OUTFILE "Joint entropy + Mutual information: ", $JointEntropy+ $MutualInformation, "\n"; print "Joint entropy + Mutual information: ", $JointEntropy+ $MutualInformation, "\n"; print OUTFILE "Previous entropy + following entropy:", $PreviousEntropy+ $FollowingEntropy, "\n"; print "Previous entropy + following entropy: ", $PreviousEntropy+ $FollowingEntropy, "\n"; for ( $i = $MinRank; $i <= $MaxRank; $i++) { print OUTFILE "\t", $SortedWords[$i]; } for ( $i = $MinRank; $i <= $MaxRank; $i++) { print OUTFILE "\n", $SortedWords[$i], "\t"; for ( $j = $MinRank; $j <= $MaxRank; $j++) { print OUTFILE $Biwords[$i][$j], "\t"; } } close OUTFILE;