# Find frequency-ranked list of bi-words. # In this 2nd program, we allow us to divide by expected frequency. # 1st argument: name of input file # 2nd argument: name of output file # 3rd argument (optional): list of flags: # s sort by frequency, not alphabetically # e sort by frequency divided by expected frequency (overrides s) # p ignore punctuation (not yet implemented) # w do not use entire words $True = 1; $False = 0; if ( scalar( @ARGV ) >= 2 ) { $TheFile = $ARGV[0]; $TheMainOutFile = ">" . $ARGV[1]; } else { print "Too few arguments; first argument should be input file, second argument output file. Goodbye!"; exit; } $Options = @ARGV[2]; if ( $Options =~ m/e/ ) { $ExpFreqs = $True; } else { $ExpFreqs = $False; } open (OUTFILE, $TheMainOutFile); open (INFILE, $TheFile) or die "The file $TheFile could not be found"; ################################################################################################# $WordCount=0; $LineCount =0; $Counter = 0; $LongestWordlength =0; $PreviousWord = ""; $MaxLength = 5; $MinLength = 2; # Array Count[n] counts the number of n-grams of length n. while () { $TheLine= $_; # unnecessarily explicit in real PERL chomp ($TheLine); $LineCount = $LineCount + 1; $TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(\S)([.,;!?()'"])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(['"(])(\S)/$1 $2/; @Words=split(/ /, $TheLine); while ($Word= lc(shift (@Words) ) ) { if ( $Options =~ m/p/ ) #delete punctuation { if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ ) { next; } } $WordCount++; $Word = "#".$Word."#"; $WordLength = length ($Word); @Letters = split (//, $Word ); while ( $Letter = pop (@Letters) ) { $Counts[1]++; if (exists $Letters{$Letter} ) { $Letters{$Letter}++; } else { $Letters {$Letter} = 1 ; } } for ($a = 0; $a <= $WordLength-$MinLength ; $a++) { for ($b = $MinLength; $b <= $MaxLength; $b++) { if ( $Options =~ /w/ ) { if ( $b >= $WordLength - 1) {next;} } $Counts[$b]++; $Piece = substr $Word, $a, $b; if ( exists $Ngrams{$Piece} ) { $Ngrams{$Piece}++; } else { $Ngrams{$Piece} = 1; } } } } } close INFILE; print "finished reading.\n"; ###########################################################################################3 foreach $Letter ( keys %Letters ) { $Letters{$Letter} /= $Counts[1]; } foreach $Ngram ( keys %Ngrams ) { $Ngrams{$Ngram} /= $Counts[length $Ngram]; $Product = 1; for ($a = 0; $a < length $Ngram; $a++) { $Product *= $Letters{ substr ($Ngram, $a, 1) }; } $FreqOverExpected {$Ngram} = $Ngrams{$Ngram} * log ( $Ngrams{$Ngram} / $Product ); } ##############################################################################################? if ( $Options =~ /e/) { @SortedList = sort { $FreqOverExpected{ $b } <=> $FreqOverExpected { $a } } keys(%Ngrams) ; } elsif ( $Options =~ /s/ ) { @SortedList = sort { $Ngrams{ $b } <=> $Ngrams { $a } } keys(%Ngrams) ; } else { @SortedList = sort keys(%Ngrams) ; } $Rank = 0; while (@SortedList) { $Rank++; print OUTFILE $Rank, "\t"; $Ngram = shift (@SortedList); print OUTFILE "$Ngram \t$Ngrams{$Ngram} \t$FreqOverExpected{$Ngram}\n"; } close OUTFILE;