# Split off punctuation... # 1st argument: name of input file # 2nd argument: name of output file # 3rd argument (optional): list of flags: # f follow word by its count in corpus # s sort by frequency, not alphabetically # p ignore punctuation # t threshold -- now set at 5; frequencies below we will not sort or print. $Threshold = 5; # r reverse alphabetization (from end of word); # z include Zipf ranking, where all words of the same frequency have the same ranking # l compute letter entropy # delete HTML markings if ( scalar( @ARGV ) >= 2 ) { $TheFile = $ARGV[0]; $TheMainOutFile = ">" . $ARGV[1]; } else { print "\n f include word count"; print "\n s sort by frequency"; print "\n p ignore punctuation"; print "\n t threshold 5"; print "\n r reverse alphbetization (from end of word)"; print "\n z include false Zipf ranking"; print "\n Followed by maximum number of words"; print "\n Not enough arguments given to operate; goodbye."; exit; } $Options = @ARGV[2]; if ( $Options =~ m/f/ ) { $PrintFreqs = TRUE; } else { $PrintFreqs = FALSE; } $MaxNumberOfWords = @ARGV[3]; open (OUTFILE, $TheMainOutFile); open (INFILE, $TheFile) or die "The file $TheFile could not be found"; $WordCount=0; $LineCount =0; $Counter = 0; $LongestWordlength =0; $PreviousLetter = "START"; $Remainder = ""; while () { if ($MaxNumberOfWords and $WordCount > $MaxNumberOfWords ) { last; } $TheLine= $_; # unnecessarily explicit in real PERL chomp ($TheLine); $LineCount = $LineCount + 1; $TheLine =~ s/(\S)([.,;!?()'"\]])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/(\S)([.,;!?()'"\]])/$1 $2/g; # \S finds a non-whitespace character $TheLine =~ s/([\[\'"(])(\S)/$1 $2/g; $TheLine =~ s/([\[\'"(])(\S)/$1 $2/g; $TheLine =~ s/(\S)(--)/$1 $2/g; # separate a "--" @Words=split(/ /, $TheLine); while ( @Words ) { $Word= lc( shift (@Words) ); if ( length $Remainder > 0 ) { $Word = $Remainder.$Word; $Remainder = ""; } if (length $Word == 0) {next;} if ( $Options =~ m/p/ ) #delete punctuation { if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ ) { next; } if ( $Word =~ m/--/ ) { next; } } if ( $Word =~ m/\-$/ ) { $Remainder = $Word; chop $Remainder; next; } #HTML: if ( substr ($Word,0,1) eq "<" and substr ($Word,length($Word)-1,1) eq ">" ) {next;} $WordCount++; $Frequency { $Word }++; for ($q = 0; $q $Frequency { $a } } keys(%Frequency) ; } elsif ( $Options =~ /r/ ) { foreach $Word ( keys %Frequency ) { $TempRev{ Reverse($Word) } = $Frequency {$Word } ; } @SortedList = sort keys ( %TempRev ); for ($f = 0; $f <= $#SortedList; $f++) { $SortedList[$f] = Reverse ( $SortedList[$f] ); } } else { @SortedList = sort keys(%Frequency) ; } $NumberOfWords = $#SortedList; print "\nNumber of words: ", $NumberOfWords; print "\nWord Count : $WordCount"; $a = 1; $ZipfRank = 0; $PriorFreq = 0; while (@SortedList) { print OUTFILE $a; $Word = shift (@SortedList); if ($Options =~ /z/) { if ($Frequency{$Word} == $PriorFreq) { print OUTFILE "\t$ZipfRank"; } else { $ZipfRank++; print OUTFILE "\t$ZipfRank";} } $PriorFreq = $Frequency{$Word}; print OUTFILE "\t", $Word; if ( $PrintFreqs eq TRUE ) { print OUTFILE "\t", $Frequency{$Word}, "\t", $Frequency{$Word}/$WordCount, "\t", $Frequency{$Word}* $a/$WordCount, "\t", log ( $Frequency{$Word} ), } print OUTFILE "\n"; $a++; } if ( $Options =~ /l/ ) { for $FirstLetter ( keys %Letters ) { print OUTFILE "\nFollowing stats for: $FirstLetter\n"; for $SecondLetter ( keys %{$Letters{$FirstLetter}} ) { if ($LetterCount {$FirstLetter } == 0) {print "\nNo count: $FirstLetter\n"; next;} $Proportion = $Letters{$FirstLetter}{$SecondLetter}/$LetterCount{$FirstLetter}; print OUTFILE "\n\t$SecondLetter\t$Letters{$FirstLetter}{$SecondLetter}\t$Proportion" ; $Entropy{$FirstLetter} += H ( $Proportion) ; } print OUTFILE "\nPreceding stats for: $FirstLetter Entropy: $Entropy{$FirstLetter}\n\n"; $Entropy += $LetterCount{$FirstLetter} * $Entropy{$FirstLetter} / $TotalLetterCount; } print OUTFILE "\n\n Conditional Entropy: $Entropy"; } close OUTFILE; sub Reverse { my $i; $string = @_[0]; $length = length ($string); $out = ""; for ($i = 0; $i < $length; $i++) { substr ($out, $i, 1,substr ($string, $length - $i -1 , 1 ) ); } return $out; } sub H { if ( @_ == 0) {return 0;} return -1 * @_[0] * log (@_[0]) / log (2); }