# 1st argument: name of input file # 2nd argument: name of output file # 3rd argument: v for verbose output # r for input list is reverse alphabetized (right to left) $True =1; $False = 0; if ( $#ARGV >= 1 ) { $TheFile = $ARGV[0]; $TheMainOutFile = ">" . $ARGV[1]; } else { print "Not enough arguments given to operate; goodbye."; exit; } open (OUTFILE, $TheMainOutFile); open (INFILE, $TheFile) or die "The file $TheFile could not be found"; $True = 1; $False = 0; if ( @ARGV > 2 ) { if ($ARGV[2] =~ /v/ ) { $Verbose = $True; print "\n it's verbose\n"; } else { $Verbose = $False; } if ($ARGV[2] =~ /r/ ) { $Reverse = $True; } else { $Reverse = $False; } } $WordCount=0; %Frequency; $LineCount =0; $Counter = 0; $LongestWordlength =0; while () { chomp ; @Words = split (/\s/); if ($#Words >= 1) { $Word = $Words[1]; # the first is being ignored -- it's presumably a line number. } else { $Word = $Words[0]; } if ($Reverse == $False) { $Word .= "#"; } else { $Word = "#".$Word;} $LineCount = $LineCount + 1; push @AlphabetizedList, $Word; } print "finished reading.\n"; $NumberOfWords = $#AlphabetizedList; print "\nNumber of words: ", $NumberOfWords; $Carat = 0; $MaxLength = 5; $PreviousWord = "#"; for ($i = 0; $i < $NumberOfWords; $i++) { $Word = $AlphabetizedList[$i]; print OUTFILE "\n\n$Word (Previous word: $PreviousWord)"; if ( $Reverse == $False ) { for ($j = 0 ; $j < length ($PreviousWord) && ( $j < length ($Word) ); $j++) { if ( (substr $PreviousWord, $j, 1) ne (substr $Word, $j, 1) ) { $CommonPrefix = substr $Word, 0, $j; last; } } } else { # REVERSE, for suffixes for ($j = 1 ; $j <= length ($PreviousWord) && ( $j <= length ($Word) ); $j++) { if ( LetterFromEnd ($PreviousWord, $j) ne LetterFromEnd ($Word, $j) ) { $CommonSuffix = substr $Word, length ($Word) - $j +1, $j-1; last; } } } #Clear ZelligCount past the length of Common part: if ($Reverse == $False) { while ( scalar (@ZelligCount) > length ($Word) ) { pop @ZelligCount; } }else { while ( scalar (@ZelligCount) > length ($CommonSuffix) ) { shift @ZelligCount; } } if ( $Reverse == $False ) #Normal, forward direction.... { $StartPoint = length ($CommonPrefix)+1; $EndPoint = (length ($Word)) - 1; for ( $j = $EndPoint; $j >= $StartPoint; $j-- ) { $TempCount = 1; $CurrentLetterInThisPosition = substr $Word, $j, 1; $PrefixConsidered = substr $Word,0, $j; if ($Verbose) {print OUTFILE "\n$PrefixConsidered First successor variant: $CurrentLetterInThisPosition" ;} for ( $k = $i+1 ; $k < $NumberOfWords ; $k++ ) { $TempWord = $AlphabetizedList[$k]; if (length $TempWord < $j ) { if ( $Verbose ) { print OUTFILE " \n\t1. After $TempWord, we quit."; } last; } if ( (substr $TempWord, 0, $j) ne (substr $Word, 0, $j) ) { if ( $Verbose ) {print OUTFILE " \n\t2. $TempWord doesn't agree with $Word in $j places; we'll quit.";} last; } if ( $CurrentLetterInThisPosition eq substr $TempWord, $j, 1 ) { next; } else { $CurrentLetterInThisPosition = substr $TempWord, $j, 1; $TempCount++; if ( $Verbose ) {print OUTFILE " \n\t3. After $TempWord, we increment count in column $j to $TempCount."; } } } $ZelligCount[$j] = $TempCount; print OUTFILE " Count: $TempCount " if ($VerboseFlag); } } # end of forward case else { #reverse case: if ($Verbose) {print OUTFILE "\tCommon suffix = $CommonSuffix"; print OUTFILE "; Predecessor count for common suffix: ", "@ZelligCount";} $LengthOfCommonSuffix = length ($CommonSuffix); for ($j = $LengthOfCommonSuffix+2; $j <= length($ Word ); $j++) { $TempCount = 1; $CurrentLetterInThisPosition = LetterFromEnd ($Word , $j); $SuffixConsidered = substr ($Word, length($Word)-$j+1); $p = length($Word) - $j +1; #p is the letter whose pred. variants we are counting if ($Verbose) {print OUTFILE "\n$SuffixConsidered First pred variant: $CurrentLetterInThisPosition" ;} for ( $k = $i+1 ; $k < $NumberOfWords ; $k++ ) { $TempWord = $AlphabetizedList[$k]; if (length $TempWord < $j ) { if ( $Verbose ) { print OUTFILE " \n\t1. After $TempWord, we quit."; } last; } if ( (substr $TempWord, length($TempWord) - $j+1) ne (substr $Word, length($Word) - $j+1 ) ) { if ( $Verbose ) {print OUTFILE " \n\t2. $TempWord doesn't agree with $Word in final $j places; we'll quit.";} last; } if ( $CurrentLetterInThisPosition eq LetterFromEnd( $TempWord, $j) ) { next; } else { $CurrentLetterInThisPosition = LetterFromEnd( $TempWord, $j) ; $TempCount++; if ( $Verbose ) {print OUTFILE "\n\t3. After $TempWord, we increment count in position $p to $TempCount."; print OUTFILE " New letter in this position: $CurrentLetterInThisPosition";} } } unshift @ZelligCount, $TempCount; #if ($Verbose) { print OUTFILE "\nPredecessor count: "; for ( $n = 0; $n < length ($Word)-1; $n++) {print OUTFILE $n,":", $ZelligCount[$n], " " ; } } } } # end of reverse case if ( $Reverse == $False ) { print OUTFILE "\n"; for ( $n = 0; $n < length ($Word)-1; $n++) {print OUTFILE substr ($Word,0,$n+1),":", $ZelligCount[$n+1], " " ; } for ($w = 1; $w < $EndPoint and $w < ( length $Word ) ; $w++) { if ( ( $ZelligCount[$w] < $ZelligCount[$w+1] ) and ( $ZelligCount[$w+1] > $ZelligCount[$w+2] ) ) { $Prefix = substr $Word, 0, $w+1; print OUTFILE "\nBreak after: ", $Prefix; $Prefixes{$Prefix}++; $Suffix = substr $Word, $w+1; print OUTFILE " $Suffix"; $Suffixes{$Suffix}++; } } } #end of forward case else { print OUTFILE "\n"; for ( $n = 0; $n < length ($Word)-1; $n++) {print OUTFILE substr ($Word,$n+1),":", $ZelligCount[$n], " " ; } for ($w = 2; $w < length ($Word) - 2 ; $w++) { if ( ( $ZelligCount[$w] > $ZelligCount[$w+1] ) and ( $ZelligCount[$w] > $ZelligCount[$w-1] ) ) { $Suffix = substr $Word, $w+1; print OUTFILE "\nBreak before: ", $Suffix; $Suffixes{$Suffix}++; } } } #end of reverse case $PreviousWord = $Word; } print OUTFILE "\n\n\nResults:\n"; if ( $Reverse == $False ) { $a=0; @SortedList = sort { $Prefixes{ $b } <=> $Prefixes { $a } } keys(%Prefixes) ; @SortedList2 = sort { $Suffixes{ $b } <=> $Suffixes { $a } } keys(%Suffixes) ; while (@SortedList) { $Found = shift @SortedList; print OUTFILE $a++, "\t", $Found, "\t", $Prefixes{$Found}, "\n"; } $a=0; print OUTFILE "\n\n\nSuffixes:\n\n"; while (@SortedList2) { $Found = shift @SortedList2; print OUTFILE $a++, "\t", $Found, "\t", $Suffixes{$Found}, "\n"; } } #end of forward case else { $a=0; @SortedList = sort { $Suffixes{ $b } <=> $Suffixes { $a } } keys(%Suffixes) ; while (@SortedList) { $Found = shift @SortedList; print OUTFILE $a++, "\t", $Found, "\t", $Suffixes{$Found}, "\n"; } } #end of reverse case print "...all done!"; close OUTFILE; sub Letter () { my $Word = @_[0]; $N= @_[1]; if ($N >= 0) { return substr ($Word, $N, 1); } else { return substr ($Word, length ($Word) + $N, 1 ) ; } } sub LetterFromEnd() { my $Word = @_[0]; $N= @_[1]; if ($N >= 0) { return substr ($Word, length ($Word) - $N , 1); } }