# Split off punctuation...
# 1st argument: name of input file
# 2nd argument: name of output file
# 3rd argument (optional): list of flags:
#     f            follow word by its count in corpus
#     s            sort by frequency, not alphabetically
#     p		   ignore punctuation
#     t		   threshold -- now set at 5; frequencies below we will not sort or print.
		$Threshold = 5;

if ( scalar( @ARGV ) >=  2 )
{ 
            $TheFile = $ARGV[0];
            $TheMainOutFile = ">" . $ARGV[1];
} 

 

else 
{           print  "Not enough arguments given to operate; goodbye."; 
            exit;
}


$Options = @ARGV[2];
if ( $Options =~ m/f/ ) 
{
           $PrintFreqs = TRUE;
} 
else 
{
           $PrintFreqs = FALSE;
}


open (OUTFILE, $TheMainOutFile);
open (INFILE, $TheFile) or die "The file $TheFile could not be found";

$WordCount=0;
$LineCount =0;
$Counter = 0;
$LongestWordlength =0;


$Remainder = "";
while (<INFILE>)        
{

            $TheLine= $_;                               # unnecessarily explicit in real PERL
            chomp ($TheLine); 
            $LineCount =  $LineCount + 1;		

	    $TheLine =~ s/(\S)([.,;!?()'"\]])/$1 $2/g;     # \S finds a non-whitespace character
	    $TheLine =~ s/(\S)([.,;!?()'"\]])/$1 $2/g;     # \S finds a non-whitespace character


	    $TheLine =~ s/([\[\'"(])(\S)/$1 $2/g;
	    $TheLine =~ s/([\[\'"(])(\S)/$1 $2/g;

	    $TheLine =~ s/(\S)(--)/$1 $2/g;		# separate a "--" 


			
            @Words=split(/ /, $TheLine); 
 	    while ( @Words )     
            { 		
			$Word= lc( shift (@Words) );

			if ( length $Remainder > 0 )   
			{
			    $Word = $Remainder.$Word;
			    $Remainder = ""; 
			
			}
			if ( $Options =~ m/p/ )                                     #delete punctuation
			{ 
			   if ( $Word =~ m/^[.,;:!?()<>'"[\]]$/ )  { next; }
			   if ( $Word =~ m/--/ ) { next; }
			}

			if ( $Word =~ m/\-$/   ) 
			{	
				$Remainder = $Word;
				chop $Remainder;		
				next; 
			   
			}

			$WordCount++;                

			if (exists $  Frequency {$Word}  ) 
                        {
                            $Frequency { $Word}  ++;                              
                        }
                        else 
                        {                                              
                            $Frequency {$Word } = 1.0;                                 
                        }          


            }
}


print "finished reading.\n";





if ( $Options =~ m/t/ )                                     # threshold
{ 
    foreach $Word ( keys %Frequency )
    {
	if ( $Frequency{$Word} < $Threshold )  { $Temp{$Word} = 1; }
    }
    foreach $Word ( keys %Temp )
    {
	delete $Frequency{$Word};
    }
}





if ( $Options =~ /s/ ) 
{
              @SortedList = sort {    $Frequency{ $b } <=> $Frequency { $a }    }  keys(%Frequency) ;
}
else
{
              @SortedList = sort  keys(%Frequency) ;
}






$NumberOfWords =  $#SortedList;
print "\nNumber of words: ", $NumberOfWords;  
print "\nWord Count : $WordCount";
$a = 1;
while (@SortedList)
{
        $Word = shift (@SortedList);
        print OUTFILE $a++, "\t",  $Word;

              
	if ( $PrintFreqs eq TRUE  ) 
	{ 	print OUTFILE  "\t", $Frequency{$Word}; 		
	}
        print OUTFILE "\n"; 

}

 

close OUTFILE;






