#!/usr/bin/perl

my $CMD="tokenise.pl";
my $VERSION="1.1";
my $MODIFIED="30/6/2004";

###########################################################################
# tokenises text for French TreeTagger
# Achim Stein <achim@ims.uni-stuttgart.de>
###########################################################################

# Separate characters at the beginning of words
my $begin_char='[|{(\/\´\`"»«\202\204\206\207\213\221\222\223\224\225\226\227\233°';

# Separate characters at the end of words
my $end_char=']|}\/\'\`\"),;:\!\?\%»«\202\204\205\206\207\211\213\221\222\223\224\225\226\227\233°|';

# Separate strings at the beginning of words
my $begin_string='[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';

# Separate strings at the end of words
my $end_string='-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-là';

###########################################################################
#                    DO NOT MODIFY FOLLOWING CODE !
###########################################################################

my $HELP="
---------------------------------------------------------------------------
$CMD $VERSION (c) Achim Stein $MODIFIED
---------------------------------------------------------------------------
FUNCTION: segments text for French TreeTagger
SYNTAX:   $CMD [options] <file>
OPTIONS:
  -h        print help screen
  -w        replace whitespace by SGML-Tags (use TreeTagger -sgml option!)
EXAMPLE:
tokenise.pl test.txt | tree-tagger parfile -token -lemma -sgml > test.tgd
";

###########################################################################
# parse command line
###########################################################################

use Getopt::Std;
getopts('hw');

if(defined($opt_h)) {
  print STDERR "$HELP";
  exit(1);
}

###########################################################################
# read the file
###########################################################################

while (<>) {

# delete \r
s/\r//g;

# replace blanks within SGML Tags
    while (s/(<[^<> ]*)[ \t]([^<>]*>)/$1\377$2/g) {};

# replace whitespace by SGML-Tags
if(defined($opt_w)) {
    s/\n/<internal_NL>/g;
    s/\t/<internal_TAB>/g;
    s/ /<internal_BL>/g;
}

# restore SGML Tags
    tr/\377/ /;

# put special characters around SGML Tags for tokenisation
    s/(<[^<>]*>)/\377$1\377/g;
    s/(&[^; \t\n\r]*;)/\377$1\377/g;
    s/^\377//;
    s/\377$//;
    s/\377\377/\377/g;

    @S = split("\377");
    for($i=0; $i<=$#S; $i++) {
	$_ = $S[$i];

	# skip lines with  only SGML tags
	if (/^<.*>$/) {
	    print $_,"\n";
	}
	# normal text
	else {
	    # put spaces at beginning and end
	    $_ = ' '.$_.' ';
	    # put spaces around punctuation
	    s/(\.\.\.)/ ... /g;
	    s/([;\!\?\/])([^ ])/$1 $2/g;
	    s/([.,:])([^ 0-9.])/$1 $2/g;
	    
	    @F = split;
	    for($j=0; $j<=$#F; $j++) {
		my $suffix="";
		$_ = $F[$j];
		# cut off punctuation and brackets
		do {
		    $finished = 1;
		    # preceding brackets etc.
		    if (s/^([$begin_char])(.)/$2/) {
			print $1,"\n";
			$finished = 0;
		    }
		    # following brackets etc.
		    if (s/(.)([$end_char])$/$1/) {
			$suffix = "$2\n$suffix";
			$finished = 0;
		    }
		    # cut off dot after punctuation etc.
		    if (s/([$end_char])\.$//) { 
			$suffix = ".\n$suffix";
			if ($_ eq "") {
			    $_ = $1;
			}
			else {
			    $suffix = "$1\n$suffix";
			}
			$finished = 0; 
		    }
		}
		while (!$finished);
		
		# deal with listed tokens
		if (defined($Token{$_})) {
		    print "$_\n$suffix";
		    next;
		}
		
		# deal with abbrevs like U.S.A.
		if (/^([A-Za-zÀ-ÿ]\.)+$/) {
		    print "$_\n$suffix";
		    next;
		}
		
		# ordinal numbers
		if (/^[0-9]+\.$/) {
		    print "$_\n$suffix";
		    next;
		}
		
		# deal with differnt types of dots
		if (/^(..*)\.$/ && $_ ne "...") {
		    $_ = $1;
		    $suffix = ".\n$suffix";
		    if (defined($Token{$_})) {
			print "$_\n$suffix";
			next;
		    }
		}
		
		# cut  clitics off
		while (s/^($begin_string)(.)/$2/) {
		    print $1,"\n";
		}
		while (s/(.)($end_string)$/$1/) {
		    $suffix = "$2\n$suffix";
		}
		print "$_\n$suffix";
	    }
	}
    }
}
