#!/usr/bin/perl my $CMD="tokenise.pl"; my $VERSION="1.1"; my $MODIFIED="30/6/2004"; ########################################################################### # tokenises text for French TreeTagger # Achim Stein ########################################################################### # Separate characters at the beginning of words my $begin_char='[|{(\/\Д\`"ЛЋ\202\204\206\207\213\221\222\223\224\225\226\227\233А'; # Separate characters at the end of words my $end_char=']|}\/\'\`\"),;:\!\?\%ЛЋ\202\204\205\206\207\211\213\221\222\223\224\225\226\227\233А|'; # Separate strings at the beginning of words my $begin_string='[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\''; # Separate strings at the end of words my $end_string='-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mъmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-lр'; ########################################################################### # DO NOT MODIFY FOLLOWING CODE ! ########################################################################### my $HELP=" --------------------------------------------------------------------------- $CMD $VERSION (c) Achim Stein $MODIFIED --------------------------------------------------------------------------- FUNCTION: segments text for French TreeTagger SYNTAX: $CMD [options] OPTIONS: -h print help screen -w replace whitespace by SGML-Tags (use TreeTagger -sgml option!) EXAMPLE: tokenise.pl test.txt | tree-tagger parfile -token -lemma -sgml > test.tgd "; ########################################################################### # parse command line ########################################################################### use Getopt::Std; getopts('hw'); if(defined($opt_h)) { print STDERR "$HELP"; exit(1); } ########################################################################### # read the file ########################################################################### while (<>) { # delete \r s/\r//g; # replace blanks within SGML Tags while (s/(<[^<> ]*)[ \t]([^<>]*>)/$1\377$2/g) {}; # replace whitespace by SGML-Tags if(defined($opt_w)) { s/\n//g; s/\t//g; s/ //g; } # restore SGML Tags tr/\377/ /; # put special characters around SGML Tags for tokenisation s/(<[^<>]*>)/\377$1\377/g; s/(&[^; \t\n\r]*;)/\377$1\377/g; s/^\377//; s/\377$//; s/\377\377/\377/g; @S = split("\377"); for($i=0; $i<=$#S; $i++) { $_ = $S[$i]; # skip lines with only SGML tags if (/^<.*>$/) { print $_,"\n"; } # normal text else { # put spaces at beginning and end $_ = ' '.$_.' '; # put spaces around punctuation s/(\.\.\.)/ ... /g; s/([;\!\?\/])([^ ])/$1 $2/g; s/([.,:])([^ 0-9.])/$1 $2/g; @F = split; for($j=0; $j<=$#F; $j++) { my $suffix=""; $_ = $F[$j]; # cut off punctuation and brackets do { $finished = 1; # preceding brackets etc. if (s/^([$begin_char])(.)/$2/) { print $1,"\n"; $finished = 0; } # following brackets etc. if (s/(.)([$end_char])$/$1/) { $suffix = "$2\n$suffix"; $finished = 0; } # cut off dot after punctuation etc. if (s/([$end_char])\.$//) { $suffix = ".\n$suffix"; if ($_ eq "") { $_ = $1; } else { $suffix = "$1\n$suffix"; } $finished = 0; } } while (!$finished); # deal with listed tokens if (defined($Token{$_})) { print "$_\n$suffix"; next; } # deal with abbrevs like U.S.A. if (/^([A-Za-zР-џ]\.)+$/) { print "$_\n$suffix"; next; } # ordinal numbers if (/^[0-9]+\.$/) { print "$_\n$suffix"; next; } # deal with differnt types of dots if (/^(..*)\.$/ && $_ ne "...") { $_ = $1; $suffix = ".\n$suffix"; if (defined($Token{$_})) { print "$_\n$suffix"; next; } } # cut clitics off while (s/^($begin_string)(.)/$2/) { print $1,"\n"; } while (s/(.)($end_string)$/$1/) { $suffix = "$2\n$suffix"; } print "$_\n$suffix"; } } } }