#!c:\Perl\bin\perl.exe <du fichierdu filtrage DOC use utf8; #----------------------------------------------------------- my $rep="$ARGV[0]"; # on s'assure que le nom du répertoire ne se termine pas par un "/" $rep=~ s/[\/]$//; # on initialise une variable contenant le flux de sortie my $DUMPFULL1=""; #---------------------------------------- my $output1="SORTIE.xml"; if (!open (FILEOUT,">$output1")) { die "Pb a l'ouverture du fichier $output1"}; #---------------------------------------- my $i=1; &parcoursarborescencefichiers($rep); #recurse! #---------------------------------------- print FILEOUT "\n"; print FILEOUT "\n"; print FILEOUT "Votre nom\n"; print FILEOUT "\n".$DUMPFULL1."\n\n"; print FILEOUT "\n"; close(FILEOUT); exit; #---------------------------------------------- sub parcoursarborescencefichiers { my $path = shift(@_); opendir(DIR, $path) or die "can't open $path: $!\n"; my @files = readdir(DIR); closedir(DIR); foreach my $file (@files) { next if $file =~ /^\.\.?$/; $file = $path."/".$file; if (-d $file) { &parcoursarborescencefichiers($file); #recurse! } if (-f $file) { if ($file=~/0,.*\.xml$/) { my $compteuritem =0; open(FILEINPUT,"$file"); $DUMPFULL1.=" $file\n"; while ($ligne = ) { if ($ligne=~//) { $compteuritem++; } if ($ligne=~/([^<]+)<\/title>/) { $texte=$1; $texte=~s/&#39;/\'/g; $texte=~s/é/é/g; $texte=~s/&#34;/\"/g; $texte=~s/&/\'/g; $texte=~s/"/\'/g; $texte=~s/ê/ê/g; @TAB=split(/[ ,;!?\"\'\.:()]/,$texte); open (FILETMP,">toto.tmp"); foreach $elt (@TAB) { print FILETMP $elt,"\n"; } close (FILETMP); system ("./Treetagger/bin/tree-tagger.exe -token -lemma -no-unknown ./Treetagger/lib/french.par toto.tmp toto.out"); system ("perl ./Treetagger/treetagger2xml.pl toto.out"); open (FILETMP, "toto.out.xml"); while ($l=<FILETMP>) { if ($l!~/xml version/ && $l!~/ / && $l!~//) { $DUMPFULL1.=$l; } } } if ($ligne=~/<description>([^<]+)<\/description>/) { $texte=$1; $texte=~s/&#39;/\'/g; $texte=~s/é/é/g; $texte=~s/&#34;/\"/g; $texte=~s/&/\'/g; $texte=~s/"/\'/g; $texte=~s/ê/ê/g; @TAB=split(/[ ,;!?\"\'\.:()]/,$texte); open (FILETMP,">toto.tmp"); foreach $elt (@TAB) { print FILETMP $elt,"\n"; } close (FILETMP); system ("./Treetagger/bin/tree-tagger.exe -token -lemma -no-unknown ./Treetagger/lib/french.par toto.tmp toto.out"); system ("perl ./Treetagger/treetagger2xml.pl toto.out"); open (FILETMP, "toto.out.xml"); while ($l=<FILETMP>) { if ($l!~/xml version/ && $l!~/ / && $l!~//) { $DUMPFULL1.=$l; } } } } $DUMPFULL1.="</fichier$i>"; print $i++,"\n"; } close(FILEINPUT); } } } #----------------------------------------------