#/usr/bin/perl
#-----------------------------------------------------------
use utf8;
use strict;
binmode(STDOUT, ":encoding(UTF-8)");
#-----------------------------------------------------------
# Ce programme s'utilise ainsi :
# ex) perl ./BAO2.pl src/2021 3210
# Il prend en arguments 2 éléments : (1) le nom de l'arborescence 2021
# contenant les fils RSS de l'année 2021, (2) le nom de la rubrique à traiter
# ici 3208 pour A la une
#-----------------------------------------------------------
if ($#ARGV != 1) {print "Il manque un argument à votre programme....\n";exit;} 
my $rep="$ARGV[0]";
my $RUBRIQUE="$ARGV[1]";
# on s'assure que le nom du répertoire ne se termine pas par un "/"
$rep=~ s/[\/]$//;
open my $output_txt, ">:encoding(UTF-8)","corpus-titre-description.txt";
open my $output_xml, ">:encoding(UTF-8)","pre-corpus-titre-description.xml";
print $output_xml "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<corpus>\n";
#----------------------------------------
&parcoursarborescencefichiers($rep);	#recurse!
#----------------------------------------
print $output_xml "</corpus>\n";
close $output_txt;
close $output_xml;
#----------------------------------------
#1. annoter avec udpipe le fichier corpus-titre-description.txt
&etiquetageUP;
#2. annoter avec treetagger le fichier corpus-titre-description.xml
&etiquetageTT;
#----------------------------------------------
# delete temporary files
system("rm -f temp.txt temp2.txt corpus-titre-description pre-corpus-titre-description.xml corpus-titre-description.txt");

exit;
#----------------------------------------------
sub etiquetageUP {
	#udpipe --tokenize --tag --parse udpipe_model file  > result_udpipe
	system("./udpipe/udpipe-1.2.0-bin/bin-osx/udpipe --tokenize --tag --parse --tokenizer=presegmented ./udpipe/modeles/french-sequoia-ud-2.5-191206.udpipe corpus-titre-description.txt > ./rst/corpus-titre-description.udpipe");
}
#----------------------------------------------
sub etiquetageTT {
	system("./tree-tagger/tree-tagger -lemma -token -no-unknown -sgml ./tree-tagger/french-utf8.par pre-corpus-titre-description.xml > corpus-titre-description");
	system("perl ./tree-tagger/treetagger2xml-utf8.pl corpus-titre-description UTF8");
	
}
#----------------------------------------------
sub parcoursarborescencefichiers {
    my $path = shift(@_);
    opendir(DIR, $path) or die "can't open $path: $!\n";
    my @files = readdir(DIR);
    closedir(DIR);
    foreach my $file (@files) {
		next if $file =~ /^\.\.?$/;
		$file = $path."/".$file;
		if (-d $file) {
			print "On entre dans le REPERTOIRE : $file \n";
			&parcoursarborescencefichiers($file);	#recurse!
			print "On sort du REPERTOIRE :  $file \n";
		}
		if (-f $file) {
			if ($file =~ /$RUBRIQUE.+\.xml$/) {
				print "Traitement du fichier $file \n";
				open my $input, "<:encoding(UTF-8)",$file;
				$/=undef; # par défaut cette variable contient \n
				my $ligne=<$input> ;
				close($input);
				while ($ligne=~/<item><title>(.+?)<\/title>.+?<description>(.+?)<\/description>/gs) {
					my $titre=&nettoyage($1);
					my $description=&nettoyage($2);
					print $output_txt "$titre \n";
					print $output_txt "$description \n";
					# segmentation titre et description avec le programme tokenize offert par treetagger
					my ($titreSEG,$descriptionSEG)=&segmentationTD($titre,$description);
					print $output_xml "<item><titre>\n$titreSEG\n</titre><description>\n$descriptionSEG\n</description></item>\n";
				}
			}
		}
    }
}
#----------------------------------------------
sub segmentationTD {
	my ($arg1,$arg2)=@_;
	# 1. ecriture des données textuelles dans un fichier TOTO
	# 2. puis tokenizer TOTO
	# 3. puis recupérer les données 
	#-----------------------------------------------
	open my $tmp, ">:encoding(UTF-8)","temp.txt";
	print $tmp $arg1;
	close $tmp;
	system("perl ./tree-tagger/tokenise-utf8.pl temp.txt > temp2.txt");
	undef $/;
	open my $tmp2, "<:encoding(UTF-8)","temp2.txt";
	my $titresegmente=<$tmp2>;
	close $tmp2;
	#-----------------------------------------------
	open $tmp, ">:encoding(UTF-8)","temp.txt";
	print $tmp $arg2;
	close $tmp;
	system("perl ./tree-tagger/tokenise-utf8.pl temp.txt > temp2.txt");
	open  $tmp2, "<:encoding(UTF-8)","temp2.txt";
	my $descriptionsegmente=<$tmp2>;
	close $tmp2;
	$/="\n";
	#-----------------------------------------------
	return $titresegmente,$descriptionsegmente;
}
#----------------------------------------------
sub nettoyage {
	my $texte=shift @_;
	$texte=~s/(^<!\[CDATA\[)|(\]\]>$)//g;
	$texte.=".";
	$texte=~s/\.+$/\./;
	return $texte;
}
#----------------------------------------------