#/usr/bin/perl
#-----------------------------------------------------------
<<DOC; 
Yagmur Ozturk
usage : perl parcours-arborescence-fichiers repertoire-a-parcourir rubrique
DOC
#-----------------------------------------------------------
use XML::RSS;

#-----------------------------------------------------------
my $rep="$ARGV[0]";
my $rubrique="$ARGV[1]";
# on s'assure que le nom du répertoire ne se termine pas par un "/"
$rep=~ s/[\/]$//;
my $i=0;
my %doublons;
open(OUT, ">:encoding(utf-8)", "sortie-$rubrique-xmlrss.txt");
open(OUTXML, ">:encoding(utf-8)", "sortie-$rubrique-xmlrss.xml");
print OUTXML "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
print OUTXML "<corpus2020>\n";

#----------------------------------------
&parcoursarborescencefichiers($rep);	#recurse!
close OUT;
print OUTXML "</corpus2020>\n";
close OUTXML;
exit;
#----------------------------------------------
sub parcoursarborescencefichiers {
    my $path = shift(@_);
    opendir(DIR, $path) or die "can't open $path: $!\n";
    my @files = readdir(DIR);
    closedir(DIR);
    foreach my $file (@files) {
		next if $file =~ /^\.\.?$/;
		$file = $path."/".$file;
		if (-d $file) {
		    print "position :  $file \n";
			&parcoursarborescencefichiers($file);	#recurse!
		}
		if (-f $file) {
		    if ($file=~/$rubrique.+\.xml$/) {			
				print $i++," Processing of ", $file, "\n";
                print
                "##------------------------------------------##\n";

                #PARSING THE FILE--------xmlrss part 

				my $rss=new XML::RSS;
				eval {$rss->parsefile($file); };
				if( $@ ) {
					$@ =~ s/at \/.*?$//s;               # remove module line number
					print STDERR "\nERROR in '$file':\n$@\n";
				} 


                #EXTRACTION--------------  

				else {
					foreach my $item (@{$rss->{'items'}}) {
						my $description=$item->{'description'};
						my $titre=$item->{'title'};
						my ($titrenettoye,$descriptionnettoye) = &nettoyage($titre,$description);
						if (exists $doublons{$titrenettoye}) {
							$doublons{$titrenettoye}++;
						}
						else {
							$doublons{$titrenettoye}=1;

							print OUT "TITRE  : ", $titrenettoye, "\n";
							print OUT "DESCRIPTION : ", $descriptionnettoye, "\n";
                            print OUT "--------------------------------------------------------------------------------------\n";

							print OUTXML "\t\t<item>\n";
							print OUTXML "\t\t\t<titre>$titre</titre>\n";
							print OUTXML "\t\t\t<description>$description</description>\n";
							print OUTXML "\t\t</item>\n";
						}
					}
				}
						
			}
		}
    }
}

sub nettoyage {

my $tt = $_[0]; 
my $desc = $_[1];

$tt = $tt . "." ;  #add dots at the end of titles (?)

$desc =~s/&/et/g ;  #change et 
$tt =~s/&/et/g ;

$desc=~s/^<;!\[CDATA\[//;  #clean some words 
$desc=~s/\]\]&gt;$//;
$tt=~s/^<;!\[CDATA\[//;
$tt=~s/\]\]&gt;$//;

$desc =~ s/&#38;#39;/'/g;   
$tt =~s/&#38;#39;/'/g;

$desc =~s/&#38;#34;/"/g;
$tt =~s/&#38;#34;/"/g;

$desc =~s/&lt;.+?&gt;//g ;  #get rid of balises 
$tt =~s/&lt;.+?&gt;//g ;

return $tt, $desc;
}

#----------------------------------------------