#!/usr/bin/perl
<<DOC;
-------------------------------------------------------------------------------
Nom : ANDREA F. MONACO

BAO1_regex.pl -- nettoie et extrait le contenu textuel des balises <title> et 
<description> des fichiers XML-RSS en entrée et produit deux sorties: 

	- un fichier texte brut
	- un fichier XML restructuré

Rubriques traitées ($nom_rubrique): 'france', 'media'
Mode d'emploi : perl BAO1_regex.pl chemin_dossier nom_rubrique
-------------------------------------------------------------------------------
DOC
use strict;

my $folder = "$ARGV[0]";
my $rubrique = "$ARGV[1]";
# -----------------------------------------------------------------------------
# PREPARING ARGUMENTS
my $nom_rubrique = $rubrique;
my %rubrique_conv = ("france"=>"3224", "media"=>"3236");
my $rubrique = $rubrique_conv{"$rubrique"};
# on s'assure que le répertoire ne se termine pas par "/"
$folder=~ s/[\/]$//;
# -----------------------------------------------------------------------------
my %doublons;
my $i=1;
# -----------------------------------------------------------------------------
# OPENING FILES FOR WRITING
open my $txt, ">:encoding(utf-8)", "./sorties/txt/bao1_regex_$folder-$nom_rubrique.txt";
open my $xml, ">:encoding(utf-8)", "./sorties/xml/bao1_regex_$folder-$nom_rubrique.xml";
# -----------------------------------------------------------------------------
# PRINTING XML HEADERS
print $xml "<?xml version='1.0' encoding='utf-8'?>\n";
print $xml "<file>\n";
print $xml "\t<name>Andrea F. Monaco</name>\n";
# -----------------------------------------------------------------------------
# RETRIEVING AND PROCESSING XML FILES
&finding_files($folder);
# -----------------------------------------------------------------------------
# CLOSING FILES
print $xml "</file>\n";
close $xml;
close $txt;
exit;
# -----------------------------------------------------------------------------
# RETRIEVING AND PROCESSING XML FILES
sub finding_files {
	# annee : $folder
    my $path = shift(@_);
    # on récupère la liste des fichiers de $folder
    opendir(DIR, $path) or die "can't open $path: $!\n";
    my @files = readdir(DIR);
    closedir(DIR);
    # on ordonne la liste de fichiers
    foreach my $file (sort { $a <=> $b } @files) {
		next if $file =~ /^\.\.?$/;
		$file = $path."/".$file;
		# /mois/jour/horaire...
		if (-d $file) {
		    print "$file\t↵\n";
			&finding_files($file);
		}
		# ...les fichiers ont été trouvé !
		if (-f $file) {
		    if ($file =~ /$rubrique.+\.xml$/) {
		    	print $xml "\t<fileid>$file</fileid>\n";	
				print $i++," : $file \n";
				# --------------------------------------------------------------
				# REGEX VERSION (BAO1)
				open my $rss_in, "<:encoding(utf-8)", $file;
				my $texte = "";
				while (my $line = <$rss_in>) {
					chomp $line;
					# suppression du Carriage Return \r
					$line =~ s/\r//g;
					$texte = $texte . $line ;
				}
				close $rss_in;
				my $count = 1;
				while ($texte =~ /<item>.*?<title>([^<]*)<\/title>.*?<description>([^<]*)<\/description>.*?<\/item>/g){
					my $title = "$1";
					my $description = "$2";
					if (!exists $doublons{$title}) {
						$doublons{$title}=1;
						my($title, $description) = &cleaning($title, $description);
						print $txt "#$title\n";
						print $txt "$description\n\n";
						my($title, $description) = &xml_entities($title, $description);
						print $xml "\t<item n='$count'>\n" ;
						print $xml "\t\t<title>$title</title>\n" ;
						print $xml "\t\t<description>$description</description>\n";
						print $xml "\t</item>\n";
						$count++;	
					}
				}		
			}
		}
    }
} # finding_files
# -----------------------------------------------------------------------------
# CLEANING <TITLE> AND <DESCRIPTION> TEXT CONTENT
sub cleaning {

	my ($title, $description) = @_;

	# cleaning <title>
	$title= "$title".".";
	$title =~ s/ +\./\./g;
	$title =~ s/<[^>]+>//g;
	$title =~ s/&amp;/&/g;
	$title =~ s/&lt;/</g;
	$title =~ s/&gt;/>/g;
	$title =~ s/&quot;/"/g;
	$title =~ s/&apos;/'/g;

	# cleaning <description>
	$description =~ s/\n//g;
	$description =~ s/&#39;/'/g;
	$description =~ s/&#34;/"/g;
	$description =~ s/&#38;#39;/'/g;
	$description =~ s/&#38;#34;/"/g;
	$description =~ s/&amp;/&/g;
	$description =~ s/&lt;/</g;
	$description =~ s/&gt;/>/g;
	$description =~ s/&quot;/"/g;
	$description =~ s/&apos;/'/g;
	$description =~ s/<[^>]+>//g;

	return $title, $description;

} # cleaning

# RE-ENCODING XML ENTITIES
sub xml_entities {

	my ($title, $description) = @_;

	# re-encoding <title>
	$title =~ s/</&lt;/g;
	$title =~ s/>/&gt;/g;
	$title =~ s/&/&amp;/g;
	$title =~ s/"/&quot;/g;
	$title =~ s/'/&apos;/g;

	# re-encoding <description>
	$description =~ s/</&lt;/g;
	$description =~ s/>/&gt;/g;
	$description =~ s/&/&amp;/g;
	$description =~ s/"/&quot;/g;
	$description =~ s/'/&apos;/g;

	return $title, $description;

} # xml_entities
