#------------------------------nettoyage textes------------------------------------------------------------ sub supprime_html { my @balises_a_ignorer = ("applet","code","embed","head","object","script","server"); my $html = shift @_; $html =~ s/\n+/ /g; $html =~ s/\r+/ /g; decode_entities($html); foreach my $balise (@balises_a_ignorer) { $html=~s/<$balise.*?<\/$balise>//ig; } $html =~ s///g; #commentaires $html =~ s/<\/?p\/?>/\n/ig; #paragraphes $html =~ s//\n/ig; #retours à la ligne $html =~ s/<\/tr>/\n/ig; #lignes de tableau $html =~ s/<\/?h[1-6]>/\n/ig; #titres $html =~ s/<\/?div.*?>/\n/ig; #sections $html =~ s/<.*?>//g; #autres balises $html =~ s/\s*\n\s*/\n/g; #espaces en début/fin de ligne $html =~ s/ +/ /g; #séquences de plusieurs espaces return $html; } sub normalise_latin1 { my $chaine = shift @_; $chaine =~ s/[\x{2019}\x{2018}]/\'/g; $chaine =~ s/[\x{201C}\x{201D}]/\"/g; $chaine =~ s/[\x{2013}\x{2014}]/-/g; $chaine =~ s/\x{2026}/.../g; $chaine =~ s/\x{0152}/OE/g; $chaine =~ s/\x{0153}/oe/g; $chaine =~ s/[^\x{0000}-\x{00FF}]//g; return $chaine; }