#!/bin/bash
read REP; 
read tablo; 
echo "<html><head><title>tableaux de liens</title>" > $tablo;
echo "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\" /> " >> $tablo;
echo "<style>" >> $tablo;
echo "h1, h2, h3, p, td { font-family: Calibri, Verdana,sans-serif; } " >> $tablo;
echo "h4, h5 { margin: 0px; padding: 0px; } " >> $tablo;
echo "h1 {color: #000060; font-size: 16pt; font-weight: bold;} " >> $tablo;
echo "h2 {color: #000060; font-size: 14pt; font-weight: bold;} " >> $tablo;
echo "h3 {color: #000060; font-size: 12pt; font-weight: bold;} " >> $tablo;
echo "a:visited {text-decoration: none;color: blue;}" >> $tablo;
echo "a:link {text-decoration: none;color: blue;}" >> $tablo;
echo "a:hover{text-decoration: none;color: red;}" >> $tablo;
echo "</style>" >> $tablo;
echo "</head><body bgcolor=\"silver\">" >> $tablo;
eecho "<div style='border:solid windowtext .75pt;padding:4.0pt 4.0pt 4.0pt 4.0pt;background:#D7D5D2;margin-left:80px;margin-right:80px;'><blockquote>" >> $tablo;
echo "<h3><span style='background-color:#FC3;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>Les TABLEAUX de données</span></h3>" >> $tablo; 
#Variable pour compter les tableaux
j=1;
for fic in `ls $REP`
{ 
	echo "<table bgcolor=\"white\" align=\"center\" border=\"1\"><tr><td colspan=\"7\" align=\"center\" bgcolor=\"black\"><font color=\"white\"><h4><b>Tableau n° $j</b></h4></font><small><span style='background-color:silver;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>fichier : $fic</span></small></td></tr>" >> $tablo; 
	echo "<tr><td align=\"center\" width=\"50\"><b>n°URL</b></td><td align=\"center\" width=\"100\"><b>URL</b></td><td align=\"center\" width=\"100\"><b>PAGES ASPIREES</b></td><td align=\"center\" width=\"20\"><b>Ret. CURL</b></td><td align=\"center\" width=\"100\"><b>DUMP initial</b><br/><small>(non utf-8)</small></b></td><td align=\"center\" width=\"100\"><b>DUMP utf-8</b></td></tr>" >> $tablo;
	# Variable i pour compter les URLs
	i=1;
	nbdump=0;
	mkdir -p CONTEXTES/$j ;
	mkdir -p DUMP-TXT/$j ;
	mkdir -p PAGES-ASPIREES/$j ;
	for nom in `cat $REP/$fic` 
	{
	    echo "================================================================";
	    echo "==========TRAITEMENT : $nom ";
            #------- on CURL et on determine l'encodage -----------------------------------------
	    curl -o ./PAGES-ASPIREES/$j/$i.html  "$nom";
	    retourcurl=$? ;
		contenupageaspiree=$(egrep -i -o "(400 )?Bad request|Moved Permanently|s interdit" ./PAGES-ASPIREES/$j/$i.html | sort -u);
		if [[ $contenupageaspiree != "" ]]
			then
				retourcurl="<small>$retourcurl<br/>$contenupageaspiree</small>";
		fi
	    echo "RETOUR CURL : $retourcurl";
        if [[ $retourcurl == 0 ]] 
			then
				encodage=$(file -i ./PAGES-ASPIREES/$j/$i.html | cut -d= -f2);
				echo "ENCODAGE initial : $encodage";
				#------- on continue en tenant compte de l'encodage fourni par curl---------------------
				if [[ $encodage == "utf-8" ]]
					then
						lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i-utf8.txt ;
						echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\">&nbsp;-&nbsp;</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td></tr>" >> $tablo;
					else
						#-----------------------------------------------------
						# la page n'est pas en utf-8, on cherche un charset...
						#-----------------------------------------------------echo "on cherche un charset dans la page aspiree...";
						if egrep -qi "(charset ?=.*?(\"|\')|encoding ?=.*?(\"|\'))" ./PAGES-ASPIREES/$j/$i.html ; 
							then 
								#----------------------------------------------
								# on a peut-etre trouve un charset.....
								#----------------------------------------------
								echo "Presence d'un charset...";
								encodage=$(egrep -m 1 -o '(((utf|UTF)-(8|16|32))|(gb|GB)(k|K|2312|18030)|(iso|ISO|Iso)-8859-(\w)(\w)?|(WINDOWS|windows)-1252|(WINDOWS|windows)-1256|((m|M)(a|A)(c|C)(R|r)(O|o)(M|m)(a|A)(n|N))|us-ascii)' ./PAGES-ASPIREES/$j/$i.html | sort -u) ;
								echo "charset extrait : $encodage ";
								#-------------------------------------------------------------------
								# avant de continuer on va vérifier si le charset est connu de iconv
								#-------------------------------------------------------------------
								VERIFENCODAGEDANSICONV=$(iconv -l | egrep -io $encodage | sort -u);
								if [[ $VERIFENCODAGEDANSICONV == "" ]]
									then
										#-------------------------------------------
										# le charset n'est pas connu de iconv : on fait rien....
										#-------------------------------------------
										echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> inconnu par inconv, on ne fait rien"
										echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td></tr>" >> $tablo;
									else
										#-------------------------------------------
										# le charset extrait est connu de iconv : on lynxe et on dump !!!
										#-------------------------------------------
										echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..."
										lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i.txt ;
										echo "ENCODAGE final : $encodage (avant conversion vers utf-8)";
										iconv -f $encodage -t utf-8 ./DUMP-TXT/$j/$i.txt > ./DUMP-TXT/$j/$i-utf8.txt
										echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i.txt\">$i.txt</a><br/><small>($encodage)</small></td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td></tr>" >> $tablo;
								fi
							else 
								echo "Pas de charset detecte : on ne fait rien pour le DUMP... ";
								echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td></tr>" >> $tablo;
						fi  
				fi
          # fin curl OK
          else
			echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\" bgcolor=\"red\">Page non aspiree...</td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"red\"><small>Page<br/>non \"aspir&eacute;e\"</small></td><td align=\"center\" width=\"100\" bgcolor=\"red\"><small>Page<br/>non \"aspir&eacute;e\"</small></td></tr>" >> $tablo;
        fi
	  let "i+=1"; 
	}
	echo "</table>" >> $tablo; 
	let "j+=1"; 
	echo "<p align=\"center\"><hr color=\"blue\" width=\"50%\"/></p>" >> $tablo;
}
echo "</body></html>" >> $tablo; 
