#!/bin/bash
read REP; 
read tablo; 
motif="\b идентичность|identité | هوية \b"



echo "<html><head><title>tableaux de liens</title>" > $tablo;
echo "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\" /> " >> $tablo;
echo "<style>" >> $tablo;
echo "h1, h2, h3, p, td { font-family: Calibri, Verdana,sans-serif; } " >> $tablo;
echo "h4, h5 { margin: 0px; padding: 0px; } " >> $tablo;
echo "h1 {color: #000060; font-size: 16pt; font-weight: bold;} " >> $tablo;
echo "h2 {color: #000060; font-size: 14pt; font-weight: bold;} " >> $tablo;
echo "h3 {color: #000060; font-size: 12pt; font-weight: bold;} " >> $tablo;
echo "a:visited {text-decoration: none;color: blue;}" >> $tablo;
echo "a:link {text-decoration: none;color: blue;}" >> $tablo;
echo "a:hover{text-decoration: none;color: red;}" >> $tablo;
echo "</style>" >> $tablo;
echo "</head><body bgcolor=\"silver\">" >> $tablo;
echo "<div style='border:solid windowtext .75pt;padding:4.0pt 4.0pt 4.0pt 4.0pt;background:#D7D5D2;margin-left:80px;margin-right:80px;'><blockquote>" >> $tablo;
echo "<h3><span style='background-color:#FC3;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>Les TABLEAUX de données</span></h3>" > $tablo; 
#Variable pour compter les tableaux
j=1;
for fic in `ls $REP`
{ 
	echo "<table bgcolor=\"white\" align=\"center\" border=\"1\"><tr><td colspan=\"9\" align=\"center\" bgcolor=\"black\"><font color=\"white\"><h4><b>Tableau n° $j</b></h4></font><small><span style='background-color:silver;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>fichier : $fic</span></small></td></tr>" >> $tablo; 
	echo "<tr><td align=\"center\" width=\"50\"><b>n°URL</b></td><td align=\"center\" width=\"100\"><b>URL</b></td><td align=\"center\" width=\"100\"><b>PAGES ASPIREES</b></td><td align=\"center\" width=\"20\"><b>Ret. CURL</b></td><td align=\"center\" width=\"100\"><b>DUMP initial</b><br/><small>(non utf-8)</small></b></td><td align=\"center\" width=\"100\"><b>DUMP utf-8</b></td><td align=\"center\" width=\"100\"><b>CONTEXTES</b></td><td align=\"center\" width=\"100\"><b>CONTEXTES<br/>HTML</b></td><td align=\"center\" width=\"100\"><b>NB Occur</b></td></tr>" >> $tablo;
	# Variable i pour compter les URLs
	i=1;
	nbdump=0;
	mkdir -p CONTEXTES/$j ;
	mkdir -p DUMP-TXT/$j ;
	mkdir -p PAGES-ASPIREES/$j ;
	
	for ligne in `cat $REP/$fic` 
	{
	    echo "================================================================";
	    echo "==========TRAITEMENT : $ligne ";
            #------- on CURL et on determine l'encodage -----------------------------------------
	    curl -o ./PAGES-ASPIREES/$j/$i.html  "$ligne";
	    retourcurl=$? ;
	    contenupageaspiree=$(egrep -i -o "(400 )?Bad request|Moved Permanently|s interdit|Not Acceptable" ./PAGES-ASPIREES/$j/$i.html | sort -u);
	    if [[ $contenupageaspiree != "" ]]
		then
		retourcurl="<span style='background-color:black;color:white;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>$retourcurl<br/>BAD</span>";
	    fi
	    echo "RETOUR CURL : $retourcurl";
	    if [[ $retourcurl == 0 ]] 
		then
		encodage=$(file -i ./PAGES-ASPIREES/$j/$i.html | cut -d= -f2);
		echo "ENCODAGE initial : $encodage";
		#------- on continue en tenant compte de l'encodage fourni par curl---------------------
		if [[ $encodage == "utf-8" ]]
		    then
		    lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i-utf8.txt ;
		    egrep -i "$motif" ./DUMP-TXT/$j/$i-utf8.txt > ./CONTEXTES/$j/$i-utf8.txt ;
		    nbOccur=??;
		    nbOccur=$(egrep -o  "$motif" ./DUMP-TXT/$j/$i-utf8.txt | wc -l );
		    perl5.16.3 ./PROGRAMMES/minigrepmultilingue.pl  "UTF-8" ./DUMP-TXT/$j/$i-utf8.txt ./PROGRAMMES/motif-regexp.txt ;
		    mv resultat-extraction.html ./CONTEXTES/$j/$i-utf8.html ;
		    echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$ligne\">$ligne</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\">&nbsp;-&nbsp;</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\" ><a href=\"../CONTEXTES/$j/$i-utf8.html\">$i-utf8.html</a></td><td align=\"center\" bgcolor=\"yellow\"><span style='background-color:red;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>$nbOccur</span></td></tr>" >> $tablo;
		    cat ./CONTEXTES/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt ;
		    cat ./DUMP-TXT/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt ;
		    let "nbdump+=1";
		    
		else
		    VERIFENCODAGEDANSICONV=$(iconv -l | egrep -io $encodage | sort -u);
		    if [[ $VERIFENCODAGEDANSICONV != "" ]]
			then
			#-------------------------------------------
			# le charset extrait est connu de iconv : on lynxe et on dump !!!
			#-------------------------------------------
			echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..."
			lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i.txt ;
			echo "ENCODAGE final : $encodage (avant conversion vers utf-8)";
			iconv -f $encodage -t utf-8 ./DUMP-TXT/$j/$i.txt > ./DUMP-TXT/$j/$i-utf8.txt
			egrep -i "$motif" ./DUMP-TXT/$j/$i-utf8.txt > ./CONTEXTES/$j/$i-utf8.txt ;
			nbOccur=??;
			nbOccur=$(egrep -o "$motif" ./DUMP-TXT/$j/$i-utf8.txt | wc -l );
			perl5.16.3 ./PROGRAMMES/minigrepmultilingue.pl  "UTF-8" ./DUMP-TXT/$j/$i-utf8.txt ./PROGRAMMES/motif-regexp.txt ;
			mv resultat-extraction.html ./CONTEXTES/$j/$i-utf8.html ;
			echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$ligne\">$ligne</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i.txt\">$i.txt</a><br/><small>($encodage)</small></td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.html\">$i-utf8.html</a></td><td align=\"center\" bgcolor=\"yellow\"><span style='background-color:red;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>$nbOccur</span></td></tr>" >> $tablo;
			cat ./CONTEXTES/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt ;
			cat ./DUMP-TXT/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt ;
			let "nbdump+=1";
		else 
			#-----------------------------------------------------
			# la page n'est pas en utf-8 et son encodage detecte par file n'est pas connu de iconv, on cherche un charset...
			#-----------------------------------------------------echo "on cherche un charset dans la page aspiree...";
			if egrep -i "<meta.*charset" ./PAGES-ASPIREES/$j/$i.html ; 
			    then 
			    #----------------------------------------------
			    # on a peut-etre trouve un charset.....
			    #----------------------------------------------
			    echo "Presence d'un charset...";
			    encodage=$(egrep -i "<meta.*charset" ./PAGES-ASPIREES/$j/$i.html | egrep -i -o 'charset *=[^\"]+'  | tr [A-Z] [a-z] | sort -u | cut -f2 -d=) ;
			    echo "charset extrait : $encodage ";
			    #-------------------------------------------------------------------
			    # avant de continuer on va vérifier si le charset est connu de iconv
			    #-------------------------------------------------------------------
			    VERIFENCODAGEDANSICONV=$(iconv -l | egrep -io $encodage | sort -u);
			    if [[ $VERIFENCODAGEDANSICONV == "" ]]
				then
				#-------------------------------------------
				# le charset n'est pas connu de iconv : on fait rien....
				#-------------------------------------------
				echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> inconnu par inconv, on ne fait rien"
				echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$ligne\">$ligne</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td></tr>" >> $tablo;
			else
			        #-------------------------------------------
				# le charset extrait est connu de iconv : on lynxe et on dump !!!
				#-------------------------------------------
				echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..."
				lynx -dump -nolist -display_charset=$encodage ./PAGES-ASPIREES/$j/$i.html  > ./DUMP-TXT/$j/$i.txt ;
				echo "ENCODAGE final : $encodage (avant conversion vers utf-8)";
				iconv -f $encodage -t utf-8 ./DUMP-TXT/$j/$i.txt > ./DUMP-TXT/$j/$i-utf8.txt
				egrep -i "$motif" ./DUMP-TXT/$j/$i-utf8.txt > ./CONTEXTES/$j/$i-utf8.txt ;
				nbOccur=??;
				nbOccur=$(egrep -o "$motif" ./DUMP-TXT/$j/$i-utf8.txt | wc -l );
				perl5.16.3 ./PROGRAMMES/minigrepmultilingue.pl  "UTF-8" ./DUMP-TXT/$j/$i-utf8.txt ./PROGRAMMES/motif-regexp.txt ;
				mv resultat-extraction.html ./CONTEXTES/$j/$i-utf8.html ;
				echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$ligne\">$ligne</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i.txt\">$i.txt</a><br/><small>($encodage)</small></td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.html\">$i-utf8.html</a></td><td align=\"center\" bgcolor=\"yellow\"><span style='background-color:red;font-family:arial,helvetica,sans-serif;font-variant:small-caps;font-size:120%;padding:2px;border-right:1px solid #C90;border-bottom:1px solid #C90;'>$nbOccur</span></td></tr>" >> $tablo;
				
				cat ./CONTEXTES/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt ;
				cat ./DUMP-TXT/$j/$i-utf8.txt >> ./FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt ;
				let "nbdump+=1";
			    fi
			
			else 
			    echo "Pas de charset detecte : on ne fait rien pour le DUMP... ";
			    echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$ligne\">$ligne</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\" width=\"100\" bgcolor=\"yellow\"><small>Encodage<br/>non d&eacute;tect&eacute;</small></td><td align=\"center\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td></tr>" >> $tablo;
			    
			fi
		    fi
                fi
                # fin curl OK
		else
		    echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$ligne\">$ligne</a></td><td align=\"center\" width=\"100\" bgcolor=\"red\">&nbsp;-&nbsp;</td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"red\">&nbsp;-&nbsp;</td><td align=\"center\" width=\"100\" bgcolor=\"red\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td><td align=\"center\">&nbsp;-&nbsp;</td></tr>" >> $tablo;
	    fi
	  let "i+=1"; 
	}
	echo "<tr><td align=\"center\" colspan=\"4\" bgcolor=\"silver\">&nbsp</td><td align=\"center\" width=\"100\"><a href="../FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt">Fichier DUMP<br/>global</a><br/><small>$nbdump fichier(s)</small></td><td align=\"center\" width=\"100\"><a href="../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt">Fichier CONTEXTES<br/>global</a><br/><small>$nbdump fichier(s)</small></td><td align=\"center\"  bgcolor=\"silver\">&nbsp</td><td colspan="3" bgcolor=\"silver\"></td></tr>" >> $tablo;
	echo "</table>" >> $tablo; 
	let "j+=1"; 
	echo "<p align=\"center\"><hr color=\"blue\" width=\"50%\"/></p>" >> $tablo;
}
echo "</body></html>" >> $tablo; 