La vie des mots sur le web- Accueil

					
						#!/bin/bash
#LIRE LES FICHIERS - utiliser le fichier input avec le chemin vers les fichiers
read URLS;
read fichier_tableau; 
read motif;
# création de l'en-tête de la page html:
echo "<html>" > $fichier_tableau;
echo "<head>" >> $fichier_tableau;
echo "< link rel="stylesheet" type="text/css" href="boot.css"/>" >> $fichier_tableau;
echo "</head>" >> $fichier_tableau;
echo "<body>" >> $fichier_tableau;            
#compteur fichier
cpttableau=1;
			
			#Pour chacun des fichiers de liens contenus dans un répertoire donné
	for fichier in `ls $URLS`
	{
	echo "TRAITEMENT de fichier";
	#création de tableau pour chaque fichier
	echo "<table border=\"1\">" >> $fichier_tableau;

	#1ere ligne du tableau 
	echo "<tr>" >> $fichier_tableau ;
	echo "<td>tableau n° $cpttableau - $fichier<td>" >> $fichier_tableau;
	echo "</tr>" >> $fichier_tableau ;

	#2nde ligne du tableau
	echo "<tr>" >> $fichier_tableau ;
	echo "<th><b>N°<b><th>" >> $fichier_tableau ;
	echo "<th><b>N° Lien</b></th>" >> $fichier_tableau ;
	echo "<th><b>>Page Aspirée</b></th>" >> $fichier_tableau ;
	echo "<th><b>Retour WGET</b></th>" >> $fichier_tableau ;
	echo"<th><b>Encodage initial</b></th>" >> $fichier_tableau ;
	echo "<th><b>DUMP initial (fichier UTF-8)</b>></th>" >> $fichier_tableau ;
	echo "<th><b>DUMP (fichier converti en UTF-8)</b></th>" >> $fichier_tableau ;
	echo "<th><b>CONTEXTE UTF-8</b></th>" >> $fichier_tableau ;
	echo "<th><b>CONTEXTE HTML</b></th>" >> $fichier_tableau ;
	echo "<th><b>Fq Motif dans DUMP</b></th>" >> $fichier_tableau ;
	echo "<th><b>INDEX DUMP</b></th>" >> $fichier_tableau ;
	echo "</tr>" >> $fichier_tableau ;

	#on crée un compteur de lignes et un compteur de dumps
	compteur=1;
	nbdump=1;       
					

					# Pour chaque URL lue dans un des fichiers de liens, on aspire la page et on récupère son contenu
	for line in `cat $URLS/$fichier`{
	wget $line --no-check-certificate -O ../PAGES-ASPIREES/$cpttableau-$compteur.html;
	retourwget=$?;
	echo "Retour WGET : $retourwget";
					

					#si la page est aspirée sans problème
		if [ $retourwget==0 ];
		
		  then
			
		    #récupérer l'encodage de la page aspirée
			encodage=$(file -i ../PAGES-ASPIREES/$cpttableau-$compteur.html | cut -f2 -d=) ;

			echo "ENCODAGE :<$encodage>";
			  
			  #Traitement des pages UTF-8
		      if [ "$encodage" = "utf-8" ];
			
			    then 
				
				  #récupérer le contenu textuel de la page aspirée
				  lynx -dump -nolist -assume_charset=$encodage -display_charset=$encodage $line > ../DUMP-TEXT/$cpttableau-$compteur-initial.txt;    
				 
				  egrep -i "$motif" ../DUMP-TEXT/$cpttableau-$compteur-initial.txt > ../CONTEXTES/$cpttableau-$compteur-UTF8.txt;
				 
				  #extraction des contextes avec le programme minigrepmultilingue
				  perl ../MINIGREP/minigrepmultilingue.pl "UTF-8" ../DUMP-TEXT/$cpttableau-$compteur-initial.txt ../MINIGREP/motif.txt ;
				  mv resultat-extraction.html ../CONTEXTES/$cpttableau-$compteur-minigrep.html ;
				 
				  #compter le nombre d'occurrences des mots-clés dans chaque fichier dump. i = insensible à la casse | c = count | o = occurrence du mot
				  compteurOccurrence=$(egrep -ico "$motif" ../DUMP-TEXT/$cpttableau-$compteur-initial.txt) ;
				  echo "la frequence est: $compteurOccurrence";
				
				  # créer un dictionnaire pour chaque fichier dump dans le dossier INDEX 
				  egrep -o "\w+" ../DUMP-TEXT/$cpttableau-$compteur-initial.txt | sort | uniq -c | sort -r > ../INDEX/index-$cpttableau-$compteur-UTF8.txt;
									  
				  #créer les colonnes : contexte , occurrences, et index 
							 #3e ligne du tableau et les suivantes
				  echo "<tr>" >> $fichier_tableau ;
				  echo "<td>$compteur</td>" >> $fichier_tableau ;
				  echo "<td><a href=\"$line\">LIEN N°$compteur</a></td>" >> $fichier_tableau ;
				  echo"<td><a href=\"../PAGES-ASPIREES/$cpttableau-$compteur.html\">PAGE N° $cpttableau-$compteur</a></td>" >> $fichier_tableau ;
				  echo "<td>$retourwget</td>" >> $fichier_tableau ;
				  echo"<td>$encodage</td>" >> $fichier_tableau ;
				  echo "<td><a href=\"../DUMP-TEXT/$cpttableau-$compteur-initial.txt\">DUMP N° $cpttableau-$compteur</a></td>" >> $fichier_tableau ;
				  echo"<td>--</td>" >> $fichier_tableau ;
				  echo "<td><a href=\"../CONTEXTES/$cpttableau-$compteur-UTF8.txt\">CONTEXTE N° $cpttableau-$compteur-UTF8.txt</a></td>" >> $fichier_tableau ;
				  echo "<td><a href=\"../CONTEXTES/$cpttableau-$compteur-minigrep.html\">CONTEXTE N° $cpttableau-$compteur.html</a></td>" >> $fichier_tableau ;
				  echo "<td>$compteurOccurrence</td>" >> $fichier_tableau ;
				  echo "<td><a href=\"../INDEX/index-$cpttableau-$compteur-UTF8.txt\">INDEX $cpttableau-UTF8.txt</a">></td>" >> $fichier_tableau ;
				  echo"</tr>" >> $fichier_tableau ;
				  # Concaténer tous les fichiers dump, contextes et index dans des fichiers globaux
				  # inclusion de balises pour visualiser les dump / contextes / index pour chaque fichier
				  cat ../CONTEXTES/$cpttableau-$compteur-UTF8.txt >> ../FICHIERS-GLOBAUX/CONTEXTES-GLOBAUX_$cpttableau.txt ;
				  cat ../DUMP-TEXT/$cpttableau-$compteur-initial.txt >> ../FICHIERS-GLOBAUX/DUMP-GLOBAUX_$cpttableau.txt ;
				  #incrementation: il faut ajouter 1 au compteur des dumps
				  let "nbdump=nbdump+1";
				  # incrementation: il faut ajouter 1 au compteur de lignes
				  let "compteur=compteur+1"
					
					
			else 
					#la page n'est pas en utf-8, on va vérifier si son encodage est reconnu par iconv.
					VerifEncodageIconv=$(iconv -l | egrep -io $encodage | sort -u);
					 echo  $VerifEncodageIconv
					
					#si l'encodage pas connu de iconv, on cherche un charset dans la balise meta de la page aspirée      
				
			    if [ "$VerifEncodageIconv" = "" ];  
					
				  then
				  
				    if egrep  -qi "meta.+charset" ../PAGES-ASPIREES/$cpttableau-$compteur.html ; 
					  
				      then
						echo "CHARSET présent dans la page html ";
						 #on le récupère à l'aide des expressions régulières
						charset=$(egrep -m 1 -o '(((utf|UTF)-(8|16|32))|(gb|GB)(k|K|2312|18030)|(iso|ISO|Iso)-8859-(\w)(\w)?|(WINDOWS|windows)-1252|(WINDOWS|windows)-1256|((m|M)(a|A)(c|C)(R|r)(O|o)(M|m)(a|A)(n|N))|us-ascii)' ../PAGES-ASPIREES/$cpttableau-$compteur.html | tr "a-z" "A-Z" | sort -u) ;
						echo "CHARSET récupéré : $encodage"; 
						# On regarde si le charset est connu de iconv, s'il ne l'est pas, on ne fait rien
						VerifEncodageIconv=$(iconv -l | egrep -io $encodage | sort -u);
					    if [ $VerifEncodageIconv=="" ];
							 
					      then 
							
							echo "L'encodage n'est pas connu d'iconv : on ne fait rien !";

							echo "<tr>" >> $fichier_tableau ;
							echo "<td>$compteur</td>" >> $fichier_tableau ;
							echo "<td><p><a href=\"$line\">n°$compteur</a></p></td>" >> $fichier_tableau ;
							echo"<td><a href=\"../PAGES-ASPIREES/$cpttableau-$compteur.html\">PA n° $cpttableau-$compteur</a></td>" >> $fichier_tableau;
							echo "<td>$retourwget</td>">>$fichier_tableau ;         
							echo"td>$encodage</td>" >> $fichier_tableau ;
							echo"<td>encodage non connu de iconv<br/></td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo "</tr>" >> $fichier_tableau;
					
					#Le charset est connu de iconv ! On lynx et on lance iconv
					    else
						
						  echo "Le charset extrait est CONNU de iconv !";
						   #récupérer le contenu textuel de la page aspirée
						   lynx -dump -nolist -assume_charset=$encodage -display_charset=$encodage $line > ../DUMP-TEXT/$cpttableau-$compteur-initial.txt;
						 
					      iconv -f $encodage -t utf-8 ../DUMP-TEXT/$cpttableau-$compteur-initial.txt > ../DUMP-TEXT/$cpttableau-$compteur.txt ;
						 
						  egrep -i "$motif" ../DUMP-TEXT/$cpttableau-$compteur.txt> ../CONTEXTES/$cpttableau-$compteur-UTF8.txt;
						  #extraction des contextes avec le programme minigrepmultilingue
						 
						  perl ../MINIGREP/minigrepmultilingue.pl "UTF-8" ../DUMP-TEXT/$cpttableau-$compteur.txt ../MINIGREP/motif.txt ;
						  mv resultat-extraction.html ../CONTEXTES/$cpttableau-$compteur-minigrep.html ;
						  #compter le nombre d'occurrences des mots-clés dans chaque fichier dump. i = insensible à la casse | c = count | o = occurrence du mot

						  compteurOccurrence=$(egrep -ico "$motif" ../DUMP-TEXT/$cpttableau-$compteur.txt) ;
						  echo "la frequence est: $compteurOccurrence";
						
						  # créer un dictionnaire pour chaque fichier dump dans le dossier INDEX 
					      egrep -o "\w+" ../DUMP-TEXT/$cpttableau-$compteur-.txt | sort | uniq -c | sort -r > ../INDEX/index-$cpttableau-$compteur-UTF8.txt;
						
						 #créer les colonnes : contexte , occurrences, et index 
						echo "<tr>" >> $fichier_tableau ;
						echo "<td>$compteur</td>" >> $fichier_tableau ;
						echo "<td><a href=\"$line\">LIEN N°$compteur</a></td>" >> $fichier_tableau ;
						echo"<td><a href=\"../PAGES-ASPIREES/$cpttableau-$compteur.html\">PAGE N° $cpttableau-$compteur</a></td>" >> $fichier_tableau ;
						echo "<td>$retourwget</td>" >> $fichier_tableau ;
						echo"<td>$encodage</td>" >> $fichier_tableau ;
						echo "<td><a href=\"../DUMP-TEXT/$cpttableau-$compteur-initial.txt\">DUMP N° $cpttableau-$compteur</a></td>" >> $fichier_tableau ;
						echo"<td>--</td>" >> $fichier_tableau ;
						echo "<td><a href=\"../CONTEXTES/$cpttableau-$compteur-UTF8.txt\">CONTEXTE N° $cpttableau-$compteur-UTF8.txt</a></td>" >> $fichier_tableau ;
						echo "<td><a href=\"../CONTEXTES/$cpttableau-$compteur-minigrep.html\">CONTEXTE N° $cpttableau-$compteur.html</a></td>" >> $fichier_tableau ;
						echo "<td>$compteurOccurrence</td>" >> $fichier_tableau ;
						echo "<td><a href=\"../INDEX/index-$cpttableau-$compteur-UTF8.txt\">INDEX $cpttableau-UTF8.txt</a">></td>" >> $fichier_tableau ;
						echo"</tr>" >> $fichier_tableau ;                   

						  #Concaténation dans un fichier global des contextes et dumps
						  cat ../CONTEXTES/$cpttableau-$compteur-UTF8.txt >> ../FICHIERS-GLOBAUX/CONTEXTES-GLOBAUX_$cpttableau.txt ;          cat ../DUMP-TEXT/$cpttableau-$compteur.txt >> ../FICHIERS-GLOBAUX/DUMP-GLOBAUX_$cpttableau.txt ;
						#incrementation: il faut ajouter 1 au compteur des dumps
					    let "nbdump=nbdump+1";
					     # incrementation: il faut ajouter 1 au compteur de lignes
						let "compteur= compteur+1";
					  
			fi 
				 else
					  # PAS DE CHARSET, on ne fait rien
						echo "PAS DE CHARSET trouvé";
							echo "<tr>" >> $fichier_tableau ;
							echo "<td>$compteur</td>" >> $fichier_tableau ;
							echo "<td><p><a href=\"$line\">n°$compteur</a></p></td>" >> $fichier_tableau ;
							echo"<td><a href=\"../PAGES-ASPIREES/$cpttableau-$compteur.html\">PA n° $cpttableau-$compteur</a></td>" >> $fichier_tableau;
							echo "<td>$retourwget</td>">>$fichier_tableau ;         
							echo"td>$encodage</td>" >> $fichier_tableau ;
							echo"<td>encodage non connu de iconv<br/></td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo "</tr>" >> $fichier_tableau;
		 fi
				   
					
	  else   
				  #L'encodage est connu de iconv ! On lynx et on lance iconv
				  echo "L'encodage est CONNU de iconv !";
							   
				   #récupérer le contenu textuel de la page aspirée
						   lynx -dump -nolist -assume_charset=$encodage -display_charset=$encodage $line > ../DUMP-TEXT/$cpttableau-$compteur-initial.txt;
						 
				   iconv -f $encodage -t utf-8 ../DUMP-TEXT/$cpttableau-$compteur-initial.txt > ../DUMP-TEXT/$cpttableau-$compteur.txt ;
						 
				 egrep -i "$motif" ../DUMP-TEXT/$cpttableau-$compteur.txt> ../CONTEXTES/$cpttableau-$compteur-UTF8.txt;
						  #extraction des contextes avec le programme minigrepmultilingue
						 
						  perl ../MINIGREP/minigrepmultilingue.pl "UTF-8" ../DUMP-TEXT/$cpttableau-$compteur.txt ../MINIGREP/motif.txt ;
						  mv resultat-extraction.html ../CONTEXTES/$cpttableau-$compteur-minigrep.html ;
						  #compter le nombre d'occurrences des mots-clés dans chaque fichier dump. i = insensible à la casse | c = count | o = occurrence du mot

						  compteurOccurrence=$(egrep -ico "$motif" ../DUMP-TEXT/$cpttableau-$compteur.txt) ;
						  echo "la frequence est: $compteurOccurrence";
						
						  # créer un dictionnaire pour chaque fichier dump dans le dossier INDEX 
				 egrep -o "\w+" ../DUMP-TEXT/$cpttableau-$compteur-.txt | sort | uniq -c | sort -r > ../INDEX/index-$cpttableau-$compteur-UTF8.txt;
						
						 #créer les colonnes : contexte , occurrences, et index 
						echo "<tr>" >> $fichier_tableau ;
						echo "<td>$compteur</td>" >> $fichier_tableau ;
						echo "<td><a href=\"$line\">LIEN N°$compteur</a></td>" >> $fichier_tableau ;
						echo"<td><a href=\"../PAGES-ASPIREES/$cpttableau-$compteur.html\">PAGE N° $cpttableau-$compteur</a></td>" >> $fichier_tableau ;
						echo "<td>$retourwget</td>" >> $fichier_tableau ;
						echo"<td>$encodage</td>" >> $fichier_tableau ;
						echo "<td><a href=\"../DUMP-TEXT/$cpttableau-$compteur-initial.txt\">DUMP N° $cpttableau-$compteur</a></td>" >> $fichier_tableau ;
						echo"<td>--</td>" >> $fichier_tableau ;
						echo "<td><a href=\"../CONTEXTES/$cpttableau-$compteur-UTF8.txt\">CONTEXTE N° $cpttableau-$compteur-UTF8.txt</a></td>" >> $fichier_tableau ;
						echo "<td><a href=\"../CONTEXTES/$cpttableau-$compteur-minigrep.html\">CONTEXTE N° $cpttableau-$compteur.html</a></td>" >> $fichier_tableau ;
						echo "<td>$compteurOccurrence</td>" >> $fichier_tableau ;
						echo "<td><a href=\"../INDEX/index-$cpttableau-$compteur-UTF8.txt\">INDEX $cpttableau-UTF8.txt</a">></td>" >> $fichier_tableau ;
						echo"</tr>" >> $fichier_tableau ;     

						 #Concaténation dans un fichier global des contextes et dumps
						cat ../CONTEXTES/$cpttableau-$compteur-UTF8.txt >> ../FICHIERS-GLOBAUX/CONTEXTES-GLOBAUX_$cpttableau.txt ;          cat ../DUMP-TEXT/$cpttableau-$compteur.txt >> ../FICHIERS-GLOBAUX/DUMP-GLOBAUX_$cpttableau.txt ;
						#incrementation: il faut ajouter 1 au compteur des dumps
					    let "nbdump=nbdump+1";
					     # incrementation: il faut ajouter 1 au compteur de lignes
						let "compteur= compteur+1";
		  fi

	  fi
				  
				  else     
	#problème au niveau de wget, on ne fait rien
	echo "PROBLEME LORS DE L'ASPIRATION"
							echo "<tr>" >> $fichier_tableau ;
							echo "<td>$compteur</td>" >> $fichier_tableau ;
							echo "<td><p><a href=\"$line\">n°$compteur</a></p></td>" >> $fichier_tableau ;
							echo"<td><a href=\"../PAGES-ASPIREES/$cpttableau-$compteur.html\">PA n° $cpttableau-$compteur non aspirée</a></td>" >> $fichier_tableau;
							echo "<td>$retourwget</td>">>$fichier_tableau ;         
							echo"td>$encodage</td>" >> $fichier_tableau ;
							echo"<td>--<br/></td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo"<td>--</td>" >> $fichier_tableau ;
							echo "</tr>" >> $fichier_tableau;

		   
		    let  "compteur=compteur+1";

							
		  fi
	  }
				  
				  # Insertion des cases pour les fichiers globaux
			egrep -o "\w+" ../FICHIERS-GLOBAUX/DUMP-GLOBAUX_$cpttableau.txt | sort | uniq -c | sort -r >> ../FICHIERS-GLOBAUX/index-dump-$cpttableau.txt ;    
			egrep -o "\w+" ../FICHIERS-GLOBAUX/CONTEXTES-GLOBAUX_$cpttableau.txt | sort | uniq -c | sort -r >> ../FICHIERS-GLOBAUX/index-contexte-$cpttableau.txt ;
			echo  ""<tr>"">> $fichier_tableau;
			echo "<td align=\"center\" colspan=\"6\" > </td>">> $fichier_tableau;
			echo"<td align=\"center\" width=\"100\"><a href="../FICHIERS-GLOBAUX/DUMP-GLOBAUX_$cpttableau.txt">Fichier DUMP<br/>global</a><br/>$nbdump fichier(s)</td>"</b>>> $fichier_tableau;
			echo "<td align=\"center\" width=\"100\"><a href="../FICHIERS-GLOBAUX/CONTEXTES-GLOBAUX_$cpttableau.txt">Fichier CONTEXTES<br/>global</a><br/>$nbdump fichier(s)</td>">> $fichier_tableau;
			echo "<td colspan="3"></td></tr>" >> $fichier_tableau;
			echo "<tr><td align=\"center\" colspan=\"6\"> </td>">> $fichier_tableau;
			echo "<td align=\"center\" width=\"100\"><a href="../FICHIERS-GLOBAUX/index-dump-$cpttableau.txt">Index DUMP<br/>global</a><br/>$nbdump fichier(s)</td>">> $fichier_tableau;
			echo "<td align=\"center\" width=\"100\"><a href="../FICHIERS-GLOBAUX/index-contexte-$cpttableau.txt">Index CONTEXTES<br/>global</a><br/>$nbdump fichier(s)</td>">> $fichier_tableau;
			echo "<td colspan="3"></td></tr>" >> $fichier_tableau;
			echo "</table>" >> $fichier_tableau;
			
	# il faut ajouter 1 au compteur de fichiers-langue
		let "cpttableau= cpttableau+1";
	}
	echo "</div>">> $fichier_tableau;
	echo "</body>">> $fichier_tableau;
	echo "</html>">> $fichier_tableau;
le script

(script à télécharger)