Script

#!/bin/sh

# test-3.sh

#creation des repertoires

mkdir TABLEAUX 2>/dev/null

mkdir PAGES-ASPIREES 2>/dev/null

mkdir URLS 2>/dev/null

mkdir DUMP-TEXT 2>/dev/null

mkdir CONTEXT 2>/dev/null

#initialise les variables

want_dl=0

fic1=./URLS/relation-jp.txt

fic2=./URLS/relation-cn.txt

tablo=./TABLEAUX/aspirer.html

#demander si il faut tout telecharger

echo "Voulez-vous telecharger les pages Web ou utiliser celles sur le disque ?"

echo "(o/N)"

read a [ "$a" = "o" ] && want_dl=1

#commence la construction du fichier avec les liens des pages aspirées

echo "tableau de liens" > $tablo

echo "" >> $tablo

# créer le premier tableau

echo "

" >> $tablo

i=1;

#split deux cellules dans un titre de tableau

echo "

" >> $tablo

#effacer les fichiers .error si on telecharge tout a nouveau

[ $want_dl = 1 ] && rm -f ./PAGES-ASPIREES/* ./DUMP-TEXT/*

for nom in `cat $fic1`

{

#Telecharger les pages

if [ $want_dl = 1 ]; then

echo $nom

wget -t 20 -O ./PAGES-ASPIREES/$i.html $nom

[ $? = 1 ] && echo "$nom" > ./PAGES-ASPIREES/$i.html.error

#Convertir les pages en shift_JIS qui ne sont pas supportees par Lynx.... :p

#if [ `grep shift_JIS ./PAGES-ASPIREES/$i.html | wc -l` = 1 ]; then

# iconv -f shift_JIS -t utf-8 ./PAGES-ASPIREES/$i.html > ./PAGES-ASPIREES/c_$i.html

# mv ./PAGES-ASPIREES/$i.html ./PAGES-ASPIREES/$i.old.html

# sed "s/charset=shift_JIScharset=utf-8/Ig" ./PAGES-ASPIREES/c_$i.html > ./PAGES-ASPIREES/$i.html

#fi

#Convertir les pages en utf8

charset=$(perl -ne 'print $1 if (/ ?charset=([^"]+)"/)' ./PAGES-ASPIREES/$i.html)

iconv -f $charset -t utf-8 ./PAGES-ASPIREES/$i.html > ./PAGES-ASPIREES/c_$i.html

mv ./PAGES-ASPIREES/$i.html ./PAGES-ASPIREES/$i.old.html

sed "s/charset=${charset}/charset=utf-8/g" ./PAGES-ASPIREES/c_$i.html > ./PAGES-ASPIREES/$i.html

fi

#Dump des pages en texte

#lynx -nolist -assume_local_charset=utf-8 -display_charset=utf-8 -dump ./PAGES-ASPIREES/c_$i.html > ./DUMP-TEXT/c_$i.txt

lynx -nolist -dump -assume_local_charset=utf-8 -display_charset=utf-8 -dump ./PAGES-ASPIREES/$i.html > ./DUMP-TEXT/$i.txt

#recuperation du contexte

echo "" > ./CONTEXT/$i.html

echo "

Relation en japonais
" >> ./CONTEXT/$i.html

grep -i -B1 -A1 "关系" ./PAGES-ASPIREES/$i.html

style="background-color: red;font-weight:bold;">关系<\/span>/g' >> ./CONTEXT/$i.html

grep -i -B1 -A1 "関係" ./PAGES-ASPIREES/$i.html

style="background-color: red;font-weight:bold;">関係<\/span>/g' >> ./CONTEXT/$i.html

#grep -i -C1 relation $i.txt | sed 's/--/<\/p>

/g' | sed 's/relation/relation<\/b>/g'

echo "

" >> ./CONTEXT/$i.html

#nombre d'occurences

echo "


" >> ./CONTEXT/$i.html

occ1=`grep -i -c "关系" ./DUMP-TEXT/$i.txt`

occ2=`grep -i -c "関係" ./DUMP-TEXT/$i.txt`

occ=$((${occ1}+${occ2}))

echo "Occurences = $occ" >> ./CONTEXT/$i.html

echo "

" >> ./CONTEXT/$i.html

echo "" >> ./CONTEXT/$i.html

#Ajout d'une entree dans le tableau

echo " " >> $tablo

echo " url $i

$nom" >> $tablo

echo " PAGE ASPIREE $i" >> $tablo

echo " PAGE DUMP $i" >> $tablo

echo " CONTEXT" >> $tablo

echo " " >> $tablo

let "i++"

}

echo "" >> $tablo

echo "


" >> $tablo

# créer le deuxième tableau

echo "

" >> $tablo

echo "

" >> $tablo

for nom in `cat $fic2`

{

#Telecharger les pages

if [ $want_dl = 1 ]; then

echo $nom

wget -t 20 -O ./PAGES-ASPIREES/$i.html $nom

[ $? = 1 ] && echo "$nom" > ./PAGES-ASPIREES/$i.html.error

#Convertir les pages en GB2312 qui ne sont pas supportees par Lynx.... :p

#if [ `grep gb2312 ./PAGES-ASPIREES/$i.html | wc -l` = 1 ]; then

# iconv -f gb2312 -t utf-8 ./PAGES-ASPIREES/$i.html > ./PAGES-ASPIREES/c_$i.html

# mv ./PAGES-ASPIREES/$i.html ./PAGES-ASPIREES/$i.old.html

# sed "s/charset=gb2312/charset=utf-8/Ig" ./PAGES-ASPIREES/c_$i.html > ./PAGES-ASPIREES/$i.html

#fi

#Convertir les pages en utf8

charset=$(perl -ne 'print $1 if (/ ?charset=([^"]+)"/)' ./PAGES-ASPIREES/$i.html)

iconv -f $charset -t utf-8 ./PAGES-ASPIREES/$i.html > ./PAGES-ASPIREES/c_$i.html

mv ./PAGES-ASPIREES/$i.html ./PAGES-ASPIREES/$i.old.html

sed "s/charset=${charset}/charset=utf-8/g" ./PAGES-ASPIREES/c_$i.html > ./PAGES-ASPIREES/$i.html

fi

#Dump des pages en texte

#lynx -nolist -assume_local_charset=utf-8 -display_charset=utf-8 -dump ./

PAGES-ASPIREES/c_$i.html > ./DUMP-TEXT/c_$i.txt

lynx -nolist -dump -assume_local_charset=utf-8 -display_charset=utf-8 -

dump ./PAGES-ASPIREES/$i.html > ./DUMP-TEXT/$i.txt

#recuperation du contexte

echo "" > ./CONTEXT/$i.html

echo "

Relation en chinois
" >> ./CONTEXT/$i.html

grep -i -B1 -A1 "关系" ./PAGES-ASPIREES/$i.html

style="background-color: red;font-weight:bold;">关系<\/span>/g' >> ./CONTEXT/$i.html

grep -i -B1 -A1 "関係" ./PAGES-ASPIREES/$i.html

style="background-color: red;font-weight:bold;">関係<\/span>/g' >> ./CONTEXT/$i.html

#grep -i -C1 relation $i.txt | sed 's/--/<\/p>

/g' | sed 's/relation/relation<\/b>/g'

echo "

" >> ./CONTEXT/$i.html

#nombre d'occurences

echo "


" >> ./CONTEXT/$i.html

occ1=`grep -i -c "关系" ./DUMP-TEXT/$i.txt`

occ2=`grep -i -c "関係" ./DUMP-TEXT/$i.txt`

occ=$((${occ1}+${occ2}))

echo "Occurences = $occ" >> ./CONTEXT/$i.html

echo "

" >> ./CONTEXT/$i.html

echo "" >> ./CONTEXT/$i.html

#Ajout d'une entree dans le tableau

echo " " >> $tablo

echo " url $i

$nom" >> $tablo

echo " PAGE ASPIREE $i" >> $tablo

echo " PAGE DUMP $i" >> $tablo

echo " CONTEXT" >> $tablo

echo " " >> $tablo

let "i++"

}

echo "" >> $tablo

echo "


" >> $tablo

echo "" >> $tablo