#!/usr/bin/perl
#  CooccurrencesDansFenetre2IM.pl

<<DOC; 
B. Habert
lun mar 20 19:57:26 CET 2006

               CooccurrencesDansFenetre2IM.pl :


Format d'entrée : 
     . Index :


<index>
<t c="sceptique_A" f="21"/>
<t c="failler_V" f="1"/>
<t c="station_Nc" f="2"/>
<t c="quantificateur_Nc" f="1"/>
<t c="3x3_A" f="1"/>
<t c="rabbin_Nc" f="3"/>
<t c="Dostoïevsky_Np" f="5"/>

     . Cooccurrences


<cooccurrences>
<t c1="type_Nc" c2="mental_A" cf="3" dm="10.67"/>
<t c1="Lévy-Bruhl_Np" c2="Levy-Bruhl_Np" cf="4" dm="13.50"/>
<t c1="matière_Nc" c2="contemporain_A" cf="4" dm="12.50"/>
<t c1="philosophie_Nc" c2="Alexius_Np" cf="5" dm="9.00"/>
<t c1="pensée_Nc" c2="technique_Nc" cf="4" dm="12.00"/>
<t c1="raison_Nc" c2="humain_A" cf="4" dm="6.50"/>
<t c1="Paris_Np" c2="1888_Nc" cf="4" dm="5.75"/>
<t c1="Saint-Simon_Np" c2="philosophie_Nc" cf="3" dm="8.67"/>
<t c1="langage_Nc" c2="philosophique_A" cf="3" dm="13.33"/>
<t c1="Diogène_Np" c2="vie_Nc" cf="3" dm="3.67"/>

Exemple : CooccurrencesDansFenetre2IM.pl PHILOLynx-dump-normaliseCordialLemmeCatBreve.ArticleEgalFenetre.index PHILOLynx-dump-normaliseCordialLemmeCatBreve.ArticleEgalFenetre.CoocMaxDistance20MinCooc3 5 > PHILOLynx-dump-normaliseCordialLemmeCatBreve.ArticleEgalFenetre.CoocMaxDistance20MinCooc5.IM

Exemple d'appel :  perl CooccurrencesDansFenetre2IM.pl PHILOLynx-dump-normaliseCordialLemmeCatBreveArticleEgalFenetre.Index PHILOLynx-dump-normaliseCordialLemmeCatBreveArticleEgalFenetre.CoocMaxDistance20MinCooc3 5 > PHILOLynx-dump-normaliseCordialLemmeCatBreveArticleEgalFenetreCoocMaxDistance20MinCooc5.IM


Format de sortie :

<cooccurrent1><cooccurrent2><fréquence cooccurrent1><fréquence cooccurrent2><cofréquence><distance moyenne><information mutuelle>

<?xml version="1.0" encoding="iso-8859-1"?>
<information_mutuelle occurrences="134102">
<c c1="philosophie_Nc" c2="Alexius_Np"  fc1="855" fc2="8" cf="5" dm="9.00" im="6.62"/>
<c c1="lumière_Nc" c2="lien_Nc"  fc1="77" fc2="101" cf="9" dm="9.67" im="7.28"/>
<c c1="it_Nc" c2="pt_Nc"  fc1="46" fc2="58" cf="16" dm="14.12" im="9.65"/>
<c c1="Tseu_Np" c2="Tseu_Np"  fc1="7" fc2="7" cf="5" dm="9.40" im="13.74"/>
<c c1="état_Nc" c2="philosophie_Nc"  fc1="83" fc2="855" cf="6" dm="12.00" im="3.50"/>
<c c1="catégorie_Nc" c2="der_Nc"  fc1="212" fc2="55" cf="6" dm="10.00" im="6.11"/>
<c c1="volonté_Nc" c2="social_A"  fc1="49" fc2="89" cf="5" dm="9.80" im="7.26"/>
<c c1="fragment_Nc" c2="Mill_Np"  fc1="8" fc2="10" cf="5" dm="10.40" im="13.03"/>
<c c1="externe_A" c2="ligne_Nc"  fc1="65" fc2="20" cf="5" dm="9.00" im="9.01"/>
<c c1="philosophie_Nc" c2="métaphysique_Nc"  fc1="855" fc2="50" cf="7" dm="12.57" im="4.46"/>
<c c1="dieu_Nc" c2="existence_Nc"  fc1="107" fc2="84" cf="6" dm="10.83" im="6.48"/>
<c c1="chose_Nc" c2="cause_Nc"  fc1="168" fc2="62" cf="6" dm="10.17" im="6.27"/>
<c c1="1908_Nc" c2="1908_Nc"  fc1="7" fc2="7" cf="5" dm="11.40" im="13.74"/>
<c c1="société_Nc" c2="connaissance_Nc"  fc1="103" fc2="183" cf="13" dm="10.00" im="6.53"/>
<c c1="Mill_Np" c2="James_Np"  fc1="10" fc2="38" cf="24" dm="9.04" im="13.05"/>
<c c1="philosophie_Nc" c2="externe_A"  fc1="855" fc2="65" cf="12" dm="10.25" im="4.86"/>
<c c1="http_Nc" c2="Canguilhem_Np"  fc1="8" fc2="17" cf="6" dm="10.83" im="12.53"/>
<c c1="présocratique_A" c2="liste_Nc"  fc1="36" fc2="38" cf="5" dm="8.40" im="8.94"/>
<c c1="âme_Nc" c2="existence_Nc"  fc1="83" fc2="84" cf="6" dm="9.67" im="6.85"/>
<c c1="idée_Nc" c2="homme_Nc"  fc1="133" fc2="288" cf="7" dm="12.29" im="4.62"/>
<c c1="connaissance_Nc" c2="objet_Nc"  fc1="183" fc2="143" cf="15" dm="9.87" im="6.26"/>

A faire :

Bugs et problèmes :

DOC



$ChaineUsage = "Usage : CooccurrencesDansFenetre2IM.pl <index><cooccurrences><fréquence plancher>\n" ; 
if (@ARGV != 3) {die $ChaineUsage ; }  
$FichierIndex = $ARGV[0] ;  
$FichierCooccurrences = $ARGV[1] ;  
$CooccurrencePlancher   = $ARGV[2] ;  
$Trace = 0 ; 
$Occurrences = 0 ;
%Cooccurrent2Frequence = () ;
if ($PourWindows == 0) {
  $FinLigne     = "\n" ; 
}
else {
  $FinLigne     = "\r\n" ; 
}

open(INDEX, $FichierIndex) ;  
#<t c="sceptique_A" f="21"/>
while ($LigneIndex = <INDEX>){
  if ($LigneIndex =~ /<t c=\"(.+)\" f=\"(\d+).*/) {
    $Cooccurrent = $1 ;
    $Frequence   = $2 ;
    $Occurrences += $Frequence ;
    if ($Frequence  >= $CooccurrencePlancher) {
      $Cooccurrent2Frequence{$Cooccurrent} = $Frequence ; 
      if ($Trace == 1) {print "$Cooccurrent\t$Frequence\n" ; }
    }
  }
}
close(INDEX) ; 

imprimeEnTete() ;
open(COOCCURRENCES, $FichierCooccurrences) ;  
#<t c1="raison_Nc" c2="humain_A" cf="4" dm="6.50"/>
while ($LigneCooc = <COOCCURRENCES>){
  if ($Trace == 1) {print $LigneCooc ; }
  if ($LigneCooc =~ /<t c1=\"(.+)\" c2=\"(.+)\" cf=\"(\d+)\" dm=\"([0-9.]+)/) {
     $Cooc1 = $1 ;
     $Cooc2 = $2 ;
     $Cofrequence = $3 ;
     $DistanceMoyenne = $4 ;
     if ($Trace == 1) {print "$Cooc1\t$Cooc2\t$Cofrequence\t$DistanceMoyenne$FinLigne" ;  }
     if (exists $Cooccurrent2Frequence{$Cooc1} 
         && exists $Cooccurrent2Frequence{$Cooc2}
         && $Cofrequence >= $CooccurrencePlancher)
     {
       $FrequenceCooc1 = $Cooccurrent2Frequence{$Cooc1} ;
       $FrequenceCooc2 = $Cooccurrent2Frequence{$Cooc2} ;
       $ProbabiliteCofrequence = $Cofrequence / $Occurrences ;
       $ProbabiliteFrequenceCooc1 = $FrequenceCooc1 / $Occurrences ;
       $ProbabiliteFrequenceCooc2 = $FrequenceCooc2 / $Occurrences ;
       $InformationMutuelle =  
         (log ($ProbabiliteCofrequence / ($ProbabiliteFrequenceCooc1 * $ProbabiliteFrequenceCooc2) ) / log 2) ;

      print "<c c1=\"$Cooc1\" c2=\"$Cooc2\"  fc1=\"$FrequenceCooc1\" fc2=\"$FrequenceCooc2\" cf=\"$Cofrequence\" dm=\"$DistanceMoyenne\" im=\"" ;
      printf "%1.2f", $InformationMutuelle ;
      print "\"/>$FinLigne" ;
    }
  }
}
close(COOCCURRENCES) ; 
imprimeEnPied() ;


sub imprimeEnTete{
  print "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>$FinLigne<information_mutuelle occurrences=\"$Occurrences\">$FinLigne" ; 
}

sub imprimeEnPied{
  print "</information_mutuelle>$FinLigne" ;
}
