#!/usr/bin/perl

<<DOC; 
S.Fleury
December 2000

               makeCorpusTAGForLexico.pl :

DOC
#-------------------------------------------------------------------
sub GetParams {
 print "Getting parameters...\n";
 # Process parameters

 $InFile = "";
 while ($ARGV[0] =~ /^-/) {
   if    ($ARGV[0] eq "-help")     { &PrintHelp; }
   if    ($ARGV[0] eq "-h")        { &PrintHelp; }

   elsif ($ARGV[0] eq "-at")       { $AskedPath = &NextArg;
                                    $AskedPath = &shell_path($AskedPath); 

				 }
   else {&PrintUsage; exit;} 

   shift @ARGV;
}

}  # GetParams
#-------------------------------------------------------------------
sub shell_path {
  # Returns shell paths 
  local($dos_path) = @_;
  $dos_path =~ s/\\/\//g;
  return $dos_path;
}
#-------------------------------------------------------------------
sub get_path_to_results {
    # to add to the file name
    $path_to_results=$CWD;
    # default we put the results in the current working directory
    if ($AskedPath) {
	# a path_to_results asked
	$AskedPath .= '/' unless ($AskedPath =~ m#/$#);
				  # we make sure it ends with a slash
				  if ($AskedPath =~ m#^(?:\w:)?/#) {
				      # a full_path_to_results is asked
				      $path_to_results=$AskedPath;
				  }
				  else {
				      # the path asked is not a fullpath
				      $path_to_results .= "$AskedPath";
				      # we add it to the working directory
				  }
       }
}
#-------------------------------------------------------------------
sub PrintUsage {

 print <<"E_O_T";

Usage:          

Options:       -help/-h -at 


Configuring:   see the MAIN section

E_O_T
}
#-------------------------------------------------------------------
sub PrintHelp {

&PrintUsage;
}
#-------------------------------------------------------------------
sub NextArg {

  shift @ARGV;
  return $ARGV[0];

}  # NextArg
#-------------------------------------------------------------------
sub GetCWD {

  # -- GetCWD
  # Get the current working directory, make sure it ends with a slash
  # No parameters
  # Returns cwd
  # Modified by Calin MOSUT (the command `pwd` in the original code could not
  # reach the mount point on a Windows machine)

  use Cwd;
  local($dir);

  $dir = getcwd;
  # method from Cwd.pm
  $dir =~ s/\n//g;
  if ($dir !~ m#/$#) {
    $dir = "$dir/";
  }

  return $dir;

}  # GetCWD
#-------------------------------------------------------------------
sub by_count {
    $seen2{$b} <=> $seen2{$a}; 
}
#-------------------------------------------------------------------
sub by_count2 {
    $seen3{$b} <=> $seen3{$a}; 
}
#--------------------MAIN-------------------
&GetParams;
&get_path_to_results;

$CWD=&GetCWD;

print $ARGV[0],"\n";
print $ARGV[1],"\n";
print $path_to_results,"\n";

open(CORPUSXML, "$ARGV[0]") ;
open(LISTAG, "$ARGV[1]") ;
open(TRACEFFF,">".$path_to_results."SELECTEDTAG.txt");
open(TRACEFFFF,">".$path_to_results."SELECTEDTAGCONV.txt");
open(TRACEFF,">".$path_to_results."SELECTEDTAGFRQ.txt");
open(TRACEFFFFF,">".$path_to_results."LIST-CONVTAG.txt");

$seen=();
$seen2=();
$seen3=();

while($line = <LISTAG>) { 
    if ($line=~/^([A-Z]+)[^0-9]+([0-9]+)$/) {
	print $1,"\t", $2,"\n";
	$seen{$1}=$2;
    }
}
exit;
$cptTrait=1;

while($line = <CORPUSXML>) { 
    $selectedSite ="";
    $selectedSite2 ="";
    if ($line =~ /<SITEName>([^<]+)<\/SITEName>/) 
    { 
	$sitename=$1;
	$findMark=0;
	$line = <CORPUSXML>;
	until ($line =~ /<\/SITE>/) { 
	    if ($line =~ /<SITEFile>/i) { 
		$line = <CORPUSXML>;
		if ($line =~ /<SITEFileName>([^>]*)<\/SITEFileName>/i) 
		{
		    $filename=$1;
		    print  "\$ <SITENAME=$sitename>\n";
		    $selectedSite .= "\$ <SITENAME=$sitename>\n";
		    $selectedSite2.= "\$ <SITENAME=$sitename>\n";
		    $selectedSite .=  " <PAGE=$filename>\n";
		    $selectedSite2 .=  " <PAGE=$filename>\n";
		    print  "<PAGE=$filename>\n";
		}
		$line = <CORPUSXML>;
		$cpt=0;
		$cpt2=0;
		until ($line =~/<\/SITEFile>/){
		    if ($line =~ /<tagHTML TAGType=\"([^\"]+)\" NBATTR=\"([^\"]+)\">BEGIN/i) {
			$typeLink=uc($1);
			$attrNb=$2;
			$attrValue="";
			if (exists $seen{$typeLink}) {
			    print "$typeLink\n";
			    for ($i=1 ; $i <= $attrNb ; $i++) {
				
				$line=<CORPUSXML>;
				if ($line =~ /<tagHTMLAttr TAG=\"([^\"]+)\" NUM=\"([^\"]+)\" ATTRType=\"([^\"]+)\" VALUE=\"([^\"]+)\"\/>/i) {
				    print "$i\t$typeLink : ($3) ($4)\n";
				    $tmp3=$3;
				    $tmp3=~s/ //g;
				    $tmp3=~s/\"//g;
				    $tmp3=~s/\.//g;
				    $tmp3=~s/\'//g;
				    $tmp3=~s/\,//g;
				    $tmp3=~s/\;//g;
				    $tmp3=~s/\://g;
				    $tmp3=~s/\!//g;
				    $tmp3=~s/\?//g;
				    $tmp3=~s/\-//g;
				    $tmp3=~s/\_//g;
				    $tmp3=~s/\///g;
				    $tmp3=~s/\\//g;
				    $tmp3=~s/\#//g;
#
				    $tmp4=$4;
				    $tmp4=~s/ //g;
				    $tmp4=~s/\"//g;
				    $tmp4=~s/\.//g;
				    $tmp4=~s/\'//g;
				    $tmp4=~s/\,//g;
				    $tmp4=~s/\;//g;
				    $tmp4=~s/\://g;
				    $tmp4=~s/\!//g;
				    $tmp4=~s/\?//g;
				    $tmp4=~s/\-//g;
				    $tmp4=~s/\_//g;
				    $tmp4=~s/\///g;
				    $tmp4=~s/\\//g;
				    $tmp4=~s/\#//g;

				    $attrValue.="#Type#$tmp3#Value#$tmp4#";
				}
			    }
			    $tmpTag = uc($typeLink);
			    $selectedSite .=  uc($typeLink);
			    $attrValue=~s/\#\#/\#/g;
			    $tmpTag .= uc($attrValue);
			    $seen2{$tmpTag}++;
			    $selectedSite .=  uc($attrValue);
			    
			    if (exists $seen3{$tmpTag}) {
				$selectedSite2 .= $seen3{$tmpTag};
				$selectedSite2 .= " ";
			    }
			    else {
				$nomtrait="TRAIT".$cptTrait;
				$cptTrait++;
				$seen3{$tmpTag} = $nomtrait;
				$selectedSite2 .= $seen3{$tmpTag};
				$selectedSite2 .= " ";
				}
			    print "$seen3{$tmpTag} = $tmpTag\n";
			    $selectedSite .=  " ";
			    $cpt++;
			    $cpt2++;
			    $findMark=1;
			}
			if ($cpt > 10) {
			    $selectedSite .=  "\n";
			    $cpt=0;
			}
			if ($cpt2 > 10) {
			    $selectedSite2 .=  "\n";
			    $cpt2=0;
			}
		    }
		    if ($line =~ /<tagHTML TAGType=\"([^\"]+)\">END/i) {
			$typeLink="END@".uc($1);
			if (exists $seen{uc($1)}) {
			    $selectedSite .=  uc($typeLink);
			    $selectedSite .=  " ";
			    $cpt++;
			    $findMark=1;
			}
			if ($cpt > 10) {
			    $selectedSite .=  "\n";
			    $cpt=0;
			}
			if ($cpt2 > 10) {
			    $selectedSite2 .=  "\n";
			    $cpt2=0;
			}
		    }
		    $line=<CORPUSXML>;
		}
		 $selectedSite .=  "\n";
		 $selectedSite2 .=  "\n";
	    }
	   
	    $line=<CORPUSXML>;
	}
	print TRACEFFF $selectedSite;
	print TRACEFFFF $selectedSite2;
	
    }
}

$i=1;
foreach $tag (sort by_count keys %seen2) {
    print TRACEFF $tag, "\t", $seen2{$tag}, "\n" ;
}

foreach $tag (keys %seen3) {
    print TRACEFFFFF $seen3{$tag}, "\t", $tag, "\n" ;
}


close (CORPUSXML);


