#! usr/bin/perl
# alignement du corpus au niveau des titres et des articles

my $i=0;
my $check = 0;
my $concat="";
my $contenu = 0;
my $go="";

	if (!open (FILEIN,"<:encoding(UTF-8)","./frn.htm")) {
		die $!;
	}
	if (!open (FILEIN2,"<:encoding(UTF-8)","./eng.htm")) {
		die $!;
	}
	if (!open (FILEIN3,"<:encoding(UTF-8)","./jpn.htm")) {
		die $!;
	}
	if (!open (FILEIN4,"<:encoding(UTF-8)","./blg.htm")) {
		die $!;
	}
	if (!open (FILEIN5,"<:encoding(UTF-8)","./arz.htm")) {
		die $!;
	}
	if (!open (FILEIN6,"<:encoding(UTF-8)","./chn.htm")) {
		die $!;
	}
	if (!open (FILEOUT,">:encoding(UTF-8)","./alignement.html")) {
		die $!;
	}

print FILEOUT "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n
			   </head>\n<body style=\"background-color: green;\">\n<table style=\"background-color: white; width: 100%; margin: auto;\">\n";
	
	
####################
## FRANCAIS
####################
my %article=();

while ($ligne = <FILEIN>){
	chomp $ligne;
	if ($contenu == 1){
		if ($ligne !~ m/<h\d>/){
		$ligne =~ s/<[^>]>//g;
		$go .= $ligne;
		}
		if (($ligne =~ m/<h\d>/) || ($ligne =~ m/<\/div>/)){
			$article{$i} = $go;		
			$article{$i} =~ s/<[^>]*>//g;
			$go="";
			$i++;
			$contenu = 0;	
		}
	}
	if($ligne =~ m/<h\d>/){
		$article{$i} = $ligne;
		$article{$i} =~ s/<[^>]*>//g;
		$i++;
		$contenu = 1 unless ($ligne =~ m/<h3>/);
	}
}


###################
## ANGLAIS
###################
$i=0;
$contenu = 0;
$go="";
my %article2=();

while ($ligne = <FILEIN2>){
	chomp $ligne;
	if ($contenu == 1){
		if (($ligne !~ m/<h\d>/) && ($ligne !~ m/<\/div>/)){
			$go .= $ligne;
		}
		if (($ligne =~ m/<h\d>/) || ($ligne =~ m/<\/div>/)){
			$article2{$i} = $go;		
			$article2{$i} =~ s/<[^>]*>//g;
			
			$i++;
			$contenu = 0;
			$go="";	
		}
	}
	if($ligne =~ m/<h\d>/){
		$article2{$i} = $ligne;
		$article2{$i} =~ s/<[^>]*>//g;
		$i++;
		$contenu = 1 unless ($ligne =~ m/<h3>/);
	}
}


##################
## JAPONAIS
##################
$i=0;
$contenu = 0;
$go="";
my %article3=();

while ($ligne = <FILEIN3>){
	chomp $ligne;
	if ($contenu == 1){
		if (($ligne !~ m/<h\d>/) && ($ligne !~ m/<\/div>/)){
			$go .= $ligne;
		}
		if (($ligne =~ m/<h\d>/) || ($ligne =~ m/<\/div>/)){
			$article3{$i} = $go;		
			$article3{$i} =~ s/<[^>]*>//g;
			$go="";
			$i++;
			$contenu = 0;	
		}
	}
	
	if ($check == 1){
		if ($ligne =~ m/<\/h\d>/){
			$concat .= $ligne;
			$article3{$i} = $concat;
			$article3{$i} =~ s/<[^>]*>//g;
			$check = 0;
			$i++;
			$contenu = 1 unless ($ligne =~ m/<h3>/);
		}		
		else{
			$concat .= $ligne;
		}
	}
	
	if($ligne =~ m/<h\d>.*<\/h\d>/){
		$article3{$i} = $ligne;
		$article3{$i} =~ s/<[^>]*>//g;
		$i++;
		$contenu = 1 unless ($ligne =~ m/<h3>/);
	}
	
	if(($ligne =~ m/<h\d>/) && ($ligne !~ m/<\/h\d/)){
		$concat = $ligne;		
		$check=1;
	}
}


####################
## BULGARE
####################
$i=0;
$contenu = 0;
$go="";
my %article4=();

while ($ligne = <FILEIN4>){
	chomp $ligne;
	if ($contenu == 1){
		if (($ligne !~ m/<h\d>/) && ($ligne !~ m/<\/div>/)){
			$go .= $ligne;
		}
		if (($ligne =~ m/<h\d>/) || ($ligne =~ m/<\/div>/)){
			$article4{$i} = $go;		
			$article4{$i} =~ s/<[^>]*>//g;
			$go="";
			$i++;
			$contenu = 0;	
		}
	}
	if($ligne =~ m/<h\d>/){
		$article4{$i} = $ligne;
		$article4{$i} =~ s/<[^>]*>//g;
		$i++;
		$contenu = 1 unless ($ligne =~ m/<h3>/);
	}
}


##################
## ARABE
##################

$i=0;
$check = 0;
$concat="";
$contenu = 0;
$go="";
my %article5=();

while ($ligne = <FILEIN5>){
	chomp $ligne;
	$ligne =~ s/<a.*?\/a>//g;	
	if ($contenu == 1){
		if (($ligne !~ m/<h\d/) && ($ligne !~ m/<\/div>/)){
			$go .= $ligne;
		}
		if (($ligne =~ m/<h\d/) || ($ligne =~ m/<\/div>/)){
			$article5{$i} = $go;		
			$article5{$i} =~ s/<[^>]*>//g;
			$go="";
			$i++;
			$contenu = 0;	
		}
	}
	
	if ($check == 1){
		if ($ligne =~ m/<\/h\d/){
			$concat .= $ligne;
			$article5{$i} = $concat;
			$article5{$i} =~ s/<[^>]*>//g;
			$i++;
			$check=0;
			$contenu = 1 unless ($ligne =~ m/<\/h3/);
		}		
		else{
			$concat .= $ligne;
		}
	}
	
	if($ligne =~ m/<h\d.*<\/h\d>/){
		$article5{$i} = $ligne;
		$article5{$i} =~ s/<[^>]*>//g;
		$i++;
		$contenu = 1 unless ($ligne =~ m/<h3/);
	}
	
	if(($ligne =~ m/<h\d/) && ($ligne !~ m/<\/h\d/)){
		$concat = $ligne;		
		$check=1;
	}
}


####################
## CHINOIS
####################
my %article6=();
$i=0;
$contenu = 0;
my $go="";

while ($ligne = <FILEIN6>){
		chomp $ligne;
	if ($contenu == 1){
		if ($ligne !~ m/<h\d>([^<]+)/){
		$go .= $ligne;
		}
		if (($ligne =~ m/<h\d>([^<]+)/) || ($ligne =~ m/<\/div>/)){
			$go =~ s/<[^>]>//g;
			$article6{$i} = $go;		
			$article6{$i} =~ s/<[^>]*>//g;
			$go="";
			$i++;
			$contenu = 0;	
		}
	}
	if($ligne =~ m/<h\d>/){
		$article6{$i} = $ligne;
		$article6{$i} =~ s/<[^>]*>//g;
		$i++;
		unless ($ligne =~ m/<h3>/){
		$contenu = 1;
		}
	}
}




sort {$a <=> $b} keys %article2;
sort {$a <=> $b} keys %article3;
sort {$a <=> $b} keys %article4;
sort {$a <=> $b} keys %article5;
sort {$a <=> $b} keys %article6;

for my $key(sort {$a <=> $b} keys %article){
	my $valeur = $article{$key};
	my $valeur2 = $article2{$key};
	my $valeur3 = $article3{$key};
	my $valeur4 = $article4{$key};
	my $valeur5 = $article5{$key};
	my $valeur6 = $article6{$key};

	print FILEOUT "<tr style=\"border: solid black 1px;\"><td style=\"border: solid black 1px; width:16%;\">$valeur</td>\n";
	print FILEOUT "<td style=\"border: solid black 1px; width:16%;\">$valeur2</td>\n";
	print FILEOUT "<td style=\"border: solid black 1px; width:16%;\">$valeur3</td>\n";
	print FILEOUT "<td style=\"border: solid black 1px; width:16%;\">$valeur4</td>\n";
	print FILEOUT "<td style=\"border: solid black 1px; width:16%;\">$valeur5</td>\n";
	print FILEOUT "<td style=\"border: solid black 1px; width:16%;\">$valeur6</td></tr>\n";
}

print FILEOUT "</table></body></html>";

close (FILEIN); close (FILEIN2); close (FILEIN3); close (FILEIN4); close (FILEIN5); close (FILEIN6); close (FILEOUT);
