#!/usr/bin/perl
#--------------------------------------------------------------
use English;
#--------------------------------------------------------------
#
# This is webxref038 updated January 2001. It will work with Perl5
# without any configuring.
#
# ==============================================================
#  W E B X R E F
# ==============================================================
# Webxref is a WWW link checker and cross referencing tool, intended
# to check a local set of HTML documents for missing files,
# anchors etc, and to display information on the headers, links and all
# elements of the web documents found.
# Webxref compiles a list of HTML documents, URLs, name anchors, 
# images etc and the html files that reference those, i.e. a 
# cross-reference list. Hence the name: webxref.
# 
# Written July 1995 by Rick Jansen at SARA as part of the SURFACE 
# project (SURFnet Advanced Communication Environment)
# Updated February-March 1997 by Rick Jansen as part of the zoutmijn
# project.
# email:  rja@euronet.nl
# url:    http://www.sara.nl/Rick.Jansen
#
# New versions can be obtained from:
#   http://www.sara.nl/Rick.Jansen/Web/
#
# Updated April 2000 by Calin MOSUT as part of the ENS/France Telecom project.
# New versions can be obtained from:
#   cmosut@personal.ro
#
# Updated January 2001 by Serge Fleury as part of the ENS/France Telecom project.
# New versions can be obtained from:
#   fleury@msh-paris.fr
# 
# Usage: see PrintHelp
#
# Lists are made of:
# - html files
# - directories
# - binary files (images)
# - named anchors
# - mailto's
# - news
# - ftp
# - telnet
# - gopher
# - external URLs
# - cgi-bin scripts
# - file:'s
# - files that can't be found
# - files that are not world readable
# - directories that can't be found
# - name anchors that can't be found
# - files and directories in your website never actually used
# - files whose content matched the find parameter
# - files in which text was search-replaced
# - files older/younger than a certain date/time
# - http:// ok references
# - http:// failed references
#
# Modification history:
# 11-JUL-95 lcheck version 0.0.0
# 18-JUL-95 renamed webxref 0.0.1
# 20-JUL-95 webxref 0.0.2
# 21-JUL-95 webxref 0.0.3 root handling
# 27-JUL-95 webxref 0.0.4 metachar interpretation in substitutions fixed
# 28-JUL-95 webxref 0.0.5 pass associative array to sub
# 08-AUG-95 webxref 0.0.6 parsing with temp file
# 08-AUG-95 webxref 0.0.7 handle Welcome/welcome/index.html in case of dir
# 08-AUG-95 webxref 0.0.8 'file:' refs
# 10-AUG-95 webxref 0.0.9 Extensible DefaultFiles
# 14-AUG-95 webxref 0.1.0 Some perl lint removed, cgi-bin added
# 28-SEP-95 webxref 0.1.1 1-level external URL checking added
# 04-OCT-95 webxref 0.1.2 options -nohttp -htmlonly and -avoid added
# 04-OCT-95 webxref 0.1.3 Restriction on tags not being allowed to spread
#                         over more than 1 source line removed, thanks to
#                         Hans Hoppe (hopha@sci.kun.nl)
# 10-OCT-95 webxref 0.1.4 -silent option
# 15-APR-96 webxref 0.1.5 Temporary fix for $SOCK_STREAM
# 22-FEB-97 webxref 0.2.0 Updated internals
# 23-FEB-97 webxref 0.2.0 added -root, -noxref and -islocal
# 26-FEB-97 webxref 0.2.0 added TryExtensions, try html, htm in case of
#                         missing extension
# 01-MAR-97 webxref 0.2.1 speedup by testing files before recursing
# 01-MAR-97 webxref 0.2.2 optional testing for fluff (never referenced files)
# 02-MAR-97 webxref 0.2.3 -depth, -find and -replace 
# 02-MAR-97 webxref 0.2.4 -before -after -date -time
# 02-MAR-97 webxref 0.2.5 -one, frames
# 03-MAR-97 webxref 0.2.6 switched to less verbose output
# 03-MAR-97 webxref 0.2.6 on interrupt print output so far and exit
# 03-MAR-97 webxref 0.2.6 -long/-brief -errors -verbose 
# 04-MAR-97 webxref 0.2.7 interrupt handling, extended -help
# 04-MAR-97 webxref 0.2.8 fixed bug in ../.. constructions
# 05-MAR-97 webxref 0.2.9 area href and another ../ bug
# 05-MAR-97 webxref 0.3.0 delay in http checking
# 07-MAR-97 webxref 0.3.1 two roots from now: server root and site root
# 07-MAR-97 webxref 0.3.2 -html
# 12-MAR-97 webxref 0.3.3 -onexref -intermediair
# 13-MAR-97 webxref 0.3.4 garbage collection while printing (undef %list)
# 13-MAR-97 webxref 0.3.5 bug fixed in fluff detector and -avoid, -noint
# 12-APR-00 webxref 0.3.6 -rep -at added; -root, -files  reconsidered 
# 23-SEP-00 webxref 0.3.7 corpus XML, stat
# ===================================================================
# DISCLAIMER: the author cannot be held responsible for any damage
# resulting from using the edit- or any other functions of webxref
# or indeed any software, hardware, chemical substance, imagined
# or real (or imagined to be real) effects or by-effects of anything,
# at all, whatsoever. I didn't do it. Honestly.
# ===================================================================


#                              MAIN                                 #


#===================================================================#
#                       CONFIGURABLE THINGS:                        #
#===================================================================#

# Files to try in case of a directory reference like ../..
@DefaultFiles = ('index.html','index.htm',
                 'welcome.html','Welcome.html',
                 'welcome.htm','Welcome.htm',
                 'Index.html','index.shtml',
                 'sommaire.html','sommaire.htm',
                 'Sommaire.html','Sommaire.htm',
                 'README.html');
@Extensions =   ('html','htm','shtml');
$MaxDots = 50;  # 50 +/-'s per line max

#-------------------------------------------------------------------
# Default for some parameters
$ReportFiles=1;        # Reports the elements of the html local files
$Spell=1;             # Checks html syntax to avoid parse errors
$Do_External_URLs=0;   # Default we don'check external URLs
$LongReport=1;         # Long report is default
$HTMLReport=0;         # Default gives us a .txt report
$DelHTMLReport=0;      # Default keeps html report
$FullPath=0;           # Prints full filepaths if true
$HTML_only = 0;        # All referenced files are checked for links
$Avoid = "";           # Regexp to avoid certain URLs, files,...

$Silent = 1;           # Only error msgs will be printed
$Verbose = 0;          # Default as little output as possible
$Dots = 1;             # Print a + for every file checked (- if failed)
$Errors = 0;           # Print error messages only
$Xref = 1;             # Default generation of cross references
$MaxDepth = 99999999;  # Default max nesting level
$Fluff = 0;            # Default do check for never referenced files
$HTTPDelay = 1;        # 1 second between external link checks
$Time = '000000'; $NoInterrupt = 0;      # Default webxref is interruptable

#====================================================================

$debug = 0;  # Bugs? haha! hm.
&GetParams;
die "No input file(s).\n(try webxref -help)\n" unless @ARGV;
# Assume webxref is called in the document root directory
$CWD=&GetCWD;
foreach (@ARGV) {
# serial treatement
    next if ($_ eq""||$_=~/^-/);
    $_=&shell_path($_);
    $InFile=$_;
    if (! -e $InFile) {
	&AddedToList(*LostFileList,$InFile,$WebxrefReferer);
	print "Cannot find file $InFile\n";
	next;
    }
    $SiteRoot=$CWD;
    if ($_ =~ m#^((?:\w:)?/(?:.+/)*)[^/]+/?$#) {
	# fullpath
	$SiteRoot=$1;   
	s/$1//;    
    }
    print <<EOM && next unless ( -e $SiteRoot);
    The site directory $SiteRoot does not exist!
	EOM 
	    print <<EOM && next unless ( -d $SiteRoot);
    "\"$SiteRoot\" is not a directory!
EOM 
  print <<EOM && next unless ( -r $SiteRoot);
  Cannot access directory \"$SiteRoot\"!
EOM
  &WriteResFiles;
  # writes the results file

  # If interrupted print output so far
  # NOTE: This is unreliable if webxref was interrupted
  # asynchronously. The C-library is not re-entrant, so
  # if printing was in progress printing may well fail
  # due to malloc running into trouble. Oh well. It does
  # work sometimes.
  $SIG{INT} = 'InterruptHandler' if (! $NoInterrupt);

  $WebxrefReferer = '--webxref--';

  # Expression to replace root filepath when printing
  $SiteRootExpr = $SiteRoot;
  if ($SiteRootExpr !~ m#/$#) { $SiteRootExpr .= '/'; }
  $SiteRootExpr =~ s/(\W)/\\$1/g;  # escape regexp chars

  ($d,$f,$a,$RootDepth) = &SplitFile($SiteRoot);
  # print "\nSiteRoot=$SiteRoot, \nd=$d \nf=$f \na=$a \ndep=$RootDepth\n";
  $MaxDepth = $MaxDepth + $RootDepth;
  #print "Maxdepth=$MaxDepth\n";

  print RES "<pre>\n" if ($HTMLReport);

  &GetFluffFiles($SiteRoot) if ($Fluff);
  $InFile = $SiteRoot . $_;
  print "\nChecking $InFile...\n\n";
  print "Dissecting files...\n" if ($ReportFiles);
  &GetReferences($InFile,"--webxref--");
  &UpdateARGV;

  # See if there are any never-referenced files
  &PickFluff if ($Fluff);

  print RES "</pre>\n" if ($HTMLReport);
  &PrintLists;
 
  $DotCount = 0;
  if ($Do_External_URLs) {
  print "External HTTP checking starts...\n" if ($Dots);

  print RES "\n\n";
  print RES "<p>" if ($HTMLReport);

  # Check external URLs
  if (!$Silent) {
  print <<"E_O_T";

- - - - - - - - - - - - - - - - - - - - - - - - - - -
Going to really check external URLs via the network.
This may take some time. Simply abort webxref if you
are out of patience.
- - - - - - - - - - - - - - - - - - - - - - - - - - -
E_O_T

  }
  &InitStatusMessages;
  &Check_External_URLs;
  }
  &PrintHTTPLists;
  print RES "</body></html>\n" if ($HTMLReport);
  close RES;
  close TRA;
  close TRASH ;
  &WriteCorpusAndXMLReports ;
  close CORPXML;
  close CORPTXT;
  &DeleteHTMLReports if ($DelHTMLReport);
  print "\n" if ($HTTP);
  print "All done. See $path_to_results\n" ;
  }

# End of webxref-038.pl

#====================================================================

#                            ROUTINES                               #

#====================================================================

sub InterruptHandler {

 # Called on interrupt
 # Print output accumulated so far and exit
 $| = 0;
 print "\n\n** Interrupt!\n";
 &PrintLists;
 if ($HTMLReport) {
   print RES "</body></html>";
   print REP "</body></html>";
 }
 close RES;
 close REP;
 close TRA;
 &WriteCorpusAndXMLReports ;
 close CORPXML;
 close CORPTXT;
 &DeleteHTMLReports if ($DelHTMLReport);
 exit;
}

#====================================================================

sub GetParams {
 print "Getting parameters...\n";
 # Process parameters

# for Output Stat

  $print_subform =
'@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<      @>>>>>>>>>>>>';
  $print_subform2 =
'@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<';


 %seen = ();
 %elements=();
 %elements2=();
 %elements3=();
 %seenElt = ();
 $fileNumber = 0;
 $imgNb = 0;
 $imgExtNb = 0;
 $imgIntNb = 0;
 $imgNb1 = 0;
 $imgExtNb1 = 0;
 $imgIntNb1 = 0;
 $linkNb = 0;
 $extHypertextLinkNb = 0;
 $intHypertextLinkNb = 0;
 $intDocFileLinkNb = 0;
 $intHtmlFileLinkNb = 0;
 $extCgiLinkNb = 0;
 $extNewsLinkNb = 0;
 $extFtpLinkNb = 0;
 $extGopherLinkNb = 0;
 $extLinkNb = 0;
 $intLinkNb = 0;
 $extMailNb =0;
 $intAncNb = 0;
 $extAncNb = 0;
 $eltNb = 0;
 $txtEltNb= 0;
 $txtbrut="";
 $metaTitle ="-";
 $metaContentType ="-";
 $metaDescription ="-";
 $metaGenerator ="-";
 $metaAuthor ="-";
 $metaKeywords ="-";

 $InFile = "";
 while ($ARGV[0] =~ /^-/) {
   if    ($ARGV[0] eq "-help")     { &PrintHelp; }
   if    ($ARGV[0] eq "-h")        { &PrintHelp; }

   elsif ($ARGV[0] eq "-noxref")   { $Xref = 0; }
   elsif ($ARGV[0] eq "-xref")     { $Xref = 1; }
   elsif ($ARGV[0] eq "-onexref")  { $OneXref = 1; }
   elsif ($ARGV[0] eq "-x")        { $Xref = 1; }
   elsif ($ARGV[0] eq "-htmlonly") { $HTML_only = 1; }

   elsif ($ARGV[0] eq "-http")     { $Do_External_URLs = 1; }
   elsif ($ARGV[0] eq "-delay")    { $HTTPDelay = &NextArg; }

   elsif ($ARGV[0] eq "-nofluff")  { $Fluff = 0; }
   elsif ($ARGV[0] eq "-fluff")    { $Fluff = 1; }

   elsif ($ARGV[0] eq "-silent")   { $Silent = 1; $Dots = 0; }
   elsif ($ARGV[0] eq "-s")        { $Silent = 1; $Dots = 0; }
   elsif ($ARGV[0] eq "-verbose")  { $Verbose = 1; $Silent = 0; 
                                    $Dots = 0;  $Errors = 1; }

   elsif ($ARGV[0] eq "-noint")    { $NoInterrupt = 1; }
   elsif ($ARGV[0] eq "-spell")    { $Spell = 1; }

   elsif ($ARGV[0] eq "-v")        { $Verbose = 1; $Silent = 0; 
                                    $Dots = 0; $Errors = 1; }
   elsif ($ARGV[0] eq "-errors")   { $Errors = 1; }
   elsif ($ARGV[0] eq "-error")    { $Errors = 1; }
   elsif ($ARGV[0] eq "-e")        { $Errors = 1; }

   elsif ($ARGV[0] eq "-brief")    { $LongReport=0; }
   elsif ($ARGV[0] eq "-html")     { $HTMLReport=1; }
   elsif ($ARGV[0] eq "-del")      { $DelHTMLReport=1; }
   elsif ($ARGV[0] eq "-norep")    { $ReportFiles = 0; }
   elsif ($ARGV[0] eq "-files")    { shift @ARGV; last; }
   elsif ($ARGV[0] eq "-f")        { shift @ARGV; last; }


   elsif ($ARGV[0] eq "-islocal")  { $IsLocal = &NextArg; }
   elsif ($ARGV[0] eq "-avoid")    { $Avoid = &NextArg; }
   elsif ($ARGV[0] eq "-a")        { $Avoid = &NextArg; }

   elsif ($ARGV[0] eq "-at")       { $AskedPath = &NextArg;
                                    $AskedPath = &shell_path($AskedPath); }

   elsif ($ARGV[0] eq "-fullpath") { $FullPath=1;}
   elsif ($ARGV[0] eq "-depth")    { $MaxDepth = &NextArg; }
   elsif ($ARGV[0] eq "-one")      { $One = 1; }
   elsif ($ARGV[0] eq "-1")        { $One = 1; }

   elsif ($ARGV[0] eq "-date")     { $Date = &NextArg; }
   elsif ($ARGV[0] eq "-time")     { $Time = &NextArg; }
   elsif ($ARGV[0] eq "-before")   { $Before = 1; }
   elsif ($ARGV[0] eq "-after")    { $After = 1; }

   elsif ($ARGV[0] eq "-find")     { $FindExpr = &NextArg; 
                                    $OrgFindExpr = $FindExpr;
                                    # escape regexp chars
                                    $FindExpr =~ s/(\W)/\\$1/g;
                                  }
   elsif ($ARGV[0] eq "-findexpr") { $FindExpr = &NextArg; 
                                    $OrgFindExpr = $FindExpr;
                                  }
   elsif ($ARGV[0] eq "-replace")  { $FindExpr = &NextArg; 
                                    $OrgFindExpr = $FindExpr;
                                    # escape regexp chars
                                    $FindExpr =~ s/(\W)/\\$1/g;
                                  }
   elsif ($ARGV[0] eq "-replaceexpr")  { $FindExpr = &NextArg; 
                                        $OrgFindExpr = $FindExpr;}
   elsif ($ARGV[0] eq "-by")       { $Replacement = &NextArg; }

   elsif ($ARGV[0] eq "-intermediair") { $Intermediair = 1; } 
   # Special case

   else {&PrintUsage; exit;} 

   shift @ARGV;
 }

 if ($After || $Before) {
   die "You cannot specify both -before AND -after\n" if ($Before && $After);
   die "-time format must be hhmmss or hhmm" 
   unless ($Time =~ /\d\d\d\d(\d\d)?/);
   die "-date format must be yymmdd or yymm" 
   unless ($Date =~ /\d\d\d\d(\d\d)?/);

   $Before = 0 if ($After);  # We don't user $After
   $TimeStamp = &ConvertTimeStamp($Date,$Time);
 }

 if ($IsLocal) {
   $IsLocal =~ s#^http://##;   # Delete http://
   $IsLocal =~ s#/*##;         # delete port no
   $IsLocal =~ s/(\W)/\\$1/g;  # escape regexp chars
 }
}  # GetParams

#====================================================================

sub NextArg {

  shift @ARGV;
  return $ARGV[0];

}  # NextArg

#====================================================================

sub PrintUsage {

 print <<"E_O_T";

Usage:          webxref -options file.html || webxref -options site/
		    then navigate to "working_directory/res[](site)/"

                webxref -options site1 site2
		    then see "res[](site1)" and "res[](site2)"

                webxref -at "path" file.html
		    then navigate to "path/res[](site)/"


Options:       -help/-h -noxref -xref/-x -onexref -fluff -htmlonly 
               -rep -norep  -at -http -delay seconds
	         -silent/-s -verbose/-v -errors/-e -noint
	         -spell -html -del -brief -fullpath
	         -islocal <address> -avoid/-a <regexp>
	         -one/-1 -depth <depth>
	         -date <yymmdd> -time <hhmmss> -before -after
	         -find <string> -findexpr <regexp>>
	         -replace <string> -replaceexpr <regexp> -by <string/expr>


Configuring:   see the MAIN section

E_O_T
}

#====================================================================

sub PrintHelp {

&PrintUsage;

print <<"E_O_T";
=========================================
Which parameters to use for what purpose:
=========================================

Webxref checks the given file and follows the links in that file. While
working it lets you know it's alive by printing to STDOUT verbose messages.
It also prints in the report file a '+' for each file checked ok, and a '-'
for each file with a problem.

Default webxref gives for each file found on your local disks a report
on its headers,tag elements(with attributes-values) and links. After parsing
it as a string,the routine DissectFile will display data from the HTML syntax
tree. The routine was inspired by the htmlscript "dissectsite.hts" found at
"http://worldwidemart.com/scripts/htmlscript/dissect/". Specify -norep to
discharge it or see the main section for configuring.
  
A webxref run can take some time. You can, however, interrupt webxref with
ctrl-c (Unix). Webxref will report only the files it has inspected up to that
moment and exit. (*New!*)(Note: this is not reliable! webxref is not interruptable
at any time, due to the C-libraries not being re-entrant. (This probably does not
interest you at all, but it's not the author's fault.)) Specify -noint if you don't
want webxref to try and generate output after an interrupt.

When the whole site has been searched, all links have been inspected and
all its .html, .htm files found have been dissected, webxref prints a report.
Actual default is a long report in .txt form. The option "-html", lets you
change the form default. The option "-at", allows you to choose a directory
on your disk to put the results in. See also the examples.

If you want more information while webxref is working specify -verbose to get
messages on every file or -errors to see only files with problems. With -silent
webxref prints few messages while working.

Webxref keeps track of which html-documents are being linked to from other
documents. This is called cross-referencing, hence webxref's name. If you are
not interested in this, specify -noxref, so you won't be told where things have
failed and probably have to run webxref again. If you're just interested in one
location where a file is referenced specify -onexref. This saves memory too.

If you need to know if there are files and/or directories in your site that
are not referenced at all by any pages in your site specify -fluff. 

If you want to only inspect files that really have the .html or .htm extension
specify -htmlonly

If you specify -fullpath, you'll get the full paths for files. Default, the file
names will be abbreviated: /u/people/rick/www/a.html is printed as "a.html"
(webxref is called from ~/rick/www).

If you use full URLs in your site referring to your own site, say "www.sara.nl" is
your www-address and you use links like <a href=www.sara.nl/rick/index.html> then
tell webxref that "www.sara.nl" actually can be found on the local machine with:
-islocal 'www.sara.nl'

If you want to avoid certain files use the -avoid parameter to specify which
files to avoid. 

If you want to limit the number of files webxref inspects you may want to limit
the scan to 1 or 2 directories deep in the file system. If you specify -depth 0
only files in the current directory are inspected.

If you just want to check if links in a file are valid specify -one (or -1). Only
the links present in the file are tested, but no more. Use this with -files
to specify a collection of files to just check those files.

Specify -http if you want webxref  to check if the http:// links work. After all
local files are inspected. This may be time-consuming. To avoid overloading
a webserver there is a delay of 1 second between checks. If you want longer
or shorter delays specify the number of seconds with -delay. (Longer delays may
be necessary if a lot of links refer to the same webserver.)

To see if you have files or directories that were modified last before or after
a certain date/time use: -before/-after -date yymmdd -time hhmmss. If -before
is given files are reported that were modified before the date given, with -after
files last modified after the date given are reported.

Default, simply list the fileor directories at the end of the command. To tell
webxref which files to inspect use -files or -f. Webxref generates different
results directories only if the files given as arguments are from different sites.

Webxref can search and even search-replace text, see later.

=======================
What the parameters do:
=======================
While checking webxref prints messages to STDOUT according to:
 -silent/-s	Few messages, list problems at the end of the run.
-verbose/-v	Print information while checking files.
-errors/-e	Print errors when they occur, even when -silent.

Webxref generates a report according to:

-noint	    Do not generate output on interrupt
-norep      The routine elements is discharged
-spell     Checks html files for syntax errors
-brief      List just problems.
-xref/-x    List which files reference files (cross-references).
-noxref	    Do not list which files reference files (default).
-html	    Print report in .html form.
-del        Deletes HtmlReportFiles
-at         Lets you choose a directory to put the results in

Webxref inspects files/directories according to:

-fluff		List which files/directories are never used.
-htmlonly	Only inspect files with the .html/.htm extension.

-fullpath	Print full-length filenames.
-islocal url	'www.mymachine.nl' is actually a local file reference.
-avoid regexp	Avoid files with names matching regexp for inspection.
-depth number	The maximum directory nesting level.
		0 means: current directory only,
		1 means: directories from the current directory.
		100 probably means there is no restriction in
		how deep webxref is allowed to find files.
-one/-1		Specify -one if you just want to check the links
		from the given file(s) and no further link following.
-http		Check external URLs via the network.
-delay seconds	Wait the specified number of seconds between HTTP checks
-date -time	Date [yymm<dd>], time [hhmm<ss>].
-before -after	List files that are modified before or after
		the date/time given with -date and -time.

=================
Find/replacement: ** EXPERT ONLY **
=================

Webxref can scan your site for files containing certain text. To find fixed
text use -find. To find text using e.g. wildcards use -findexpr. The Perl
expression is matched with the text of the file under test. Take care to not
have the shell interpret '*' and '/' by using appropriate quoting. Search is
always case-insensitive.  Webxref does search/replace beyond end-of-line. I.e.
newlines are matched, and can even be inserted (use \n).

To replace text with something else use -replace and -replaceexpr and -by. The
string or expression you specify with -replace or -replaceexpr is replaced by
the string you specify with -by. In case of editing, a backup file with a random
numeric extension is placed next to the resulting file. E.g. when index.html is
edited there'll be a file "index.html.1234" or something similar.(DISCLAIMER:
the author cannot be held responsible for any damage resulting from using the
edit- or any other functions of webxref or indeed any software, hardware, chemical
substance, imagined or real (or imagined to be real) effects or by-effects of
anything, at all, whatsoever.)

-find string	report files containing the given string
-findexpr regexp	report files containing the given expression
-replace string	*REPLACE* string by the string given with -by
-replaceexpr regexp *REPLACE* regexpr by the string given with -by
-by string		replacement string (or regexp)
-nobackup		Not implemented on purpose.

========
Examples
========

webxref file.htm(l) or webxref site/
        Lists every file encountered in directories, reports problems,
        dissects .html, .html and writes the list of the reports in
        "/res(site)[]/analysis_results.html[txt]".
webxref site1/ site/2
        Analyse site(directory)1 then site(directory)2
webxref -at path file.htm(l)
        Lets you choose a directory on you disk where to put the results
webxref -norep file.html
        lists files encountered in directories and reports problems 
webxref -html index.html
        lets you get the reports in .html form
webxref -one index.html
	just check the links in index.html, don't follow the links
webxref -one *.html
	Check only the links in the html-files in the current dir.
webxref -depth 0 index.html
	Check index.html, but don't check files in directories
	that are deeper in the file system. 
webxref -http file.html
	Checks file.html and external URLs
webxref -htmlonly file.html
	Checks file.html, but only files with the .html/htm extension
webxref -avoid '.*Archive.*' file.html
	Checks file.html but avoids files with names containing
	'Archive'
webxref -avoid '.*Archive.*|.*Distribution.*' file.html
	Same as above, but also files with names containing
webxref -islocal www.sara.nl
	Treat things like '<a href=http://www.sara.nl/rick' as a 
	local reference, as if it would have been '<a href=/rick'

webxref -fluff index.html
	Checks index.html and reports files in the directories 
	encountered that were not referenced by index.html or any 
	file linked to from there.
webxref -silent index.html
	Just report problems at the end of the run. This may take
	a while with a big website.
webxref -silent -errors index.html
	Prints only problems while scanning, and the final report.
webxref -verbose index.html
	Prints a message for every file under test.
webxref -brief -silent index.html
	Does not print messages while scanning, and generates a
	short report, i.e. lists just problems.
webxref -before -date 970823 -time 1200 index.html
	Reports files last modified before August 23rd 1997
webxref -find 'me.gif' index.html
	Reports a list of pages containing the text 'me.gif'
webxref -findexpr '<img .*\.gif' index.html
	Reports files containing links to gif files.
webxref -replace 'me' -by 'you' -one index.html
	Replace 'me' by 'you' in index.html one-ly.
  
E_O_T

exit;
}

#====================================================================

sub GetCWD {

  # -- GetCWD
  # Get the current working directory, make sure it ends with a slash
  # No parameters
  # Returns cwd
  # Modified by Calin MOSUT (the command `pwd` in the original code could not
  # reach the mount point on a Windows machine)

  use Cwd;
  local($dir);

  $dir = getcwd;
  # method from Cwd.pm
  $dir =~ s/\n//g;
  if ($dir !~ m#/$#) {
    $dir = "$dir/";
  }

  return $dir;

}  # GetCWD

#====================================================================

sub SplitFile {

  # -- SplitFile($filename)
  # Split filename into base directory, filename, anchor and depth
  # Returns (dir,file,anchor,depth)

  local($file)= @_;

  local(@parts,$filename,$dir,$anchor,$depth);

  @parts = split(/\//,$file);

  # The recursion "level" of webxref is linked to the depth
  # we're currently operating at on the file system
  $depth = scalar(@parts);

  # If ends with a slash it's a directory ref
  # If there's no slash it can still be a directory ref, but
  # we'll find that out later
  if ($file eq "/") {
    return ($file,'','',0);
  }
  elsif ($file =~ m#/$#) {
    return ($file,'','',$depth-1);
  }

  $filename = pop(@parts);
  $depth = $depth-2;

  ($filename,$anchor) = split(/#/,$filename);
  if ($anchor) { $anchor = '#' . $anchor; }

  $dir = join('/',@parts);
  if ($dir ne "") {
    if ($dir !~ /\/$/) { $dir .= '/'; }
  }
  while ($dir =~ s#[^/]+/\.\./##) { ; } # collapse ../ constructions
  # Add slash if not there already
  if ($dir ne "") {
    if ($dir !~ /\/$/) { $dir .= '/'; }
  }

  return ($dir,$filename,$anchor,$depth);

}  # SplitFile

#====================================================================

sub SplitURL {

  # -- SplitURL(url)
  # Split the parameter given as if it's an url.
  # Method: http, ftp, telnet, news, ...
  # In case of a file $method is "file:"
  # Returns (method,file)

  # http://www.sara.nl:80/rick
  # http://www.sara.nl:80/cgi-bin/wibble
  # http://www.sara.nl:80/cgi-bin/wibble?param
  # http://www.sara.nl:80/cgi-bin/wibble?param&param2

  local($URL) = @_;
  local($method,$rest);

  $URL =~ s/^\s+//;  # Remove leading whitespace

  # http://  gopher://  news://  etc
  if ($URL =~ m#^(\w+)://(.*)#) {
    $method = lc($1);
    $rest = $2;
  }
  #  //something is http too
  elsif ($URL =~ m#^//(.*)#) {
    $method = 'http';
    $rest = $2;
  }

  # mailto:  
  elsif ($URL =~ m#^(\w+):([^/].+)#) {
    # the mount point "^\w:/" is considered as a file)
    $method = lc($1);
    $rest = $2;
  }

  elsif ($URL =~ m#^/cgi-bin/(.+)#i) {
    $method = 'CGI';
    $rest = $1;
  }
  else {
    $method = 'file'; # Can be dir too, will find out later
    $rest = $URL;
  }
 
  return ($method,$file);

}  # SplitURL

#====================================================================

sub AddedToList {

  # -- AddedToList(list,url,referer)
  # Add resource location and referer to the hash belonging to the method
  # Often AddedToList is just called to add a referer of the file.
  # Returns 0 if already listed
  # Returns 1 if newly added

  local(*list,$url,$referer) = @_;    
  # print "\nADD: $url,$referer\n";

  if ($url =~ m#^\w:/#) {
    $url = "file:/$url";
    # navigation under Windows
  }
  if ($referer =~ m#^\w:/#) {
    $referer = "file:/$referer";
    # by Calin MOSUT
  }
  if ($list{$url}) {
    $list{$url} .= " $referer" if ($Xref);
    return 0;
  }
  # New
  $list{$url} = "$referer";
  return 1;
}  # AddedToList

#====================================================================

sub PrintDot {

  # -- PrintDot
  # Prints a dot with autoflush on.
  # (Actually: a '+' for an ok file, a '-' for files
  # with problems)
  # Called for every file in certain modes.

  local($Dot) = @_;

  local($Save) = $|;  # Save autoflush

  $| = 1;
  print RES "$Dot";
  $| = $Save;

  $DotCount++;
  if ($DotCount >= $MaxDots) {
    print RES "\n";
    $DotCount = 0;
  }

}  # PrintDot

#====================================================================
 
sub GetReferences {

  # -- GetReferences($link,$referer)
  # Get all references from the link(file) and check those recursively.
  # Link can be a file, or a ref in the form http:// etc

  # Note: the files referenced are kept as full filesystem paths
  # to those files. This is done to ensure that references to 
  # the file /u/user/file.html is the same as a reference "../file.html"

  local($link,$referer) = @_;
  # print "\nGetRefs:\nLink: $link\nReferer: $referer\n";
  # These vars are pushed onto the stack each recursive call
  local($dir,$file,$anchor);
  local($Old_Dir);
  local($filename);

  ($method,$rest) = SplitURL($link);
  # print "GetReferences link: $link  method: $method\n";

  if    ($method eq 'http')   { ; }
  elsif ($method eq 'ftp')    { return unless &AddedToList(*FTPList, $link, $referer);   }
  elsif ($method eq 'telnet') { return unless &AddedToList(*TelnetList, $link, $referer);  }
  elsif ($method eq 'gopher') { return unless &AddedToList(*GopherList, $link, $referer);  }
  elsif ($method eq 'mailto') { return unless &AddedToList(*MailList, $link, $referer);    }
  elsif ($method eq 'news')   { return unless &AddedToList(*NewsList, $link, $referer);     }
  elsif ($method eq 'CGI')    { ; }
  elsif ($method eq 'file')   { ; }
  else  { print TRA "sub GetReferences warn:\n   Unknown method '$method' Link: '$link'\n\n"; }
 
  # Apply the -islocal thingie: try to map a http reference to
  # a local file on the machine specified with -islocal. 
  # If http check if it's a local reference after all (-islocal parameter!)
  if ($method eq 'http') {
    if (! $IsLocal) {
      return unless &AddedToList(*HTTPList, $link, $referer);
    }
    else {
      if ($link =~ m#^http://$IsLocal(/?.*)#i) {  # We have a match!
        $method = 'file';
        $link = $1;

        # Could be a cgi call now...
        ($method,$rest) = SplitURL($link);
      }
    }
  }


  # If CGI just try to check if the script is present
  if ($method eq 'CGI') {
    # Delete parameters of cgi script
    $link =~ s/\?.*//;
    if (-e "$SiteRoot$link") { 
      return unless &AddedToList(*CGIList, $link, $referer); 
    }
    else {
      return unless &AddedToList(*LostCGIList, $link, $referer);
    }

    return;
  } # cgi



  return if ($method ne 'file');



  # Apparently what we have ended up with at this point   # is a reference to a file of some sort. This "file" 
  # can also still be a directory. It can also be a name 
  # anchor in the file.
  ($dir,$file,$anchor,$depth) = &SplitFile($link);
  #print "split: $dir $file\n";


  # Apply the regexp to avoid certain files
  if ($Avoid) {
    #print "Avoid: $Avoid  File: $file  Dir: ",&PrintFile($dir),"\n";
    if (&PrintFile($file) =~ m/$Avoid/o) {
      print "  Avoided file ",&PrintFile($file),"\n" if (!$Silent) || ($Errors);
      return;
    }
    if (&PrintFile($dir) =~ m/$Avoid/o) {
      print "  Avoided directory ",&PrintFile($dir),"\n" if (!$Silent) || ($Errors);
      return;
    }
  }


  $cwd = &GetCWD;
  if ($dir eq "") {
    $dir = $cwd;
  }

  # Move to the specified directory to obtain the expanded 
  # file path
  if (-d $dir) {
    $Old_Dir = $cwd;  
    chdir($dir);
    $dir = &GetCWD;
    $filename = $dir . $file;

    # See which files are in this directory if we are checking 
    # for unreferenced files, if not done before
    &GetFluffFiles($dir) if ($Fluff && (! $FluffScannedDirList{$dir}));
  }
  else {
    &PrintDot("-") if ($Dots);
    print "\n" if ($Errors && $Dots);
    print "  ", &PrintFile($dir.$file)," could not be found\n" 
      if (!$Silent) || ($Errors);
    print "    Referenced by: ",&PrintFile($referer),"\n" 
      if (!$Silent) || ($Errors);
    &AddedToList(*LostFileList,$dir.$file,$referer);
    return;
  }


  # Add to the list of already tested files if not inspected before
  # If the "file" is a directory try Welcome/welcome/index.html etc.
  if (-d $filename) { 
    return unless &AddedToList(*DirList,$filename,$referer);

    &DoDirectory; 
  }


  # Not found?
  if (! -f $filename) {

    # If the file lacks an extension try the default
    # extensions html, htm etc.
    $SecondChance = &TryExtensions($filename);
    if (! $SecondChance) {
      &PrintDot("-") if ($Dots);
      print "\n" if ($Errors && $Dots);
      print "  ", &PrintFile($filename)," could not be found\n"
        if (!$Silent) || ($Errors);
      print "    Referenced by: ",&PrintFile($referer),"\n"
        if (!$Silent) || ($Errors);
  
      # Add to list of lost files
      &AddedToList(*LostFileList,$filename,$referer);
      return;
    }
    else {  # Apparently adding an extension did the trick
      $filename = $SecondChance;
    }
  }



  # World readable? (do not use -r, doesn't work for root)
  ($dev,$ino,$mode,$nlink,
  $uid,$gid,
  $rdev,$size,
  $atime,$mtime,$ctime,
  $blksize,$blocks) = stat($filename);
  $readmode = ($mode & 4);
  if ($readmode == 0) {
    # Not world readable, add to list
    &AddedToList(*UnreadableList,$filename,$referer);
  }

  # Check if we need to list this file
  &CheckTimeStamp($filename,$mtime) if ($Before || $After);

  # Binary file? (pictures,...)
  if (-B $filename) {
    &AddedToList(*ImageFileList,$filename,$referer);
    return;
  }


  if ($HTML_only) {
    # Filename *must* have extension .html or .htm, else we don't inspect it.
    if ($filename !~ /.*\.htm[l]*$/i) {return;}
  }

  # Apply the regexp to avoid certain files
  if ($Avoid && ($filename =~ m/$Avoid/)) { 
    #print "** The above file is avoided.\n" if (!$Silent) || ($Errors);
    return;
  }


  # else it's a text (html)file
  # return if we already inspected it
  return unless &AddedToList(*FileList,$filename,$referer);


  # We now have a (html) file to check for further links, name anchors etc.

  # But not if we're just checking the One file
  if ($One) {
    #print "** ref eq INFILE\n" if ($filename eq $InFile);
    #print "** ref eq websrfe\n" if ($filename eq $WebxrefReferer);
    #print "RETURNED\n" if (($filename ne $InFile) && ($filename ne $WebxrefReferer));
    return if (($filename ne $InFile) && ($filename ne $WebxrefReferer));
  }

  print "Checking: ",&PrintFile($filename),"\n" if (!$Silent);
  #&PrintDot('+') if (($Dots) && (!$Silent));
  &PrintDot('+') if ($Dots);

  local(@newlist) = DissectFile($filename);


  #print "done DissectFile from ",&PrintFile($filename),":\n", join("\n",@newlist),"\n";

  # Now see if the anchor we were after was found
  if ($anchor) { 
    if (!defined($AnchorList{"$filename$anchor"})) {
      return unless &AddedToList(*LostAnchorList,"$filename$anchor",$referer);
      print "   Anchor ",&PrintFile($anchor)," is 1NOT present in file ",
          &PrintFile($filename),"\n" if (!$Silent) || ($Errors);
      print "\n" if ($Errors && $Dots);
      &PrintDot('-') if ($Dots);
    }
    else {
      # Anchor found, add referer
      AddedToList(*AnchorList,"$filename$anchor",$referer);
    }
    return;
  }

  # Walk the list and check everything is there
  foreach $file (@newlist) {

    ($method,$rest) = SplitURL($file);
    if ($method eq 'file') { 
      if ($file =~ s#^/##) {  # root reference
        $Notlocal_file = "$SiteRoot$file";
      }
      else {  # Reference relative to directory
        $Notlocal_file = $dir . $file; 
      }
    }
    else {  # Not file but http, news etc
      $Notlocal_file = $file;
    }

    $Notlocal_ref_filename = $filename;

    # Don't go deeper than he wanted (ooh!)
    ($dummy,$dummy,$dummy,$depth) = SplitFile($Notlocal_file);
    #print "Depth of $Notlocal_file is $depth, Max Depth is $MaxDepth\n";
    #print "we skip\n" if ($depth > $MaxDepth);
    next if ($depth > $MaxDepth);

    # Prevent from recursing if there's an easy check...
    next if (&AlreadyChecked($Notlocal_file,$Notlocal_ref_filename));


    &GetReferences($Notlocal_file, $Notlocal_ref_filename);
  } # foreach

  chdir($Old_Dir) if ($dir ne $Old_Dir);

} #sub GetReferences

#====================================================================

sub DissectFile {

  # -- DissectFile($file)
  # Read the html file and extract all elements; returns @links, @Newlist

  local($filename) = @_;
  local(@Tags,@headers,@scripts,%links,@links,%Newlist,@Newlist,%LocalAnchorsFound,%LocalAnchorsWanted);
  local($elements,$tag,$Link);

  unless (open(HTML, $filename)) {
    print TRA "sub DissectFile warn:\n   Could not open file '$filename'\n\n";
    return; 
  }

  # Read the file into a big string and remove crud in between tags.
  # print "Opening $filename\n";  
  @Tags = <HTML>;
  close(HTML);
  $elements = join('',@Tags);
  # Note down files matching the find (expression)
  &DoFind($filename,$elements) if (defined($FindExpr));
  if ($ReportFiles) {
    if ($elements =~ m#<(TITLE)>(.+)</TITLE>#si) {
      push @headers,"$1=$2";
    }
    while ($elements =~ m#<SCRIPT.+?>(.+?)</SCRIPT>#sig) {
      push @scripts, "$1";
    }
    $elements =~ s#(<SCRIPT.+?>).+?(</SCRIPT>)#$1 $2#sig;
    $elements = &spell($elements);
  }
  @Tags = split/</, $elements;
  for(@Tags) {
    s/\n/ /g;
    s/>.*//;
    # print "tag: $_\n";
    if ($ReportFiles) {
      if (m#^(?:META|ISINDEX|STILE|BASE|LINK)#i) {
        # elements from the <HEAD>...</HEAD> section
        push @headers,"$_";
        # extract the headers
      }
      if (m#^(A|IMG|FRAME|AREA|FORM|BODY|BASE|LINK|SCRIPT|INPUT|APPLET|EMBED).+(HREF|SRC|ACTION|BACKGROUND|CODEBASE|URL)\s*=\s*"?([^"\s]*)"?#i){ 
        # link elements
        $links{"$1 $2 = $3"} = 1;   
        # extract the links
      }
      @links = keys (%links);
    }
    # <a href/name
    if (/^A\s+/i) {
      #print "-anchor: $_\n";
      # -- a href
      if (m#HREF\s*=\s*"?([^"\s]*)"?#i) {
        $Link = $1;
        #print "  href: -$Link-\n";         # Link to name anchor within current document? (<a href=#anchor>)
        if ($Link =~ m/^#/) {
          #print "  -$filename$Link- wanted\n";
          # Special case for Intermediair: do not generate error for "href=file.html#"
          # (empty name anchor)
          #print "  -$Link- wanted\n";
          if ($Intermediair) {
            next if ($Link eq "#");
          }
          $LocalAnchorsWanted{"$filename$Link"} = 1;
        }
        # Link to another document?  a href=file.html#anchor
        elsif ($Link =~ m/#/) {
          $Link =~ m/(.+)#(.+)/;
          # print "LINK: $Link  $1 $file - equal?\n";
          if ($1 eq $file) {  # Current file after all
            $LocalAnchorsWanted{("$filename" . '#' . "$2")} = 1;
          } 
          else {
            $Link =~ s/#.+$//;
            $Newlist{$Link} = 1;
          }
        }
        else {  # Just a file ref
          $Newlist{$Link} = 1;
        }
      }
      elsif (m#NAME\s*=\s*"?([^"\s]*)"?#i) {         # -- a name=...
        $Link = $1;
        #print "  name: $Link\n";
        #print "  -$filename$Link- found\n";
        $LocalAnchorsFound{"$filename#$Link"} = 1;
      }
    }
    # <img src=...
    # NB: <img and src= must be on same line
    elsif (/^IMG/i) {
      if (m#SRC\s*=\s*"?([^"\s]*)"?#i) {
        $Link = $1;
        #print "  img: $Link\n";
        # Add file to the list
        $Newlist{$Link} = 1;
      }
      else { print TRA "sub DissectFile warn:\n   Image parse error in '$filename':\n  $_\n\n"; }
    }
    elsif (/^(?:FRAME|AREA|FORM|BODY|BASE|LINK|SCRIPT|INPUT|APPLET|EMBED)\s+/i) {
      if (m#(?:SRC|HREF|ACTION|BACKGROUND|CODEBASE)\s*=\s*"?([^"\s]*)"?#i) {
        $Link = $1;
        # Add file to the list
        $Newlist{$Link} = 1;
      }
    }
  } 
  &elements;
  # gets all components of the web document found

  #print "LocalAnchorsFound: \n",join("\n",keys(%LocalAnchorsFound)),"\n";
  #print "\nLocalAnchorsWanted: \n",join("\n",keys(%LocalAnchorsWanted)),"\n";


  # Add the local anchors found to the global list
  foreach $Anchor (keys(%LocalAnchorsFound)) {
    &AddedToList(*AnchorList,$Anchor,$filename);
  }

  # Check if the locally referenced anchors are there
  # and remove them from Newlist if they are

  foreach $Anchor (keys(%LocalAnchorsWanted)) {
    if (!defined($LocalAnchorsFound{$Anchor})) {
      &AddedToList(*LostAnchorList,$Anchor,$filename);
      print "\n" if ($Errors && $Dots);
      &PrintDot('-') if ($Dots);
      print "  Anchor ",&PrintFile($Anchor)," is 2NOT present in file ",
            &PrintFile($filename),"\n" if (!$Silent) || ($Errors);
    }
    else {
      $Newlist{$Anchor} = 0; # Remove from list
    }
  }
  foreach (keys(%Newlist)) {
    if ($Newlist{$_}) { push(@Newlist, $_);}
  }
  # print "\nnewlist: \n",join("\n",@Newlist),"\n";

  return @Newlist;

}  # DissectFile

#====================================================================

sub AlreadyChecked {

  # -- AlreadyChecked($file,$referer) 
  # Return 1 if $file already referenced in one of the lists

  local($file,$referer) = @_;

  if ($FileList{$file}) {
    &AddedToList(*FileList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($LostFileList{$file}) {
    &AddedToList(*LostFileList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($UnreadableList{$file}) {
    &AddedToList(*UnreadableList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($ImageFileList{$file}) {
    &AddedToList(*ImageFileList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($DirList{$file}) {
    &AddedToList(*DirList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($DirNotFoundList{$file}) {
    &AddedToList(*DirNotFoundList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($HTTPList{$file}) {
    &AddedToList(*HTTPList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($CGIList{$file}) {
    &AddedToList(*CGIList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($LostCGIList{$file}) {
    &AddedToList(*LostCGIList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($AnchorList{$file}) {
    &AddedToList(*AnchorList,$file,$referer) if $Xref;
    return 1;
  }
  elsif ($LostAnchorList{$file}) {
    &AddedToList(*LostAnchorList,$file,$referer) if $Xref;
    return 1;
  }

  return 0;

}  # AlreadyChecked

#====================================================================

sub TryExtensions {

  # -- TryExtensions(file)
  # Apparently $file could not be located. Maybe the extension
  # was missing, in that case try html, htm, shtml etc.
  # Returns the filename plus extension if that conbination
  # does exist.

  local($file) = @_;
  local($d,$f,$extension);

  ($d,$f) = &SplitFile($file);

  ($f,$extension) = split(/\./, $f);
  if (! $extension) {  # No extension, try some
    foreach $extension (@Extensions) {
      if (-f "$file.$extension") {
        return "$file.$extension";
      }
    }
  }

  return "";  # Not found or file does have extension

}  # TryExtensions

#====================================================================

sub DoDirectory {

  #print "xx $filename is a directory, trying Welcome/welcome/index.html.\n";

  $found = 0;
  foreach $default_file (@DefaultFiles) {
    #print "Trying $default_file\n";
    if (-f ($filename . '/' . $default_file)) {
      $dirname=$filename;
      $file= $default_file;
      $found = 1;
      last;
    }
  }

  if (! $found) {
    print "\n" if ($Errors && $Dots);
    &PrintDot("-") if ($Dots);
    print "  No Welcome/welcome/index.html can be found in ",
    &PrintFile($filename),"\n" 
    if (!$Silent) || ($Errors);
    print "    Referenced by: ",&PrintFile($referer),"\n"
    if (!$Silent) || ($Errors);
    $filename = "$filename/index or welcome file";

    # Add to list of lost files
      if (!defined($LostFileList{$filename})) {
        $LostFileList{$filename} = $referer; 
      }
      else {
        $LostFileList{$filename} = "$LostFileList{$filename} $referer";
      }
      return;
    }

  # Move to the specified directory
  chdir($dirname);
  $dir=&GetCWD;
  $filename = $dir . $file;

  # See which files are in this directory if we are checking 
  # for unreferenced files, if not done before
  &GetFluffFiles($dir) if ($Fluff && (! $FluffScannedDirList{$dir}));

}  # DoDirectory

#====================================================================

sub GetFluffFiles {

  # -- GetFluffFiles
  # If we're checking for never referenced files a list
  # is maintained of all files in all directories we have
  # encountered. This list is checked against all files
  # referenced. Any files left over are considered fluff.
  # GetFluffFiles reads the current working directory

  local($dir) = @_;
  $dir .= '/' if ($dir !~ m#/$#); # Add / if not there

  local(@Files);

  # Read dir contents
  #print "Fluff-dir: $dir\n";
  opendir(DIR,"$dir");
  @Everything = readdir(DIR);
  closedir(DIR);

  # Separate in list of files and list of dirs
  @Files = grep !/^\./, @Everything;  # No .-files
  @Directories = @Files;
  @Files = grep -f, @Files;  
  @Directories = grep -d, @Directories; 

  # Add them to the fluff-list
  foreach (@Files) {
    #print "f: $dir$_\n";
    $FluffFileList{"$dir$_"} = 1;
  }
  foreach (@Directories) {
    $_ .= '/' if (! m#/$#); # Add / if not there
    #print "d: $dir$_\n";
    $FluffDirectoryList{"$dir$_"} = 1;
  }

  # Mark this directory scanned
  #print "fsdl: $dir\n";
  $FluffScannedDirList{$dir} = 1;

}  # GetFluffFiles

#====================================================================

sub PickFluff {

  # Determine the files that were never referenced but
  # that did reside in the directories we have scanned.
  # Actually, files in the list are unmarked if we have
  # encountered them while checking.


  # ----- Files
  foreach (keys(%FileList)) {
    $FluffFileList{$_} = 0;  # unmark
  }
  foreach (keys(%LostFileList)) {
    $FluffFileList{$_} = 0;  # unmark
  }
  foreach (keys(%ImageFileList)) {
    $FluffFileList{$_} = 0;  # unmark
  }

  # The actual fluff files
  foreach (keys(%FluffFileList)) {
    push(@FluffFiles,$_) if ($FluffFileList{$_});
  }
  undef %FluffFileList;
  @FluffFiles = sort(@FluffFiles);


  # ----- Directories
  foreach (keys(%DirList)) {
    $FluffDirectoryList{$_} = 0;  # unmark
  }
  foreach (keys(%LostDirList)) {
    $FluffDirectoryList{$_} = 0;  # unmark
  }
  foreach (keys(%FluffScannedDirList)) {     $FluffDirectoryList{$_} = 0;  # unmark
  }

  # The actual fluff directories
  foreach (keys(%FluffDirectoryList)) {
    push(@FluffDirectories,$_) if ($FluffDirectoryList{$_});
  }
  undef %FluffDirectoryList;
  undef %FluffScannedDirList;
 }  # PickFluff

#====================================================================

sub DoFind {

  # -- DoFind($filename,$html-file-as-string)
  # Match the text of the html-doc to the find and findexpr
  # parameters.

  local($filename,$content) = @_;
  #print "DOFIND: f=$filename\n\ncont:\n----$content\n----\nfind: $FindExpr\n";

  if (defined($FindExpr)) {
    if ($content =~ /$FindExpr/moi) {
      $FindFiles{$filename} = 1;
      #if ($FindFiles{$filename}) {
        #print "YES we have a match\n" 
      #}
      #else {
        #print "NO match\n";
      #}

      # EDIT!
      if (defined($Replacement)) {
        # Just in case we've been here before...
        return if (defined($EditedFile{$filename}));

        $oldcontent = $content;
        $content =~ s/$FindExpr/$Replacement/gmoi;
        if (&WriteFile($filename,$oldcontent,$content)) {
          $EditedOkList{$filename} = 1;
        }
        else {
          $EditFailedList{$filename} = 1;
        }
      }
    }
  }

}  # DoFind

#====================================================================

sub WriteFile {

  # -- WriteFile($filename,$oldcontent,$content)
  # Write $content to $filename, while making a backup

  local($filename,$oldcontent,$content) = @_;
  local($backupfile) = $filename . '.' . $$;

  # First make backup: $filename.$$ (pid) --> a.html.12345
  open(BAKFILE, ">$backupfile")  || return 0;
  print BAKFILE $oldcontent;
  close(BAKFILE);

  # Write new edited version
  open(NEWFILE,">$filename") || return 0;
  print NEWFILE $content;
  close(NEWFILE);

  return 1;

}  # WriteFile

#====================================================================

sub PrintFindHeader {

  local($header) = 'Files matching ';

  if (defined($OrgFindExpr)) { $header .= "\"$OrgFindExpr\""; }

  return $header;

}  # PrintFindHeader

#====================================================================

sub PrintEditHeader {

  # -- PrintEditHeader($ok)
  # If $ok returns succesful, else failed header

  local($ok) = @_;

  local($header) = 'Files where replacing ';

  if (defined($OrgFindExpr)) { $header .= "\"$OrgFindExpr\""; }

  if (defined($Replacement)) { 
    $header .= "  by \"$Replacement\""; 
  }
 
  if ($ok) {
    $header .= ' was succesful:';
  }
  else {
    $header .= ' FAILED:';
  }

  return $header;

}  # PrintEditHeader
  
#====================================================================

# Time/date stuff


sub CheckTimeStamp {

  # -- CheckTimeStamp($filename,$mtime)

  local($filename,$mtime) = @_;

  if ($Before) {
    $TimeList{$filename} = 1 if ($mtime <= $TimeStamp);
  }
  else {
    $TimeList{$filename} = 1 if ($mtime >= $TimeStamp);
  }

}  # CheckTimeStamp

#====================================================================

sub PrintTimeStamp {

  # -- PrintTimeStamp($mtime)
  # Returns date/time string:  yymmdd hh:mm:ss

  local($mtime) = @_;

  local($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
    localtime($mtime);

  $hour = '00' unless $hour;
  $min =  '00' unless $min;
  $sec = '00'  unless $sec;

  $mon = "0$mon" if ($mon =~ /^\d$/);
  $mday = "0$mday" if ($mday =~ /^\d$/);

  return "$year/$mon/$mday at $hour:$min:$sec";

}  # PrintTimeStamp

#====================================================================

sub ConvertTimeStamp {

  # -- ConvertTimeStamp($date,$Time)
  # Convert $Date and $Time given as parameters to tm struct
  # Date: yymm(dd)   Time: hhmm(ss)

  # Perl library module
  use Time::Local;

  local($Date,$Time) = @_;

  local($year,$month,$day,$hour,$min,$sec);

  $Date =~ m/(\d\d)(\d\d)(\d\d)?/;
  $year = $1;
  $month = $2-1;
  $day = 1;
  $day = $3 if defined($3);

  $hour = '00'; $min = '00'; $sec = '00';
  $Time =~ m/(\d\d)(\d\d)(\d\d)?/;
  $hour = $1;
  $min = $2;
  $sec = 0;
  $sec = $3 if defined($3);

  #print "\n\nhour $hour\n";
  #print "min $min\n";
  #print "sec $sec\n";
  #print "day $day\n";
  #print "mon $month\n";
  #print "year $year\n";
  #print "ts:", timelocal($sec,$min,$hour,$day,$month,$year);

  # From Perl library module
  return timelocal($sec,$min,$hour,$day,$month,$year);

}  # ConvertTimeStamp

#====================================================================

sub Check_External_URLs {

  local($URL);
  local(@SortedList) = sort (keys (%HTTPList));
  my $numberof;

  if (!$Silent) { print "\n\n----------------Checking external URLs:\n\n"; } 
  foreach $URL (@SortedList) {
    if (defined($HTTPStatusList{$URL})) {
      # Already checked on this one
      next;
    }
    else {
      sleep($HTTPDelay);
      $rcode = &Check_URL($URL);
    }

    $numberof++;
    print "url: $numberof\n"; 
    if (defined($OkStatusMsgs{$rcode})) {
      # URL is ok, server responds and all. 
      if (!$Silent) { print "  Ok\n"; }
      &PrintDot('+') if ($Dots);
      &AddedToList(*HTTP_OK_List,$URL,$HTTPList{$URL});  # The references
    }
    else {
      # Something is wrong.
      if (defined($FailStatusMsgs{$rcode})) {
        &PrintDot('-') if ($Dots);
        print "\n" if ($Errors && $Dots);
        print "     Failed: $FailStatusMsgs{$rcode}\n" 
        if (!$Silent) || ($Errors);
      }
      else {
        &PrintDot('-') if ($Dots);
        print "\n" if ($Errors && $Dots);
        print "     Failed with code $rcode\n"
          if (!$Silent) || ($Errors);
      }
      &AddedToList(*HTTP_Fail_List,$URL,$rcode);
    }
  }
} #Check_External_URLs

#====================================================================

sub Check_URL {

  # http://host:port/path

  local($URL) = @_;

  if ($URL !~ m#^http://.*#i) { 
    print TRA "sub Check_URL warn:\n   '$URL': wrong format http!\n\n";
    return;
  }
  else {

    # Get the host and port
    #if ($URL =~ m#^http://([\w-\.]+):?(\d*)($|/(.*))#i) {
    if ($URL =~ m#^http://([^:/]+):?(\d*)($|/(.*))#i) {
      $host = $1;
      $port = $2;
      $path = $3;
      #print "URL:$URL host:$host port:$port path:$path\n";
    }
    else {
      print TRA "sub Check_URL warn:\n   Unrecognized URL: '$URL'\n\n";
      return;
    }
    if ($path eq "") { $path = '/'; }
    if ($port eq "") { $port = 80; }

    # Delete name anchor. (check if the anchor is present in the doc?)
    $path =~ s/#.*//;
    # Delete parameters
    #$path =~ s/\?.*//;

    #print "-->\n URL: $URL\n host: $host\n port: $port\n path: $path\n";
  }


  # The following is largely taken from the Camel book, chapter 6

  $AF_INET = 2;
  $SOCK_STREAM = 1;

  $sockaddr = 'S n a4 x8';  # (Bless you)

  chop($hostname = `hostname`);

  ($name,$aliases,$proto) = getprotobyname('tcp');
  ($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/;
  ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname);
  if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) {
    return -1;
  }

  $this = pack($sockaddr, $AF_INET, 0, $thisaddr);
  $that = pack($sockaddr, $AF_INET, $port, $thataddr);

  # Make the socket filehandle.
  # ** Temporary fix, this is NOT The way to do it. 15-APR-96
  # But we'll still use it anyway, cannot rely on Perl to be 
  # installed correctly everywhere. 
  if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) {
    $SOCK_STREAM = 2;
    if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; }
  }

  # Give the socket an address
  if (!(bind(S, $this))) {
    return -3;
  }

  if (!(connect(S,$that))) {
    return -4;
  }

  select(S); $| = 1; select(STDOUT);

  print S "HEAD $path HTTP/1.0\n\n";

  $response = <S>;
  ($protocol, $status) = split(/ /, $response);
  while (<S>) {
    #print;
  }
  close(S);

  #print "Status: $status\n";
  return $status;
}

#====================================================================

sub SortUnique {

  # Sort the list and remove doubles

  local(%list);

  # Remove doubles
  foreach (@_) {
    $list{$_} = 1;
  }
  return (sort(keys(%list)));
}  # SortUnique

#====================================================================

sub PrintFile {
  # Returns filename without the root path
  local($file) = @_;

  $file =~ s#^(?:file:/)?$ServerRootExpr## if (! $FullPath); # delete root from path
  $file =~ s#^(?:file:/)?$SiteRootExpr## if (! $FullPath); # delete root from path
  return $file;
}

#====================================================================

sub PrintList {
  # generation of tables
  local(*list, $header, $left, $right, $NoXref) = @_;

  local($file,$i);
  local(@SortedFileList);
  local(@SortedReferList);
  local(@ReferList );

  # Don't list empty lists
  return if (! %list);

  @SortedFileList = sort(keys(%list));
  # Append number to header
  $header = "$header ".($#SortedFileList+1);

  print RES "<pre><b>\n" if ($HTMLReport);
  print RES "\n\n",'-' x length($header);
  print RES "<br>" if ($HTMLReport);
  print RES "\n$header\n";
  print RES " <br>" if ($HTMLReport);
  print RES "",'-' x length($header), "\n\n";
  print RES "</b></pre>\n" if ($HTMLReport);
 
  if ($HTMLReport) {
    print RES "<center><table border cols=2 width=80%><tr><td><center>$left</center></td>";
    print RES "<td><center>$right</center></td></tr>" if ($Xref||$OneXref);
    # tables for clearly results
  }
  
  foreach (@SortedFileList) {
    if ($HTMLReport) {
      print RES "<tr><td><a href=$_>",&PrintFile($_),"</a></td>";
      print RES "</tr>\n" unless ($Xref);
    }
    else {
      print RES &PrintFile($_),"\n";
    }
    if ($Xref || $OneXref) {
      @SortedReferList = split(/ /,$list{$_});
      @SortedReferList = &SortUnique(@SortedReferList);
      @HTTPReferList = split(/ /,$HTTPList{$_});
      @HTTPReferList = &SortUnique(@HTTPReferList);
      if ($HTMLReport) {
	print RES "<td>\n";
        foreach $i (@SortedReferList) {
          if ($_[0] =~ /HTTP_Fail_List/) {
            print RES "<b>Status: $i $FailStatusMsgs{$i}</b><br> Referenced by:<br>\n";
            foreach $lostURL (@HTTPReferList) {
              print RES "<a href=$lostURL>",&PrintFile($lostURL),"</a><br>\n";
	    }
          }
          else {
	    print RES "<a href=$i>",&PrintFile($i),"</a><br>\n";
          }
        }
        print RES "</td>\n";
      }
      else {
        if ($_[0] =~ /HTTP_Fail_List/) {
          foreach $i (@SortedReferList) {
            print RES "   Status: $i $FailStatusMsgs{$i}\n    Referenced by:\n";
            foreach $lostURL (@HTTPReferList) {
              print RES "      ",&PrintFile($lostURL),"\n";
            }
          }
        }
        else {
          print RES "  Referenced by:\n" unless ($_[0]=~/HTTP_Fail_List/);
          foreach $i (@SortedReferList) {
	    print RES "    ",&PrintFile($i),"\n";
	  }
        } 
      }
    }  
    last if ($OneXref);
    #print "\n";
  }#$Xref
  print RES "</tr>" if ($Xref && $HTMLReport);
  print RES "</table></center>\n" if ($HTMLReport);

  # CAUTION! TRY TO RETRIEVE MEMORY 13.3.97
  undef (%list) unless ($_[0] =~ /HTTPList/) ;
  # we save here an option (-http)

}# sub PrintList

#====================================================================


sub PrintFluff {
  # The Fluff list is an array, not a hash
  local(*list, $header,$left) = @_;

  # Don't list empty lists
  return if (! @list);

  # Append number to header
  $header = "$header ".($#list+1);
  print RES "<pre><b>\n" if ($HTMLReport);
  print RES "\n\n", '-' x length($header);
  print RES "<br>" if ($HTMLReport);
  print RES "\n$header\n";
  print RES " <br>" if ($HTMLReport);
  print RES '-' x length($header), "\n\n";
  print RES "</b></pre>\n" if ($HTMLReport);
  
  print RES "<center><table border cols=1 width=80%><tr><td><center>$left</center></td>" if ($HTMLReport);
  foreach (@list) {
    if ($HTMLReport) {
      print RES "<tr><td><a href=$_>",&PrintFile($_),"</a></td>\n";
    }
    else {
      print RES &PrintFile($_)," \n";
    }
  }
  print RES "</tr></table></center>\n" if ($HTMLReport);

}  # PrintFluff

#====================================================================

sub PrintLists {


  # Print lists
  print "Printing lists...\n";

  # List of dissected files
  &PrintList(*ReportFiles,"Generated report files:","Report Files","Initial Files",0)if ($LongReport);

  # List all files found
  &PrintList(*FileList,"Web documents found:","Files","Referenced by",0)if ($LongReport);

  # List of directories referenced
  &PrintList(*DirList,"Directories:","Directories","Referenced by",0) if ($LongReport);

  # List of images referenced
  &PrintList(*ImageFileList,"Images:","Images","Referenced by",0) if ($LongReport);

  # List of mailto's
  &PrintList(*MailList,"Mailto:","Mailto","Referenced by",0) if ($LongReport);

  # List of ftp's
  &PrintList(*FTPList,"ftp:","ftp","Referenced by",0) if ($LongReport);

  # List of telnets
  &PrintList(*TelnetList,"telnet:","telnet","Referenced by",0) if ($LongReport);

  # List of gophers
  &PrintList(*GopherList,"gopher:","gopher","Referenced by",0) if ($LongReport);

  # List of news
  &PrintList(*NewsList,"News:","News","Referenced by",0) if ($LongReport);

  # List of http's
  &PrintList(*HTTPList,"External URLs:","URL","Referenced by",0) if ($LongReport);

  # List of file:'s
  &PrintList(*ExtFileList,"External files:","Files","Referenced by",0) if ($LongReport);

  # List of cgi-bin scripts/forms
  &PrintList(*CGIList,"cgi-bin scripts/forms:","scripts/forms","Referenced by",0) if ($LongReport);

  # List of name anchors
  &PrintList(*AnchorList,"Name anchors found:","Anchors","Referenced by",0) if ($LongReport);

  # ---- PROBLEM SECTION -----

  # List of files that can't be found
  &PrintList(*LostFileList,"Files not found:","Files","Referenced by",0);

  # List of cgi-bin scripts/forms not found
  &PrintList(*LostCGIList,"cgi-bin scripts/forms not found:","Scripts/forms","Referenced by",0);

  # List of files that are not world readable
  # Override Xref, as reference list for non-world-readable files is not kept
  &PrintList(*UnreadableList,"Files not world readable:","Files","Referenced by",1);

  # List of directories that can't be found
  &PrintList(*DirNotFoundList,"Directories not found:","Directories","Referenced by",0);

  # List of name anchors not found
  &PrintList(*LostAnchorList,"Name anchors not found:","Anchors","Referenced by",0);


  # List of files found in the directories but not referenced
  &PrintFluff(*FluffFiles,"Files never referenced:","Files");

  # List of directories found in the directories but not referenced
  &PrintFluff(*FluffDirectories,"Directories never referenced:","Directories");

  # List of files that matched $Find/$FindExpr
  &PrintList(*FindFiles, &PrintFindHeader,"Files","Referenced by",1);
 
  # List of files successfully edited
  &PrintList(*EditedOkList, &PrintEditHeader(1),"Files","Referenced by",1);

  # List of files not edited
  &PrintList(*EditFailedList, &PrintEditHeader(0),"Files","Referenced by",1);

  # List of files matching the date/time criterium
  if ($Before) {
    &PrintList(*TimeList, "Files last modified before: ".
                         &PrintTimeStamp($TimeStamp).":","Files","Referenced by", 1);
  }
  else {
    &PrintList(*TimeList, "Files last modified after: ".
                         &PrintTimeStamp($TimeStamp).":","Files","Referenced by",1);
  }
  if ($HTML_only) { print "\nDone.\n"; }


} #sub PrintLists

#====================================================================

sub PrintHTTPLists {
  &PrintList(*HTTP_OK_List,"URLs checked ok:","URLs","Referenced by",0);
  &PrintList(*HTTP_Fail_List,"Failed URLs:","URLs","References",0);
  undef (%HTTPList);
}

#====================================================================

sub InitStatusMessages {

 # HTTP status codes and messages

  %OkStatusMsgs = (
  200, "OK 200",
  201, "CREATED 201",
  202, "Accepted 202",
  203, "Partial Information 203",
  204, "No Response 204",
);

  %FailStatusMsgs = (
  -1,  "Could not lookup server",
  -2,  "Could not open socket",
  -3,  "Could not bind socket",
  -4,  "Could not connect",
  301, "Found, but moved",
  302, "Found, but data resides under different URL (add a /)",
  303, "Method",
  304, "Not Modified",
  400, "Bad request",
  401, "Unauthorized",
  402, "PaymentRequired",
  403, "Forbidden",
  404, "Not found",
  500, "Internal Error",
  501, "Not implemented",
  502, "Service temporarily overloaded",
  503, "Gateway timeout ",
  600, "Bad request",
  601, "Not implemented",
  602, "Connection failed (host not found?)",
  603, "Timed out",
);

}  # sub InitStatusMessages

#====================================================================

sub WriteCorpusAndXMLReports { 
 
  open(READFILE, "$path_to_results$fileResultsGlobal") or die "can't open : $!";

	while($line=<READFILE>){
	    if ($line=~/^Name anchors found:(.*)/){
		print CORPXML "<SITEWebDocumentAnchorFound> $1 </SITEWebDocumentAnchorFound>\n";
	    }
	    if ($line=~/^Name anchors not found:(.*)/){
		print CORPXML "<SITEWebDocumentAnchorNotFound> $1 </SITEWebDocumentAnchorNotFound>\n";
	    }
	    if ($line=~/^Files not found:(.*)/){
		print CORPXML "<SITEWebDocumentFileNotFound> $1 </SITEWebDocumentFileNotFound>\n";
	    }
	    if ($line=~/^External URLs:(.*)/){
		print CORPXML "<SITEWebDocumentUrls> $1 </SITEWebDocumentUrls>\n";
	    }
	    if ($line=~/^Mailto:(.*)/){
		print CORPXML "<SITEWebDocumentMailTo> $1 </SITEWebDocumentMailTo>\n";
	    }
	    if ($line=~/^Images:(.*)/){
		print CORPXML "<SITEWebDocumentImages> $1 </SITEWebDocumentImages>\n";
	    }
	    if ($line=~/^Web documents found:(.*)/){
		print CORPXML "<SITEWebDocumentFound> $1 </SITEWebDocumentFound>\n";
	    }
	    if ($line=~/^Generated report files:(.*)/){
		print CORPXML "<SITEWebDocumentDissected> $1 </SITEWebDocumentDissected>\n";
		$reportNb=0;
		$line=<READFILE>;
		$line=<READFILE>;
		print CORPXML "<SITEWebDocumentReports>\n";
		 until ($line=~/^------------/){
		     if ($line=~/<a href=rep[0-9]+\(([^<>]*\).html*)/){
			 $reportNb++;
			 $tmpfilename=$1;
			 $tmpfilename=~s/(\(|\))//g;
			 print CORPXML "<SITEReportFile NUM=\"$reportNb\">$tmpfilename</SITEReportFile>\n";
		     }
		     $line=<READFILE>;
		 }
		 print CORPXML "</SITEWebDocumentReports>\n";
	    }
	}

    print CORPXML "<SITEFileMeta>\n";
    print CORPXML "<SITEFileContent>",&translate($metaContentType),"</SITEFileContent>\n";
    print CORPXML "<SITEFileDescription>",&translate($metaDescription),"</SITEFileDescription>\n";
    print CORPXML "<SITEFileGenerator>",&translate($metaGenerator),"</SITEFileGenerator>\n";
    print CORPXML "<SITEFileKeywords>",&translate($metaKeywords),"</SITEFileKeywords>\n";
    print CORPXML "<SITEFileTitle>",&translate($metaTitle),"</SITEFileTitle>\n";
    print CORPXML "<SITEFileAuthor>",&translate($metaAuthor),"</SITEFileAuthor>\n";
    print CORPXML "</SITEFileMeta>\n";

    print CORPXML "<SITEFileStructure>\n";
    print CORPXML "<SITEFileElements>\n";
    print CORPXML "<SITEFileElementsNb>$eltNb</SITEFileElementsNb>\n";
    print CORPXML "</SITEFileElements>\n";
    print CORPXML "<SITEFileTxtElements>$txtEltNb</SITEFileTxtElements>\n";
    print CORPXML "<SITEFileImageNb>$imgNb1</SITEFileImageNb>\n";
    print CORPXML "<SITEFileImageDesc>$imgNb<EXTImage>$imgExtNb</EXTImage><INTImage>$imgIntNb</INTImage></SITEFileImageDesc>\n";
    print CORPXML "<SITEFileLinks>\n";
    print CORPXML "<SITEFileLinksNumber>$linkNb</SITEFileLinksNumber>\n";
    print CORPXML "<SITEFileExternalLinks>$extLinkNb</SITEFileExternalLinks>\n";
    print CORPXML "<SITEFileInternalLinks>$intLinkNb</SITEFileInternalLinks>\n";
    print CORPXML "<SITEFileHtmlFileLinks>$intHtmlFileLinkNb</SITEFileHtmlFileLinks>\n";
    print CORPXML "<SITEFileExtHypertextualLinks>$extHypertextLinkNb</SITEFileExtHypertextualLinks>\n";
    print CORPXML "<SITEFileIntHypertextualLinks>$intHypertextLinkNb</SITEFileIntHypertextualLinks>\n";
    print CORPXML "<SITEFileIntDocFileLinks>$intDocFileLinkNb</SITEFileIntDocFileLinks>\n";
    print CORPXML "<SITEFileExternalCgiLinks>$extCgiLinkNb</SITEFileExternalCgiLinks>\n";
    print CORPXML "<SITEFileExternalNewsLinks>$extNewsLinkNb</SITEFileExternalNewsLinks>\n";
    print CORPXML "<SITEFileExternalFtpLinks>$extFtpLinkNb</SITEFileExternalFtpLinks>\n";
    print CORPXML "<SITEFileExternalGopherLinks>$extGopherLinkNb</SITEFileExternalGopherLinks>\n";
    print CORPXML "<SITEFileExternalMail>$extMailNb</SITEFileExternalMail>\n";
    print CORPXML "<SITEFileInternalAnchor>$intAncNb</SITEFileInternalAnchor>\n";
    print CORPXML "<SITEFileExternalAnchor>$extAncNb</SITEFileExternalAnchor>\n";
    print CORPXML "</SITEFileLinks>\n";
    print CORPXML "</SITEFileStructure>\n";
    print CORPXML "</SITE>\n";
    print CORPTXT "</SITE>\n";
print TRACEF "<StatWordElement site=\"$rootSite\">\n"; 
print TRACEF "<TAGS>\n"; 

print TRACEF "<ELEMENTS>\n"; 
foreach $element (sort by_count3 keys %elements) {
    print TRACEF "<ITEM>$element</ITEM><FRQ>$elements{$element}</FRQ>\n";
}
print TRACEF "</ELEMENTS>\n";

print TRACEF "<ELEMENTS DISTRIBUTION>\n" ;
foreach $ele3(sort {$el3by_site{$b} <=> $el3by_site{$a} or $a cmp $b ; } (keys (%el3by_site))) {
  print TRACEF "<ITEM>$ele3</ITEM><FRQ>$el3by_site{$ele3}</FRQ>\n";
  delete ($el3by_site{$ele3}); 
  }
print TRACEF "</ELEMENTS DISTRIBUTION>\n" ;

print TRACEF "<ELEMENTS_ATTR>\n"; 
foreach $element (sort by_count7 keys %elements2) {
    print TRACEF "<ITEM>$element</ITEM><FRQ>$elements2{$element}</FRQ>\n";
}
print TRACEF "</ELEMENTS_ATTR>\n";

print TRACEF "<ELEMENTS_ATTR DISTRIBUTION>\n" ;
foreach (sort {$el4by_site{$b} <=> $el4by_site{$a} or $a cmp $b ; } (keys (%el4by_site))) {
  print TRACEF "<ITEM>$_</ITEM><FRQ>$el4by_site{$_}</FRQ>\n";
  delete ($el4by_site{$_}); 
  }
print TRACEF "</ELEMENTS_ATTR DISTRIBUTION>\n" ;


print TRACEF "<ELEMENTS_ATTRVALUE>\n"; 
foreach $element (sort by_count8 keys %elements3) {
    print TRACEF "<ITEM>$element</ITEM><FRQ>$elements3{$element}</FRQ>\n";
}
print TRACEF "</ELEMENTS_ATTRVALUE>\n";

print TRACEF "<ELEMENTS_ATTRVALUE DISTRIBUTION>\n" ;
foreach (sort {$el5by_site{$b} <=> $el5by_site{$a} or $a cmp $b ; } (keys (%el5by_site))) {
  print TRACEF "<ITEM>$_</ITEM><FRQ>$el5by_site{$_}</FRQ>\n";
  delete ($el5by_site{$_}); 
  }
print TRACEF "</ELEMENTS_ATTRVALUE DISTRIBUTION>\n" ;
 
print TRACEF "</TAGS>\n"; 

print TRACEF "<WORDS>\n"; 
$tmpc=0;
$tmpd=0;
foreach $word (sort by_count keys %seen) {
    print TRACEF "<ITEM>$word</ITEM><FRQ>$seen{$word}</FRQ>\n";
    $tmpc++;
    $tmpd+=$seen{$word};
}
print TRACEF "<TOTALFORM>$tmpc</TOTALFORM>\n";
print TRACEF "<TOTALOCCUR>$tmpd</TOTALOCCUR>\n";
print TRACEF "</WORDS>\n"; 
print TRACEF "</StatWordElement>\n"; 

close READFILE;
}
#===========================================================================

sub DeleteHTMLReports {
  print "Delete $path_to_results*.htm in progress\n";
  opendir(DIRREP,"$path_to_results");
  while(defined($file=readdir(DIRREP))) {
    if (($file=~/rep[0-9]+.*htm/) || ($file=~/analysis_results/)) {
      print "$file\n";
      $file1="$path_to_results"."$file";
      unlink($file1) || warn "having trouble deleting $file1: $! \n";
      }
    }
  closedir(DIRREP);

  } # sub DeleteHTMLReports

#====================================================================           

sub translate {
  # this is taken from HTML::Entities.pm

  %entity2char = (
  # Some normal chars that have special meaning in SGML context
 amp    => '&',  # ampersand 
'gt'    => '>',  # greater than
'lt'    => '<',  # less than
 quot   => '"',  # double quote


 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
 AElig	=> '',  # capital AE diphthong (ligature)
 Aacute	=> '',  # capital A, acute accent
 Acirc	=> '',  # capital A, circumflex accent
 Agrave	=> '',  # capital A, grave accent
 Aring	=> '',  # capital A, ring
 Atilde	=> '',  # capital A, tilde
 Auml       => '',  # capital A, dieresis or umlaut mark
 Ccedil	=> '',  # capital C, cedilla
 ETH        => '',  # capital Eth, Icelandic
 Eacute	=> '',  # capital E, acute accent
 Ecirc	=> '',  # capital E, circumflex accent
 Egrave	=> '',  # capital E, grave accent
 Euml       => '',  # capital E, dieresis or umlaut mark
 Dstrok     => '',  # capital 
 Iacute	=> '',  # capital I, acute accent
 Icirc	=> '',  # capital I, circumflex accent
 Igrave	=> '',  # capital I, grave accent
 Iuml       => '',  # capital I, dieresis or umlaut mark
 Ntilde	=> '',  # capital N, tilde
 Oacute	=> '',  # capital O, acute accent
 Ocirc	=> '',  # capital O, circumflex accent
 OElig      => '',  # capital , diphtong
 Ograve	=> '',  # capital O, grave accent
 Oslash	=> '',  # capital O, slash
 Otilde	=> '',  # capital O, tilde
 Ouml       => '',  # capital O, dieresis or umlaut mark
 THORN	=> '',  # capital THORN, Icelandic
 Uacute	=> '',  # capital U, acute accent
 Ucirc	=> '',  # capital U, circumflex accent
 Ugrave	=> '',  # capital U, grave accent
 Uuml       => '',  # capital U, dieresis or umlaut mark
 Yacute	=> '',  # capital Y, acute accent
 aelig      => '',  # small , diphtong
 aacute	=> '',  # small a, acute accent
 acirc	=> '',  # small a, circumflex accent
 aelig	=> '',  # small ae diphthong (ligature)
 agrave	=> '',  # small a, grave accent
 aring	=> '',  # small a, ring
 atilde	=> '',  # small a, tilde
 auml       => '',  # small a, dieresis or umlaut mark
 ccedil	=> '',  # small c, cedilla
 eacute	=> '',  # small e, acute accent
 ecirc	=> '',  # small e, circumflex accent
 egrave	=> '',  # small e, grave accent
 eth        => '',  # small eth, Icelandic
 euml       => '',  # small e, dieresis or umlaut mark
 iacute	=> '',  # small i, acute accent
 icirc	=> '',  # small i, circumflex accent
 igrave	=> '',  # small i, grave accent
 iuml       => '',  # small i, dieresis or umlaut mark
 ntilde	=> '',  # small n, tilde
 oacute	=> '',  # small o, acute accent
 ocirc	=> '',  # small o, circumflex accent
 ograve	=> '',  # small o, grave accent
 oslash	=> '',  # small o, slash
 otilde	=> '',  # small o, tilde
 ouml       => '',  # small o, dieresis or umlaut mark
 pound	=> '',  # symbol
 szlig	=> '',  # small sharp s, German (sz ligature)
 thorn	=> '',  # small thorn, Icelandic
 uacute	=> '',  # small u, acute accent
 ucirc	=> '',  # small u, circumflex accent
 ugrave	=> '',  # small u, grave accent
 uuml       => '',  # small u, dieresis or umlaut mark
 yacute	=> '',  # small , acute accent
 yuml       => '',  # small , circumflex accent

 acute  => '',
 brkbar => '',
 cedil  => '',
 cent   => '',
 copy   => '(c)',
 curren => '',
 deg    => '',
 divide => '',
 emdash => '-',
 endash => '',
 frac14 => '1/4',
 frac12 => '1/2',
 frac34 => '3/4',
 hellip => '...',
 hibar  => '',
 iexcl  => '',
 iquest => '',
 laquo  => '"',
 macr   => '',
 micro  => '',
 middot => '',
 nbsp   => ' ',
 not    => '',
 ordf   => '',
 ordm   => '',
 para   => '',
 plusmn => '',
 raquo  => '"',
 reg    => '(r)',
 sect   => '',
 shy    => '',
 sup1   => '',
 sup2   => '',
 sup3   => '',
'times' => '',    # times is a keyword in perl
 uml    => '',
 yen    => '',
);

  # Make the oposite mapping
  while (($entity, $char) = each(%entity2char)) {
    $char2entity{$char} = "&$entity;";
  }

  # Fill inn missing entities
  for (0 .. 255) {
    next if exists $char2entity{chr($_)};
    $char2entity{chr($_)} = "&#$_;";
  }

  my $array;
  if (defined wantarray) {
    $array = [@_]; # copy
  }
  else {
    $array = \@_;  # modify in-place
  }
  my $c;
  for (@$array) {
    s/&amp;?/&/g;

    s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
    s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
    s/(&(\w+);?)/$entity2char{$2} || "$1;"/eg;

    s/&Oelig;?//g;
    s/&oelig;?//g;
    s/&eaacute;?//g;
    s/&\#9484;?//g;
    s/&ecric;?//g;
    s/&laqno;?/"/g;
    s#\s+# #sg;

    # local spelling
  }
  wantarray ? @$array : $array->[0];
}

#===================================================================

sub splitText {
  my ($text) = @_;
  if ($text ne "") {
    $text=&translate($text);
    $_=$text;
      while ( /(\w[\w-]*)/g ){
	  $seen{lc $1}++;
	  $seen2{lc $1}++;
	}
    }
  }# sub splitText

#====================================================================

sub display {
  print REP "$tmp\n";
  # display data found
}

#====================================================================

sub shell_path {
  # Returns shell paths 
  local($dos_path) = @_;
  $dos_path =~ s/\\/\//g;
  return $dos_path;
}

#====================================================================

sub spell {
  if ($Spell) {
    # checks files for html errors
    local ($input) = @_;
    $input =~ s#\240#\040#g;
    $input =~ s#(<[!/a-zA-Z][a-zA-Z0-9\.\-]*)#\240$1#g;
    $input =~ s#\s([a-zA-Z][a-zA-Z0-9\.\-_]*\s*=["']?)# >_$1#g;
    # marks all tags and attribute-value couples
    $input =~ s#(_[^\s]+?\s*=\s*"[^"<]*?)>#$1">#sg;
    $input =~ s#(_[^\s]+?\s*=\s*'[^'<]*?)>#$1'>#sg;
    # corrects right truncated values
    $input =~ s#(_[a-zA-Z][a-zA-Z0-9\.\-_]*\s*=)\s*([^\s"'][^"'<>]*?)(["'])#$1$3$2$3#sg;
    # try and manage with left truncated values
    $input =~ s#>_##g;
    $input =~ s#(<![^-][^<>]*?(['"])[^<>]*?\2)\s*\240#$1>#sg;
    # adds missing ">" from declaration tag
    $input =~ s#(<[a-zA-Z][a-zA-Z0-9\.\-]*(?:\s*[a-zA-Z][a-zA-Z0-9\.\-_]*\s*=\s*(?:(["'])[^<>]*?\2|[^\s<>]*))*)\s*\240#$1>#sg;
    # adds a closing ">" after possible couples attribute-value
    $input =~ s#(</[a-zA-Z][a-zA-Z0-9\.\-]*)[\s\240]+>?#$1>#sg;
    # corrects missing ">" from the end tag
    $input =~ s#(<!--[^>]*?--)\s*\240#$1>#sg;
    # adds a closing ">" in comment tags 

    while ($input =~ m#<[^_][^>]+?\240#sg) {
      print TRA "sub spell warn:\n";
      if ($input =~ s#(<![^-][^<>]*?(['"])[^<>]*?\2)([^<>]+)\240#$1>$3#s){print TRA "   Not conforming declaration tag: '$1$3' found at '$filename'\n\n     Interpreted as '$1> $3'\n\n";}
      elsif ($input =~ s#(<[a-zA-Z][a-zA-Z0-9\.\-]*[^<>]*[a-zA-Z][a-zA-Z0-9\.\-_]\s*=\s*(["'])[[^<>]*?\2)([^<=>]+)\240#$1>$3#s){print TRA "   Not conforming start tag: '$1$3' found at '$filename'\n\n    Interpreted as '$1> $3'\n\n";}
      elsif ($input =~ s#(<!--[^>]*?--)([^>]*?)\240#$1>$2#s){print TRA "   Not conforming comment tag: '$1$2' found at '$filename'\n\n    Interpreted as '$1> $2'\n\n";}
      elsif ($input =~ s#(<[^_][^>]+?)\240#$1#s) {print TRA "   Not conforming tag or simple mask: '$1' found at '$filename'\n\n     May have caused parse errors\n\n";}
    }
    $input =~ s#\240##g;
    # deletes additional marks
    # print "$input\n";
    return $input;
  }
}

#====================================================================

sub get_path_to_results {
  $loop++;
  # to add to the file name
  $path_to_results=$CWD;
  # default we put the results in the current working directory
  my $site;
  if ($AskedPath) {
    # a path_to_results asked
    $AskedPath .= '/' unless ($AskedPath =~ m#/$#);
    # we make sure it ends with a slash
    if ($AskedPath =~ m#^(?:\w:)?/#) {
	# a full_path_to_results is asked
      $path_to_results=$AskedPath;
    }
    else {
	# the path asked is not a fullpath
      $path_to_results .= "$AskedPath";
	# we add it to the working directory
    }
  }
  die (<<EOM) unless (-d $path_to_results);
  "$path_to_results" is not a valid path. $!\n
EOM
   
  if ( -d $InFile) {
    if ($InFile =~ m#([^/]+)/?$#) {
	$site=$1;
	# we use the directory name
    }
  }
  else {
    if ($SiteRoot =~ m#([^/]+)/$#) {
	$site=$1;
	# because we need different names for the results files
    }
  }
  $site =~ s/://g;
  # escapes regexp chars
   
  mkdir "$path_to_results/res$loop($site)",2047 || die (<<EOM);
  Cannot create the results directory at $path_to_results!
EOM
  $path_to_results .= "res$loop($site)/";

    $rootSite=$site;
}

#====================================================================

sub WriteResFiles {
  &get_path_to_results;
  # where to put the results
  $tmp="analysis_results.txt";
  $tmp="analysis_results.html" if ($HTMLReport);
  $fileResultsGlobal = $tmp;
  $tmp2="corp-".$rootSite.".xml";
  $tmp3="corp-".$rootSite.".txt";

  open(RES, ">$path_to_results$tmp")||
  die "Can't open analysis_results!\n";

  open(TRACEF,">$path_to_results"."StatWordFull.txt")||
  die "Can't open trace.txt!\n";
  open(TRACEf,">$path_to_results"."StatWordByFile.txt")||
  die "Can't open trace.txt!\n";
  open(INDEXFILE,">$path_to_results"."indexFile.txt")||
  die "Can't open trace.txt!\n";

  open(CORPXML, ">$path_to_results$tmp2")||
  die "Can't open $path_to_results$tmp2!\n";
  open(CORPTXT, ">$path_to_results$tmp3")||
  die "Can't open CORPTXT!\n";
  open(TRA, ">$path_to_results"."trace.txt")||
  die "Can't open trace.txt!\n";
  open(TRASH, ">$path_to_results"."trash.txt")||
  die "Can't open trash.txt!\n";

  # Hello
  $tmp=(<<EOM);
Site Analysis Results
Webxref version 0.3.5, 13-Mar-97 by Rick Jansen
Updated 12-Apr-2000 as part of the ENS/France Telecom project
EOM
  $tmp=(<<EOM) if ($HTMLReport);
<html><head><title>Webxref output</title></head><body><h1><center>Site Analysis Results<br>Webxref version 0.3.5, 13-Mar-97 by Rick Jansen</center></h1><br><h3>Updated 12-Apr-2000 as part of the ENS/France Telecom project</h3><hr>
EOM

  print RES "$tmp\n";
  print CORPXML "<SITE>\n";
  print CORPXML "<SITEName> $rootSite </SITEName>\n";
  print CORPTXT "<SITEName> $rootSite </SITEName>\n";

}# sub WriteResFiles

#====================================================================

sub UpdateARGV {
  
  for (@ARGV) {

    &shell_path($_); # allways a Shell path
    $argv=$CWD.$_;
    # default, short path is given
    if ($_ =~ m#^(?:\w:)?/(?:.+/)*[^/]+/?$#) {
	# if fullpath
      $argv=$_;
    }

    undef $_ if ($argv eq $InFile)||
    ($^O=~/MSWin/)&&(lc$argv eq lc$InFile);
    # sort unique

    $argv="file:/$argv";
    for $key (keys%FileList){
      undef($_) && last if ($key eq $argv)||
        (lc$key eq lc$argv)&&($^O=~/MSWin/);
      # allready checked
    }
    
    chdir ($CWD);
    # return to the initial working directory
    undef($filenumber);
  }
}

#====================================================================

sub parse {
  # this is taken from HTML::Parser.pm and adapted
  $self = shift;
  my $buf = \ $self->{'_buf'};
  $$buf .= $_[0];

  # Parse html text in $$buf.  The strategy is to remove complete
  # tokens from the beginning of $$buf until we can't deside whether
  # it is a token or not, or the $$buf is empty.
  while (1) {  # the loop will end by returning when text is parsed
    # First we try to pull off any plain text (anything before a "<" char)
    if ($$buf =~ s|^([^<]+)||) {
      unless (length $$buf) {
	my $text = $1;
       # At the end of the buffer, we should not parse white space
        # but leave it for parsing on the next round.
	if ($text =~ s|(\s+)$||) {
	  $$buf = $1;
          # Same treatment for chopped up entites.
        }
        elsif ($text =~ s/(&(?:(?:\#\d*)?|\w*))$//) {
	  $$buf = $1;
	};
        $self->text($text);
          return $self;
    }
        else {
          $self->text($1);
      }

  }
      # Netscapes buggy comments are easy to handle
      elsif ($self->{'_netscape_comment'} && $$buf =~ m|^(<!--)|) {
        if ($$buf =~ s|^<!--(.*?)-->||s) {
          $self->comment($1);
        }
        else {
         return $self;  # must wait until we see the end of it
        }
	# Then, markup declarations (usually either <!DOCTYPE...> or a comment)
      }
      elsif ($$buf =~ s|^(<!)||) {
        my $eaten = $1;
        my $text = '';
        my @com = ();  # keeps comments until we have seen the end
        # Eat text and beginning of comment
        while ($$buf =~ s|^(([^>]*?)--)||) {
          $eaten .= $1;
	  $text .= $2;
	  # Look for end of comment
	  if ($$buf =~ s|^((.*?)--)||s) {
	    $eaten .= $1;
            push(@com, $2);
	  }
	  else {
	    # Need more data to get all comment text.
	    $$buf = $eaten . $$buf;
	    return $self;
	  }
        }
	# Can we finish the tag
	if ($$buf =~ s|^([^>]*)>||) {
	  $text .= $1;
	  $self->declaration($text) if $text =~ /\S/;
	  # then tell about all the comments we found
	  for (@com) {
	    $self->comment($_);
	  }
	}
	else {
	  $$buf = $eaten . $$buf;  # must start with it all next time
	  return $self;
	}
      }
      # Then, look for a end tag
      elsif ($$buf =~ s|^</||) {
	# end tag
	if ($$buf =~ s|^([a-zA-Z][a-zA-Z0-9\.\-]*)\s*>||) {
	  $self->end(lc($1));
	}
	elsif ($$buf =~ m|^[a-zA-Z]*[a-zA-Z0-9\.\-]*\s*$|) {
	  $$buf = "</" . $$buf;  # need more data to be sure
	  return $self;
	}
	else {
	  # it is plain text after all
	  $self->text("</");
	}
	# Then, finally we look for a start tag
      }
      elsif ($$buf =~ s|^<||) {
        # start tag
        my $eaten = '<';  
        # This first thing we must find is a tag name.  RFC1866 says:
        #   A name consists of a letter followed by letters,
        #   digits, periods, or hyphens. The length of a name is
        #   limited to 72 characters by the `NAMELEN' parameter in
        #   the SGML declaration for HTML, 9.5, "SGML Declaration
        #   for HTML".  In a start-tag, the element name must
        #   immediately follow the tag open delimiter `<'.
        if ($$buf =~ s|^(([a-zA-Z][a-zA-Z0-9\.\-]*)\s*)||) {
          $eaten .= $1;
	  my $tag = $2;
	  my %attr;
          my @attrseq;

          # Then we would like to find some attributes
          #
          # Arrgh!! Since stupid Netscape violates RCF1866 by
          # using "_" in attribute names (like "ADD_DATE") of
          # their bookmarks.html, we allow this too.
          while ($$buf =~ s|^(([a-zA-Z][a-zA-Z0-9\.\-_]*)\s*)||) {
            $eaten .= $1;
            my $attr = $2;
            my $val;
            # The attribute might take an optional value (first we
            # check for an unquoted value)
            if ($$buf =~ s|(^=\s*([^\"\'>\s][^>\s]*)\s*)||) {
              $eaten .= $1;
	      $val = $2;
	      # or quoted by " or '
	    }
	    elsif ($$buf =~ s|(^=\s*([\"\'])(.*?)\2\s*)||s) {
	      $eaten .= $1;
	      $val = $3;
	      # truncated just after the '=' or inside the attribute
	    }
	    elsif ($$buf =~ m|^(=\s*)$| or
	      $$buf =~ m|^(=\s*[\"\'].*)|s) {
	      $$buf = "$eaten$1";
	      return $self;
	    }
	    else {
	      # assume attribute with implicit value
	      $val = $attr;
            }
            $attr{$attr} = $val;
            push(@attrseq, $attr);
          }

          # At the end there should be a closing ">"
          if ($$buf =~ s|^>||) {
            $self->start($tag, \%attr, \@attrseq, "$eaten>");
          }
          elsif (length $$buf) {
            # Not a conforming start tag, regard it as normal text
            $self->text($eaten);
          }
          else {
            $$buf = $eaten;  # need more data to know
            return $self;
          }
        }
        elsif (length $$buf) {
        $self->text($eaten);
        }
        else {
          $$buf = $eaten . $$buf;  # need more data to parse
          return $self;
        }    
      }
      elsif (length $$buf) {
        die; # This should never happen
      } 
    else {
      # The buffer is empty now
      return $self;
    }
  }
  unless ($HTMLReport) {
    print REP "\n\n\n",'-'x(length($name)+12),"\nLinks from: $name\n",'-'x(length($name)+12),"\n\n";
  }
  else { 
    print REP "<p><h2>Links from: $name</h2><br>";
  }
  &links;
  print REP "</body></html>" if ($HTMLReport);

}# sub parse

#====================================================================

sub eof {
  shift->parse(undef);
}

#====================================================================

sub declaration {
  my($self, $decl) = @_;
  $tmp = "\nDECLARATION: $decl";
  $tmp="<table border><tr><td align=\"center\" valign=\"center\"><b>DECLARATION</b></td><td align = \"left\" valign=\"center\"><b>$decl</b></td></tr></table>" if ($HTMLReport);
  &display;
  print CORPXML "<DECLARATION>",&translate($decl),"</DECLARATION>\n";

}

#====================================================================

sub headers {
  my ($numberof);

  for (@headers) {
    next unless (/\=.+$/);
    s/=/: /g;
    s/['"]//g;
    $numberof++;

    if (/\n/) {
      # must handle header values with embedded newlines with care
      s/\s+$//;
      # trailing newlines and space must go
      s/\n\n+/\n/g;
      # no empty lines
      s/\n(\S)/\n $1/g;
      # intial space for continuation
    }
    
    unless ($HTMLReport) {
      $_ = &translate($_);
      $tmp = "HEADER_$numberof $_";
    }
    else {
      $tmp = "<b>HEADER_$numberof</b> $_<br>";
    }   
    &display;

  my $line=$tmp;
  print CORPXML "<HEADER NUM=\"$numberof\">",&translate($_),"</HEADER>\n";
	if ($line=~/^<b>HEADER_.*<\/b> TITLE:(.*)<br>/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaTitle=$stmp;
	}

	if ($line=~/<b>HEADER_.*<\/b> meta http-equiv: Content-Type content:(.*)<br>/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaContentType=$stmp;
	}
	if ($line=~/^<b>HEADER_.*<\/b> META Name: author Content:(.*)<br>/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaAuthor=$stmp;
	}
	if ($line=~/^<b>HEADER_.*<\/b> META Name: description Content:(.*)<br>/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaDescription=$stmp;
	}

	if ($line=~/^<b>HEADER_.*<\/b> META Name: description Content:([^<]*)$/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaDescription=$stmp;
	    $line=<READFILE>;
		 until ($line=~/<br>/){
                     $line=translate($line);
		     $metaDescription = "$metaDescription"."$line";
		     $line=<READFILE>;
		 }
	    $line=~/(.*)<br>/;
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaDescription = "$metaDescription"."$stmp";
	}

	if ($line=~/^<b>HEADER_.*<\/b> META NAME: GENERATOR CONTENT:(.*)<br>/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaGenerator=$stmp;
	}
	if ($line=~/^<b>HEADER_.*<\/b> META Name: keywords Content:(.*)<br>/i){
            $stmp=$1;
            $stmp=translate($stmp);
	    $metaKeywords=$stmp;
	}

  }
}
#====================================================================

sub scripts {
  my ($numberof);
  foreach $script (@scripts) {
    $numberof++;
    $script =~ s/(?:<!--|-->)//g;
    $tmp = "SCRIPT_$numberof $script";
    &display unless ($HTMLReport);
    print CORPXML "<SCRIPTCONTENT NUM=\"$numberof\" VALUE=\"INDISPONIBLE\"/>\n";
  }
}

#====================================================================

sub comment {
  my($self, $comment) = @_;

  $comment =~ s/^>?\s*//;    
  $tmp = "\nCOMMENT: $comment";
  $tmp = "<table border><tr><td align=\"center\" valign=\"center\"><b>COMMENT</b></td><td align = \"left\" valign=\"center\"><b>$comment</b></td></tr></table>" if ($HTMLReport);
  &display;
  print CORPXML "<COMMENT>",&translate($comment),"</COMMENT>\n";

  }

#====================================================================

sub text {
  my($self, $text) = @_;
  return if ($text eq "");
  &splitText($text);
  $txtEltNb++;
  unless ($HTMLReport) {
    print REP "\nTEXT OBJECT:";
  }
  else {
    print REP "<table border><tr><td align=\"center\" valign=\"center\"><b>TEXT OBJECT</b></td><td align = \"left\" valign=\"center\">";
  }
  if ($text =~ m#^\s+$#s) {
    # says we found (not visible) space characters
    $tmp = " Blank space\n";
    $tmp = "<b>Blank space</b>" if ($HTMLReport);
    &display;
    print CORPXML "<SITEFileTxtBrut TYPE=\"BLANKSPACE\"> </SITEFileTxtBrut>\n";
    $simple_text .= " " ;
    &fill(*tags, "TXT") ;
    &fill(*tagAttrib, "TXT") ;
    &fill(*tagAttrVal, "TXT") ;
    }

  else {
    $text = &translate($text) unless ($HTMLReport);
    $tmp = " $text";
    $tmp = "<pre>$text</pre>" if ($HTMLReport);
    &display;
    print CORPXML "<SITEFileTxtBrut>\n",&translate($text),"\n</SITEFileTxtBrut>\n";
    $simple_text .= "$text" ;
    }
  print REP "</td></tr></table>" if ($HTMLReport);
}

#====================================================================

sub start {
  my($self, $tag, $attr, $attrseq) = @_;
  # $attr is reference to a HASH, $attrseq is reference to an ARRAY
  my $all_attributes = scalar(@$attrseq);
  # all attributes found
  my $numberof;
  # counter 

  $tmp="\nTAG OBJECT: $tag\nNUMBER OF ATTRIBUTES: $all_attributes";   $tmp="<table border><tr><td><b>TAG OBJECT</b><br><i>ELEMENT:</i> $tag<br><i>NUMBER OF ATTRIBUTES:</i> $all_attributes</td><td>" if ($HTMLReport);
  print CORPXML "<tagHTML TAGType=\"$tag\" NBATTR=\"$all_attributes\">BEGIN-$tag\n";
  if (lc($tag) eq 'a') { $simple_text .= "<surf>" ; }

  $eltNb++;
  $seenElt{uc($tag)}++;
  &fill(*tags, uc($tag));
  $el3{uc($tag)}++;
  $elements{uc($tag)}++;
  if ($tmp=~/<b>TAG OBJECT<\/b><br><i>ELEMENT:<\/i>[ ]*img<br><i>NUMBER OF ATTRIBUTES:/i){
    $imgNb1++;
    }

  &display;

  if ($all_attributes eq 0) { 
    # such as in "<P>"
    $tmp="NO ATTRIBUTE-VALUE PAIRS";
    $tmp="<b>No Attribute-Value Pairs</b>" if ($HTMLReport);
    &display;
    }
    
  else {
    $tmp="ATTRIBUTES AND VALUES:";
    $tmp="<b>ATTRIBUTES AND VALUES:</b><br>" if ($HTMLReport);
    &display;
	
    foreach (keys (%$attr)) {
      $numberof++;
      $tmp="ATTRIBUTE_$numberof: $_; VALUE_$numberof: $$attr{$_}";
      $tmp="<table border><tr><td><i>Attribute_$numberof:</i></td><td>$_<br></td></tr><tr><td><i>Value_$numberof:</i></td><td>$$attr{$_}<br></td></tr></table>" if ($HTMLReport);
      my $tmp78 = &translate($$attr{$_});

      print CORPXML "<tagHTMLAttr TAG=\"$tag\" NUM=\"$numberof\" ATTRType=\"$_\" VALUE=\"$tmp78\"/>\n";
      my $tagAttr = "$tag"."\("."$_"."\)";
      &fill(*tagAttrib, uc($tagAttr)) ;
      $el4{uc($tagAttr)}++;
      $elements2{uc($tagAttr)}++;
      $tagAttr = "$tag"."\("."$_"."\="."$tmp78"."\)";
      &fill(*tagAttrVal, uc($tagAttr)) ;
      $el5{uc($tagAttr)}++;
      $elements3{uc($tagAttr)}++;

      &display;
      }
    }

  if ((lc($tag) eq 'p') or (lc($tag) eq 'br') or (lc($tag) eq 'hr') or (lc($tag) eq 'li') or (lc($tag) eq 'body')) { &write_text; }

  if (lc($tag) eq 'p') { &elements_distribution ; }

  print REP "</td></tr></table>\n" if ($HTMLReport);
  print CORPXML "</tagHTML>\n";
}

#====================================================================

sub end {
  my($self, $tag) = @_;

  $tmp="\nTAG OBJECT: /$tag";
  $tmp="<table border><tr><td><b>TAG OBJECT</b><br><i>ELEMENT:</i> /$tag</td></tr></table>" if ($HTMLReport);
  # this gives the end tag
  print CORPXML "<tagHTML TAGType=\"$tag\">END-$tag</tagHTML>\n";
  if (lc($tag) eq 'a') { $simple_text .= "</surf>" ; }
  elsif ((lc($tag) eq 'p') or (lc($tag) eq 'tr')) {
    &write_text;
    }
  &display;
}

#====================================================================

sub links {

  my $tag;
  my $attr;
  my $link;
  my $numberof;
 
  for (@links) {
    # for each link tag
      
    if (m#^(\w+) (\w+) = (.+)#i){ 
      $tag = $1;
      $attr = $2;
      $link = $3;

      $numberof++;
      # counts the selected links
      $tmp="LINK_$numberof ";
      $linkToPrint = "<tagHTML  TAGType=\"LINK\" NUM=\"$numberof\"";

      # link class
      if ($link =~ /mailto\s*:/i) {
        # test for MAILTO links
        $tmp .= "EXTERNAL (MAILTO)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_MAILTO\"";
        }
      elsif ($link =~ /gopher\s*:/i) {
        # test for external GOPHER links
        $tmp .= "EXTERNAL (GOPHER)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_GOPHER\"";
        }
      elsif ($link =~ /news\s*:/i) {
        # test for external NEWS links
        $tmp .= "EXTERNAL (NEWS)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_NEWS\"";
        } 
      elsif ($link =~ /ftp\s*:/i) {
        # test for external FTP links
        $tmp .= "EXTERNAL (FTP)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_FTP\"";
        }
      elsif (($tag =~ /img/i) && ($link =~ /^http/i)) {
        # test for external IMAGE links
        $tmp .= "EXTERNAL (IMAGE)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_IMAGE\"";
        }
      elsif ($link =~ /cgi-bin/i) {
        # test for CGI links
        $tmp .= "EXTERNAL_HTTP_CGI";
        $linkToPrint .= " TYPELink=\"\"";
        }
      elsif ($link =~ /http\s*:/i) {
        # test for external HTTP links
        $tmp .= "EXTERNAL (HTTP)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_HTTP\"";
        }
      elsif (($tag =~ /link/i) && ($link =~ /^http/i)) {
        # test for external HYPERTEXTUAL links
        $tmp .= "EXTERNAL (HYPERTEXTUAL)";
        $linkToPrint .= " TYPELink=\"EXTERNAL_HYPERTEXTUAL\"";
        }
      elsif (($link =~ /file\s*:/i)&&($link =~ /html?$/i)) {
        # test for internal HTMLFILE links
        $tmp = "INTERNAL (HTMLFILE)";
        $linkToPrint .= " TYPELink=\"INTERNAL_HTMLFILE\"";
        }
      elsif ($link =~ /file\s*:/i) {
        # test for internal DOCFILE links
        $tmp = "INTERNAL (DOCFILE)";
        $linkToPrint .= " TYPELink=\"INTERNAL_DOCFILE\"";
        }
      elsif (($tag =~ /img/i) || ($attr =~ /BACKGROUND/i)) {
        # test for internal IMAGE links
        $tmp .= "INTERNAL (IMAGE)";
        $linkToPrint .= " TYPELink=\"INTERNAL_IMAGE\"";
        $link = $ServerRoot.$link;
        }
      elsif ($tag =~ /link/i) {
        # test for internal HYPERTEXTUAL links
        $tmp .= "INTERNAL (HYPERTEXTUAL)";
        $linkToPrint .= " TYPELink=\"INTERNAL_HYPERTEXTUAL\"";
        $link = $ServerRoot.$link;
        }
      elsif ($link =~ /\#/) {
        # test for internal ANCHORS links
	  if (($link eq "#")||($link =~ /^\#/)) {
	    $numberof--;
	    next;
          }
        else {
          $tmp .= "INTERNAL (ANCHOR)";
          $linkToPrint .= " TYPELink=\"INTERNAL_ANCHOR\"";
          $link = $ServerRoot.$link;
          }
        }
      elsif ($link =~ /html?$/) {
        # test for internal HTML files
        $tmp .= "INTERNAL (HTMLFILE)";
        $linkToPrint .= " TYPELink=\"INTERNAL_HTMLFILE\"";
        $link = $ServerRoot.$link;
        } 
      else {
        # some internal link
        $tmp .= "INTERNAL (DOCFILE)";
        $linkToPrint .= " TYPELink=\"INTERNAL_DOCFILE\"";
        $link = $ServerRoot.$link;
        }
      
      $reference = $link;
      $reference = "file:/$reference" if (($link !~ /^http:/)&&($ServerRoot =~ m#^\w:/#));
      # navigation under windows
      $link = &PrintFile($link);
  
      unless ($HTMLReport) {
        $tmp .= " $tag $attr: $link";
        }
      else {
        $tmp .= "</b> $tag $attr: <a href=$reference>$link</a><br>";
        $linkToPrint .= " TAG=\"$tag\"/>\n";
	  print CORPXML $linkToPrint;
        }
      print REP "<b>"if ($HTMLReport);
      &display;
      my $line ="<b>".$tmp."<\/b>";								

	if ($line=~/^\<b\>LINK_.* ([INTERNAL]+ \(IMAGE\))\<\/b\>/i){
	  $imgNb++;
	  $imgIntNb++;
	  $el3{$1}++;
	  $elements{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(IMAGE\))\<\/b\>/i){
        $imgNb++;
        $imgExtNb++;
        $elt3{$1}++;
        $elements{$1}++;
        }

	if ($line=~/^\<b\>LINK_.* ([INTERNAL]+ \(FILE\))\<\/b\>/i){
        $linkNb++;
	  $intLinkNb++;
        $elements{$1}++;
        $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(HYPERTEXTUAL\))\<\/b\>/i){
	  $linkNb++;
	  $extHypertextLinkNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([INTERNAL]+ \(HYPERTEXTUAL\))\<\/b\>/i){
        $linkNb++;
	  $intHypertextLinkNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([INTERNAL]+ \(HTMLFILE\))\<\/b\>/i){
	  $linkNb++;
	  $intHtmlFileLinkNb++;
        $elements{$1}++;
        $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([INTERNAL]+ \(DOCFILE\))\<\/b\>/i){
	  $linkNb++;
	  $intDocFileLinkNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([INTERNAL]+ \(ANCHOR\))\<\/b\>/i){
	  $linkNb++;
	  $intAncNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(ANCHOR\))\<\/b\>/i){
	  $linkNb++;
	  $extAncNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(MAILTO\))\<\/b\>/i){
	  $linkNb++;
	  $extMailNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(HTTP\))\<\/b\>/i){
	  $linkNb++;
	  $extLinkNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }

	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(HTTP_CGI\))\<\/b\>/i){
	  $linkNb++;
	  $extCgiLinkNb++;
        $elements{$1}++;
        $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(HTTP_NEWS\))\<\/b\>/i){
        $linkNb++;
	  $extNewsLinkNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(HTTP_FTP\))\<\/b\>/i){ 	  $linkNb++;
	  $extFtpLinkNb++;
        $elements{$1}++;
	  $el3{$1}++;
	  }
	if ($line=~/^\<b\>LINK_.* ([EXTERNAL]+ \(HTTP_GOPHER\))\<\/b\>/i){
	  $linkNb++;
	  $extGopherLinkNb++;
	  $elements{$1}++;
	  $el3{$1}++;
	  }
      }
    }
  }

#====================================================================

sub fill {
  local (*array, $item) = @_ ;
  push @array, $item ;
  }

#====================================================================

sub write_text {

  $simple_text = &translate($simple_text) ;
  while ($simple_text =~ s#<surf>(.*?)<\/surf>#$1#s) {
    push @surf_words, $1 ;
    }

  if (@surf_words) {

    foreach (@surf_words) {
      $count++ ;
      if (/(?:\@|http\:|www|suivant|prcdent|retour|revenir|sommaire|menu|accueil|next|back|home)/i) {
        $simple_text =~ s#$_##s ;
        print TRASH "$filename: $_\n" ;
        $_ = "" ;
        }
      }
    undef (@surf_words) ;
    }
  unless ($simple_text =~ /^ *$/) { print CORPTXT "$simple_text\n" ; }

  $simple_text = "" ;
  &elements_distribution(*el3by_file, *tags) ;
  &elements_distribution(*el4by_file, *tagAttrib) ;
  &elements_distribution(*el5by_file, *tagAttrVal) ;

  } # sub write_text

#=====================================================================

sub elements_distribution {

  local (*by_file, *elems) = @_ ;

  for ($i = 0 ; $i < $#elems; $i++) {
    my $segment = $elems[$i] ;

    $max = $i + 5;
    if ($max > $#elems){ $max = $#elems ; }

    for ($j = $i + 1; $j <= $max ; $j++) {
      $segment .= "_$elems[$j]" ;

      if (exists ($by_file{$segment})) {
        $by_file{$segment}++ ;
        }
      else {
        $by_file{$segment} = 1 ;
        }
      }
    }
  undef (@elems) ;
  return 1 ;
  }

 #=====================================================================

sub elements {
  # reports all components of a web page
  if (($ReportFiles)&&($filename =~ m#html?$#)) {
    
    my $self = bless { '_buf','','_netscape_comment',0};
    # the object constructor

    $filenumber++;
    print "file $filenumber\n";
    s#^file:/##;
    # deletes what we added for navigation under windows
	
    if ($filename =~ m#^(.+/)([^/]+)$#){
      $ServerRoot = "$1";
      # new: replaces "-root"
      $name = $2;
      $report = "($name";
      $report=~ s#\.s?html?$#\)\.txt#;
      # the name of the report file
      $ServerRootExpr = $ServerRoot;
      $ServerRootExpr =~ s#(\W)#\\$1#g;  # escape regexp chars      
    }
    $report =~ s#txt$#html# if ($HTMLReport);
	
    unless (open(REP, ">$path_to_results"."rep$filenumber$report")){
      # the results file
      print TRA "sub elements warn:\n   Could not open 'rep$filenumber$report'\. $!\n\n";
      return;
    }
   
    unless ($HTMLReport) {
      print REP "Document Analysis Results\nAnalysis of: $filename\nby Calin MOSUT\n\n\n",'-'x(length($name)+29),"\nFormatted MIME Headers from: $name\n",'-'x(length($name)+29),"\n\n";
    }
    else {
      print REP "<html><head><title>Document Analysis Results</title></head><body><h1><center>Document Analysis Results</center></h1><b>Analysis of: </b>$filename<br><b>by Calin MOSUT</b><hr><p><h2>\tFormatted MIME Headers from: $name</h2><br>\n";
      print CORPXML "<SITEFile>\n<SITEFileName>",$filename,"</SITEFileName>\n<SITEReportFileName>",&PrintFile($filename),"</SITEReportFileName>\n";
      print CORPTXT "<SITEFile>\n<SITEFileName>",$filename,"</SITEFileName>\n";

      %seen2 = () ;
      %el3 = () ;
      %el4 = () ;
      %el5 = () ;

      $fileNumber++;
	formline($print_subform2, "$rootSite-$fileNumber",$filename);
      print INDEXFILE $ACCUMULATOR, "\n";
      $ACCUMULATOR = '';
    }
    &headers;
    unless ($scripts[0] eq "") {
      unless ($HTMLReport) {
        print REP "\n\n\n",'-'x(length($name)+14),"\nScripts from: $name\n",'-'x(length($name)+14),"\n\n";
      }
      else {
        print REP "<p><h2>Scripts from: $name</h2><br>This section can be displayed safely only in .txt form\n";
      }
      &scripts;
    }
    unless ($HTMLReport) {
      print REP "\n\n\n",'-'x(length($name)+13),"\nElements of: $name\n",'-'x(length($name)+13),"\n\n";  
    }
    else {
      print REP "<p><h2>Elements of: $name</h2><br>\n";
    }
    $self->parse("$elements");
    $self->eof;

    unless ($links[0] eq "") {
      unless ($HTMLReport) {
        print REP "\n\n\n",'-'x(length($name)+12),"\nLinks from: $name\n",'-'x(length($name)+12),"\n\n";
      }
      else { 
        print REP "<p><h2>Links from: $name</h2><br>\n";
      }
      &links;
    }

    print REP "</body></html>\n" if ($HTMLReport);
    print CORPXML "</SITEFile>\n";
    print CORPTXT "\n</SITEFile>\n";

    print TRACEf "<TAGS>\n"; 
    print TRACEf "<SITE>$rootSite</SITE>\n<PAGE>$filename</PAGE>\n";

    print TRACEf "<ELEMENTS>\n";
    foreach $el (sort by_count4 keys %el3) {
      print TRACEf "<ITEM>$el</ITEM><FRQ>$el3{$el}</FRQ>\n";
      }
    print TRACEf "</ELEMENTS>\n";

    print TRACEf "<ELEMENTS DISTRIBUTION>\n" ;
    foreach (sort { $el3by_file{$b} <=> $el3by_file{$a} or $a cmp $b ; }(keys (%el3by_file))) {
      if ($el3by_file{$_} >= 2) {
        print TRACEf "<ITEM>$_</ITEM><FRQ>$el3by_file{$_}</FRQ>\n";
        if (exists ($el3by_site{$_})) {
          $el3by_site{$_} = $el3by_site{$_} + $el3by_file{$_} ;
          }
        else { $el3by_site{$_} = $el3by_file{$_} ; }
        }
      delete ($el3by_file{$_}); 
      }
    print TRACEf "</ELEMENTS DISTRIBUTION>\n" ;
 
    print TRACEf "<ELEMENTS_ATTR>\n"; 
    foreach $el (sort by_count5 keys %el4) {
      print TRACEf "<ITEM>$el</ITEM><FRQ>$el4{$el}</FRQ>\n";
      }
    print TRACEf "</ELEMENTS_ATTR>\n";

    print TRACEf "<ELEMENTS_ATTR DISTRIBUTION>\n" ;
    foreach (sort {$el4by_file{$b} <=> $el4by_file{$a} or $a cmp $b ;}(keys (%el4by_file))) {
      if ($el4by_file{$_} >= 2) {
        print TRACEf "<ITEM>$_</ITEM><FRQ>$el4by_file{$_}</FRQ>\n";
        if (exists ($el4by_site{$_})) {
          $el4by_site{$_} = $el4by_site{$_} + $el4by_file{$_} ;
          }
        else { $el4by_site{$_} = $el4by_file{$_} ; }
        }
      delete ($el4by_file{$_}); 
      }
    print TRACEf "</ELEMENTS_ATTR DISTRIBUTION>\n" ;

    print TRACEf "<ELEMENTS_ATTRVALUE>\n"; 
    foreach $el (sort by_count6 keys %el5) {
      print TRACEf "<ITEM>$el</ITEM><FRQ>$el5{$el}</FRQ>\n";
      }
    print TRACEf "</ELEMENTS_ATTRVALUE>\n";

    print TRACEf "<ELEMENTS_ATTRVALUE DISTRIBUTION>\n" ;
    foreach (sort {$el5by_file{$b} <=> $el5by_file{$a} or $a cmp $b ;} (keys (%el5by_file))) {
      if ($el5by_file{$_} >= 2) {
        print TRACEf "<ITEM>$_</ITEM><FRQ>$el5by_file{$_}</FRQ>\n";
        if (exists ($el5by_site{$_})) {
          $el5by_site{$_} = $el5by_site{$_} + $el5by_file{$_} ;
          }
        else { $el5by_site{$_} = $el5by_file{$_} ; }
        }
      delete ($el5by_file{$_}); 
      }
    print TRACEf "</ELEMENTS_ATTRVALUE DISTRIBUTION>\n" ;

    print TRACEf "</TAGS>\n"; 

    print TRACEf "<WORDS>\n"; 
    print TRACEf "<SITE>$rootSite</SITE>\n<PAGE>$filename</PAGE>\n"; 
    $tmpc=0;
    $tmpd=0;
    foreach $word (sort by_count2 keys %seen2) {
      $tmpc++;
      $tmpd+=$seen2{$word};
      print TRACEf "<ITEM>$word</ITEM><FRQ>$seen2{$word}</FRQ>\n";
      }

    print TRACEf "<TOTALFORM>$tmpc</TOTALFORM>\n";
    print TRACEf "<TOTALOCCUR>$tmpd</TOTALOCCUR>\n";
    print TRACEf "</WORDS>\n"; 

    &AddedToList(*ReportFiles, "rep$filenumber$report", "$filename");
    close REP;
    # the end of the work session
  } 
} # sub elements

#=====================================================================

sub by_count {
    $seen{$b} <=> $seen{$a}; 
}
#=====================================================================

sub by_count2 {
    $seen2{$b} <=> $seen2{$a}; 
}
#=====================================================================

sub by_count3 {
    $elements{$b} <=> $elements{$a}; 
}
#=====================================================================

sub by_count4 {
    $el3{$b} <=> $el3{$a}; 
}
#=====================================================================

sub by_count5 {
    $el4{$b} <=> $el4{$a}; 
}
#=====================================================================

sub by_count6 {
    $el5{$b} <=> $el5{$a}; 
}
#=====================================================================

sub by_count7 {
    $elements2{$b} <=> $elements2{$a}; 
}
#=====================================================================

sub by_count8 {
    $elements3{$b} <=> $elements3{$a}; 
}

# This is the last line of the webxref script really.
# If this line is missing
