#!/usr/local/bin/perl
# Use -*- perl -*- mode
#
# We search for files named robots.txt in the main document tree and
# user public_html directories, and read each file in turn and update
# an array of disallow directives with every robot.  At the end we
# write a new /robots.txt .
#
# We check the general syntax and authority of lines and complain to stderr.
#
# Beware: this overwrites your /robots.txt !!!!
# First copy /robots.txt to /robots.txt.input !!!!
#
# Original source: <URL:http://www.iki.fi/hyvatti/sw/makerobots.perl>
# Author: Jaakko Hyvätti <Jaakko.Hyvatti@iki.fi>
# Copyright: GNU General Public Licence

$root = "/usr/local/etc/httpd/htdocs";
$home = "/home";
$server = "www.my.domain";
$debug = 0;

$root = $ARGV[0] if $#ARGV >= 0;
$home = $ARGV[1] if $#ARGV >= 1;
$server = $ARGV[2] if $#ARGV >= 2;

sub readfile {
    local($fname,$removeprefix,$addprefix) = @_;

    local($prefix) = $fname;
    $prefix =~ s,[^/]*$,,;
    if (substr ($prefix, 0, length ($removeprefix)) eq $removeprefix) {
	$prefix = $addprefix . substr ($prefix, length ($removeprefix));
    } else {
	die "Internal error:$fname,$removeprefix,$addprefix.";
    }

    open (R, $fname) || die "Open $fname";
    while (<R>) {
	$comment = "";
	$comment = $1 if s/\#(.*)$//;	# strip and store comments
	s/\s+$//;			# strip trailing spaces
	unless ($_) {			# empty line separates records
	    undef $useragent;
	} elsif (/^User-agent\s*:?\s*/i) { # flexible matching of keywords
	    print STDERR "$fname:$.:warning:no empty line between records\n"
		if defined $useragent;
	    if ($') {
		$useragent = $';
	    } else {
		undef $useragent;
		print STDERR "$fname:$.:empty User-agent\n";
	    }
	} elsif (/^Disallow\s*:?\s*/i) { # flexible matching of keywords
	    $path = $';
	    if (defined $useragent) {
		if ($prefix eq substr ($path, 0, length ($prefix))) {
		    if ($debug) {
			$disallow{$useragent} = $disallow{$useragent} .
			    "Disallow: $path\t# $fname:$. $comment\n";
		    } else {
			$disallow{$useragent} = $disallow{$useragent} . "Disallow: $path\n";
		    }
		} else {
		    print STDERR "$fname:$.:unauthorized line \"$_\"\n";
		}
	    } else {
		print STDERR "$fname:$.:\"$_\" outside a record\n";
	    }
	} else {
	    print STDERR "$fname:$.:syntax error in \"$_\"\n";
	}
    }
    close (R) || die "Close $fname";
}

# Check the master input
&readfile ("$root/robots.txt.input", $root, "");

# Check the subdirectories
if ($root && -d $root) {
    open (P, "find $root -mindepth 2 -type f -name robots.txt|")
	|| die "open find";
    while (<P>) {
	chomp;
	&readfile ($_, $root, "");
    }
    close (P);
}

# Check the user home directories
if ($home && -d $home) {
    open (P, "ls -d $home/*/public_html/robots.txt|") || die "open ls";
    while (<P>) {
	chomp;
	&readfile ($_, $&, "/~$1/") if m,$home/([^/]+)/public_html/,
    }
    close (P);
}

open (R, ">$root/robots.txt.tmp") || die "Open $root/robots.txt.tmp for write";

print R "# This is <URL:http://www.my.domain/robots.txt>.  See\n";
print R "# <URL:http://info.webcrawler.com/mak/projects/robots/norobots.html>\n";
print R "# for more information about robot exclusion standard.\n";
print R "#\n";
print R "# This file is automatically generated using script\n";
print R "# <URL:http://www.iki.fi/hyvatti/sw/makerobots.perl>\n";
print R "# from: Jaakko Hyvätti <Jaakko.Hyvatti\@iki.fi>\n\n";

while (($useragent,$paths) = each (%disallow)) {
    print R "User-agent: $useragent\n$paths\n" unless "*" eq $useragent;
}

# Write "User-agent: *" last, for beauty
print R "User-agent: *\n" . $disallow{"*"} . "\n" if defined $disallow{"*"};
print R "# eof\n";

close (R) || die "Close $root/robots.txt.tmp";

rename ("$root/robots.txt.tmp", "$root/robots.txt") || die "rename temp";

