#!/usr/bin/perl -w

# $HOME/.corpus should contain settings for these values:
#
# corpus=/home/corpus/corpus
# tree=/home/corpus/cvs/spamassassin
# tmp=/home/corpus/tmp
# tagtime=/home/corpus/log/tagtime
# html=/home/html/root/users/corpus
# username=joe
# password=xyzzy

use strict;
use POSIX qw(nice);

nice(15);

my $configuration = "$ENV{HOME}/.corpus";
my %opt;
my $revision = "unknown";
my %revision;
my @files;
my $skip = '';

&configure;
&version;
&init;
&update;
&locate;
&rename;
&current;
&clean_up;

sub version {
    my $line;
    if (open(TESTING, "$opt{tree}/rules/70_cvs_rules_under_test.cf")) {
	chomp($line = <TESTING>);
	if ($line =~ m/.*Revision:\s*(\S+).*/) {
	    $revision = $1;
	}
	close(TESTING);
    }
}

sub configure {
    # does rough equivalent of source
    open(C, $configuration) || die "open failed: $configuration: $!\n";
    while(<C>) {
	chomp;
	s/#.*//;
	if (/^(.*?)\s*=\s*(.*)/) {
	    $opt{$1} = $2;
	}
    }
    close(C);
}

sub clean_up {
    system "rm -f $opt{tmp}/*.$$";
}

sub init {
    $SIG{INT} = \&clean_up;
    $SIG{TERM} = \&clean_up;

    $ENV{RSYNC_PASSWORD} = $opt{password};
    $ENV{TIME} = '%e,%U,%S';
    $ENV{TZ} = 'UTC';
}

sub update {
    chdir $opt{corpus};
    system "rsync -CPcvuzbt --timeout=60 $opt{username}" . '@rsync.spamassassin.org::corpus/* .';
}

sub locate {
    chdir "$opt{tree}/masses";
    opendir(CORPUS, $opt{corpus});
    @files = sort readdir(CORPUS);
    closedir(CORPUS);

    @files = grep { /^(?:spam|nonspam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
    @files = grep {
	my $time = 0;
	my $tag = 0;
	$revision{$_} = "unknown";
	open(FILE, "$opt{corpus}/$_");
	while (my $line = <FILE>) {
	    last if $line !~ /^#/;
	    $time++ if $line =~ /\b(?!08)\d\d:\d\d:\d\d\b/;
	    $tag++ if $line =~ /CURRENT_CORPORA_SUBMIT_VERSION/;
	    $tag++ if $line =~ /CURRENT_CORPORA_WEEKLY_VERSION/ && /-net-/;
	    $revision{$_} = $1 if $line =~ m/CVS revision:\s*(\S+)/;
	}
	close(FILE);
	if (!$time) {
	    $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n";
	}
	if (!$tag) {
	    $skip .= "# skipped $_: tag not CURRENT_CORPORA_SUBMIT_VERSION\n";
	}
	($time && $tag);
    } @files;
}

sub rename {
    use File::Copy qw(move);

    my $hour = (gmtime(time))[2];
    if ($hour == 9) {
	chdir $opt{html};
	opendir(HTML, $opt{html});
	my @html = readdir(HTML);
	closedir(HTML);
	@html = grep { -f } @html;
	for (@html) {
	    move($_, "last/$_");
	}
    }
}

sub sort_all {
    my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
    my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);

    $a2 ||= '';
    $b2 ||= '';
    my $n = ($a1 cmp $b1) || ($a2 cmp $b2);
    $n -= 1000 if $a =~ /^OVERALL/;
    $n += 1000 if $b =~ /^OVERALL/;
    $n -= 100 if $a1 =~ /^\(all messages\)/;
    $n += 100 if $b1 =~ /^\(all messages\)/;
    $n -= 10 if $a1 =~ /^\(all messages as \%\)/;
    $n += 10 if $b1 =~ /^\(all messages as \%\)/;
    return $n;
}

sub current {
    for my $class ("DETAILS", "HTML", "NET") {
	for my $age ("new", "all", "1day", "2day", "7day") {
	    my @ham = grep { /^(?:nonspam|ham)/ } @files;
	    my @spam = grep { /^spam/ } @files;

	    chdir $opt{corpus};

	    next if ($class eq "NET" && $age !~ /^(?:new|all|7day)$/);

	    # net vs. local
	    my @ham_net = grep { /-net-/ } @ham;
	    my @spam_net = grep { /-net-/ } @spam;
	    if ($class eq "NET") {
		@ham = @ham_net;
		@spam = @spam_net;
	    }
	    else {
		# remove duplicates
		for my $net (@ham_net) {
		    @ham = grep { $_ ne $net } @ham;
		}
		for my $net (@spam_net) {
		    @spam = grep { $_ ne $net } @spam;
		}
	    }
	    
	    # age
	    if ($class eq "NET" && $age ne "7day") {
		@ham = grep { -M "$_" < 10 } @ham;
		@spam = grep { -M "$_" < 10 } @spam;
		# find most recent CVS revision
		my $wanted = 0.0;
		for (@spam, @ham) {
		    $wanted = $revision{$_} if ($revision{$_} > $wanted);
		}
		@spam = grep { $revision{$_} eq $wanted } @spam;
		@ham = grep { $revision{$_} eq $wanted } @ham;
	    }
	    elsif ($age =~ /^(?:new|all)$/) {
		@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
		@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
		@ham = grep { $revision{$_} eq $revision } @ham;
		@spam = grep { $revision{$_} eq $revision } @spam;
	    }
	    elsif ($age =~ /(\d+)day/) {
		my $mtime = $1;
		@ham = grep { -M "$_" < $mtime } @ham;
		@spam = grep { -M "$_" < $mtime } @spam;
	    }
	    
	    open(OUT, "> $opt{html}/$class.$age");
	    print OUT "# ham results used: " . join(" ", @ham) . "\n";
	    print OUT "# spam results used: " . join(" ", @spam) . "\n";
	    for (@ham) {
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
	    }
	    for (@spam) {
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
	    }

	    my $flags = "";
	    $flags = "-t net -s 1" if $class eq "NET";
	    $flags = "-M HTML_MESSAGE" if $class eq "HTML";

	    if ($age eq "all") {
		my %spam;
		my %ham;
		my @output;
		
		for my $file (@spam) {
		    $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);
		}
		for my $file (@ham) {
		    $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);
		}
		unlink "$opt{tmp}/ham.log.$$";
		unlink "$opt{tmp}/spam.log.$$";
		next unless (scalar keys %spam && scalar keys %ham);
		for my $user (sort keys %spam) {
		    next unless defined $ham{$user};
		    chdir "$opt{tree}/masses";
		    system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");
		    system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");
		    open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
		    while(<IN>) {
			chomp;
			push @output, "$_:$user\n";
		    }
		    close(IN);
		}
		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
		while(<IN>) {
		    push @output, $_;
		}
		close(IN);
		for (sort sort_all @output) {
		    print OUT $_;
		}
	    }
	    elsif (@ham && @spam) {
		# get logs
		system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
		system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
	
		chdir "$opt{tree}/masses";
		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
		while(<IN>) {
		    print(OUT);
		}
		close(IN);
	    }
	    close(OUT);
	}
    }
}
