#!/usr/bin/perl
#
# analyses a set of mboxes and figures out frequencies and average precedences
# of each host/IPaddr range in the Received headers.  This allows us to track
# frequent relayers, or forged hostnames.  "precedence" BTW is the position of
# the Received header in the list, normalized between 0..100; higher means
# nearer to the recipient.

foreach my $file (@ARGV) {
  open (IN, "<$file") or warn "cannot read $file";
  domail();
  close IN;
}
report();
exit;

sub domail {
  my $rcvd = '';
  while (<IN>) {

  nextline:
    /^$/ and last;
    /^Received: / or next;
    $rcvd .= $_;

  nextrcvdline:
    $_ = <IN>;
    if (/^\s+/) {
      $rcvd .= $_;
      goto nextrcvdline;
    } else {
      goto nextline;
    }
  }

  $rcvd =~ s/\s+/ /gs; study $rcvd;
  my $host_precedence = 0;

  my @hits = ();
  while ($rcvd =~ /\b
            (?:
              (?:from|by)\s([-_+a-zA-Z0-9]\S+)\s\(
              | ([-_+a-zA-Z0-9]+\.[-_+a-zA-Z0-9]+\.[-_+a-zA-Z0-9\.]+)
            )
          \b/gx)
  {
    $_ = $1; $_ ||= $2;
    next if /^\d\.\d/;      # version numbers
    next if /\d\.\d\./;     # version numbers
    push (@hits, $_);
  }

  my $hitcalibration = 99.0 / ($#hits+1);

  foreach $_ (@hits) {
    $host_precedence += int ($hitcalibration);

    if (/^[\d\.]+$/) {      # IPs
      s/\.\d+$//gs;         # turn into a network

      $hits{$_}++;
      $desc{$_} = "iprange: $_";
      $prec{$_} ||= '';
      $prec{$_} .= $host_precedence." ";

    } else {
      $_ = lc $_;

      $hits{$_}++;
      $desc{$_} = "host: $_";
      $prec{$_} ||= '';
      $prec{$_} .= $host_precedence." ";

      s/^.*(\.[^\.]+\.[^\.]+)$/$1/gs;
      if (/\S/) {
        $hits{$_}++;
        $desc{$_} = "netname: $_";
        $prec{$_} ||= '';
        $prec{$_} .= $host_precedence." ";
      }
    }
  }
}

sub report {
  foreach my $host (sort {$hits{$b} <=> $hits{$a}} keys %hits) {
    my $avgprec = average_precedences ($prec{$host});
    printf "%6d %3d:  %s\n",
                  $hits{$host}, $avgprec, $desc{$host};
  }
}

sub average_precedences {
  my $precs = shift;
  my $avgprec = 0;
  my $numprec = 0;
  foreach my $prec (split (' ', $precs)) {
    $avgprec += $prec; $numprec++;
  }
  $avgprec /= $numprec;
  $avgprec /= 10;
}
