#!/usr/bin/perl -w # db-to-text2.pl exports and imports bayes DB into/from a text file # adapted a bit by Andy Spiegl (spamassassin.andy@spiegl.de) # from http://spamassassin.taint.org/devel/db-to-text.pl.txt # to check for (and remove) atimes which lie in the future # # this also fixes a bug of "inventing" new keys because of newlines in the # value part of the token. But db-text2.pl CANNOT be used to dump bayes_seen! # You can safely use db-to-text.pl for that. ############################################################ # How to use this script: # # First sync the journal before running this script! # One way to do so is: # sa-learn --rebuild # # After (backing up! and) running # db-to-text2.pl -o bayes_toks > bayes_toks.txt # db-to-text2.pl -i bayes_toks < bayes_toks.txt # you probably want to run # sa-learn -D --force-expire # and then should NOT get this message anymore: # "debug: bayes: couldn't find a good delta atime, need more token difference, skipping expire." use strict; # WARNING: please set this variable to the correct value! # check "sa-learn --dump magic | head -1" my $db_version=2; # '^M^A^G^I^C' my $MAGIC_STRING = "\015\001\007\011\003"; my $into; if ($ARGV[0] && $ARGV[0] eq '-i') { $into = 1; } elsif ($ARGV[0] && $ARGV[0] eq '-o') { $into = 0; } else { die " usage: db-to-text -o dbfile > textdump usage: db-to-text -i dbfile < textdump "; } use DB_File; use Fcntl; shift @ARGV; my $file = shift @ARGV; my ($k, $v, $count, $resetcount); my ($ts, $th, $atime); $count = 0; $resetcount = 0; # TEXT to DB if ($into) { my %out; tie (%out, 'DB_File', $file, O_WRONLY|O_CREAT, 0600); while (<>) { $k="";$v="";$ts=0;$th=0;$atime=0; /(.*?)<\/key>/ and $k = $1; /(.*?)<\/val>/ and $v = $1; /(.*?)<\/ts>/ and $ts = $1; /(.*?)<\/th>/ and $th = $1; /(.*?)<\/atime>/ and $atime = $1; $v = &tok_pack($ts, $th, $atime) unless $v; $out{$k} = $v; $count++; print STDERR "." if ($count % 1000 == 0); } untie %out; print STDERR "\n$file: $count keys copied\n"; } else # DB to TEXT { my %in; my %magic; my $magic; tie (%in, 'DB_File', $file, O_RDONLY, 0600); while (($k, $v) = each %in) { if ($k =~ /<\/key>/ || $v =~ /<\/val>/) { print STDERR "oops"; next; } # save the ^M^A^G^I^C tokens for later if ( $k =~ /^$MAGIC_STRING/ ) { $magic = $k; $magic =~ s/^$MAGIC_STRING//; #print STDERR "\nMagic token: $magic: $v\n"; $magic{$magic} = $v; next; } # unpack the token ($ts, $th, $atime) = &tok_unpack($v); # reset atime if it is in the future if ($atime > time) { print STDERR "\nResetting atime of key in the future:\n $k$ts$th$atime\n"; $atime = time; $resetcount++; } print "$k$ts$th$atime\n"; $count++; print STDERR "." if ($count % 1000 == 0); } untie %in; print STDERR "\n$file: $count keys copied\n"; print STDERR "$file: $resetcount future-keys reset\n\n"; # print magic tokens my ($magickey, $magicval); foreach $magickey (keys %magic) { $magicval = $magic{$magickey}; # only check atime values if ($magickey =~ /^(LASTEXPIRE|LASTJOURNALSYNC|NEWESTAGE|OLDESTAGE|RUNNINGEXPIRE)$/) { if ($magicval > time) { # resetting can't be later than today print STDERR "resetting magic token \"$magickey\"\n"; $magicval = time; } } print "$MAGIC_STRING$magickey$magicval\n"; } } print STDERR "done.\n"; exit; # helpers copied from Mail::SpamAssassin::BayesStore.pm use constant FORMAT_FLAG => 0xc0; # 11000000 use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 use constant TWO_LONGS_FORMAT => 0x00; # 00000000 use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000 use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111 sub tok_unpack { my ($value) = @_; $value ||= 0; my ($packed, $atime); if ( $db_version == 0 ) { ($packed, $atime) = unpack("CS", $value); } elsif ( $db_version == 1 || $db_version == 2 ) { ($packed, $atime) = unpack("CV", $value); } if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { return (($packed & ONE_BYTE_SSS_BITS) >> 3, $packed & ONE_BYTE_HHH_BITS, $atime || 0); } elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) { my ($packed, $ts, $th, $atime); if ( $db_version == 0 ) { ($packed, $ts, $th, $atime) = unpack("CLLS", $value); } elsif ( $db_version == 1 ) { ($packed, $ts, $th, $atime) = unpack("CVVV", $value); } elsif ( $db_version == 2 ) { ($packed, $ts, $th, $atime) = unpack("CVVV", $value); } return ($ts || 0, $th || 0, $atime || 0); } # other formats would go here... else { warn "unknown packing format for Bayes db, please re-learn: $packed"; return (0, 0, 0); } } sub tok_pack { my ($ts, $th, $atime) = @_; $ts ||= 0; $th ||= 0; $atime ||= 0; if ($ts < 8 && $th < 8) { return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime); } else { return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime); } }