#!/usr/bin/perl
#
# kill_bad_patterns - filter out a number of risky rule strings, or spoor from the
# spamtrapping process itself

while (<>) {

  # skip our whitelist
  next if /
      # spamtrap strings
      (?:taint\\.org|jmason\\.org|spamassassin\\.org|spamtrap|

      # long _________ or ---------- separators
	[-_=\s]{30}|
	\/[-_=\s]+\/|
        \*\*\*\*\*{20}|
        \\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*|

      # markup
	DOCTYPE\sHTML\sPUBLIC|
	\\\{Spam\\?\\}|
	\\*\\*SPAM\\*\\*|

      # dates
        \d+\s+[A-Z][a-z]{2}\s\S+\s..:..:..\s|

      # phish text
	PayPal\sInc\.\sAll\srights\sreserved\.|
	about\seBay\\'s\scommunication\spolicies\\.|
	See\sour\sPrivacy\sPolicy\sand\sUser\sAgreement\sif\syou\shave|
	Learn\show\syou\scan\sprotect\syourself\sfrom\sspoof|
	access\sto\syour\saccount\swas\slimited\\.\sYour\saccount\saccess\swill\sremain|
	In\saccordance\swith\sPayPal\\'s\sUser\sAgreement|
	To\ssecure\syour\saccount\sand\squickly\srestore\sfull\saccess|
	To\slearn\smore\sabout\sprotecting\syourself\sfrom\sfraud,\svisit\sthe\sSecurity\sCenter|
	PayPal\swill\snever\sask\syou\sto\senter\syour\spassword\sin\san\semail|

      # asian spam; could be any text
	\s\.\.\s\.\.\s\.\.\s\.\.\s|

      # bug 6143
        \\x\{00}.\\x\{00}.\\x\{00}.\\x\{00}.\\x\{00}.\\x\{00}

        )/ix;

  # skip S/O/hit% comments for publishing
  # #  1.000   0.967   0.000
  next if /^#\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+/;

  # skip high-bit strings, too
  my $nospcs = $_;
  $nospcs =~ s/\s//gs; $nospcs =~ s/^body\s+\S+\s+\///; $nospcs =~ s/\/$//;
  my $c_hi = do { my @tmp = ($nospcs =~ /[.\x80-\xff]/g); scalar @tmp; };
  my $c_lo = do { my @tmp = ($nospcs =~ /[^\x80-\xff]/g); scalar @tmp; };
  next if ($c_hi > $c_lo && $c_hi > 6);

  print;
}
