Zarafa batch scanning spam

Recently I made the switch from Scalix to Zarafa for our email system. In making the transition, I found this post on how someone integrated Spamassassin with a public based ham/spam folder setup. This seems to be the most common setup currently based on various Zarafa forums postings. While this provided a quick solution during the transition, I didn’t like the single message approach used to pull a message from IMAP and send to sa-learn. The following is a perl script that I used with Scalix in a personal ham/spam folder setup. It has been adopted to a public folder setup for use with Zarafa. The result is a significant improvement in scan times for large corpora.

#!/usr/bin/perl

#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions
#are met:
#
#1. Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
#2. Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
#THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
#IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
#OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
#IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
#INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
#NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
#THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use Mail::IMAPClient;
use File::Temp qw/ mkstemp /;
use Time::ParseDate;

my $debug    = 0;
my $imapdebug    = 0;
my $saopts    = "";
#my $saopts    = "-D";
#my $saopts    = "-D learn";
my $mailhost = "localhost";
my $username = 'mailadmin';
my $password = "MailadminSecretPassword";

my $hamfolder = "Public folders/LearnAsHam";
my $spamfolder = "Public folders/LearnAsSpam";

my $sa_db_owner = "amavis";

# # # # # # # # # # Do no edit below  # # # # # # # # # #

sub main {

    &learn_ham();
    &learn_spam();
    &sync_sa_db();
}

sub learn_ham {
    my $file = &folder_to_file($hamfolder);

    &chown_by_name($sa_db_owner, $file);

    &learn_as_ham($file);

    unlink "$file";
}

sub learn_spam {
    my $file = &folder_to_file($spamfolder);

    &chown_by_name($sa_db_owner, $file);

    &learn_as_spam($file);

    unlink "$file";
}

sub sync_sa_db {
    my $sarebuild = `su $sa_db_owner -c \'/usr/bin/sa-learn --sync\'`;
    print "-------\nRebuild: ", $sarebuild, "\n-------\n" if $debug;
}

sub chown_by_name {
    my ( $user, $file ) = @_;
    return chown( ( getpwnam($user) )[ 2, 3 ], $file );
}

sub learn_as_ham {
    my $salearn;
    my ($hfile) = @_;

    $salearn = `su $sa_db_owner -c \'/usr/bin/sa-learn $saopts --no-sync  --ham $hfile\'`;

    print "-------\nHam: ", $salearn, "\n-------\n" if $debug;
}

sub learn_as_spam {
    my $salearn;
    my ($sfile) = @_;

    $salearn = `su $sa_db_owner -c \'/usr/bin/sa-learn $saopts --no-sync  --spam $sfile\'`;

    print "-------\nSpam: ", $salearn, "\n-------\n" if $debug;
}

sub folder_to_file {
    my ( $folder ) = @_;
    my $file;
    my $ffd;

    ( $ffd, $file ) = mkstemp("/tmp/sa-scanner_XXXXX");

    my $imap = Mail::IMAPClient->new(
        Server   => $mailhost,
        User     => $username,
        Password => $password,
        Debug    => $imapdebug
    );

    if ( !defined($imap) ) { die "IMAP Login Failed"; }

    # If debugging, print out the total counts for each mailbox
    if ($debug) {
        my $count = $imap->message_count($folder);
        print $count, " msgs to process\n";

        print "folder=$folder\n";
        print "file=$file\n";
    }

    # Process the ham mailbox
    $imap->select($folder);

    my @msgs = $imap->messages;

    if ($debug) {
        print "\t\t" . scalar(@msgs) . " messages in $folder.\n";
    }

    eval {
        for my $msg ( reverse(@msgs) )
        {
            my @envelope = $imap->fetch( $msg, "envelope" );

            if ( $envelope[0] !~ /^\* [0-9]+ FETCH \(UID [0-9]+ ENVELOPE \("(.*)" (NIL|"[^"]*") \(\((NIL|"[^"]*") (NIL|"[^"]*") (NIL|"[^"]*") (NIL|"[^"]*")\)\)/) {
                if ($debug) {
                    #print(STDERR "Bizarre output from fetch: ".$envelope[0]."\n") ;
                }
                $user = '"daemon"';
                $dom  = "NIL";
                $host = "NIL";
                $date = localtime( time() );
            } else {
                $dom  = $4;
                $user = $5;
                $host = $6;
                $date = localtime( parsedate($1) );
            }

            $dom  =~ s/^"(.*)"$/\%$1/ or $dom  = "";
            $user =~ s/^"(.*)"$/$1/   or $user = "";
            $host =~ s/^"(.*)"$/\@$1/ or $host = "";

            $msg_txt = $imap->message_string($msg);
            $msg_txt =~ s/[\n]From />From /;

            print $ffd "From ", $user, $dom, $host, "  ", $date, "\n", $msg_txt, "\n";

            $imap->delete_message($msg);
        }
    };

    if ($@) {    # $@ contains the exception that was thrown
        print "ERROR: $@\n";
    }

    close($ffd);

    $imap->expunge();
    $imap->close();

    return $file;
}

&main();

Lyric tagging audio files

I’ve been working with cleaning up the files on my computer and thought about adding proper ID3 or metadata tags to the mp3 and ogg files. Musicbrainz’s Picard is a wonderful tool to go back and help organize, rename, and add tags to most (practically all) audio file types. One thing bothered me though is that no plugin existed to automatically add lyrics to the tags. Since Picard had already run through and done its magic, I decided to throw a little python at the problem and create a script that would search for the lyrics on the web and add them to a given file. A quick bit of research turned up Mutagen a python library that is even used by Musicbrainz’s Picard and http://lyrics.wikia.com/ as a search site with a predictable URL pattern.

The result is add_lyrics_tag.py below that takes as its only argument the fully qualified path and filename to an Ogg Vorbis or MP3 file. If lyrics can be found, the file is automatically added to the metadata section of an Ogg Vorbis file or the USLT portion of the ID3 section of an MP3.

#!/usr/bin/python

#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions
#are met:
#
#1. Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
#2. Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
#THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
#IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
#OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
#IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
#INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
#NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
#THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys, codecs, os
import urllib2
from mutagen.oggvorbis import OggVorbis
from mutagen.id3 import ID3, USLT, TALB
from mutagen.easyid3 import EasyID3

def execute(cmd):
	a = os.popen(cmd)
	output = a.readlines()
	output =  "".join(output)
	return output

def getlyrics(artist, title):
	cmd = "w3m -dump -no-cookie -T text/html \"http://lyrics.wikia.com/" + artist + ":" + title + "\" | perl -e 'BEGIN { $true = 0; } while(<>) { if ($_ =~ /Ringtone/){ $true=$true?0:1;} if ($true){ print $_; } }' | tail -n+2 | sed 's/\[...\]phone//'"
	text = execute(cmd)
	#print "%s/%s/(%s)" % (artist, title, text)
	return text

def main():
	filename = sys.argv[1]

	basename, extension = os.path.splitext(filename)

	if(extension == ".mp3"):
		tagmp3(filename)
	elif(extension == ".ogg"):
		tagogg(filename)
	else:
		print "unable to handle extension: " + extension

def tagogg(filename):
	audio = OggVorbis(filename)

	artist = audio['artist'][0]
	title = audio['title'][0]
	lyrics = getlyrics(artist, title)

	if(len(lyrics) > 0):
		print "Tagging lyrics into " + filename
		audio["lyrics"] = lyrics
		audio.save()
	else:
		print "Unable to find lyrics for \"" + artist + "/" + title + "\""

def tagmp3(filename):
	audio = EasyID3(filename)

	artist = audio['artist'][0]
	title = audio['title'][0]
	lyrics = getlyrics(artist, title)

	if(len(lyrics) > 0):
		print "Tagging lyrics into " + filename
		id3 = ID3(filename)
		id3.add(USLT(encoding=3, lang="und", text=lyrics))
		id3.save()
	else:
		print "Unable to find lyrics for \"" + artist + "/" + title + "\""

if __name__ == "__main__":
	try:
		main()
	except:
		print "Unable to process file: " + sys.argv[1]

This can all be driven with a little bit of bash scripting..

find /mnt/albums -name "*.mp3" -type f -exec add_lyrics_tag.py \"{}\" \;
find /mnt/albums -name "*.ogg" -type f -exec add_lyrics_tag.py \"{}\" \;

There are several things that can be done to improve on this script..

  • Only do tagging if the lyrics tag does not already exist
  • Integrate with Picard as a real plugin
  • Leverage off of Mutagen more to clean the code up by removing the obvious duplication
  • Add ability to specify a directory instead of a single file
  • The original inspiration for this came from the MPD Hacks page, and I would like to see it integrated back with it in some way (X Window display, conky, etc). This would avoid the constant fetching from the web of the lyrics by looking for a potential copy within the audio file first.