Finding a few generic algorithms..

Late today, I found an interesting site that had a few Generic C# algorithms. One of those algorithms was a simple implementation of a Convex Hull. The implementation was tightly integrated into a single class with even a small test case. I changed the interface a bit to use a set of generic lists and return a tuple rather than allow yet another definition of a Point class to be defined (aka PostGis’s Point class).

// Finding a convex hull in the plane
// This program requires .Net version 2.0.
// Peter Sestoft (sestoft@itu.dk) * Java 2000-10-07, GC# 2001-10-27

using System;
using System.Collections.Generic;

// ------------------------------------------------------------

// Find the convex hull of a point set in the plane

// An implementation of Graham's (1972) point elimination algorithm,
// as modified by Andrew (1979) to find lower and upper hull separately.

// This implementation correctly handles duplicate points, and
// multiple points with the same x-coordinate.

// 1. Sort the points lexicographically by increasing (x,y), thus 
//    finding also a leftmost point L and a rightmost point R.
// 2. Partition the point set into two lists, upper and lower, according as 
//    point is above or below the segment LR.  The upper list begins with 
//    L and ends with R; the lower list begins with R and ends with L.
// 3. Traverse the point lists clockwise, eliminating all but the extreme
//    points (thus eliminating also duplicate points).
// 4. Eliminate L from lower and R from upper, if necessary.
// 5. Join the point lists (in clockwise order) in an array.

namespace Util
{
	public class ConvexHull
	{
		public static Tuple<List<double>, List<double>> CalculateConvexHull(List<double> lats, List<double> lons)
		{
			List<double> latshull = new List<double>();
			List<double> lonshull = new List<double>();

			if(lats.Count != lons.Count) {
				return new Tuple<List<double>, List<double>>(latshull, lonshull);
			}

			int count = lats.Count;
			List<Point> points = new List<Point>();

			for(int i = 0;i < lats.Count;i++) {
				points.Add(new Point(lons[i], lats[i]));
			}

			Point[] hull = CalculateConvexHull(points.ToArray());

			foreach(Point p in hull) {
				lonshull.Add(p.x);
				latshull.Add(p.y);
			}

			return new Tuple<List<double>, List<double>>(latshull, lonshull);
		}

		private static Point[] CalculateConvexHull(Point[] pts)
		{
			// Sort points lexicographically by increasing (x, y)
			int N = pts.Length;
			Polysort.Quicksort<Point>(pts);
			Point left = pts[0], right = pts[N - 1];
			// Partition into lower hull and upper hull
			CDLL<Point> lower = new CDLL<Point>(left), upper = new CDLL<Point>(left);
			for (int i = 0; i < N; i++) {
				double det = Point.Area2(left, right, pts[i]);
				if (det > 0) {
					upper = upper.Append(new CDLL<Point>(pts[i]));
				} else if (det < 0) {
					lower = lower.Prepend(new CDLL<Point>(pts[i]));
				}
			}
			lower = lower.Prepend(new CDLL<Point>(right));
			upper = upper.Append(new CDLL<Point>(right)).Next;
			// Eliminate points not on the hull
			eliminate(lower);
			eliminate(upper);
			// Eliminate duplicate endpoints
			if (lower.Prev.val.Equals(upper.val)) {
				lower.Prev.Delete();
			}
			if (upper.Prev.val.Equals(lower.val)) {
				upper.Prev.Delete();
			}
			// Join the lower and upper hull
			Point[] res = new Point[lower.Size() + upper.Size()];
			lower.CopyInto(res, 0);
			upper.CopyInto(res, lower.Size());
			return res;
		}

		// Graham's scan
		private static void eliminate(CDLL<Point> start)
		{
			CDLL<Point> v = start, w = start.Prev;
			bool fwd = false;
			while (v.Next != start || !fwd) {
				if (v.Next == w) {
					fwd = true;
				}
				if (Point.Area2(v.val, v.Next.val, v.Next.Next.val) < 0) // right turn
				{
					v = v.Next;
				} else {
					// left turn or straight
					v.Next.Delete();
					v = v.Prev;
				}
			}
		}
	}

// ------------------------------------------------------------

// Points in the plane

	internal class Point : Ordered<Point>
	{
		private static readonly Random rnd = new Random();

		public double x, y;

		public Point(double x, double y)
		{
			this.x = x;
			this.y = y;
		}

		public override string ToString()
		{
			return "(" + x + ", " + y + ")";
		}

		public static Point Random(int w, int h)
		{
			return new Point(rnd.Next(w), rnd.Next(h));
		}

		public bool Equals(Point p2)
		{
			return x == p2.x && y == p2.y;
		}

		public override bool Less(Ordered<Point> o2)
		{
			Point p2 = (Point) o2;
			return x < p2.x || x == p2.x && y < p2.y;
		}

		// Twice the signed area of the triangle (p0, p1, p2)
		public static double Area2(Point p0, Point p1, Point p2)
		{
			return p0.x * ( p1.y - p2.y ) + p1.x * ( p2.y - p0.y ) + p2.x * ( p0.y - p1.y );
		}
	}

// ------------------------------------------------------------

// Circular doubly linked lists of T

	internal class CDLL<T>
	{
		private CDLL<T> prev, next; // not null, except in deleted elements
		public T val;

		// A new CDLL node is a one-element circular list
		public CDLL(T val)
		{
			this.val = val;
			next = prev = this;
		}

		public CDLL<T> Prev { get { return prev; } }

		public CDLL<T> Next { get { return next; } }

		// Delete: adjust the remaining elements, make this one point nowhere
		public void Delete()
		{
			next.prev = prev;
			prev.next = next;
			next = prev = null;
		}

		public CDLL<T> Prepend(CDLL<T> elt)
		{
			elt.next = this;
			elt.prev = prev;
			prev.next = elt;
			prev = elt;
			return elt;
		}

		public CDLL<T> Append(CDLL<T> elt)
		{
			elt.prev = this;
			elt.next = next;
			next.prev = elt;
			next = elt;
			return elt;
		}

		public int Size()
		{
			int count = 0;
			CDLL<T> node = this;
			do {
				count++;
				node = node.next;
			} while (node != this);
			return count;
		}

		public void PrintFwd()
		{
			CDLL<T> node = this;
			do {
				Console.WriteLine(node.val);
				node = node.next;
			} while (node != this);
			Console.WriteLine();
		}

		public void CopyInto(T[] vals, int i)
		{
			CDLL<T> node = this;
			do {
				vals[i++] = node.val; // still, implicit checkcasts at runtime 
				node = node.next;
			} while (node != this);
		}
	}

// ------------------------------------------------------------

	internal class Polysort
	{
		private static void swap<T>(T[] arr, int s, int t)
		{
			T tmp = arr[s];
			arr[s] = arr[t];
			arr[t] = tmp;
		}

		// Typed OO-style quicksort a la Hoare/Wirth

		private static void qsort<T>(Ordered<T>[] arr, int a, int b)
		{
			// sort arr[a..b]
			if (a < b) {
				int i = a, j = b;
				Ordered<T> x = arr[( i + j ) / 2];
				do {
					while (arr[i].Less(x)) {
						i++;
					}
					while (x.Less(arr[j])) {
						j--;
					}
					if (i <= j) {
						swap<Ordered<T>>(arr, i, j);
						i++;
						j--;
					}
				} while (i <= j);
				qsort<T>(arr, a, j);
				qsort<T>(arr, i, b);
			}
		}

		public static void Quicksort<T>(Ordered<T>[] arr)
		{
			qsort<T>(arr, 0, arr.Length - 1);
		}
	}

	public abstract class Ordered<T>
	{
		public abstract bool Less(Ordered<T> that);
	}

// ------------------------------------------------------------

	/*
	internal class TestCH
	{
		private static void Main(string[] args)
		{
			if (args.Length == 1) {
				int N = int.Parse(args[0]);
				Point[] pts = new Point[N];
				for (int i = 0; i < N; i++) {
					pts[i] = Point.Random(500, 500);
				}
				Point[] chpts = ConvexHull.CalculateConvexHull(pts);
				Console.WriteLine("Area is " + area(chpts));
				print(chpts);
			} else {
				Console.WriteLine("Usage: TestCH <pointcount>\n");
			}
		}

		// The centroid of a point set
		public static Point centroid(Point[] pts)
		{
			int N = pts.Length;
			double sumx = 0, sumy = 0;
			for (int i = 0; i < N; i++) {
				sumx += pts[i].x;
				sumy += pts[i].y;
			}
			return new Point(sumx / N, sumy / N);
		}

		// The area of a polygon (represented by an array of ordered vertices)
		public static double area(Point[] pts)
		{
			int N = pts.Length;
			Point centr = centroid(pts);
			double area2 = 0;
			for (int i = 0; i < N; i++) {
				area2 += Point.Area2(centr, pts[i], pts[( i + 1 ) % N]);
			}
			return Math.Abs(area2 / 2);
		}

		public static void print(Point[] pts)
		{
			int N = pts.Length;
			for (int i = 0; i < N; i++) {
				Console.WriteLine(pts[i]);
			}
		}
	}
	 */
}

LRU Cache in C#

So far its been a fast paced week going by. I thought I might need a generic Least Recently Used (LRU) cache implementation to solve a problem. While I was on travel and had some down time, I went ahead and threw together a quick implementation that uses the traditional linked list / map approach. I added a few additional convenience methods that expose more of a dictionary feel overall. If I had additional time, I would probably look at fully implementing the IDictionary or IList interfaces. Maybe later this weekend, I’ll be able to see if it solves the problem for which is was originally created…

/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   
*/

using System.Collections.Generic;

namespace Util
{
	public class LruCache<K, V>
	{
		private class LruCacheItem
		{
			public K key;
			public V value;

			public LruCacheItem(K k, V v)
			{
				key = k;
				value = v;
			}
		}

		private int capacity;

		private Dictionary<K, LinkedListNode<LruCacheItem>> map =
			new Dictionary<K, LinkedListNode<LruCacheItem>>();

		private LinkedList<LruCacheItem> list = new LinkedList<LruCacheItem>();

		public LruCache(int capacity_)
		{
			capacity = capacity_;
		}

		public V this[K key]
		{
			get
			{
				lock (this) {
					return retrieve(key);
				}
			}
			set
			{
				lock (this) {
					store(key, value);
				}
			}
		}

		private V retrieve(K key)
		{
			LinkedListNode<LruCacheItem> node;
			if (map.TryGetValue(key, out node)) {
				V value = node.Value.value;

				list.Remove(node);
				list.AddLast(node);
				return value;
			}
			return default( V );
		}

		private void store(K key, V val)
		{
			LinkedListNode<LruCacheItem> node;
			if(map.TryGetValue(key,out node)) {
				list.Remove(node);
				list.AddLast(node);
				return;
			}

			if (map.Count >= capacity) {
				RemoveFirst();
			}

			LruCacheItem cacheItem = new LruCacheItem(key, val);
			LinkedListNode<LruCacheItem> node = new LinkedListNode<LruCacheItem>(cacheItem);
			list.AddLast(node);
			map.Add(key, node);
		}

		public bool TryGetValue(K k, out V v)
		{
			LinkedListNode<LruCacheItem> item;
			if (map.TryGetValue(k, out item)) {
				v = item.Value.value;
				return true;
			}

			v = default( V );
			return false;
		}

		public bool Contains(K k)
		{
			lock (this) {
				return map.ContainsKey(k);
			}
		}

		public void Clear()
		{
			lock (this) {
				list.Clear();
				map.Clear();
			}
		}

		protected void RemoveFirst()
		{
			LinkedListNode<LruCacheItem> node = list.First;
			list.RemoveFirst();
			map.Remove(node.Value.key);
		}
	}
}

Zarafa batch scanning spam

Recently I made the switch from Scalix to Zarafa for our email system. In making the transition, I found this post on how someone integrated Spamassassin with a public based ham/spam folder setup. This seems to be the most common setup currently based on various Zarafa forums postings. While this provided a quick solution during the transition, I didn’t like the single message approach used to pull a message from IMAP and send to sa-learn. The following is a perl script that I used with Scalix in a personal ham/spam folder setup. It has been adopted to a public folder setup for use with Zarafa. The result is a significant improvement in scan times for large corpora.

#!/usr/bin/perl

#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions
#are met:
#
#1. Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
#2. Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
#THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
#IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
#OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
#IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
#INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
#NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
#THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use Mail::IMAPClient;
use File::Temp qw/ mkstemp /;
use Time::ParseDate;

my $debug    = 0;
my $imapdebug    = 0;
my $saopts    = "";
#my $saopts    = "-D";
#my $saopts    = "-D learn";
my $mailhost = "localhost";
my $username = 'mailadmin';
my $password = "MailadminSecretPassword";

my $hamfolder = "Public folders/LearnAsHam";
my $spamfolder = "Public folders/LearnAsSpam";

my $sa_db_owner = "amavis";

# # # # # # # # # # Do no edit below  # # # # # # # # # #

sub main {

    &learn_ham();
    &learn_spam();
    &sync_sa_db();
}

sub learn_ham {
    my $file = &folder_to_file($hamfolder);

    &chown_by_name($sa_db_owner, $file);

    &learn_as_ham($file);

    unlink "$file";
}

sub learn_spam {
    my $file = &folder_to_file($spamfolder);

    &chown_by_name($sa_db_owner, $file);

    &learn_as_spam($file);

    unlink "$file";
}

sub sync_sa_db {
    my $sarebuild = `su $sa_db_owner -c \'/usr/bin/sa-learn --sync\'`;
    print "-------\nRebuild: ", $sarebuild, "\n-------\n" if $debug;
}

sub chown_by_name {
    my ( $user, $file ) = @_;
    return chown( ( getpwnam($user) )[ 2, 3 ], $file );
}

sub learn_as_ham {
    my $salearn;
    my ($hfile) = @_;

    $salearn = `su $sa_db_owner -c \'/usr/bin/sa-learn $saopts --no-sync  --ham $hfile\'`;

    print "-------\nHam: ", $salearn, "\n-------\n" if $debug;
}

sub learn_as_spam {
    my $salearn;
    my ($sfile) = @_;

    $salearn = `su $sa_db_owner -c \'/usr/bin/sa-learn $saopts --no-sync  --spam $sfile\'`;

    print "-------\nSpam: ", $salearn, "\n-------\n" if $debug;
}

sub folder_to_file {
    my ( $folder ) = @_;
    my $file;
    my $ffd;

    ( $ffd, $file ) = mkstemp("/tmp/sa-scanner_XXXXX");

    my $imap = Mail::IMAPClient->new(
        Server   => $mailhost,
        User     => $username,
        Password => $password,
        Debug    => $imapdebug
    );

    if ( !defined($imap) ) { die "IMAP Login Failed"; }

    # If debugging, print out the total counts for each mailbox
    if ($debug) {
        my $count = $imap->message_count($folder);
        print $count, " msgs to process\n";

        print "folder=$folder\n";
        print "file=$file\n";
    }

    # Process the ham mailbox
    $imap->select($folder);

    my @msgs = $imap->messages;

    if ($debug) {
        print "\t\t" . scalar(@msgs) . " messages in $folder.\n";
    }

    eval {
        for my $msg ( reverse(@msgs) )
        {
            my @envelope = $imap->fetch( $msg, "envelope" );

            if ( $envelope[0] !~ /^\* [0-9]+ FETCH \(UID [0-9]+ ENVELOPE \("(.*)" (NIL|"[^"]*") \(\((NIL|"[^"]*") (NIL|"[^"]*") (NIL|"[^"]*") (NIL|"[^"]*")\)\)/) {
                if ($debug) {
                    #print(STDERR "Bizarre output from fetch: ".$envelope[0]."\n") ;
                }
                $user = '"daemon"';
                $dom  = "NIL";
                $host = "NIL";
                $date = localtime( time() );
            } else {
                $dom  = $4;
                $user = $5;
                $host = $6;
                $date = localtime( parsedate($1) );
            }

            $dom  =~ s/^"(.*)"$/\%$1/ or $dom  = "";
            $user =~ s/^"(.*)"$/$1/   or $user = "";
            $host =~ s/^"(.*)"$/\@$1/ or $host = "";

            $msg_txt = $imap->message_string($msg);
            $msg_txt =~ s/[\n]From />From /;

            print $ffd "From ", $user, $dom, $host, "  ", $date, "\n", $msg_txt, "\n";

            $imap->delete_message($msg);
        }
    };

    if ($@) {    # $@ contains the exception that was thrown
        print "ERROR: $@\n";
    }

    close($ffd);

    $imap->expunge();
    $imap->close();

    return $file;
}

&main();

Lyric tagging audio files

I’ve been working with cleaning up the files on my computer and thought about adding proper ID3 or metadata tags to the mp3 and ogg files. Musicbrainz’s Picard is a wonderful tool to go back and help organize, rename, and add tags to most (practically all) audio file types. One thing bothered me though is that no plugin existed to automatically add lyrics to the tags. Since Picard had already run through and done its magic, I decided to throw a little python at the problem and create a script that would search for the lyrics on the web and add them to a given file. A quick bit of research turned up Mutagen a python library that is even used by Musicbrainz’s Picard and http://lyrics.wikia.com/ as a search site with a predictable URL pattern.

The result is add_lyrics_tag.py below that takes as its only argument the fully qualified path and filename to an Ogg Vorbis or MP3 file. If lyrics can be found, the file is automatically added to the metadata section of an Ogg Vorbis file or the USLT portion of the ID3 section of an MP3.

#!/usr/bin/python

#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions
#are met:
#
#1. Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
#2. Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
#THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
#IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
#OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
#IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
#INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
#NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
#THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys, codecs, os
import urllib2
from mutagen.oggvorbis import OggVorbis
from mutagen.id3 import ID3, USLT, TALB
from mutagen.easyid3 import EasyID3

def execute(cmd):
	a = os.popen(cmd)
	output = a.readlines()
	output =  "".join(output)
	return output

def getlyrics(artist, title):
	cmd = "w3m -dump -no-cookie -T text/html \"http://lyrics.wikia.com/" + artist + ":" + title + "\" | perl -e 'BEGIN { $true = 0; } while(<>) { if ($_ =~ /Ringtone/){ $true=$true?0:1;} if ($true){ print $_; } }' | tail -n+2 | sed 's/\[...\]phone//'"
	text = execute(cmd)
	#print "%s/%s/(%s)" % (artist, title, text)
	return text

def main():
	filename = sys.argv[1]

	basename, extension = os.path.splitext(filename)

	if(extension == ".mp3"):
		tagmp3(filename)
	elif(extension == ".ogg"):
		tagogg(filename)
	else:
		print "unable to handle extension: " + extension

def tagogg(filename):
	audio = OggVorbis(filename)

	artist = audio['artist'][0]
	title = audio['title'][0]
	lyrics = getlyrics(artist, title)

	if(len(lyrics) > 0):
		print "Tagging lyrics into " + filename
		audio["lyrics"] = lyrics
		audio.save()
	else:
		print "Unable to find lyrics for \"" + artist + "/" + title + "\""

def tagmp3(filename):
	audio = EasyID3(filename)

	artist = audio['artist'][0]
	title = audio['title'][0]
	lyrics = getlyrics(artist, title)

	if(len(lyrics) > 0):
		print "Tagging lyrics into " + filename
		id3 = ID3(filename)
		id3.add(USLT(encoding=3, lang="und", text=lyrics))
		id3.save()
	else:
		print "Unable to find lyrics for \"" + artist + "/" + title + "\""

if __name__ == "__main__":
	try:
		main()
	except:
		print "Unable to process file: " + sys.argv[1]

This can all be driven with a little bit of bash scripting..

find /mnt/albums -name "*.mp3" -type f -exec add_lyrics_tag.py \"{}\" \;
find /mnt/albums -name "*.ogg" -type f -exec add_lyrics_tag.py \"{}\" \;

There are several things that can be done to improve on this script..

  • Only do tagging if the lyrics tag does not already exist
  • Integrate with Picard as a real plugin
  • Leverage off of Mutagen more to clean the code up by removing the obvious duplication
  • Add ability to specify a directory instead of a single file
  • The original inspiration for this came from the MPD Hacks page, and I would like to see it integrated back with it in some way (X Window display, conky, etc). This would avoid the constant fetching from the web of the lyrics by looking for a potential copy within the audio file first.