# CAUTION: for teaching purposes only # You should use LIBSVM from http://www.csie.ntu.edu.tw/~cjlin/libsvm/ # import string import mailbox import itertools from Bio import SVM VERBOSE = 1 spam_filename = "/Users/dalke/Library/Mail/Mailboxes/spam.mbox/mbox" ham_filename = \ "/Users/dalke/Library/Mail/POP-dalke@mail230.pair.com/INBOX.mbox/mbox" # convert everything except letters and digits into spaces transtable = (" "*ord("0") + "0123456789" + " "*(ord("A")-ord("9")-1) + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + " "*(ord("a")-ord("Z")-1) + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + # convert lowercase into upper " "*(255-ord("z"))) assert len(transtable) == 256, len(transtable) def get_counts(infile): s = infile.read(10000) words = s.translate(transtable).split() words = [word for word in words if 2 < len(word) <= 10] d = {} for word in words: d[word] = d.get(word, 0) + 1 return d def add_counts(d1, d2): d = d1.copy() for k, v in d2.items(): d[k] = d.get(k, 0) + v return d def get_mailbox_counts(mbox): i = 0 d = {} for msg in mbox: d = add_counts(d, get_counts(msg.fp)) if i >= 200: break i = i + 1 return d spam_counts = get_mailbox_counts( mailbox.PortableUnixMailbox(open(spam_filename))) ham_counts = get_mailbox_counts( mailbox.PortableUnixMailbox(open(ham_filename))) def print_total_info(spam_counts, ham_counts): total = add_counts(spam_counts, ham_counts) all_words = total.keys() all_words.sort() # sorted alphabetically print " Counts " print " Word Ham Spam" print "---------- ----- ------" for word in all_words: print "%s %6d %6d" % (word.rjust(10), ham_counts.get(word, 0), spam_counts.get(word, 0)) if 0 and VERBOSE: print_total_info(spam_counts, ham_counts) # Get the top N words in each data set def get_top_N(d, N): data = [(v,k) for (k,v) in d.items()] data.sort() data.reverse() return [(k,v) for (v,k) in data[:N]] top_N = 10 top_ham = get_top_N(ham_counts, top_N) top_spam = get_top_N(spam_counts, top_N) if VERBOSE: # Take a look at the data print "Top", top_N, "words in ham" for word, count in top_ham: print "%s %6d" % (word.rjust(10), count) print "Top", top_N, "words in spam" for word, count in top_spam: print "%s %6d" % (word.rjust(10), count) # I don't know if I really need to scale the input so the # expected values are in the range [0,1] d = add_counts(dict(top_ham), dict(top_spam)) descriptor_data = [(v,k) for (k,v) in d.items()] descriptor_data.sort() descriptor_data.reverse() descriptor_names = [k for (v,k) in descriptor_data] num = float(len(descriptor_names)) descriptor_scale = [v/num for (v,k) in descriptor_data] descriptor_data = zip(descriptor_names, descriptor_scale) if VERBOSE: print "Using descriptor vector of size", len(descriptor_data) for (name, scale) in descriptor_data: print name, "%3.2f" % scale def compute_descriptors(msg, descriptor_data): d = get_counts(msg.fp) return [min(d.get(name, 0)/scale, 1.0) for (name, scale) in descriptor_data] def get_training_set(mbox, descriptor_data, N=95): training_set = [] for msg in itertools.islice(mbox, 0, N): training_set.append(compute_descriptors(msg, descriptor_data)) return training_set ham_mbox = mailbox.PortableUnixMailbox(open(ham_filename)) ham_training_set = get_training_set(ham_mbox, descriptor_data) spam_mbox = mailbox.PortableUnixMailbox(open(spam_filename)) spam_training_set = get_training_set(spam_mbox, descriptor_data) # Force some speedups by using Numeric's implementations import Numeric, multiarray SVM._dot = SVM.LinearKernel.__call__ = multiarray.matrixproduct training_set = Numeric.array(ham_training_set + spam_training_set, Numeric.Float) # Use -1 for spam, 1 for ham results = Numeric.array( [1] * len(ham_training_set) + [-1] * len(spam_training_set), Numeric.Float) ham_spam_svm = SVM.train(training_set, results) print "\n============= HAM TESTS" print "First ham in training set:", print SVM.classify(ham_spam_svm, ham_training_set[0]) msg = ham_mbox.next() desc = compute_descriptors(msg, descriptor_data) print "Unknown ham:", SVM.classify(ham_spam_svm, desc) print "="*60 msg.rewindbody() print msg.fp.read(1000) print "="*60 print "\n\n============= SPAM TESTS" print "First spam in training set:", print SVM.classify(ham_spam_svm, spam_training_set[0]) msg = spam_mbox.next() desc = compute_descriptors(msg, descriptor_data) print "Unknown spam:", SVM.classify(ham_spam_svm, desc) print "="*60 msg.rewindbody() print msg.fp.read(1000) print "="*60