## TOKENIZER def get_words(text): """returns a list of uppercase words from a text""" import re return [word.upper() for word in re.sub(r"[^\w\s]", "", text).split()] ## GENERAL DATA STRUCTURE UTILITIES from sets import Set def occurrences(items): """given a list of items, returns a dictionary mapping item to occurence count""" d = {} for item in Set(items): d[item] = items.count(item) return d ## Z VALUE CALCULATION from math import sqrt def z_values(text, overall): """calculate the z values for each word in the given text relative to the overall text""" t = occurrences(text) o = occurrences(overall) n_t = len(text) n_o = len(overall) result = {} for word in t: p_hat = float(t[word]) / n_t p_0 = float(o[word]) / n_o z = (p_hat - p_0) / sqrt((p_0 * (1 - p_0)) / n_t) result[word] = z return result ## def filter_z(z_values, threshold): """return only those items that meet the given z value threshold""" sign = int(abs(threshold) / threshold) l = [(i, z_values[i]) for i in z_values if cmp(z_values[i], threshold) == sign] l.sort(lambda a, b: -sign * cmp(a[1], b[1])) return l ## RUN file_list = ["chapter1.txt", "chapter2.txt", "chapter3.txt", "chapter4.txt", "chapter5.txt"] text_list = [(filename, get_words(file(filename).read())) for filename in file_list] overall = [] for (filename, text) in text_list: overall += text for (filename, text) in text_list: z = z_values(text, overall) unusually_frequent = filter_z(z, 4) unusually_rare = filter_z(z, -2) print print "Unusually frequent in %s" % filename print "----------------------" + ("-" * len(filename)) print "\n".join(["%-30s %s" % pair for pair in unusually_frequent]) print print "Unusually rare in %s" % filename print "------------------" + ("-" * len(filename)) print "\n".join(["%-30s %s" % pair for pair in unusually_rare])