## TOKENIZER

def get_words(text):
    """returns a list of uppercase words from a text"""
    
    import re
    return [word.upper() for word in re.sub(r"[^\w\s]", "", text).split()]


## GENERAL DATA STRUCTURE UTILITIES

from sets import Set

def occurrences(items):
    """given a list of items, returns a dictionary mapping item to occurence count"""
    d = {}
    for item in Set(items):
        d[item] = items.count(item)
    return d


## Z VALUE CALCULATION

from math import sqrt

def z_values(text, overall):
    """calculate the z values for each word in the given text relative to the overall text"""
    
    t = occurrences(text)
    o = occurrences(overall)
    n_t = len(text)
    n_o = len(overall)

    result = {}

    for word in t:
        p_hat = float(t[word]) / n_t
        p_0 = float(o[word]) / n_o

        z = (p_hat - p_0) / sqrt((p_0 * (1 - p_0)) / n_t)

        result[word] = z

    return result


##

def filter_z(z_values, threshold):
    """return only those items that meet the given z value threshold"""
    
    sign = int(abs(threshold) / threshold)
    l = [(i, z_values[i]) for i in z_values if cmp(z_values[i], threshold) == sign]     
    l.sort(lambda a, b: -sign * cmp(a[1], b[1]))
    return l


## RUN

file_list = ["chapter1.txt", "chapter2.txt", "chapter3.txt", "chapter4.txt", "chapter5.txt"]

text_list = [(filename, get_words(file(filename).read())) for filename in file_list]

overall = []
for (filename, text) in text_list:
    overall += text


for (filename, text) in text_list:
    z = z_values(text, overall)

    unusually_frequent = filter_z(z, 4)
    unusually_rare = filter_z(z, -2)
    
    print
    print "Unusually frequent in %s" % filename
    print "----------------------" + ("-" * len(filename))
    print "\n".join(["%-30s %s" % pair for pair in unusually_frequent])
    
    print
    print "Unusually rare in %s" % filename
    print "------------------" + ("-" * len(filename))
    print "\n".join(["%-30s %s" % pair for pair in unusually_rare])