#!/usr/bin/python2.3 # This is -*- Python -*- import sys, string sys.path.append("..") import gnoetics def text_stats(txt): N = txt.get_length() uniq = {} known = 0 non_punct = 0 for i in xrange(N): t = txt.get_token(i) if t.is_punctuation(): continue non_punct += 1 j = uniq.get(t.get_word(), 0) if j == 0 and t.in_dictionary(): known += 1 uniq[t.get_word()] = j+1 n = len(uniq) r = non_punct/float(n) kp = 100*known/float(n) punct_r = non_punct/float(N - non_punct) freq = uniq.items() freq.sort(lambda a, b: cmp(b[1], a[1])) sum = 0 for i in xrange(int(n*0.10)): sum += freq[i][1] p10 = 100*sum/non_punct sum = 0 for i in xrange(int(n*0.05)): sum += freq[i][1] p05 = 100*sum/non_punct title = txt.get_title() if title[:4].lower() == "the ": title = title[4:] if len(title) > 20: title = title[:17] + "..." print "%20s | %6d %5d | %4.1f | %3.1f | %2.0f%% | %2.0f%% %2.0f%%" % (title, non_punct, n, r, punct_r, kp, p10, p05) lib = gnoetics.Library("../texts-ts") for txt in lib: text_stats(txt)