#!/usr/bin/python # This is -*- Python -*- import string, sys, re dash_re = re.compile(" *--+") ellipsis_re = re.compile(" *(\. *){3,}") abbrev_list = ("dr.", "mr.", "ms.", "mrs.", "m.", "st.") def is_abbrev(word): return word.lower() in abbrev_list def is_terminal(word): if word[-1] == "." and (len(word) == 1 or word[-2] != "."): return not is_abbrev(word) return word[-1] in ("?", "!") def strip_single_quotes(word): while word and word[0] == "'": word = word[1:].strip() while word and word[-1] == "'" and word[-2:] != "s'": word = word[:-2].strip() return word def strip_trailing_punctuation(word): word = word.strip() while word and word[-1] in (".", ",", ";", ":", "-", "!", "?"): word = word[:-1].strip() return strip_single_quotes(word) def split_punctuation(word): word = strip_single_quotes(word) if word == "": return word, None if word[-1] in (",", ";", ":", "-"): return strip_trailing_punctuation(word[:-1]), word[-1] elif word[-3:] == "...": return strip_trailing_punctuation(word[:-3]), word[-3:] else: return word, None def punct_token(p): return "*punct* %s" % p def is_punct_token(t): return t[:7] == "*punct*" def break_token(): return "*break*" ############################################################################### basic_tags = ("p", "br", "b", "i", "h1", "h2", "h3", "h4", "strong", "html", "head", "body", "title", "meta") entity_re = re.compile ("&[A-Za-z0-9]+;"); def process_line(line): line = line.strip().lower() if not line: return [] line_tokens = [] # strip out quotes and other misc. punctuation line = line.replace('"', "") line = line.replace('`', "") line = line.replace('*', "") line = line.replace('_', "") # strip out basic html for tag in basic_tags: line = line.replace("<%s>" % tag, ""); line = line.replace("" % tag, ""); line = entity_re.sub("", line); # Better handling for adjacent punctuation and single-quotes line = line.replace(",'", ",' ") line = line.replace(".'", ".' ") line = line.replace("?'", "?' ") line = line.replace("!'", "!' ") # remove/ single quotes, but only on word boundaries i = 0 while i != -1: i = line.find("'", i) if i != -1: if (i == 0 or line[i-1] == " ") or \ (i == len(line)-1 or line[i+1] == " "): line = line[:i] + line[i+1:] else: i += 1 # Better handling for adjacent punctuation and single-quotes line = line.replace(",'", ",' ") line = line.replace(".'", ".' ") line = line.replace("?'", "?' ") line = line.replace("!'", "!' ") # map -- to commas, in a semi-intelligent way if line.find("--") != -1: line = dash_re.sub(", ", line) # do something smart with ellipses if line.find(". .") != -1: line = ellipsis_re.sub("... ", line) # remove square brackets line = line.replace("[", "") line = line.replace("]", "") # map parens to commas line = line.replace(")", ", ") line = line.replace("(", ", ") tokens = line.split() for token in tokens: token = token.strip() if token: if is_terminal(token): if len(token) > 1: line_tokens.append(token[:-1]) line_tokens.append(punct_token(token[-1])) line_tokens.append(break_token()) else: word, punc = split_punctuation(token) if word: line_tokens.append(word) if punc: line_tokens.append(punct_token(punc)) return line_tokens ############################################################################### def sentence_looks_reasonable(sentence): # We don't like short sentences. if len(sentence) < 5: return False # We don't like leading punctuation. if is_punct_token(sentence[0]): return False # We don't like a lack of trailing punctuation. if not is_punct_token(sentence[-1]): return False # We don't like weird one-character words. for word in sentence: if len(word) == 1 and word not in "aio0123456789&": return False # We don't like adjacent punctuation. i = 0 for i in range(len(sentence)-1): if is_punct_token(sentence[i]) and is_punct_token(sentence[i+1]): return False i += 1 return True def clean_sentence(sentence): sentence = list(sentence) # Remove adjacent duplicate punctuation i = 0 while i < len(sentence)-1: if is_punct_token(sentence[i]) and sentence[i] == sentence[i+1]: sentence.pop(i) else: i += 1 # Merge hyphenated words i = 0 while i < len(sentence)-2: if is_punct_token(sentence[i+1]) \ and sentence[i+1][-1] == "-": t = sentence[i] + "-" + sentence[i+2] sentence[i] = t sentence.pop(i+1) sentence.pop(i+1) else: i += 1 return sentence def tokens_to_sentences(token_list): brk = break_token() sentences = [] this_sentence = [] for t in token_list: if t == brk: if this_sentence: this_sentence = clean_sentence(this_sentence) sentences.append(this_sentence) this_sentence = [] else: this_sentence.append(t) return sentences def write_sentence_as_tokens(sentence): for t in sentence: print t print break_token() ############################################################################### if len(sys.argv) > 1: f = file(sys.argv[1]) else: f = sys.stdin tokens = [] header = [] in_header = False for line in f: line = line.strip() if line[:13] == "@BEGIN_HEADER": in_header = True elif line[:11] == "@END_HEADER": in_header = False elif in_header: header.append(line) else: tokens.extend(process_line(line)) sentences = tokens_to_sentences(tokens) sys.stderr.write("Found %d sentences.\n" % len(sentences)) if not sentences: sys.exit(0) good_sentences = [] bad_sentences = [] for s in sentences: if sentence_looks_reasonable(s): good_sentences.append(s) else: bad_sentences.append(s) bad_p = 100.0*len(bad_sentences)/len(sentences) sys.stderr.write("Dropping %d suspicious sentences. (%.1f%%)\n" % (len(bad_sentences), bad_p)) for line in header: print line if len(good_sentences) > 10: for s in good_sentences: write_sentence_as_tokens(s) else: sys.stderr.write("Ignoring: Too few good sentences.\n")