#!/usr/bin/python
# This is -*- Python -*-

import string, sys, xml.sax, xml.sax.handler, re

xml.sax.handler.feature_validation = False

dash_re = re.compile(" *--+")
ellipsis_re = re.compile(" *(\. *){3,}")

abbrev_list = ("dr.", "mr.", "ms.", "mrs.", "m.", "st.")

def is_abbrev(word):
    return word.lower() in abbrev_list

def is_terminal(word):
    if word[-1] == "." and (len(word) == 1 or word[-2] != "."):
        return not is_abbrev(word)
    return word[-1] in ("?", "!")

def strip_single_quotes(word):
    while word and word[0] == "'":
        word = word[1:].strip()
    while word and word[-1] == "'" and word[-2:] != "s'":
        word = word[:-2].strip()
    return word

def strip_trailing_punctuation(word):
    word = word.strip()
    while word and word[-1] in (".", ",", ";", ":", "-", "!", "?"):
        word = word[:-1].strip()
    return strip_single_quotes(word)

def split_punctuation(word):
    word = strip_single_quotes(word)
    if word[-1] in (",", ";", ":", "-"):
        return strip_trailing_punctuation(word[:-1]), word[-1]
    elif word[-3:] == "...":
        return strip_trailing_punctuation(word[:-3]), word[-3:]
    else:
        return word, None

class TextScanner:

    def __init__(self):
        pass

    def process_chunk(self, chunk):
        chunk = chunk.strip().lower()
        if not chunk:
            return

        # remove quotes
        chunk = chunk.replace('"', "")

        # remove square brackets
        chunk = chunk.replace("[", "")
        chunk = chunk.replace("]", "")

        # map parens to commas
        chunk = chunk.replace(")", ", ")
        chunk = chunk.replace("(", ", ")

        tokens = chunk.split()

        for token in tokens:
            token = token.strip()
            if token:
                if is_terminal(token):
                    if len(token) > 1:
                        print token[:-1]
                    print "*punct*", token[-1]
                    print "*break*"
                else:
                    word, punc = split_punctuation(token)
                    if word:
                        print word
                    if punc:
                        print "*punct*", punc
        
    def process(self):
        assert 0


class TextScanner_XML(TextScanner,
                      xml.sax.handler.ContentHandler,
                      xml.sax.handler.DTDHandler,
                      xml.sax.handler.EntityResolver,
                      xml.sax.handler.ErrorHandler):

    def __init__(self, filename):
        TextScanner.__init__(self)

        self.__para = 0
        self.__bookbody = 0
        self.__block = 0
        self.__pending = []
        
        self.__filename = filename

    def startElement(self, name, attrs):
        if name == "para":
            self.__para += 1
        elif name == "bookbody":
            self.__bookbody += 1
        elif name == "reference":
            self.__block += 1
        
    def endElement(self, name):
        if name == "para":
            self.__para -= 1
            if  self.__para == 0:
                self.__process_pending()
        elif name == "bookbody":
            self.__bookbody -= 1
        elif name == "reference":
            self.__block -= 1

    def characters(self, content):
        if self.__para > 0 and self.__bookbody > 0 and self.__block == 0:
            line = content.encode("utf-8")
            if line and line != "\n":
                self.__pending.append(line)

    def __process_pending(self):
        if not self.__pending:
            return

        # If a ' is on a line by itself, it almost certainly came
        # from expanding an &apos;
        pend = []
        while self.__pending:
            line = self.__pending.pop(0)
            if line == "'":
                next = ""
                if self.__pending:
                    next = self.__pending.pop(0)
                if not pend:
                    pend = [" "]
                pend[-1] = pend[-1] + "'" + next
            else:
                pend.append(line)

        line = string.join(pend, " ")

        print line


    def process(self):
        parser = xml.sax.make_parser()
        parser.setContentHandler(self)
        parser.parse(self.__filename)

        
scanner = TextScanner_XML(sys.argv[1])
scanner.process()