# A parser for SGML, using the derived class as static DTD. # XXX This only supports those SGML features used by HTML. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). # sgmlop support added by fredrik@pythonware.com (April 6, 1998) import re import string try: import sgmlop except ImportError: sgmlop = None # standard entity defs ENTITYDEFS = { 'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'' } # SGML parser base class -- find tags and call handler functions. # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). # The dtd is defined by deriving a class which defines methods # with special names to handle tags: start_foo and end_foo to handle # and , respectively, or do_foo to handle by itself. # (Tags are converted to lower case for this purpose.) The data # between tags is passed to the parser by calling self.handle_data() # with some data as argument (the data may be split up in arbutrary # chunks). Entity references are passed by calling # self.handle_entityref() with the entity reference as argument. # -------------------------------------------------------------------- # original re-based SGML parser interesting = re.compile('[&<]') incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' '<([a-zA-Z][^<>]*|' '/([a-zA-Z][^<>]*)?|' '![^<>]*)?') entityref = re.compile('&([a-zA-Z][a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#([0-9]+)[^0-9]') starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][a-zA-Z0-9]*)/([^/]*)/') endtagopen = re.compile('a-zA-Z]') endbracket = re.compile('[<>]') special = re.compile(']*>') commentopen = re.compile('