#!/usr/bin/python # # This example program converts a chunk of HTML to a DOM tree. # It then prints the tree as HTML, as XML, and it prints a list of all # the hyperlinks in the document by using getElementsByTagName() to # retrieve all the A elements. from xml.dom.html_builder import HtmlBuilder from xml.dom.writer import HtmlWriter from xml.dom import core HTML_DATA = """ Les HOWTO Linux

Les HOWTO Linux

Les Howto que vous trouverez ci-dessous sont en français. Ils peuvent etre trouvés dans les formats suivants sur le site ftp.lip6.fr dans le répertoire /pub/linux/french/docs/HOWTO :

""" # Construct an HtmlBuilder object and feed the data to it b = HtmlBuilder() b.feed(HTML_DATA) # Get the newly-constructed document object doc = b.document # Output it as HTML print "============" print "HTML version" w = HtmlWriter() w.write(b.document) # Output it as XML print "\n===========" print "XML version" print doc.toxml() print "\n===========" print "Links in the document" # Retrieve all the link objects links = doc.getElementsByTagName('A') for node in links: # Collect any children of the A element that are Text nodes # (Note that this won't work on invalid HTML, like # Text. You could fix this by actually # traversing all the child nodes of the A element.) linktext = "" for child in node.childNodes: if child.nodeType == core.TEXT_NODE: linktext = linktext + child.value # Get the HREF attribute, if present url = node.getAttribute('HREF') if url != "": print "HREF=", url, linktext print links