import html5lib, sys from html5lib import treebuilders, treewalkers, serializer from html5lib.filters import sanitizer p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) f = open( "weather.html", "r" ) dom_tree = p.parse(f) f.close() walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) passtags = [ u'a', u'h1', u'h2', u'h3', u'h4',u'em', u'strong', u'img', u'dl', u'dt', u'dd' ] for token in stream: if token.has_key('name'): if token['name'] in passtags: continue print token