#! /bin/env python ## Network Programming Project 3 ## HTML parser - extract the 5-day and detailed weather forcast from ## www.wunderground.com import html5lib, sys, re from html5lib import treebuilders, treewalkers import urllib, urllib2 def wordWrap( s, n ): ## Wrap lines that are too long by inserted line feeds between words. L = len( s ) if L <= n: return s words = s.split( " " ) ret = "" ch = 0 for i in words: if (ch + len(i) < n ): ret += " " + i ch += len(i) else: ret += "\n" + i ch = len(i) return ret def testGet(x, i): """A List function: return x[i] if it exists, else empty string""" if i < len(x): return x[i] return '' class mytable: def __init__(self): self.nrows = 0 self.cols = 0 self.col = 0 self.rows = [] self.txt = '' def trTag( self ): # new row self.row = self.nrows self.nrows = self.nrows + 1 self.rows.append( [] ) def trEnd( self ): self.col = 0 def tdTag( self ): # new col if self.col == self.cols: self.cols = self.cols + 1 self.rows[self.row].append( [] ) def tdEnd(self ): self.col = self.col + 1 def addData( self, data ): self.txt += data self.rows[self.row][self.col].append( data ) def txtMatch( self, st ): """Boolean: Determine if table data holds a desired string""" m = re.search( st, self.txt) if m is None: return False else: return True def txtLen( self ): return len(self.txt) def getTableData( self ): return self.rows def printTable( self ): """A generic table printer, depending on the data, it may or or may not get the job done. See getTableData if not.""" for x in self.rows: for y in x: for z in y: print "%s\n" % wordWrap( z, 60 ) ## End of mytable Class def printWUTable( tbl ): if tbl.txtMatch( "State Extremes" ): return if tbl.txtMatch( "Quarter" ): return if tbl.txtMatch( "Blog" ): return if tbl.txtMatch( "Calendar" ): return if tbl.txtMatch( "Statement" ): return ## Any forecast has the word 'day' in it if tbl.txtMatch( "day" ): ## print "Generic Print" ## tbl.printTable() data = tbl.getTableData() ## if tbl.txtMatch( "Updated" ): ## printDetailed( data ) ## return tblLen = tbl.txtLen() #print "table length: %d" % tblLen if tblLen < 500: # determing which table it is by length may # more reliable than by content -- they keep changing # the web page content print5Day( data ) return printDetailed( data ) def printDetailed( data ): global detailedShown if detailedShown: return # already did it -- stop random printing detailedShown = True print "Detailed Forecast:" if len( data[0] ) > 1: print "\t%s\n" %''.join( data[0][1] ) for day in data[1:]: if len(day) > 1: forecast = day[1] print "%s:" % forecast[0] print "%s\n" % wordWrap( forecast[1], 60 ) def print5Day( data ): # quick 5 day forecast #print data global fiveDayShown if fiveDayShown: return # already did it -- stop random printing fiveDayShown = True print "5 Day Forecast" ## fiveDay is the data table transformed ## With data transformed, it will be easier to print based ## on the day to day forcast. ## This uses a nested list comprehension cols = 5 fiveDay = [[testGet(x,i) for x in data] for i in range(cols)] #print fiveDay for day in fiveDay: print "%s:" % ''.join( day[0] ) temps = re.sub( ' \xb0 ', u'\xb0', ' '.join( day[1] )) print temps for forecast in day[2:]: fo = ' '.join( forecast ) if re.search( 'Hourly', fo ): continue print fo sys.stdout.write( '\n' ) # main code zipcode = '67401' url = "http://www.wunderground.com/cgi-bin/findweather/getForecast" data = urllib.urlencode([('query', zipcode)]) req = urllib2.Request(url) p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(urllib2.urlopen(req, data).read()) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) maintags = [ u'html', u'head', u'body', u'title' ] tabletags = [ u'table', u'tr', u'td' ] listtags = [ u'ul', u'li', u'ol' ] passtags = [ u'a', u'h1', u'h2', u'h3', u'h4',u'em', u'strong', u'br', \ u'img', u'dl', u'dt', u'dd' ] fiveDayShown = False detailedShown = False doingTable = False tables = [] # A stack of tables for nested tables for token in stream: if token.has_key('name'): if token['name'] in passtags: continue else: tName = token[ 'name' ] tType = token[ 'type' ] if tType == 'StartTag': if tName in tabletags: if tName == u'table': tbl = mytable() tables.append( tbl ) doingTable = True else: if tName == u'tr': tbl.trTag() else: tbl.tdTag() continue if tType == 'EndTag': if tName in tabletags: if tName == u'table': printWUTable( tbl ) if len( tables ): tbl = tables.pop() else: doingTable = False else: if tName == u'tr': tbl.trEnd() else: tbl.tdEnd() continue if tType == 'Characters': if doingTable: tbl.addData( token['data'] ) raw_input("Press Enter to Exit")