ColombianTeamAnalysis

Attachment 'BeautifulSoup.py'

Download

   1 """Beautiful Soup
   2 Elixir and Tonic
   3 "The Screen-Scraper's Friend"
   4 http://www.crummy.com/software/BeautifulSoup/
   5 
   6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 tree representation. It provides methods and Pythonic idioms that make
   8 it easy to navigate, search, and modify the tree.
   9 
  10 A well-formed XML/HTML document yields a well-formed data
  11 structure. An ill-formed XML/HTML document yields a correspondingly
  12 ill-formed data structure. If your document is only locally
  13 well-formed, you can use this library to find and process the
  14 well-formed part of it. The BeautifulSoup class 
  15 
  16 Beautiful Soup works with Python 2.2 and up. It has no external
  17 dependencies, but you'll have more success at converting data to UTF-8
  18 if you also install these three packages:
  19 
  20 * chardet, for auto-detecting character encodings
  21   http://chardet.feedparser.org/
  22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23   by stock Python.
  24   http://cjkpython.i18n.org/
  25 
  26 Beautiful Soup defines classes for two main parsing strategies:
  27     
  28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29    language that kind of looks like XML.
  30 
  31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32    or invalid. This class has web browser-like heuristics for
  33    obtaining a sensible parse tree in the face of common HTML errors.
  34 
  35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 the encoding of an HTML or XML document, and converting it to
  37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38 
  39 For more than you ever wanted to know about Beautiful Soup, see the
  40 documentation:
  41 http://www.crummy.com/software/BeautifulSoup/documentation.html
  42 
  43 """
  44 from __future__ import generators
  45 
  46 __author__ = "Leonard Richardson (leonardr@segfault.org)"
  47 __version__ = "3.0.4"
  48 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
  49 __license__ = "PSF"
  50 
  51 from sgmllib import SGMLParser, SGMLParseError
  52 import codecs
  53 import types
  54 import re
  55 import sgmllib
  56 try:
  57   from htmlentitydefs import name2codepoint
  58 except ImportError:
  59   name2codepoint = {}
  60 
  61 #This hack makes Beautiful Soup able to parse XML with namespaces
  62 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
  63 
  64 DEFAULT_OUTPUT_ENCODING = "utf-8"
  65 
  66 # First, the classes that represent markup elements.
  67 
  68 class PageElement:
  69     """Contains the navigational information for some part of the page
  70     (either a tag or a piece of text)"""
  71 
  72     def setup(self, parent=None, previous=None):
  73         """Sets up the initial relations between this element and
  74         other elements."""        
  75         self.parent = parent
  76         self.previous = previous
  77         self.next = None
  78         self.previousSibling = None
  79         self.nextSibling = None
  80         if self.parent and self.parent.contents:
  81             self.previousSibling = self.parent.contents[-1]
  82             self.previousSibling.nextSibling = self
  83 
  84     def replaceWith(self, replaceWith):        
  85         oldParent = self.parent
  86         myIndex = self.parent.contents.index(self)
  87         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
  88             # We're replacing this element with one of its siblings.
  89             index = self.parent.contents.index(replaceWith)
  90             if index and index < myIndex:
  91                 # Furthermore, it comes before this element. That
  92                 # means that when we extract it, the index of this
  93                 # element will change.
  94                 myIndex = myIndex - 1
  95         self.extract()        
  96         oldParent.insert(myIndex, replaceWith)
  97         
  98     def extract(self):
  99         """Destructively rips this element out of the tree."""        
 100         if self.parent:
 101             try:
 102                 self.parent.contents.remove(self)
 103             except ValueError:
 104                 pass
 105 
 106         #Find the two elements that would be next to each other if
 107         #this element (and any children) hadn't been parsed. Connect
 108         #the two.        
 109         lastChild = self._lastRecursiveChild()
 110         nextElement = lastChild.next
 111 
 112         if self.previous:
 113             self.previous.next = nextElement
 114         if nextElement:
 115             nextElement.previous = self.previous
 116         self.previous = None
 117         lastChild.next = None
 118 
 119         self.parent = None        
 120         if self.previousSibling:
 121             self.previousSibling.nextSibling = self.nextSibling
 122         if self.nextSibling:
 123             self.nextSibling.previousSibling = self.previousSibling
 124         self.previousSibling = self.nextSibling = None       
 125 
 126     def _lastRecursiveChild(self):
 127         "Finds the last element beneath this object to be parsed."
 128         lastChild = self
 129         while hasattr(lastChild, 'contents') and lastChild.contents:
 130             lastChild = lastChild.contents[-1]
 131         return lastChild
 132 
 133     def insert(self, position, newChild):
 134         if (isinstance(newChild, basestring)
 135             or isinstance(newChild, unicode)) \
 136             and not isinstance(newChild, NavigableString):
 137             newChild = NavigableString(newChild)        
 138 
 139         position =  min(position, len(self.contents))
 140         if hasattr(newChild, 'parent') and newChild.parent != None:
 141             # We're 'inserting' an element that's already one
 142             # of this object's children. 
 143             if newChild.parent == self:
 144                 index = self.find(newChild)
 145                 if index and index < position:
 146                     # Furthermore we're moving it further down the
 147                     # list of this object's children. That means that
 148                     # when we extract this element, our target index
 149                     # will jump down one.
 150                     position = position - 1
 151             newChild.extract()
 152             
 153         newChild.parent = self
 154         previousChild = None
 155         if position == 0:
 156             newChild.previousSibling = None
 157             newChild.previous = self
 158         else:
 159             previousChild = self.contents[position-1]
 160             newChild.previousSibling = previousChild
 161             newChild.previousSibling.nextSibling = newChild
 162             newChild.previous = previousChild._lastRecursiveChild()
 163         if newChild.previous:
 164             newChild.previous.next = newChild        
 165 
 166         newChildsLastElement = newChild._lastRecursiveChild()
 167 
 168         if position >= len(self.contents):
 169             newChild.nextSibling = None
 170             
 171             parent = self
 172             parentsNextSibling = None
 173             while not parentsNextSibling:
 174                 parentsNextSibling = parent.nextSibling
 175                 parent = parent.parent
 176                 if not parent: # This is the last element in the document.
 177                     break
 178             if parentsNextSibling:
 179                 newChildsLastElement.next = parentsNextSibling
 180             else:
 181                 newChildsLastElement.next = None
 182         else:
 183             nextChild = self.contents[position]            
 184             newChild.nextSibling = nextChild            
 185             if newChild.nextSibling:
 186                 newChild.nextSibling.previousSibling = newChild
 187             newChildsLastElement.next = nextChild
 188 
 189         if newChildsLastElement.next:
 190             newChildsLastElement.next.previous = newChildsLastElement
 191         self.contents.insert(position, newChild)
 192 
 193     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 194         """Returns the first item that matches the given criteria and
 195         appears after this Tag in the document."""
 196         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 197 
 198     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 199                     **kwargs):
 200         """Returns all items that match the given criteria and appear
 201         before after Tag in the document."""
 202         return self._findAll(name, attrs, text, limit, self.nextGenerator)
 203 
 204     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 205         """Returns the closest sibling to this Tag that matches the
 206         given criteria and appears after this Tag in the document."""
 207         return self._findOne(self.findNextSiblings, name, attrs, text,
 208                              **kwargs)
 209 
 210     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 211                          **kwargs):
 212         """Returns the siblings of this Tag that match the given
 213         criteria and appear after this Tag in the document."""
 214         return self._findAll(name, attrs, text, limit,
 215                              self.nextSiblingGenerator, **kwargs)
 216     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 217 
 218     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 219         """Returns the first item that matches the given criteria and
 220         appears before this Tag in the document."""
 221         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 222 
 223     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 224                         **kwargs):
 225         """Returns all items that match the given criteria and appear
 226         before this Tag in the document."""
 227         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 228                            **kwargs)
 229     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 230 
 231     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 232         """Returns the closest sibling to this Tag that matches the
 233         given criteria and appears before this Tag in the document."""
 234         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 235                              **kwargs)
 236 
 237     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 238                              limit=None, **kwargs):
 239         """Returns the siblings of this Tag that match the given
 240         criteria and appear before this Tag in the document."""
 241         return self._findAll(name, attrs, text, limit,
 242                              self.previousSiblingGenerator, **kwargs)
 243     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 244 
 245     def findParent(self, name=None, attrs={}, **kwargs):
 246         """Returns the closest parent of this Tag that matches the given
 247         criteria."""
 248         # NOTE: We can't use _findOne because findParents takes a different
 249         # set of arguments.
 250         r = None
 251         l = self.findParents(name, attrs, 1)
 252         if l:
 253             r = l[0]
 254         return r
 255 
 256     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 257         """Returns the parents of this Tag that match the given
 258         criteria."""
 259 
 260         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 261                              **kwargs)
 262     fetchParents = findParents # Compatibility with pre-3.x
 263 
 264     #These methods do the real heavy lifting.
 265 
 266     def _findOne(self, method, name, attrs, text, **kwargs):
 267         r = None
 268         l = method(name, attrs, text, 1, **kwargs)
 269         if l:
 270             r = l[0]
 271         return r
 272     
 273     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 274         "Iterates over a generator looking for things that match."
 275 
 276         if isinstance(name, SoupStrainer):
 277             strainer = name
 278         else:
 279             # Build a SoupStrainer
 280             strainer = SoupStrainer(name, attrs, text, **kwargs)
 281         results = ResultSet(strainer)
 282         g = generator()
 283         while True:
 284             try:
 285                 i = g.next()
 286             except StopIteration:
 287                 break
 288             if i:
 289                 found = strainer.search(i)
 290                 if found:
 291                     results.append(found)
 292                     if limit and len(results) >= limit:
 293                         break
 294         return results
 295 
 296     #These Generators can be used to navigate starting from both
 297     #NavigableStrings and Tags.                
 298     def nextGenerator(self):
 299         i = self
 300         while i:
 301             i = i.next
 302             yield i
 303 
 304     def nextSiblingGenerator(self):
 305         i = self
 306         while i:
 307             i = i.nextSibling
 308             yield i
 309 
 310     def previousGenerator(self):
 311         i = self
 312         while i:
 313             i = i.previous
 314             yield i
 315 
 316     def previousSiblingGenerator(self):
 317         i = self
 318         while i:
 319             i = i.previousSibling
 320             yield i
 321 
 322     def parentGenerator(self):
 323         i = self
 324         while i:
 325             i = i.parent
 326             yield i
 327 
 328     # Utility methods
 329     def substituteEncoding(self, str, encoding=None):
 330         encoding = encoding or "utf-8"
 331         return str.replace("%SOUP-ENCODING%", encoding)    
 332 
 333     def toEncoding(self, s, encoding=None):
 334         """Encodes an object to a string in some encoding, or to Unicode.
 335         ."""
 336         if isinstance(s, unicode):
 337             if encoding:
 338                 s = s.encode(encoding)
 339         elif isinstance(s, str):
 340             if encoding:
 341                 s = s.encode(encoding)
 342             else:
 343                 s = unicode(s)
 344         else:
 345             if encoding:
 346                 s  = self.toEncoding(str(s), encoding)
 347             else:
 348                 s = unicode(s)
 349         return s
 350 
 351 class NavigableString(unicode, PageElement):
 352 
 353     def __getattr__(self, attr):
 354         """text.string gives you text. This is for backwards
 355         compatibility for Navigable*String, but for CData* it lets you
 356         get the string without the CData wrapper."""
 357         if attr == 'string':
 358             return self
 359         else:
 360             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 361 
 362     def __unicode__(self):
 363         return self.__str__(None)
 364 
 365     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 366         if encoding:
 367             return self.encode(encoding)
 368         else:
 369             return self
 370         
 371 class CData(NavigableString):
 372 
 373     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 374         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 375 
 376 class ProcessingInstruction(NavigableString):
 377     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 378         output = self
 379         if "%SOUP-ENCODING%" in output:
 380             output = self.substituteEncoding(output, encoding)
 381         return "<?%s?>" % self.toEncoding(output, encoding)
 382 
 383 class Comment(NavigableString):
 384     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 385         return "<!--%s-->" % NavigableString.__str__(self, encoding)    
 386 
 387 class Declaration(NavigableString):
 388     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 389         return "<!%s>" % NavigableString.__str__(self, encoding)        
 390 
 391 class Tag(PageElement):
 392 
 393     """Represents a found HTML tag with its attributes and contents."""
 394 
 395     XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
 396                                       '"' : "quote",
 397                                       "&" : "amp",
 398                                       "<" : "lt",
 399                                       ">" : "gt" }
 400 
 401     def __init__(self, parser, name, attrs=None, parent=None,
 402                  previous=None):
 403         "Basic constructor."
 404 
 405         # We don't actually store the parser object: that lets extracted
 406         # chunks be garbage-collected
 407         self.parserClass = parser.__class__
 408         self.isSelfClosing = parser.isSelfClosingTag(name)
 409         self.name = name
 410         if attrs == None:
 411             attrs = []
 412         self.attrs = attrs
 413         self.contents = []
 414         self.setup(parent, previous)
 415         self.hidden = False
 416         self.containsSubstitutions = False
 417 
 418     def get(self, key, default=None):
 419         """Returns the value of the 'key' attribute for the tag, or
 420         the value given for 'default' if it doesn't have that
 421         attribute."""
 422         return self._getAttrMap().get(key, default)    
 423 
 424     def has_key(self, key):
 425         return self._getAttrMap().has_key(key)
 426 
 427     def __getitem__(self, key):
 428         """tag[key] returns the value of the 'key' attribute for the tag,
 429         and throws an exception if it's not there."""
 430         return self._getAttrMap()[key]
 431 
 432     def __iter__(self):
 433         "Iterating over a tag iterates over its contents."
 434         return iter(self.contents)
 435 
 436     def __len__(self):
 437         "The length of a tag is the length of its list of contents."
 438         return len(self.contents)
 439 
 440     def __contains__(self, x):
 441         return x in self.contents
 442 
 443     def __nonzero__(self):
 444         "A tag is non-None even if it has no contents."
 445         return True
 446 
 447     def __setitem__(self, key, value):        
 448         """Setting tag[key] sets the value of the 'key' attribute for the
 449         tag."""
 450         self._getAttrMap()
 451         self.attrMap[key] = value
 452         found = False
 453         for i in range(0, len(self.attrs)):
 454             if self.attrs[i][0] == key:
 455                 self.attrs[i] = (key, value)
 456                 found = True
 457         if not found:
 458             self.attrs.append((key, value))
 459         self._getAttrMap()[key] = value
 460 
 461     def __delitem__(self, key):
 462         "Deleting tag[key] deletes all 'key' attributes for the tag."
 463         for item in self.attrs:
 464             if item[0] == key:
 465                 self.attrs.remove(item)
 466                 #We don't break because bad HTML can define the same
 467                 #attribute multiple times.
 468             self._getAttrMap()
 469             if self.attrMap.has_key(key):
 470                 del self.attrMap[key]
 471 
 472     def __call__(self, *args, **kwargs):
 473         """Calling a tag like a function is the same as calling its
 474         findAll() method. Eg. tag('a') returns a list of all the A tags
 475         found within this tag."""
 476         return apply(self.findAll, args, kwargs)
 477 
 478     def __getattr__(self, tag):
 479         #print "Getattr %s.%s" % (self.__class__, tag)
 480         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 481             return self.find(tag[:-3])
 482         elif tag.find('__') != 0:
 483             return self.find(tag)
 484 
 485     def __eq__(self, other):
 486         """Returns true iff this tag has the same name, the same attributes,
 487         and the same contents (recursively) as the given tag.
 488 
 489         NOTE: right now this will return false if two tags have the
 490         same attributes in a different order. Should this be fixed?"""
 491         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 492             return False
 493         for i in range(0, len(self.contents)):
 494             if self.contents[i] != other.contents[i]:
 495                 return False
 496         return True
 497 
 498     def __ne__(self, other):
 499         """Returns true iff this tag is not identical to the other tag,
 500         as defined in __eq__."""
 501         return not self == other
 502 
 503     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 504         """Renders this tag as a string."""
 505         return self.__str__(encoding)
 506 
 507     def __unicode__(self):
 508         return self.__str__(None)
 509 
 510     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 511                 prettyPrint=False, indentLevel=0):
 512         """Returns a string or Unicode representation of this tag and
 513         its contents. To get Unicode, pass None for encoding.
 514 
 515         NOTE: since Python's HTML parser consumes whitespace, this
 516         method is not certain to reproduce the whitespace present in
 517         the original string."""
 518 
 519         encodedName = self.toEncoding(self.name, encoding)
 520 
 521         attrs = []
 522         if self.attrs:
 523             for key, val in self.attrs:
 524                 fmt = '%s="%s"'
 525                 if isString(val):                    
 526                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 527                         val = self.substituteEncoding(val, encoding)
 528 
 529                     # The attribute value either:
 530                     #
 531                     # * Contains no embedded double quotes or single quotes.
 532                     #   No problem: we enclose it in double quotes.
 533                     # * Contains embedded single quotes. No problem:
 534                     #   double quotes work here too.
 535                     # * Contains embedded double quotes. No problem:
 536                     #   we enclose it in single quotes.
 537                     # * Embeds both single _and_ double quotes. This
 538                     #   can't happen naturally, but it can happen if
 539                     #   you modify an attribute value after parsing
 540                     #   the document. Now we have a bit of a
 541                     #   problem. We solve it by enclosing the
 542                     #   attribute in single quotes, and escaping any
 543                     #   embedded single quotes to XML entities.
 544                     if '"' in val:
 545                         fmt = "%s='%s'"
 546                         # This can't happen naturally, but it can happen
 547                         # if you modify an attribute value after parsing.
 548                         if "'" in val:
 549                             val = val.replace("'", "&squot;")
 550 
 551                     # Now we're okay w/r/t quotes. But the attribute
 552                     # value might also contain angle brackets, or
 553                     # ampersands that aren't part of entities. We need
 554                     # to escape those to XML entities too.
 555                     val = re.sub("([<>]|&(?![^\s]+;))",
 556                                  lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
 557                                  val)
 558                                       
 559                 attrs.append(fmt % (self.toEncoding(key, encoding),
 560                                     self.toEncoding(val, encoding)))
 561         close = ''
 562         closeTag = ''
 563         if self.isSelfClosing:
 564             close = ' /'
 565         else:
 566             closeTag = '</%s>' % encodedName
 567 
 568         indentTag, indentContents = 0, 0
 569         if prettyPrint:
 570             indentTag = indentLevel
 571             space = (' ' * (indentTag-1))
 572             indentContents = indentTag + 1
 573         contents = self.renderContents(encoding, prettyPrint, indentContents)
 574         if self.hidden:
 575             s = contents
 576         else:
 577             s = []
 578             attributeString = ''
 579             if attrs:
 580                 attributeString = ' ' + ' '.join(attrs)            
 581             if prettyPrint:
 582                 s.append(space)
 583             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 584             if prettyPrint:
 585                 s.append("\n")
 586             s.append(contents)
 587             if prettyPrint and contents and contents[-1] != "\n":
 588                 s.append("\n")
 589             if prettyPrint and closeTag:
 590                 s.append(space)
 591             s.append(closeTag)
 592             if prettyPrint and closeTag and self.nextSibling:
 593                 s.append("\n")
 594             s = ''.join(s)
 595         return s
 596 
 597     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 598         return self.__str__(encoding, True)
 599 
 600     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 601                        prettyPrint=False, indentLevel=0):
 602         """Renders the contents of this tag as a string in the given
 603         encoding. If encoding is None, returns a Unicode string.."""
 604         s=[]
 605         for c in self:
 606             text = None
 607             if isinstance(c, NavigableString):
 608                 text = c.__str__(encoding)
 609             elif isinstance(c, Tag):
 610                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 611             if text and prettyPrint:
 612                 text = text.strip()              
 613             if text:
 614                 if prettyPrint:
 615                     s.append(" " * (indentLevel-1))
 616                 s.append(text)
 617                 if prettyPrint:
 618                     s.append("\n")
 619         return ''.join(s)    
 620 
 621     #Soup methods
 622 
 623     def find(self, name=None, attrs={}, recursive=True, text=None,
 624              **kwargs):
 625         """Return only the first child of this Tag matching the given
 626         criteria."""
 627         r = None
 628         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 629         if l:
 630             r = l[0]
 631         return r
 632     findChild = find
 633 
 634     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 635                 limit=None, **kwargs):
 636         """Extracts a list of Tag objects that match the given
 637         criteria.  You can specify the name of the Tag and any
 638         attributes you want the Tag to have.
 639 
 640         The value of a key-value pair in the 'attrs' map can be a
 641         string, a list of strings, a regular expression object, or a
 642         callable that takes a string and returns whether or not the
 643         string matches for some custom definition of 'matches'. The
 644         same is true of the tag name."""
 645         generator = self.recursiveChildGenerator
 646         if not recursive:
 647             generator = self.childGenerator
 648         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 649     findChildren = findAll
 650 
 651     # Pre-3.x compatibility methods
 652     first = find
 653     fetch = findAll
 654     
 655     def fetchText(self, text=None, recursive=True, limit=None):
 656         return self.findAll(text=text, recursive=recursive, limit=limit)
 657 
 658     def firstText(self, text=None, recursive=True):
 659         return self.find(text=text, recursive=recursive)
 660     
 661     #Utility methods
 662 
 663     def append(self, tag):
 664         """Appends the given tag to the contents of this tag."""
 665         self.contents.append(tag)
 666 
 667     #Private methods
 668 
 669     def _getAttrMap(self):
 670         """Initializes a map representation of this tag's attributes,
 671         if not already initialized."""
 672         if not getattr(self, 'attrMap'):
 673             self.attrMap = {}
 674             for (key, value) in self.attrs:
 675                 self.attrMap[key] = value 
 676         return self.attrMap
 677 
 678     #Generator methods
 679     def childGenerator(self):
 680         for i in range(0, len(self.contents)):
 681             yield self.contents[i]
 682         raise StopIteration
 683     
 684     def recursiveChildGenerator(self):
 685         stack = [(self, 0)]
 686         while stack:
 687             tag, start = stack.pop()
 688             if isinstance(tag, Tag):            
 689                 for i in range(start, len(tag.contents)):
 690                     a = tag.contents[i]
 691                     yield a
 692                     if isinstance(a, Tag) and tag.contents:
 693                         if i < len(tag.contents) - 1:
 694                             stack.append((tag, i+1))
 695                         stack.append((a, 0))
 696                         break
 697         raise StopIteration
 698 
 699 # Next, a couple classes to represent queries and their results.
 700 class SoupStrainer:
 701     """Encapsulates a number of ways of matching a markup element (tag or
 702     text)."""
 703 
 704     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 705         self.name = name
 706         if isString(attrs):
 707             kwargs['class'] = attrs
 708             attrs = None
 709         if kwargs:
 710             if attrs:
 711                 attrs = attrs.copy()
 712                 attrs.update(kwargs)
 713             else:
 714                 attrs = kwargs
 715         self.attrs = attrs
 716         self.text = text
 717 
 718     def __str__(self):
 719         if self.text:
 720             return self.text
 721         else:
 722             return "%s|%s" % (self.name, self.attrs)
 723     
 724     def searchTag(self, markupName=None, markupAttrs={}):
 725         found = None
 726         markup = None
 727         if isinstance(markupName, Tag):
 728             markup = markupName
 729             markupAttrs = markup
 730         callFunctionWithTagData = callable(self.name) \
 731                                 and not isinstance(markupName, Tag)
 732 
 733         if (not self.name) \
 734                or callFunctionWithTagData \
 735                or (markup and self._matches(markup, self.name)) \
 736                or (not markup and self._matches(markupName, self.name)):
 737             if callFunctionWithTagData:
 738                 match = self.name(markupName, markupAttrs)
 739             else:
 740                 match = True            
 741                 markupAttrMap = None
 742                 for attr, matchAgainst in self.attrs.items():
 743                     if not markupAttrMap:
 744                          if hasattr(markupAttrs, 'get'):
 745                             markupAttrMap = markupAttrs
 746                          else:
 747                             markupAttrMap = {}
 748                             for k,v in markupAttrs:
 749                                 markupAttrMap[k] = v
 750                     attrValue = markupAttrMap.get(attr)
 751                     if not self._matches(attrValue, matchAgainst):
 752                         match = False
 753                         break
 754             if match:
 755                 if markup:
 756                     found = markup
 757                 else:
 758                     found = markupName
 759         return found
 760 
 761     def search(self, markup):
 762         #print 'looking for %s in %s' % (self, markup)
 763         found = None
 764         # If given a list of items, scan it for a text element that
 765         # matches.        
 766         if isList(markup) and not isinstance(markup, Tag):
 767             for element in markup:
 768                 if isinstance(element, NavigableString) \
 769                        and self.search(element):
 770                     found = element
 771                     break
 772         # If it's a Tag, make sure its name or attributes match.
 773         # Don't bother with Tags if we're searching for text.
 774         elif isinstance(markup, Tag):
 775             if not self.text:
 776                 found = self.searchTag(markup)
 777         # If it's text, make sure the text matches.
 778         elif isinstance(markup, NavigableString) or \
 779                  isString(markup):
 780             if self._matches(markup, self.text):
 781                 found = markup
 782         else:
 783             raise Exception, "I don't know how to match against a %s" \
 784                   % markup.__class__
 785         return found
 786         
 787     def _matches(self, markup, matchAgainst):    
 788         #print "Matching %s against %s" % (markup, matchAgainst)
 789         result = False
 790         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
 791             result = markup != None
 792         elif callable(matchAgainst):
 793             result = matchAgainst(markup)
 794         else:
 795             #Custom match methods take the tag as an argument, but all
 796             #other ways of matching match the tag name as a string.
 797             if isinstance(markup, Tag):
 798                 markup = markup.name
 799             if markup and not isString(markup):
 800                 markup = unicode(markup)
 801             #Now we know that chunk is either a string, or None.
 802             if hasattr(matchAgainst, 'match'):
 803                 # It's a regexp object.
 804                 result = markup and matchAgainst.search(markup)
 805             elif isList(matchAgainst):
 806                 result = markup in matchAgainst
 807             elif hasattr(matchAgainst, 'items'):
 808                 result = markup.has_key(matchAgainst)
 809             elif matchAgainst and isString(markup):
 810                 if isinstance(markup, unicode):
 811                     matchAgainst = unicode(matchAgainst)
 812                 else:
 813                     matchAgainst = str(matchAgainst)
 814 
 815             if not result:
 816                 result = matchAgainst == markup
 817         return result
 818 
 819 class ResultSet(list):
 820     """A ResultSet is just a list that keeps track of the SoupStrainer
 821     that created it."""
 822     def __init__(self, source):
 823         list.__init__([])
 824         self.source = source
 825 
 826 # Now, some helper functions.
 827 
 828 def isList(l):
 829     """Convenience method that works with all 2.x versions of Python
 830     to determine whether or not something is listlike."""
 831     return hasattr(l, '__iter__') \
 832            or (type(l) in (types.ListType, types.TupleType))
 833 
 834 def isString(s):
 835     """Convenience method that works with all 2.x versions of Python
 836     to determine whether or not something is stringlike."""
 837     try:
 838         return isinstance(s, unicode) or isintance(s, basestring) 
 839     except NameError:
 840         return isinstance(s, str)
 841 
 842 def buildTagMap(default, *args):
 843     """Turns a list of maps, lists, or scalars into a single map.
 844     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
 845     NESTING_RESET_TAGS maps out of lists and partial maps."""
 846     built = {}
 847     for portion in args:
 848         if hasattr(portion, 'items'):
 849             #It's a map. Merge it.
 850             for k,v in portion.items():
 851                 built[k] = v
 852         elif isList(portion):
 853             #It's a list. Map each item to the default.
 854             for k in portion:
 855                 built[k] = default
 856         else:
 857             #It's a scalar. Map it to the default.
 858             built[portion] = default
 859     return built
 860 
 861 # Now, the parser classes.
 862 
 863 class BeautifulStoneSoup(Tag, SGMLParser):
 864 
 865     """This class contains the basic parser and search code. It defines
 866     a parser that knows nothing about tag behavior except for the
 867     following:
 868    
 869       You can't close a tag without closing all the tags it encloses.
 870       That is, "<foo><bar></foo>" actually means
 871       "<foo><bar></bar></foo>".
 872 
 873     [Another possible explanation is "<foo><bar /></foo>", but since
 874     this class defines no SELF_CLOSING_TAGS, it will never use that
 875     explanation.]
 876 
 877     This class is useful for parsing XML or made-up markup languages,
 878     or when BeautifulSoup makes an assumption counter to what you were
 879     expecting."""
 880 
 881     XML_ENTITY_LIST = {}
 882     for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
 883         XML_ENTITY_LIST[i] = True 
 884 
 885     SELF_CLOSING_TAGS = {}
 886     NESTABLE_TAGS = {}
 887     RESET_NESTING_TAGS = {}
 888     QUOTE_TAGS = {}
 889 
 890     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
 891                        lambda x: x.group(1) + ' />'),
 892                       (re.compile('<!\s+([^<>]*)>'),
 893                        lambda x: '<!' + x.group(1) + '>')
 894                       ]
 895 
 896     ROOT_TAG_NAME = u'[document]'
 897 
 898     HTML_ENTITIES = "html"
 899     XML_ENTITIES = "xml"
 900 
 901     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
 902                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
 903                  convertEntities=None, selfClosingTags=None):
 904         """The Soup object is initialized as the 'root tag', and the
 905         provided markup (which can be a string or a file-like object)
 906         is fed into the underlying parser. 
 907 
 908         sgmllib will process most bad HTML, and the BeautifulSoup
 909         class has some tricks for dealing with some HTML that kills
 910         sgmllib, but Beautiful Soup can nonetheless choke or lose data
 911         if your data uses self-closing tags or declarations
 912         incorrectly.
 913 
 914         By default, Beautiful Soup uses regexes to sanitize input,
 915         avoiding the vast majority of these problems. If the problems
 916         don't apply to you, pass in False for markupMassage, and
 917         you'll get better performance.
 918 
 919         The default parser massage techniques fix the two most common
 920         instances of invalid HTML that choke sgmllib:
 921 
 922          <br/> (No space between name of closing tag and tag close)
 923          <! --Comment--> (Extraneous whitespace in declaration)
 924 
 925         You can pass in a custom list of (RE object, replace method)
 926         tuples to get Beautiful Soup to scrub your input the way you
 927         want."""
 928 
 929         self.parseOnlyThese = parseOnlyThese
 930         self.fromEncoding = fromEncoding
 931         self.smartQuotesTo = smartQuotesTo
 932         self.convertEntities = convertEntities
 933         if self.convertEntities:
 934             # It doesn't make sense to convert encoded characters to
 935             # entities even while you're converting entities to Unicode.
 936             # Just convert it all to Unicode.
 937             self.smartQuotesTo = None
 938         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
 939         SGMLParser.__init__(self)
 940             
 941         if hasattr(markup, 'read'):        # It's a file-type object.
 942             markup = markup.read()
 943         self.markup = markup
 944         self.markupMassage = markupMassage
 945         try:
 946             self._feed()
 947         except StopParsing:
 948             pass
 949         self.markup = None                 # The markup can now be GCed
 950         
 951     def _feed(self, inDocumentEncoding=None):
 952         # Convert the document to Unicode.
 953         markup = self.markup
 954         if isinstance(markup, unicode):
 955             if not hasattr(self, 'originalEncoding'):
 956                 self.originalEncoding = None
 957         else:
 958             dammit = UnicodeDammit\
 959                      (markup, [self.fromEncoding, inDocumentEncoding],
 960                       smartQuotesTo=self.smartQuotesTo)
 961             markup = dammit.unicode
 962             self.originalEncoding = dammit.originalEncoding
 963         if markup:
 964             if self.markupMassage:
 965                 if not isList(self.markupMassage):
 966                     self.markupMassage = self.MARKUP_MASSAGE            
 967                 for fix, m in self.markupMassage:
 968                     markup = fix.sub(m, markup)
 969         self.reset()
 970 
 971         SGMLParser.feed(self, markup)
 972         # Close out any unfinished strings and close all the open tags.
 973         self.endData()
 974         while self.currentTag.name != self.ROOT_TAG_NAME:
 975             self.popTag()
 976 
 977     def __getattr__(self, methodName):
 978         """This method routes method call requests to either the SGMLParser
 979         superclass or the Tag superclass, depending on the method name."""
 980         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
 981 
 982         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
 983                or methodName.find('do_') == 0:
 984             return SGMLParser.__getattr__(self, methodName)
 985         elif methodName.find('__') != 0:
 986             return Tag.__getattr__(self, methodName)
 987         else:
 988             raise AttributeError
 989 
 990     def isSelfClosingTag(self, name):
 991         """Returns true iff the given string is the name of a
 992         self-closing tag according to this parser."""
 993         return self.SELF_CLOSING_TAGS.has_key(name) \
 994                or self.instanceSelfClosingTags.has_key(name)
 995             
 996     def reset(self):
 997         Tag.__init__(self, self, self.ROOT_TAG_NAME)
 998         self.hidden = 1
 999         SGMLParser.reset(self)
1000         self.currentData = []
1001         self.currentTag = None
1002         self.tagStack = []
1003         self.quoteStack = []
1004         self.pushTag(self)
1005     
1006     def popTag(self):
1007         tag = self.tagStack.pop()
1008         # Tags with just one string-owning child get the child as a
1009         # 'string' property, so that soup.tag.string is shorthand for
1010         # soup.tag.contents[0]
1011         if len(self.currentTag.contents) == 1 and \
1012            isinstance(self.currentTag.contents[0], NavigableString):
1013             self.currentTag.string = self.currentTag.contents[0]
1014 
1015         #print "Pop", tag.name
1016         if self.tagStack:
1017             self.currentTag = self.tagStack[-1]
1018         return self.currentTag
1019 
1020     def pushTag(self, tag):
1021         #print "Push", tag.name
1022         if self.currentTag:
1023             self.currentTag.append(tag)
1024         self.tagStack.append(tag)
1025         self.currentTag = self.tagStack[-1]
1026 
1027     def endData(self, containerClass=NavigableString):
1028         if self.currentData:
1029             currentData = ''.join(self.currentData)
1030             if not currentData.strip():
1031                 if '\n' in currentData:
1032                     currentData = '\n'
1033                 else:
1034                     currentData = ' '
1035             self.currentData = []
1036             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1037                    (not self.parseOnlyThese.text or \
1038                     not self.parseOnlyThese.search(currentData)):
1039                 return
1040             o = containerClass(currentData)
1041             o.setup(self.currentTag, self.previous)
1042             if self.previous:
1043                 self.previous.next = o
1044             self.previous = o
1045             self.currentTag.contents.append(o)
1046 
1047 
1048     def _popToTag(self, name, inclusivePop=True):
1049         """Pops the tag stack up to and including the most recent
1050         instance of the given tag. If inclusivePop is false, pops the tag
1051         stack up to but *not* including the most recent instqance of
1052         the given tag."""
1053         #print "Popping to %s" % name
1054         if name == self.ROOT_TAG_NAME:
1055             return            
1056 
1057         numPops = 0
1058         mostRecentTag = None
1059         for i in range(len(self.tagStack)-1, 0, -1):
1060             if name == self.tagStack[i].name:
1061                 numPops = len(self.tagStack)-i
1062                 break
1063         if not inclusivePop:
1064             numPops = numPops - 1
1065 
1066         for i in range(0, numPops):
1067             mostRecentTag = self.popTag()
1068         return mostRecentTag    
1069 
1070     def _smartPop(self, name):
1071 
1072         """We need to pop up to the previous tag of this type, unless
1073         one of this tag's nesting reset triggers comes between this
1074         tag and the previous tag of this type, OR unless this tag is a
1075         generic nesting trigger and another generic nesting trigger
1076         comes between this tag and the previous tag of this type.
1077 
1078         Examples:
1079          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1080          <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1081          <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1082          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1083 
1084          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1085          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1086          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1087         """
1088 
1089         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1090         isNestable = nestingResetTriggers != None
1091         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1092         popTo = None
1093         inclusive = True
1094         for i in range(len(self.tagStack)-1, 0, -1):
1095             p = self.tagStack[i]
1096             if (not p or p.name == name) and not isNestable:
1097                 #Non-nestable tags get popped to the top or to their
1098                 #last occurance.
1099                 popTo = name
1100                 break
1101             if (nestingResetTriggers != None
1102                 and p.name in nestingResetTriggers) \
1103                 or (nestingResetTriggers == None and isResetNesting
1104                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1105                 
1106                 #If we encounter one of the nesting reset triggers
1107                 #peculiar to this tag, or we encounter another tag
1108                 #that causes nesting to reset, pop up to but not
1109                 #including that tag.
1110                 popTo = p.name
1111                 inclusive = False
1112                 break
1113             p = p.parent
1114         if popTo:
1115             self._popToTag(popTo, inclusive)
1116 
1117     def unknown_starttag(self, name, attrs, selfClosing=0):
1118         #print "Start tag %s: %s" % (name, attrs)
1119         if self.quoteStack:
1120             #This is not a real tag.
1121             #print "<%s> is not real!" % name
1122             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1123             self.handle_data('<%s%s>' % (name, attrs))
1124             return        
1125         self.endData()
1126 
1127         if not self.isSelfClosingTag(name) and not selfClosing:
1128             self._smartPop(name)
1129 
1130         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1131                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1132             return
1133 
1134         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1135         if self.previous:
1136             self.previous.next = tag
1137         self.previous = tag
1138         self.pushTag(tag)
1139         if selfClosing or self.isSelfClosingTag(name):
1140             self.popTag()                
1141         if name in self.QUOTE_TAGS:
1142             #print "Beginning quote (%s)" % name
1143             self.quoteStack.append(name)
1144             self.literal = 1
1145         return tag
1146 
1147     def unknown_endtag(self, name):
1148         #print "End tag %s" % name
1149         if self.quoteStack and self.quoteStack[-1] != name:
1150             #This is not a real end tag.
1151             #print "</%s> is not real!" % name
1152             self.handle_data('</%s>' % name)
1153             return
1154         self.endData()
1155         self._popToTag(name)
1156         if self.quoteStack and self.quoteStack[-1] == name:
1157             self.quoteStack.pop()
1158             self.literal = (len(self.quoteStack) > 0)
1159 
1160     def handle_data(self, data):
1161         self.currentData.append(data)
1162 
1163     def _toStringSubclass(self, text, subclass):
1164         """Adds a certain piece of text to the tree as a NavigableString
1165         subclass."""
1166         self.endData()
1167         self.handle_data(text)
1168         self.endData(subclass)
1169 
1170     def handle_pi(self, text):
1171         """Handle a processing instruction as a ProcessingInstruction
1172         object, possibly one with a %SOUP-ENCODING% slot into which an
1173         encoding will be plugged later."""
1174         if text[:3] == "xml":
1175             text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1176         self._toStringSubclass(text, ProcessingInstruction)
1177 
1178     def handle_comment(self, text):
1179         "Handle comments as Comment objects."
1180         self._toStringSubclass(text, Comment)
1181 
1182     def handle_charref(self, ref):
1183         "Handle character references as data."
1184         if self.convertEntities in [self.HTML_ENTITIES,
1185                                     self.XML_ENTITIES]:
1186             data = unichr(int(ref))
1187         else:
1188             data = '&#%s;' % ref
1189         self.handle_data(data)
1190 
1191     def handle_entityref(self, ref):
1192         """Handle entity references as data, possibly converting known
1193         HTML entity references to the corresponding Unicode
1194         characters."""
1195         data = None
1196         if self.convertEntities == self.HTML_ENTITIES or \
1197                (self.convertEntities == self.XML_ENTITIES and \
1198                 self.XML_ENTITY_LIST.get(ref)):
1199             try:
1200                 data = unichr(name2codepoint[ref])
1201             except KeyError:
1202                 pass
1203         if not data:
1204             data = '&%s;' % ref
1205         self.handle_data(data)
1206         
1207     def handle_decl(self, data):
1208         "Handle DOCTYPEs and the like as Declaration objects."
1209         self._toStringSubclass(data, Declaration)
1210 
1211     def parse_declaration(self, i):
1212         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1213         declaration as a CData object."""
1214         j = None
1215         if self.rawdata[i:i+9] == '<![CDATA[':
1216              k = self.rawdata.find(']]>', i)
1217              if k == -1:
1218                  k = len(self.rawdata)
1219              data = self.rawdata[i+9:k]
1220              j = k+3
1221              self._toStringSubclass(data, CData)
1222         else:
1223             try:
1224                 j = SGMLParser.parse_declaration(self, i)
1225             except SGMLParseError:
1226                 toHandle = self.rawdata[i:]
1227                 self.handle_data(toHandle)
1228                 j = i + len(toHandle)
1229         return j
1230 
1231 class BeautifulSoup(BeautifulStoneSoup):
1232 
1233     """This parser knows the following facts about HTML:
1234 
1235     * Some tags have no closing tag and should be interpreted as being
1236       closed as soon as they are encountered.
1237 
1238     * The text inside some tags (ie. 'script') may contain tags which
1239       are not really part of the document and which should be parsed
1240       as text, not tags. If you want to parse the text as tags, you can
1241       always fetch it and parse it explicitly.
1242 
1243     * Tag nesting rules:
1244 
1245       Most tags can't be nested at all. For instance, the occurance of
1246       a <p> tag should implicitly close the previous <p> tag.
1247 
1248        <p>Para1<p>Para2
1249         should be transformed into:
1250        <p>Para1</p><p>Para2
1251 
1252       Some tags can be nested arbitrarily. For instance, the occurance
1253       of a <blockquote> tag should _not_ implicitly close the previous
1254       <blockquote> tag.
1255 
1256        Alice said: <blockquote>Bob said: <blockquote>Blah
1257         should NOT be transformed into:
1258        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1259 
1260       Some tags can be nested, but the nesting is reset by the
1261       interposition of other tags. For instance, a <tr> tag should
1262       implicitly close the previous <tr> tag within the same <table>,
1263       but not close a <tr> tag in another table.
1264 
1265        <table><tr>Blah<tr>Blah
1266         should be transformed into:
1267        <table><tr>Blah</tr><tr>Blah
1268         but,
1269        <tr>Blah<table><tr>Blah
1270         should NOT be transformed into
1271        <tr>Blah<table></tr><tr>Blah
1272 
1273     Differing assumptions about tag nesting rules are a major source
1274     of problems with the BeautifulSoup class. If BeautifulSoup is not
1275     treating as nestable a tag your page author treats as nestable,
1276     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1277     BeautifulStoneSoup before writing your own subclass."""
1278 
1279     def __init__(self, *args, **kwargs):
1280         if not kwargs.has_key('smartQuotesTo'):
1281             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1282         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1283 
1284     SELF_CLOSING_TAGS = buildTagMap(None,
1285                                     ['br' , 'hr', 'input', 'img', 'meta',
1286                                     'spacer', 'link', 'frame', 'base'])
1287 
1288     QUOTE_TAGS = {'script': None}
1289     
1290     #According to the HTML standard, each of these inline tags can
1291     #contain another tag of the same type. Furthermore, it's common
1292     #to actually use these tags this way.
1293     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1294                             'center']
1295 
1296     #According to the HTML standard, these block tags can contain
1297     #another tag of the same type. Furthermore, it's common
1298     #to actually use these tags this way.
1299     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1300 
1301     #Lists can contain other lists, but there are restrictions.    
1302     NESTABLE_LIST_TAGS = { 'ol' : [],
1303                            'ul' : [],
1304                            'li' : ['ul', 'ol'],
1305                            'dl' : [],
1306                            'dd' : ['dl'],
1307                            'dt' : ['dl'] }
1308 
1309     #Tables can contain other tables, but there are restrictions.    
1310     NESTABLE_TABLE_TAGS = {'table' : [], 
1311                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1312                            'td' : ['tr'],
1313                            'th' : ['tr'],
1314                            'thead' : ['table'],
1315                            'tbody' : ['table'],
1316                            'tfoot' : ['table'],
1317                            }
1318 
1319     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1320 
1321     #If one of these tags is encountered, all tags up to the next tag of
1322     #this type are popped.
1323     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1324                                      NON_NESTABLE_BLOCK_TAGS,
1325                                      NESTABLE_LIST_TAGS,
1326                                      NESTABLE_TABLE_TAGS)
1327 
1328     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1329                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1330 
1331     # Used to detect the charset in a META tag; see start_meta
1332     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1333 
1334     def start_meta(self, attrs):
1335         """Beautiful Soup can detect a charset included in a META tag,
1336         try to convert the document to that charset, and re-parse the
1337         document from the beginning."""
1338         httpEquiv = None
1339         contentType = None
1340         contentTypeIndex = None
1341         tagNeedsEncodingSubstitution = False
1342 
1343         for i in range(0, len(attrs)):
1344             key, value = attrs[i]
1345             key = key.lower()
1346             if key == 'http-equiv':
1347                 httpEquiv = value
1348             elif key == 'content':
1349                 contentType = value
1350                 contentTypeIndex = i
1351 
1352         if httpEquiv and contentType: # It's an interesting meta tag.
1353             match = self.CHARSET_RE.search(contentType)
1354             if match:
1355                 if getattr(self, 'declaredHTMLEncoding') or \
1356                        (self.originalEncoding == self.fromEncoding):
1357                     # This is our second pass through the document, or
1358                     # else an encoding was specified explicitly and it
1359                     # worked. Rewrite the meta tag.
1360                     newAttr = self.CHARSET_RE.sub\
1361                               (lambda(match):match.group(1) +
1362                                "%SOUP-ENCODING%", value)
1363                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1364                                                newAttr)
1365                     tagNeedsEncodingSubstitution = True
1366                 else:
1367                     # This is our first pass through the document.
1368                     # Go through it again with the new information.
1369                     newCharset = match.group(3)
1370                     if newCharset and newCharset != self.originalEncoding:
1371                         self.declaredHTMLEncoding = newCharset
1372                         self._feed(self.declaredHTMLEncoding)
1373                         raise StopParsing
1374         tag = self.unknown_starttag("meta", attrs)
1375         if tag and tagNeedsEncodingSubstitution:
1376             tag.containsSubstitutions = True
1377 
1378 class StopParsing(Exception):
1379     pass
1380    
1381 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1382 
1383     """The BeautifulSoup class is oriented towards skipping over
1384     common HTML errors like unclosed tags. However, sometimes it makes
1385     errors of its own. For instance, consider this fragment:
1386 
1387      <b>Foo<b>Bar</b></b>
1388 
1389     This is perfectly valid (if bizarre) HTML. However, the
1390     BeautifulSoup class will implicitly close the first b tag when it
1391     encounters the second 'b'. It will think the author wrote
1392     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1393     there's no real-world reason to bold something that's already
1394     bold. When it encounters '</b></b>' it will close two more 'b'
1395     tags, for a grand total of three tags closed instead of two. This
1396     can throw off the rest of your document structure. The same is
1397     true of a number of other tags, listed below.
1398 
1399     It's much more common for someone to forget to close a 'b' tag
1400     than to actually use nested 'b' tags, and the BeautifulSoup class
1401     handles the common case. This class handles the not-co-common
1402     case: where you can't believe someone wrote what they did, but
1403     it's valid HTML and BeautifulSoup screwed up by assuming it
1404     wouldn't be."""
1405 
1406     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1407      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1408       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1409       'big']
1410 
1411     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1412 
1413     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1414                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1415                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1416 
1417 class MinimalSoup(BeautifulSoup):
1418     """The MinimalSoup class is for parsing HTML that contains
1419     pathologically bad markup. It makes no assumptions about tag
1420     nesting, but it does know which tags are self-closing, that
1421     <script> tags contain Javascript and should not be parsed, that
1422     META tags may contain encoding information, and so on.
1423 
1424     This also makes it better for subclassing than BeautifulStoneSoup
1425     or BeautifulSoup."""
1426     
1427     RESET_NESTING_TAGS = buildTagMap('noscript')
1428     NESTABLE_TAGS = {}
1429 
1430 class BeautifulSOAP(BeautifulStoneSoup):
1431     """This class will push a tag with only a single string child into
1432     the tag's parent as an attribute. The attribute's name is the tag
1433     name, and the value is the string child. An example should give
1434     the flavor of the change:
1435 
1436     <foo><bar>baz</bar></foo>
1437      =>
1438     <foo bar="baz"><bar>baz</bar></foo>
1439 
1440     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1441 
1442     This is, of course, useful for scraping structures that tend to
1443     use subelements instead of attributes, such as SOAP messages. Note
1444     that it modifies its input, so don't print the modified version
1445     out.
1446 
1447     I'm not sure how many people really want to use this class; let me
1448     know if you do. Mainly I like the name."""
1449 
1450     def popTag(self):
1451         if len(self.tagStack) > 1:
1452             tag = self.tagStack[-1]
1453             parent = self.tagStack[-2]
1454             parent._getAttrMap()
1455             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1456                 isinstance(tag.contents[0], NavigableString) and 
1457                 not parent.attrMap.has_key(tag.name)):
1458                 parent[tag.name] = tag.contents[0]
1459         BeautifulStoneSoup.popTag(self)
1460 
1461 #Enterprise class names! It has come to our attention that some people
1462 #think the names of the Beautiful Soup parser classes are too silly
1463 #and "unprofessional" for use in enterprise screen-scraping. We feel
1464 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1465 #All-Night Kosher Bakery recommends renaming this file to
1466 #"RobustParser.py" (or, in cases of extreme enterprisness,
1467 #"RobustParserBeanInterface.class") and using the following
1468 #enterprise-friendly class aliases:
1469 class RobustXMLParser(BeautifulStoneSoup):
1470     pass
1471 class RobustHTMLParser(BeautifulSoup):
1472     pass
1473 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1474     pass
1475 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1476     pass
1477 class SimplifyingSOAPParser(BeautifulSOAP):
1478     pass
1479 
1480 ######################################################
1481 #
1482 # Bonus library: Unicode, Dammit
1483 #
1484 # This class forces XML data into a standard format (usually to UTF-8
1485 # or Unicode).  It is heavily based on code from Mark Pilgrim's
1486 # Universal Feed Parser. It does not rewrite the XML or HTML to
1487 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1488 # (XML) and BeautifulSoup.start_meta (HTML).
1489 
1490 # Autodetects character encodings.
1491 # Download from http://chardet.feedparser.org/
1492 try:
1493     import chardet
1494 #    import chardet.constants
1495 #    chardet.constants._debug = 1
1496 except:
1497     chardet = None
1498 chardet = None
1499 
1500 # cjkcodecs and iconv_codec make Python know about more character encodings.
1501 # Both are available from http://cjkpython.i18n.org/
1502 # They're built in if you use Python 2.4.
1503 try:
1504     import cjkcodecs.aliases
1505 except:
1506     pass
1507 try:
1508     import iconv_codec
1509 except:
1510     pass
1511 
1512 class UnicodeDammit:
1513     """A class for detecting the encoding of a *ML document and
1514     converting it to a Unicode string. If the source encoding is
1515     windows-1252, can replace MS smart quotes with their HTML or XML
1516     equivalents."""
1517 
1518     # This dictionary maps commonly seen values for "charset" in HTML
1519     # meta tags to the corresponding Python codec names. It only covers
1520     # values that aren't in Python's aliases and can't be determined
1521     # by the heuristics in find_codec.
1522     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1523                         "x-sjis" : "shift-jis" }
1524 
1525     def __init__(self, markup, overrideEncodings=[],
1526                  smartQuotesTo='xml'):
1527         self.markup, documentEncoding, sniffedEncoding = \
1528                      self._detectEncoding(markup)
1529         self.smartQuotesTo = smartQuotesTo
1530         self.triedEncodings = []
1531         if markup == '' or isinstance(markup, unicode):
1532             self.originalEncoding = None
1533             self.unicode = unicode(markup)            
1534             return
1535         
1536         u = None
1537         for proposedEncoding in overrideEncodings:
1538             u = self._convertFrom(proposedEncoding)
1539             if u: break
1540         if not u:
1541             for proposedEncoding in (documentEncoding, sniffedEncoding):
1542                 u = self._convertFrom(proposedEncoding)
1543                 if u: break
1544                 
1545         # If no luck and we have auto-detection library, try that:
1546         if not u and chardet and not isinstance(self.markup, unicode):
1547             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1548 
1549         # As a last resort, try utf-8 and windows-1252:
1550         if not u:
1551             for proposed_encoding in ("utf-8", "windows-1252"):
1552                 u = self._convertFrom(proposed_encoding)
1553                 if u: break
1554         self.unicode = u
1555         if not u: self.originalEncoding = None
1556 
1557     def _subMSChar(self, orig):
1558         """Changes a MS smart quote character to an XML or HTML
1559         entity."""
1560         sub = self.MS_CHARS.get(orig)
1561         if type(sub) == types.TupleType:
1562             if self.smartQuotesTo == 'xml':
1563                 sub = '&#x%s;' % sub[1]
1564             else:
1565                 sub = '&%s;' % sub[0]
1566         return sub            
1567 
1568     def _convertFrom(self, proposed):        
1569         proposed = self.find_codec(proposed)
1570         if not proposed or proposed in self.triedEncodings:
1571             return None
1572         self.triedEncodings.append(proposed)
1573         markup = self.markup
1574 
1575         # Convert smart quotes to HTML if coming from an encoding
1576         # that might have them.
1577         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1578                                                       "iso-8859-1",
1579                                                       "iso-8859-2"):
1580             markup = re.compile("([\x80-\x9f])").sub \
1581                      (lambda(x): self._subMSChar(x.group(1)),
1582                       markup)
1583 
1584         try:
1585             # print "Trying to convert document to %s" % proposed
1586             u = self._toUnicode(markup, proposed)
1587             self.markup = u       
1588             self.originalEncoding = proposed
1589         except Exception, e:
1590             # print "That didn't work!"
1591             # print e
1592             return None        
1593         #print "Correct encoding: %s" % proposed
1594         return self.markup
1595 
1596     def _toUnicode(self, data, encoding):
1597         '''Given a string and its encoding, decodes the string into Unicode.
1598         %encoding is a string recognized by encodings.aliases'''
1599 
1600         # strip Byte Order Mark (if present)
1601         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1602                and (data[2:4] != '\x00\x00'):
1603             encoding = 'utf-16be'
1604             data = data[2:]
1605         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1606                  and (data[2:4] != '\x00\x00'):
1607             encoding = 'utf-16le'
1608             data = data[2:]
1609         elif data[:3] == '\xef\xbb\xbf':
1610             encoding = 'utf-8'
1611             data = data[3:]
1612         elif data[:4] == '\x00\x00\xfe\xff':
1613             encoding = 'utf-32be'
1614             data = data[4:]
1615         elif data[:4] == '\xff\xfe\x00\x00':
1616             encoding = 'utf-32le'
1617             data = data[4:]
1618         newdata = unicode(data, encoding)
1619         return newdata
1620     
1621     def _detectEncoding(self, xml_data):
1622         """Given a document, tries to detect its XML encoding."""
1623         xml_encoding = sniffed_xml_encoding = None
1624         try:
1625             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1626                 # EBCDIC
1627                 xml_data = self._ebcdic_to_ascii(xml_data)
1628             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1629                 # UTF-16BE
1630                 sniffed_xml_encoding = 'utf-16be'
1631                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1632             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1633                      and (xml_data[2:4] != '\x00\x00'):
1634                 # UTF-16BE with BOM
1635                 sniffed_xml_encoding = 'utf-16be'
1636                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1637             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1638                 # UTF-16LE
1639                 sniffed_xml_encoding = 'utf-16le'
1640                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1641             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1642                      (xml_data[2:4] != '\x00\x00'):
1643                 # UTF-16LE with BOM
1644                 sniffed_xml_encoding = 'utf-16le'
1645                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1646             elif xml_data[:4] == '\x00\x00\x00\x3c':
1647                 # UTF-32BE
1648                 sniffed_xml_encoding = 'utf-32be'
1649                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1650             elif xml_data[:4] == '\x3c\x00\x00\x00':
1651                 # UTF-32LE
1652                 sniffed_xml_encoding = 'utf-32le'
1653                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1654             elif xml_data[:4] == '\x00\x00\xfe\xff':
1655                 # UTF-32BE with BOM
1656                 sniffed_xml_encoding = 'utf-32be'
1657                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1658             elif xml_data[:4] == '\xff\xfe\x00\x00':
1659                 # UTF-32LE with BOM
1660                 sniffed_xml_encoding = 'utf-32le'
1661                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1662             elif xml_data[:3] == '\xef\xbb\xbf':
1663                 # UTF-8 with BOM
1664                 sniffed_xml_encoding = 'utf-8'
1665                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1666             else:
1667                 sniffed_xml_encoding = 'ascii'
1668                 pass
1669             xml_encoding_match = re.compile \
1670                                  ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1671                                  .match(xml_data)
1672         except:
1673             xml_encoding_match = None
1674         if xml_encoding_match:
1675             xml_encoding = xml_encoding_match.groups()[0].lower()
1676             if sniffed_xml_encoding and \
1677                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1678                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1679                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1680                                  'utf16', 'u16')):
1681                 xml_encoding = sniffed_xml_encoding
1682         return xml_data, xml_encoding, sniffed_xml_encoding
1683 
1684 
1685     def find_codec(self, charset):
1686         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1687                or (charset and self._codec(charset.replace("-", ""))) \
1688                or (charset and self._codec(charset.replace("-", "_"))) \
1689                or charset
1690 
1691     def _codec(self, charset):
1692         if not charset: return charset 
1693         codec = None
1694         try:
1695             codecs.lookup(charset)
1696             codec = charset
1697         except LookupError:
1698             pass
1699         return codec
1700 
1701     EBCDIC_TO_ASCII_MAP = None
1702     def _ebcdic_to_ascii(self, s):
1703         c = self.__class__
1704         if not c.EBCDIC_TO_ASCII_MAP:
1705             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1706                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1707                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1708                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1709                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1710                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1711                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1712                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1713                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1714                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1715                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1716                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1717                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1718                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1719                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1720                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1721                     250,251,252,253,254,255)
1722             import string
1723             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1724             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1725         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1726 
1727     MS_CHARS = { '\x80' : ('euro', '20AC'),
1728                  '\x81' : ' ',
1729                  '\x82' : ('sbquo', '201A'),
1730                  '\x83' : ('fnof', '192'),
1731                  '\x84' : ('bdquo', '201E'),
1732                  '\x85' : ('hellip', '2026'),
1733                  '\x86' : ('dagger', '2020'),
1734                  '\x87' : ('Dagger', '2021'),
1735                  '\x88' : ('circ', '2C6'),
1736                  '\x89' : ('permil', '2030'),
1737                  '\x8A' : ('Scaron', '160'),
1738                  '\x8B' : ('lsaquo', '2039'),
1739                  '\x8C' : ('OElig', '152'),
1740                  '\x8D' : '?',
1741                  '\x8E' : ('#x17D', '17D'),
1742                  '\x8F' : '?',
1743                  '\x90' : '?',
1744                  '\x91' : ('lsquo', '2018'),
1745                  '\x92' : ('rsquo', '2019'),
1746                  '\x93' : ('ldquo', '201C'),
1747                  '\x94' : ('rdquo', '201D'),
1748                  '\x95' : ('bull', '2022'),
1749                  '\x96' : ('ndash', '2013'),
1750                  '\x97' : ('mdash', '2014'),
1751                  '\x98' : ('tilde', '2DC'),
1752                  '\x99' : ('trade', '2122'),
1753                  '\x9a' : ('scaron', '161'),
1754                  '\x9b' : ('rsaquo', '203A'),
1755                  '\x9c' : ('oelig', '153'),
1756                  '\x9d' : '?',
1757                  '\x9e' : ('#x17E', '17E'),
1758                  '\x9f' : ('Yuml', ''),}
1759 
1760 #######################################################################
1761 
1762 
1763 #By default, act as an HTML pretty-printer.
1764 if __name__ == '__main__':
1765     import sys
1766     soup = BeautifulSoup(sys.stdin.read())
1767     print soup.prettify()

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2007-12-11 15:52:37, 67.9 KB) [[attachment:BeautifulSoup.py]]
  • [get | view] (2008-04-01 21:50:20, 3.2 KB) [[attachment:UbuntuEstadisticasMoinMoin.py]]
  • [get | view] (2008-03-28 03:17:50, 21.0 KB) [[attachment:ubuntu-co_stats.ods]]
  • [get | view] (2008-03-28 03:17:00, 2.1 KB) [[attachment:ubuntu-co_stats.py]]
 All files | Selected Files: delete move to page

You are not allowed to attach a file to this page.