ColombianTeamAnalysis
Attachment 'BeautifulSoup.py'
Download 1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
5
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
9
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it. The BeautifulSoup class
15
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
19
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
25
26 Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
34
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43 """
44 from __future__ import generators
45
46 __author__ = "Leonard Richardson (leonardr@segfault.org)"
47 __version__ = "3.0.4"
48 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
49 __license__ = "PSF"
50
51 from sgmllib import SGMLParser, SGMLParseError
52 import codecs
53 import types
54 import re
55 import sgmllib
56 try:
57 from htmlentitydefs import name2codepoint
58 except ImportError:
59 name2codepoint = {}
60
61 #This hack makes Beautiful Soup able to parse XML with namespaces
62 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64 DEFAULT_OUTPUT_ENCODING = "utf-8"
65
66 # First, the classes that represent markup elements.
67
68 class PageElement:
69 """Contains the navigational information for some part of the page
70 (either a tag or a piece of text)"""
71
72 def setup(self, parent=None, previous=None):
73 """Sets up the initial relations between this element and
74 other elements."""
75 self.parent = parent
76 self.previous = previous
77 self.next = None
78 self.previousSibling = None
79 self.nextSibling = None
80 if self.parent and self.parent.contents:
81 self.previousSibling = self.parent.contents[-1]
82 self.previousSibling.nextSibling = self
83
84 def replaceWith(self, replaceWith):
85 oldParent = self.parent
86 myIndex = self.parent.contents.index(self)
87 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
88 # We're replacing this element with one of its siblings.
89 index = self.parent.contents.index(replaceWith)
90 if index and index < myIndex:
91 # Furthermore, it comes before this element. That
92 # means that when we extract it, the index of this
93 # element will change.
94 myIndex = myIndex - 1
95 self.extract()
96 oldParent.insert(myIndex, replaceWith)
97
98 def extract(self):
99 """Destructively rips this element out of the tree."""
100 if self.parent:
101 try:
102 self.parent.contents.remove(self)
103 except ValueError:
104 pass
105
106 #Find the two elements that would be next to each other if
107 #this element (and any children) hadn't been parsed. Connect
108 #the two.
109 lastChild = self._lastRecursiveChild()
110 nextElement = lastChild.next
111
112 if self.previous:
113 self.previous.next = nextElement
114 if nextElement:
115 nextElement.previous = self.previous
116 self.previous = None
117 lastChild.next = None
118
119 self.parent = None
120 if self.previousSibling:
121 self.previousSibling.nextSibling = self.nextSibling
122 if self.nextSibling:
123 self.nextSibling.previousSibling = self.previousSibling
124 self.previousSibling = self.nextSibling = None
125
126 def _lastRecursiveChild(self):
127 "Finds the last element beneath this object to be parsed."
128 lastChild = self
129 while hasattr(lastChild, 'contents') and lastChild.contents:
130 lastChild = lastChild.contents[-1]
131 return lastChild
132
133 def insert(self, position, newChild):
134 if (isinstance(newChild, basestring)
135 or isinstance(newChild, unicode)) \
136 and not isinstance(newChild, NavigableString):
137 newChild = NavigableString(newChild)
138
139 position = min(position, len(self.contents))
140 if hasattr(newChild, 'parent') and newChild.parent != None:
141 # We're 'inserting' an element that's already one
142 # of this object's children.
143 if newChild.parent == self:
144 index = self.find(newChild)
145 if index and index < position:
146 # Furthermore we're moving it further down the
147 # list of this object's children. That means that
148 # when we extract this element, our target index
149 # will jump down one.
150 position = position - 1
151 newChild.extract()
152
153 newChild.parent = self
154 previousChild = None
155 if position == 0:
156 newChild.previousSibling = None
157 newChild.previous = self
158 else:
159 previousChild = self.contents[position-1]
160 newChild.previousSibling = previousChild
161 newChild.previousSibling.nextSibling = newChild
162 newChild.previous = previousChild._lastRecursiveChild()
163 if newChild.previous:
164 newChild.previous.next = newChild
165
166 newChildsLastElement = newChild._lastRecursiveChild()
167
168 if position >= len(self.contents):
169 newChild.nextSibling = None
170
171 parent = self
172 parentsNextSibling = None
173 while not parentsNextSibling:
174 parentsNextSibling = parent.nextSibling
175 parent = parent.parent
176 if not parent: # This is the last element in the document.
177 break
178 if parentsNextSibling:
179 newChildsLastElement.next = parentsNextSibling
180 else:
181 newChildsLastElement.next = None
182 else:
183 nextChild = self.contents[position]
184 newChild.nextSibling = nextChild
185 if newChild.nextSibling:
186 newChild.nextSibling.previousSibling = newChild
187 newChildsLastElement.next = nextChild
188
189 if newChildsLastElement.next:
190 newChildsLastElement.next.previous = newChildsLastElement
191 self.contents.insert(position, newChild)
192
193 def findNext(self, name=None, attrs={}, text=None, **kwargs):
194 """Returns the first item that matches the given criteria and
195 appears after this Tag in the document."""
196 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
197
198 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
199 **kwargs):
200 """Returns all items that match the given criteria and appear
201 before after Tag in the document."""
202 return self._findAll(name, attrs, text, limit, self.nextGenerator)
203
204 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
205 """Returns the closest sibling to this Tag that matches the
206 given criteria and appears after this Tag in the document."""
207 return self._findOne(self.findNextSiblings, name, attrs, text,
208 **kwargs)
209
210 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
211 **kwargs):
212 """Returns the siblings of this Tag that match the given
213 criteria and appear after this Tag in the document."""
214 return self._findAll(name, attrs, text, limit,
215 self.nextSiblingGenerator, **kwargs)
216 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
217
218 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
219 """Returns the first item that matches the given criteria and
220 appears before this Tag in the document."""
221 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
222
223 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
224 **kwargs):
225 """Returns all items that match the given criteria and appear
226 before this Tag in the document."""
227 return self._findAll(name, attrs, text, limit, self.previousGenerator,
228 **kwargs)
229 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
230
231 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
232 """Returns the closest sibling to this Tag that matches the
233 given criteria and appears before this Tag in the document."""
234 return self._findOne(self.findPreviousSiblings, name, attrs, text,
235 **kwargs)
236
237 def findPreviousSiblings(self, name=None, attrs={}, text=None,
238 limit=None, **kwargs):
239 """Returns the siblings of this Tag that match the given
240 criteria and appear before this Tag in the document."""
241 return self._findAll(name, attrs, text, limit,
242 self.previousSiblingGenerator, **kwargs)
243 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
244
245 def findParent(self, name=None, attrs={}, **kwargs):
246 """Returns the closest parent of this Tag that matches the given
247 criteria."""
248 # NOTE: We can't use _findOne because findParents takes a different
249 # set of arguments.
250 r = None
251 l = self.findParents(name, attrs, 1)
252 if l:
253 r = l[0]
254 return r
255
256 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
257 """Returns the parents of this Tag that match the given
258 criteria."""
259
260 return self._findAll(name, attrs, None, limit, self.parentGenerator,
261 **kwargs)
262 fetchParents = findParents # Compatibility with pre-3.x
263
264 #These methods do the real heavy lifting.
265
266 def _findOne(self, method, name, attrs, text, **kwargs):
267 r = None
268 l = method(name, attrs, text, 1, **kwargs)
269 if l:
270 r = l[0]
271 return r
272
273 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
274 "Iterates over a generator looking for things that match."
275
276 if isinstance(name, SoupStrainer):
277 strainer = name
278 else:
279 # Build a SoupStrainer
280 strainer = SoupStrainer(name, attrs, text, **kwargs)
281 results = ResultSet(strainer)
282 g = generator()
283 while True:
284 try:
285 i = g.next()
286 except StopIteration:
287 break
288 if i:
289 found = strainer.search(i)
290 if found:
291 results.append(found)
292 if limit and len(results) >= limit:
293 break
294 return results
295
296 #These Generators can be used to navigate starting from both
297 #NavigableStrings and Tags.
298 def nextGenerator(self):
299 i = self
300 while i:
301 i = i.next
302 yield i
303
304 def nextSiblingGenerator(self):
305 i = self
306 while i:
307 i = i.nextSibling
308 yield i
309
310 def previousGenerator(self):
311 i = self
312 while i:
313 i = i.previous
314 yield i
315
316 def previousSiblingGenerator(self):
317 i = self
318 while i:
319 i = i.previousSibling
320 yield i
321
322 def parentGenerator(self):
323 i = self
324 while i:
325 i = i.parent
326 yield i
327
328 # Utility methods
329 def substituteEncoding(self, str, encoding=None):
330 encoding = encoding or "utf-8"
331 return str.replace("%SOUP-ENCODING%", encoding)
332
333 def toEncoding(self, s, encoding=None):
334 """Encodes an object to a string in some encoding, or to Unicode.
335 ."""
336 if isinstance(s, unicode):
337 if encoding:
338 s = s.encode(encoding)
339 elif isinstance(s, str):
340 if encoding:
341 s = s.encode(encoding)
342 else:
343 s = unicode(s)
344 else:
345 if encoding:
346 s = self.toEncoding(str(s), encoding)
347 else:
348 s = unicode(s)
349 return s
350
351 class NavigableString(unicode, PageElement):
352
353 def __getattr__(self, attr):
354 """text.string gives you text. This is for backwards
355 compatibility for Navigable*String, but for CData* it lets you
356 get the string without the CData wrapper."""
357 if attr == 'string':
358 return self
359 else:
360 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
361
362 def __unicode__(self):
363 return self.__str__(None)
364
365 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
366 if encoding:
367 return self.encode(encoding)
368 else:
369 return self
370
371 class CData(NavigableString):
372
373 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
374 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
375
376 class ProcessingInstruction(NavigableString):
377 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378 output = self
379 if "%SOUP-ENCODING%" in output:
380 output = self.substituteEncoding(output, encoding)
381 return "<?%s?>" % self.toEncoding(output, encoding)
382
383 class Comment(NavigableString):
384 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385 return "<!--%s-->" % NavigableString.__str__(self, encoding)
386
387 class Declaration(NavigableString):
388 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389 return "<!%s>" % NavigableString.__str__(self, encoding)
390
391 class Tag(PageElement):
392
393 """Represents a found HTML tag with its attributes and contents."""
394
395 XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
396 '"' : "quote",
397 "&" : "amp",
398 "<" : "lt",
399 ">" : "gt" }
400
401 def __init__(self, parser, name, attrs=None, parent=None,
402 previous=None):
403 "Basic constructor."
404
405 # We don't actually store the parser object: that lets extracted
406 # chunks be garbage-collected
407 self.parserClass = parser.__class__
408 self.isSelfClosing = parser.isSelfClosingTag(name)
409 self.name = name
410 if attrs == None:
411 attrs = []
412 self.attrs = attrs
413 self.contents = []
414 self.setup(parent, previous)
415 self.hidden = False
416 self.containsSubstitutions = False
417
418 def get(self, key, default=None):
419 """Returns the value of the 'key' attribute for the tag, or
420 the value given for 'default' if it doesn't have that
421 attribute."""
422 return self._getAttrMap().get(key, default)
423
424 def has_key(self, key):
425 return self._getAttrMap().has_key(key)
426
427 def __getitem__(self, key):
428 """tag[key] returns the value of the 'key' attribute for the tag,
429 and throws an exception if it's not there."""
430 return self._getAttrMap()[key]
431
432 def __iter__(self):
433 "Iterating over a tag iterates over its contents."
434 return iter(self.contents)
435
436 def __len__(self):
437 "The length of a tag is the length of its list of contents."
438 return len(self.contents)
439
440 def __contains__(self, x):
441 return x in self.contents
442
443 def __nonzero__(self):
444 "A tag is non-None even if it has no contents."
445 return True
446
447 def __setitem__(self, key, value):
448 """Setting tag[key] sets the value of the 'key' attribute for the
449 tag."""
450 self._getAttrMap()
451 self.attrMap[key] = value
452 found = False
453 for i in range(0, len(self.attrs)):
454 if self.attrs[i][0] == key:
455 self.attrs[i] = (key, value)
456 found = True
457 if not found:
458 self.attrs.append((key, value))
459 self._getAttrMap()[key] = value
460
461 def __delitem__(self, key):
462 "Deleting tag[key] deletes all 'key' attributes for the tag."
463 for item in self.attrs:
464 if item[0] == key:
465 self.attrs.remove(item)
466 #We don't break because bad HTML can define the same
467 #attribute multiple times.
468 self._getAttrMap()
469 if self.attrMap.has_key(key):
470 del self.attrMap[key]
471
472 def __call__(self, *args, **kwargs):
473 """Calling a tag like a function is the same as calling its
474 findAll() method. Eg. tag('a') returns a list of all the A tags
475 found within this tag."""
476 return apply(self.findAll, args, kwargs)
477
478 def __getattr__(self, tag):
479 #print "Getattr %s.%s" % (self.__class__, tag)
480 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
481 return self.find(tag[:-3])
482 elif tag.find('__') != 0:
483 return self.find(tag)
484
485 def __eq__(self, other):
486 """Returns true iff this tag has the same name, the same attributes,
487 and the same contents (recursively) as the given tag.
488
489 NOTE: right now this will return false if two tags have the
490 same attributes in a different order. Should this be fixed?"""
491 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
492 return False
493 for i in range(0, len(self.contents)):
494 if self.contents[i] != other.contents[i]:
495 return False
496 return True
497
498 def __ne__(self, other):
499 """Returns true iff this tag is not identical to the other tag,
500 as defined in __eq__."""
501 return not self == other
502
503 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504 """Renders this tag as a string."""
505 return self.__str__(encoding)
506
507 def __unicode__(self):
508 return self.__str__(None)
509
510 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511 prettyPrint=False, indentLevel=0):
512 """Returns a string or Unicode representation of this tag and
513 its contents. To get Unicode, pass None for encoding.
514
515 NOTE: since Python's HTML parser consumes whitespace, this
516 method is not certain to reproduce the whitespace present in
517 the original string."""
518
519 encodedName = self.toEncoding(self.name, encoding)
520
521 attrs = []
522 if self.attrs:
523 for key, val in self.attrs:
524 fmt = '%s="%s"'
525 if isString(val):
526 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
527 val = self.substituteEncoding(val, encoding)
528
529 # The attribute value either:
530 #
531 # * Contains no embedded double quotes or single quotes.
532 # No problem: we enclose it in double quotes.
533 # * Contains embedded single quotes. No problem:
534 # double quotes work here too.
535 # * Contains embedded double quotes. No problem:
536 # we enclose it in single quotes.
537 # * Embeds both single _and_ double quotes. This
538 # can't happen naturally, but it can happen if
539 # you modify an attribute value after parsing
540 # the document. Now we have a bit of a
541 # problem. We solve it by enclosing the
542 # attribute in single quotes, and escaping any
543 # embedded single quotes to XML entities.
544 if '"' in val:
545 fmt = "%s='%s'"
546 # This can't happen naturally, but it can happen
547 # if you modify an attribute value after parsing.
548 if "'" in val:
549 val = val.replace("'", "&squot;")
550
551 # Now we're okay w/r/t quotes. But the attribute
552 # value might also contain angle brackets, or
553 # ampersands that aren't part of entities. We need
554 # to escape those to XML entities too.
555 val = re.sub("([<>]|&(?![^\s]+;))",
556 lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
557 val)
558
559 attrs.append(fmt % (self.toEncoding(key, encoding),
560 self.toEncoding(val, encoding)))
561 close = ''
562 closeTag = ''
563 if self.isSelfClosing:
564 close = ' /'
565 else:
566 closeTag = '</%s>' % encodedName
567
568 indentTag, indentContents = 0, 0
569 if prettyPrint:
570 indentTag = indentLevel
571 space = (' ' * (indentTag-1))
572 indentContents = indentTag + 1
573 contents = self.renderContents(encoding, prettyPrint, indentContents)
574 if self.hidden:
575 s = contents
576 else:
577 s = []
578 attributeString = ''
579 if attrs:
580 attributeString = ' ' + ' '.join(attrs)
581 if prettyPrint:
582 s.append(space)
583 s.append('<%s%s%s>' % (encodedName, attributeString, close))
584 if prettyPrint:
585 s.append("\n")
586 s.append(contents)
587 if prettyPrint and contents and contents[-1] != "\n":
588 s.append("\n")
589 if prettyPrint and closeTag:
590 s.append(space)
591 s.append(closeTag)
592 if prettyPrint and closeTag and self.nextSibling:
593 s.append("\n")
594 s = ''.join(s)
595 return s
596
597 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
598 return self.__str__(encoding, True)
599
600 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
601 prettyPrint=False, indentLevel=0):
602 """Renders the contents of this tag as a string in the given
603 encoding. If encoding is None, returns a Unicode string.."""
604 s=[]
605 for c in self:
606 text = None
607 if isinstance(c, NavigableString):
608 text = c.__str__(encoding)
609 elif isinstance(c, Tag):
610 s.append(c.__str__(encoding, prettyPrint, indentLevel))
611 if text and prettyPrint:
612 text = text.strip()
613 if text:
614 if prettyPrint:
615 s.append(" " * (indentLevel-1))
616 s.append(text)
617 if prettyPrint:
618 s.append("\n")
619 return ''.join(s)
620
621 #Soup methods
622
623 def find(self, name=None, attrs={}, recursive=True, text=None,
624 **kwargs):
625 """Return only the first child of this Tag matching the given
626 criteria."""
627 r = None
628 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
629 if l:
630 r = l[0]
631 return r
632 findChild = find
633
634 def findAll(self, name=None, attrs={}, recursive=True, text=None,
635 limit=None, **kwargs):
636 """Extracts a list of Tag objects that match the given
637 criteria. You can specify the name of the Tag and any
638 attributes you want the Tag to have.
639
640 The value of a key-value pair in the 'attrs' map can be a
641 string, a list of strings, a regular expression object, or a
642 callable that takes a string and returns whether or not the
643 string matches for some custom definition of 'matches'. The
644 same is true of the tag name."""
645 generator = self.recursiveChildGenerator
646 if not recursive:
647 generator = self.childGenerator
648 return self._findAll(name, attrs, text, limit, generator, **kwargs)
649 findChildren = findAll
650
651 # Pre-3.x compatibility methods
652 first = find
653 fetch = findAll
654
655 def fetchText(self, text=None, recursive=True, limit=None):
656 return self.findAll(text=text, recursive=recursive, limit=limit)
657
658 def firstText(self, text=None, recursive=True):
659 return self.find(text=text, recursive=recursive)
660
661 #Utility methods
662
663 def append(self, tag):
664 """Appends the given tag to the contents of this tag."""
665 self.contents.append(tag)
666
667 #Private methods
668
669 def _getAttrMap(self):
670 """Initializes a map representation of this tag's attributes,
671 if not already initialized."""
672 if not getattr(self, 'attrMap'):
673 self.attrMap = {}
674 for (key, value) in self.attrs:
675 self.attrMap[key] = value
676 return self.attrMap
677
678 #Generator methods
679 def childGenerator(self):
680 for i in range(0, len(self.contents)):
681 yield self.contents[i]
682 raise StopIteration
683
684 def recursiveChildGenerator(self):
685 stack = [(self, 0)]
686 while stack:
687 tag, start = stack.pop()
688 if isinstance(tag, Tag):
689 for i in range(start, len(tag.contents)):
690 a = tag.contents[i]
691 yield a
692 if isinstance(a, Tag) and tag.contents:
693 if i < len(tag.contents) - 1:
694 stack.append((tag, i+1))
695 stack.append((a, 0))
696 break
697 raise StopIteration
698
699 # Next, a couple classes to represent queries and their results.
700 class SoupStrainer:
701 """Encapsulates a number of ways of matching a markup element (tag or
702 text)."""
703
704 def __init__(self, name=None, attrs={}, text=None, **kwargs):
705 self.name = name
706 if isString(attrs):
707 kwargs['class'] = attrs
708 attrs = None
709 if kwargs:
710 if attrs:
711 attrs = attrs.copy()
712 attrs.update(kwargs)
713 else:
714 attrs = kwargs
715 self.attrs = attrs
716 self.text = text
717
718 def __str__(self):
719 if self.text:
720 return self.text
721 else:
722 return "%s|%s" % (self.name, self.attrs)
723
724 def searchTag(self, markupName=None, markupAttrs={}):
725 found = None
726 markup = None
727 if isinstance(markupName, Tag):
728 markup = markupName
729 markupAttrs = markup
730 callFunctionWithTagData = callable(self.name) \
731 and not isinstance(markupName, Tag)
732
733 if (not self.name) \
734 or callFunctionWithTagData \
735 or (markup and self._matches(markup, self.name)) \
736 or (not markup and self._matches(markupName, self.name)):
737 if callFunctionWithTagData:
738 match = self.name(markupName, markupAttrs)
739 else:
740 match = True
741 markupAttrMap = None
742 for attr, matchAgainst in self.attrs.items():
743 if not markupAttrMap:
744 if hasattr(markupAttrs, 'get'):
745 markupAttrMap = markupAttrs
746 else:
747 markupAttrMap = {}
748 for k,v in markupAttrs:
749 markupAttrMap[k] = v
750 attrValue = markupAttrMap.get(attr)
751 if not self._matches(attrValue, matchAgainst):
752 match = False
753 break
754 if match:
755 if markup:
756 found = markup
757 else:
758 found = markupName
759 return found
760
761 def search(self, markup):
762 #print 'looking for %s in %s' % (self, markup)
763 found = None
764 # If given a list of items, scan it for a text element that
765 # matches.
766 if isList(markup) and not isinstance(markup, Tag):
767 for element in markup:
768 if isinstance(element, NavigableString) \
769 and self.search(element):
770 found = element
771 break
772 # If it's a Tag, make sure its name or attributes match.
773 # Don't bother with Tags if we're searching for text.
774 elif isinstance(markup, Tag):
775 if not self.text:
776 found = self.searchTag(markup)
777 # If it's text, make sure the text matches.
778 elif isinstance(markup, NavigableString) or \
779 isString(markup):
780 if self._matches(markup, self.text):
781 found = markup
782 else:
783 raise Exception, "I don't know how to match against a %s" \
784 % markup.__class__
785 return found
786
787 def _matches(self, markup, matchAgainst):
788 #print "Matching %s against %s" % (markup, matchAgainst)
789 result = False
790 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
791 result = markup != None
792 elif callable(matchAgainst):
793 result = matchAgainst(markup)
794 else:
795 #Custom match methods take the tag as an argument, but all
796 #other ways of matching match the tag name as a string.
797 if isinstance(markup, Tag):
798 markup = markup.name
799 if markup and not isString(markup):
800 markup = unicode(markup)
801 #Now we know that chunk is either a string, or None.
802 if hasattr(matchAgainst, 'match'):
803 # It's a regexp object.
804 result = markup and matchAgainst.search(markup)
805 elif isList(matchAgainst):
806 result = markup in matchAgainst
807 elif hasattr(matchAgainst, 'items'):
808 result = markup.has_key(matchAgainst)
809 elif matchAgainst and isString(markup):
810 if isinstance(markup, unicode):
811 matchAgainst = unicode(matchAgainst)
812 else:
813 matchAgainst = str(matchAgainst)
814
815 if not result:
816 result = matchAgainst == markup
817 return result
818
819 class ResultSet(list):
820 """A ResultSet is just a list that keeps track of the SoupStrainer
821 that created it."""
822 def __init__(self, source):
823 list.__init__([])
824 self.source = source
825
826 # Now, some helper functions.
827
828 def isList(l):
829 """Convenience method that works with all 2.x versions of Python
830 to determine whether or not something is listlike."""
831 return hasattr(l, '__iter__') \
832 or (type(l) in (types.ListType, types.TupleType))
833
834 def isString(s):
835 """Convenience method that works with all 2.x versions of Python
836 to determine whether or not something is stringlike."""
837 try:
838 return isinstance(s, unicode) or isintance(s, basestring)
839 except NameError:
840 return isinstance(s, str)
841
842 def buildTagMap(default, *args):
843 """Turns a list of maps, lists, or scalars into a single map.
844 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
845 NESTING_RESET_TAGS maps out of lists and partial maps."""
846 built = {}
847 for portion in args:
848 if hasattr(portion, 'items'):
849 #It's a map. Merge it.
850 for k,v in portion.items():
851 built[k] = v
852 elif isList(portion):
853 #It's a list. Map each item to the default.
854 for k in portion:
855 built[k] = default
856 else:
857 #It's a scalar. Map it to the default.
858 built[portion] = default
859 return built
860
861 # Now, the parser classes.
862
863 class BeautifulStoneSoup(Tag, SGMLParser):
864
865 """This class contains the basic parser and search code. It defines
866 a parser that knows nothing about tag behavior except for the
867 following:
868
869 You can't close a tag without closing all the tags it encloses.
870 That is, "<foo><bar></foo>" actually means
871 "<foo><bar></bar></foo>".
872
873 [Another possible explanation is "<foo><bar /></foo>", but since
874 this class defines no SELF_CLOSING_TAGS, it will never use that
875 explanation.]
876
877 This class is useful for parsing XML or made-up markup languages,
878 or when BeautifulSoup makes an assumption counter to what you were
879 expecting."""
880
881 XML_ENTITY_LIST = {}
882 for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
883 XML_ENTITY_LIST[i] = True
884
885 SELF_CLOSING_TAGS = {}
886 NESTABLE_TAGS = {}
887 RESET_NESTING_TAGS = {}
888 QUOTE_TAGS = {}
889
890 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
891 lambda x: x.group(1) + ' />'),
892 (re.compile('<!\s+([^<>]*)>'),
893 lambda x: '<!' + x.group(1) + '>')
894 ]
895
896 ROOT_TAG_NAME = u'[document]'
897
898 HTML_ENTITIES = "html"
899 XML_ENTITIES = "xml"
900
901 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
902 markupMassage=True, smartQuotesTo=XML_ENTITIES,
903 convertEntities=None, selfClosingTags=None):
904 """The Soup object is initialized as the 'root tag', and the
905 provided markup (which can be a string or a file-like object)
906 is fed into the underlying parser.
907
908 sgmllib will process most bad HTML, and the BeautifulSoup
909 class has some tricks for dealing with some HTML that kills
910 sgmllib, but Beautiful Soup can nonetheless choke or lose data
911 if your data uses self-closing tags or declarations
912 incorrectly.
913
914 By default, Beautiful Soup uses regexes to sanitize input,
915 avoiding the vast majority of these problems. If the problems
916 don't apply to you, pass in False for markupMassage, and
917 you'll get better performance.
918
919 The default parser massage techniques fix the two most common
920 instances of invalid HTML that choke sgmllib:
921
922 <br/> (No space between name of closing tag and tag close)
923 <! --Comment--> (Extraneous whitespace in declaration)
924
925 You can pass in a custom list of (RE object, replace method)
926 tuples to get Beautiful Soup to scrub your input the way you
927 want."""
928
929 self.parseOnlyThese = parseOnlyThese
930 self.fromEncoding = fromEncoding
931 self.smartQuotesTo = smartQuotesTo
932 self.convertEntities = convertEntities
933 if self.convertEntities:
934 # It doesn't make sense to convert encoded characters to
935 # entities even while you're converting entities to Unicode.
936 # Just convert it all to Unicode.
937 self.smartQuotesTo = None
938 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
939 SGMLParser.__init__(self)
940
941 if hasattr(markup, 'read'): # It's a file-type object.
942 markup = markup.read()
943 self.markup = markup
944 self.markupMassage = markupMassage
945 try:
946 self._feed()
947 except StopParsing:
948 pass
949 self.markup = None # The markup can now be GCed
950
951 def _feed(self, inDocumentEncoding=None):
952 # Convert the document to Unicode.
953 markup = self.markup
954 if isinstance(markup, unicode):
955 if not hasattr(self, 'originalEncoding'):
956 self.originalEncoding = None
957 else:
958 dammit = UnicodeDammit\
959 (markup, [self.fromEncoding, inDocumentEncoding],
960 smartQuotesTo=self.smartQuotesTo)
961 markup = dammit.unicode
962 self.originalEncoding = dammit.originalEncoding
963 if markup:
964 if self.markupMassage:
965 if not isList(self.markupMassage):
966 self.markupMassage = self.MARKUP_MASSAGE
967 for fix, m in self.markupMassage:
968 markup = fix.sub(m, markup)
969 self.reset()
970
971 SGMLParser.feed(self, markup)
972 # Close out any unfinished strings and close all the open tags.
973 self.endData()
974 while self.currentTag.name != self.ROOT_TAG_NAME:
975 self.popTag()
976
977 def __getattr__(self, methodName):
978 """This method routes method call requests to either the SGMLParser
979 superclass or the Tag superclass, depending on the method name."""
980 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
981
982 if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
983 or methodName.find('do_') == 0:
984 return SGMLParser.__getattr__(self, methodName)
985 elif methodName.find('__') != 0:
986 return Tag.__getattr__(self, methodName)
987 else:
988 raise AttributeError
989
990 def isSelfClosingTag(self, name):
991 """Returns true iff the given string is the name of a
992 self-closing tag according to this parser."""
993 return self.SELF_CLOSING_TAGS.has_key(name) \
994 or self.instanceSelfClosingTags.has_key(name)
995
996 def reset(self):
997 Tag.__init__(self, self, self.ROOT_TAG_NAME)
998 self.hidden = 1
999 SGMLParser.reset(self)
1000 self.currentData = []
1001 self.currentTag = None
1002 self.tagStack = []
1003 self.quoteStack = []
1004 self.pushTag(self)
1005
1006 def popTag(self):
1007 tag = self.tagStack.pop()
1008 # Tags with just one string-owning child get the child as a
1009 # 'string' property, so that soup.tag.string is shorthand for
1010 # soup.tag.contents[0]
1011 if len(self.currentTag.contents) == 1 and \
1012 isinstance(self.currentTag.contents[0], NavigableString):
1013 self.currentTag.string = self.currentTag.contents[0]
1014
1015 #print "Pop", tag.name
1016 if self.tagStack:
1017 self.currentTag = self.tagStack[-1]
1018 return self.currentTag
1019
1020 def pushTag(self, tag):
1021 #print "Push", tag.name
1022 if self.currentTag:
1023 self.currentTag.append(tag)
1024 self.tagStack.append(tag)
1025 self.currentTag = self.tagStack[-1]
1026
1027 def endData(self, containerClass=NavigableString):
1028 if self.currentData:
1029 currentData = ''.join(self.currentData)
1030 if not currentData.strip():
1031 if '\n' in currentData:
1032 currentData = '\n'
1033 else:
1034 currentData = ' '
1035 self.currentData = []
1036 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1037 (not self.parseOnlyThese.text or \
1038 not self.parseOnlyThese.search(currentData)):
1039 return
1040 o = containerClass(currentData)
1041 o.setup(self.currentTag, self.previous)
1042 if self.previous:
1043 self.previous.next = o
1044 self.previous = o
1045 self.currentTag.contents.append(o)
1046
1047
1048 def _popToTag(self, name, inclusivePop=True):
1049 """Pops the tag stack up to and including the most recent
1050 instance of the given tag. If inclusivePop is false, pops the tag
1051 stack up to but *not* including the most recent instqance of
1052 the given tag."""
1053 #print "Popping to %s" % name
1054 if name == self.ROOT_TAG_NAME:
1055 return
1056
1057 numPops = 0
1058 mostRecentTag = None
1059 for i in range(len(self.tagStack)-1, 0, -1):
1060 if name == self.tagStack[i].name:
1061 numPops = len(self.tagStack)-i
1062 break
1063 if not inclusivePop:
1064 numPops = numPops - 1
1065
1066 for i in range(0, numPops):
1067 mostRecentTag = self.popTag()
1068 return mostRecentTag
1069
1070 def _smartPop(self, name):
1071
1072 """We need to pop up to the previous tag of this type, unless
1073 one of this tag's nesting reset triggers comes between this
1074 tag and the previous tag of this type, OR unless this tag is a
1075 generic nesting trigger and another generic nesting trigger
1076 comes between this tag and the previous tag of this type.
1077
1078 Examples:
1079 <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1080 <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1081 <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1082 <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1083
1084 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1085 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1086 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1087 """
1088
1089 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1090 isNestable = nestingResetTriggers != None
1091 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1092 popTo = None
1093 inclusive = True
1094 for i in range(len(self.tagStack)-1, 0, -1):
1095 p = self.tagStack[i]
1096 if (not p or p.name == name) and not isNestable:
1097 #Non-nestable tags get popped to the top or to their
1098 #last occurance.
1099 popTo = name
1100 break
1101 if (nestingResetTriggers != None
1102 and p.name in nestingResetTriggers) \
1103 or (nestingResetTriggers == None and isResetNesting
1104 and self.RESET_NESTING_TAGS.has_key(p.name)):
1105
1106 #If we encounter one of the nesting reset triggers
1107 #peculiar to this tag, or we encounter another tag
1108 #that causes nesting to reset, pop up to but not
1109 #including that tag.
1110 popTo = p.name
1111 inclusive = False
1112 break
1113 p = p.parent
1114 if popTo:
1115 self._popToTag(popTo, inclusive)
1116
1117 def unknown_starttag(self, name, attrs, selfClosing=0):
1118 #print "Start tag %s: %s" % (name, attrs)
1119 if self.quoteStack:
1120 #This is not a real tag.
1121 #print "<%s> is not real!" % name
1122 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1123 self.handle_data('<%s%s>' % (name, attrs))
1124 return
1125 self.endData()
1126
1127 if not self.isSelfClosingTag(name) and not selfClosing:
1128 self._smartPop(name)
1129
1130 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1131 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1132 return
1133
1134 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1135 if self.previous:
1136 self.previous.next = tag
1137 self.previous = tag
1138 self.pushTag(tag)
1139 if selfClosing or self.isSelfClosingTag(name):
1140 self.popTag()
1141 if name in self.QUOTE_TAGS:
1142 #print "Beginning quote (%s)" % name
1143 self.quoteStack.append(name)
1144 self.literal = 1
1145 return tag
1146
1147 def unknown_endtag(self, name):
1148 #print "End tag %s" % name
1149 if self.quoteStack and self.quoteStack[-1] != name:
1150 #This is not a real end tag.
1151 #print "</%s> is not real!" % name
1152 self.handle_data('</%s>' % name)
1153 return
1154 self.endData()
1155 self._popToTag(name)
1156 if self.quoteStack and self.quoteStack[-1] == name:
1157 self.quoteStack.pop()
1158 self.literal = (len(self.quoteStack) > 0)
1159
1160 def handle_data(self, data):
1161 self.currentData.append(data)
1162
1163 def _toStringSubclass(self, text, subclass):
1164 """Adds a certain piece of text to the tree as a NavigableString
1165 subclass."""
1166 self.endData()
1167 self.handle_data(text)
1168 self.endData(subclass)
1169
1170 def handle_pi(self, text):
1171 """Handle a processing instruction as a ProcessingInstruction
1172 object, possibly one with a %SOUP-ENCODING% slot into which an
1173 encoding will be plugged later."""
1174 if text[:3] == "xml":
1175 text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1176 self._toStringSubclass(text, ProcessingInstruction)
1177
1178 def handle_comment(self, text):
1179 "Handle comments as Comment objects."
1180 self._toStringSubclass(text, Comment)
1181
1182 def handle_charref(self, ref):
1183 "Handle character references as data."
1184 if self.convertEntities in [self.HTML_ENTITIES,
1185 self.XML_ENTITIES]:
1186 data = unichr(int(ref))
1187 else:
1188 data = '&#%s;' % ref
1189 self.handle_data(data)
1190
1191 def handle_entityref(self, ref):
1192 """Handle entity references as data, possibly converting known
1193 HTML entity references to the corresponding Unicode
1194 characters."""
1195 data = None
1196 if self.convertEntities == self.HTML_ENTITIES or \
1197 (self.convertEntities == self.XML_ENTITIES and \
1198 self.XML_ENTITY_LIST.get(ref)):
1199 try:
1200 data = unichr(name2codepoint[ref])
1201 except KeyError:
1202 pass
1203 if not data:
1204 data = '&%s;' % ref
1205 self.handle_data(data)
1206
1207 def handle_decl(self, data):
1208 "Handle DOCTYPEs and the like as Declaration objects."
1209 self._toStringSubclass(data, Declaration)
1210
1211 def parse_declaration(self, i):
1212 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1213 declaration as a CData object."""
1214 j = None
1215 if self.rawdata[i:i+9] == '<![CDATA[':
1216 k = self.rawdata.find(']]>', i)
1217 if k == -1:
1218 k = len(self.rawdata)
1219 data = self.rawdata[i+9:k]
1220 j = k+3
1221 self._toStringSubclass(data, CData)
1222 else:
1223 try:
1224 j = SGMLParser.parse_declaration(self, i)
1225 except SGMLParseError:
1226 toHandle = self.rawdata[i:]
1227 self.handle_data(toHandle)
1228 j = i + len(toHandle)
1229 return j
1230
1231 class BeautifulSoup(BeautifulStoneSoup):
1232
1233 """This parser knows the following facts about HTML:
1234
1235 * Some tags have no closing tag and should be interpreted as being
1236 closed as soon as they are encountered.
1237
1238 * The text inside some tags (ie. 'script') may contain tags which
1239 are not really part of the document and which should be parsed
1240 as text, not tags. If you want to parse the text as tags, you can
1241 always fetch it and parse it explicitly.
1242
1243 * Tag nesting rules:
1244
1245 Most tags can't be nested at all. For instance, the occurance of
1246 a <p> tag should implicitly close the previous <p> tag.
1247
1248 <p>Para1<p>Para2
1249 should be transformed into:
1250 <p>Para1</p><p>Para2
1251
1252 Some tags can be nested arbitrarily. For instance, the occurance
1253 of a <blockquote> tag should _not_ implicitly close the previous
1254 <blockquote> tag.
1255
1256 Alice said: <blockquote>Bob said: <blockquote>Blah
1257 should NOT be transformed into:
1258 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1259
1260 Some tags can be nested, but the nesting is reset by the
1261 interposition of other tags. For instance, a <tr> tag should
1262 implicitly close the previous <tr> tag within the same <table>,
1263 but not close a <tr> tag in another table.
1264
1265 <table><tr>Blah<tr>Blah
1266 should be transformed into:
1267 <table><tr>Blah</tr><tr>Blah
1268 but,
1269 <tr>Blah<table><tr>Blah
1270 should NOT be transformed into
1271 <tr>Blah<table></tr><tr>Blah
1272
1273 Differing assumptions about tag nesting rules are a major source
1274 of problems with the BeautifulSoup class. If BeautifulSoup is not
1275 treating as nestable a tag your page author treats as nestable,
1276 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1277 BeautifulStoneSoup before writing your own subclass."""
1278
1279 def __init__(self, *args, **kwargs):
1280 if not kwargs.has_key('smartQuotesTo'):
1281 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1282 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1283
1284 SELF_CLOSING_TAGS = buildTagMap(None,
1285 ['br' , 'hr', 'input', 'img', 'meta',
1286 'spacer', 'link', 'frame', 'base'])
1287
1288 QUOTE_TAGS = {'script': None}
1289
1290 #According to the HTML standard, each of these inline tags can
1291 #contain another tag of the same type. Furthermore, it's common
1292 #to actually use these tags this way.
1293 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1294 'center']
1295
1296 #According to the HTML standard, these block tags can contain
1297 #another tag of the same type. Furthermore, it's common
1298 #to actually use these tags this way.
1299 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1300
1301 #Lists can contain other lists, but there are restrictions.
1302 NESTABLE_LIST_TAGS = { 'ol' : [],
1303 'ul' : [],
1304 'li' : ['ul', 'ol'],
1305 'dl' : [],
1306 'dd' : ['dl'],
1307 'dt' : ['dl'] }
1308
1309 #Tables can contain other tables, but there are restrictions.
1310 NESTABLE_TABLE_TAGS = {'table' : [],
1311 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1312 'td' : ['tr'],
1313 'th' : ['tr'],
1314 'thead' : ['table'],
1315 'tbody' : ['table'],
1316 'tfoot' : ['table'],
1317 }
1318
1319 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1320
1321 #If one of these tags is encountered, all tags up to the next tag of
1322 #this type are popped.
1323 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1324 NON_NESTABLE_BLOCK_TAGS,
1325 NESTABLE_LIST_TAGS,
1326 NESTABLE_TABLE_TAGS)
1327
1328 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1329 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1330
1331 # Used to detect the charset in a META tag; see start_meta
1332 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1333
1334 def start_meta(self, attrs):
1335 """Beautiful Soup can detect a charset included in a META tag,
1336 try to convert the document to that charset, and re-parse the
1337 document from the beginning."""
1338 httpEquiv = None
1339 contentType = None
1340 contentTypeIndex = None
1341 tagNeedsEncodingSubstitution = False
1342
1343 for i in range(0, len(attrs)):
1344 key, value = attrs[i]
1345 key = key.lower()
1346 if key == 'http-equiv':
1347 httpEquiv = value
1348 elif key == 'content':
1349 contentType = value
1350 contentTypeIndex = i
1351
1352 if httpEquiv and contentType: # It's an interesting meta tag.
1353 match = self.CHARSET_RE.search(contentType)
1354 if match:
1355 if getattr(self, 'declaredHTMLEncoding') or \
1356 (self.originalEncoding == self.fromEncoding):
1357 # This is our second pass through the document, or
1358 # else an encoding was specified explicitly and it
1359 # worked. Rewrite the meta tag.
1360 newAttr = self.CHARSET_RE.sub\
1361 (lambda(match):match.group(1) +
1362 "%SOUP-ENCODING%", value)
1363 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1364 newAttr)
1365 tagNeedsEncodingSubstitution = True
1366 else:
1367 # This is our first pass through the document.
1368 # Go through it again with the new information.
1369 newCharset = match.group(3)
1370 if newCharset and newCharset != self.originalEncoding:
1371 self.declaredHTMLEncoding = newCharset
1372 self._feed(self.declaredHTMLEncoding)
1373 raise StopParsing
1374 tag = self.unknown_starttag("meta", attrs)
1375 if tag and tagNeedsEncodingSubstitution:
1376 tag.containsSubstitutions = True
1377
1378 class StopParsing(Exception):
1379 pass
1380
1381 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1382
1383 """The BeautifulSoup class is oriented towards skipping over
1384 common HTML errors like unclosed tags. However, sometimes it makes
1385 errors of its own. For instance, consider this fragment:
1386
1387 <b>Foo<b>Bar</b></b>
1388
1389 This is perfectly valid (if bizarre) HTML. However, the
1390 BeautifulSoup class will implicitly close the first b tag when it
1391 encounters the second 'b'. It will think the author wrote
1392 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1393 there's no real-world reason to bold something that's already
1394 bold. When it encounters '</b></b>' it will close two more 'b'
1395 tags, for a grand total of three tags closed instead of two. This
1396 can throw off the rest of your document structure. The same is
1397 true of a number of other tags, listed below.
1398
1399 It's much more common for someone to forget to close a 'b' tag
1400 than to actually use nested 'b' tags, and the BeautifulSoup class
1401 handles the common case. This class handles the not-co-common
1402 case: where you can't believe someone wrote what they did, but
1403 it's valid HTML and BeautifulSoup screwed up by assuming it
1404 wouldn't be."""
1405
1406 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1407 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1408 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1409 'big']
1410
1411 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1412
1413 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1414 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1415 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1416
1417 class MinimalSoup(BeautifulSoup):
1418 """The MinimalSoup class is for parsing HTML that contains
1419 pathologically bad markup. It makes no assumptions about tag
1420 nesting, but it does know which tags are self-closing, that
1421 <script> tags contain Javascript and should not be parsed, that
1422 META tags may contain encoding information, and so on.
1423
1424 This also makes it better for subclassing than BeautifulStoneSoup
1425 or BeautifulSoup."""
1426
1427 RESET_NESTING_TAGS = buildTagMap('noscript')
1428 NESTABLE_TAGS = {}
1429
1430 class BeautifulSOAP(BeautifulStoneSoup):
1431 """This class will push a tag with only a single string child into
1432 the tag's parent as an attribute. The attribute's name is the tag
1433 name, and the value is the string child. An example should give
1434 the flavor of the change:
1435
1436 <foo><bar>baz</bar></foo>
1437 =>
1438 <foo bar="baz"><bar>baz</bar></foo>
1439
1440 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1441
1442 This is, of course, useful for scraping structures that tend to
1443 use subelements instead of attributes, such as SOAP messages. Note
1444 that it modifies its input, so don't print the modified version
1445 out.
1446
1447 I'm not sure how many people really want to use this class; let me
1448 know if you do. Mainly I like the name."""
1449
1450 def popTag(self):
1451 if len(self.tagStack) > 1:
1452 tag = self.tagStack[-1]
1453 parent = self.tagStack[-2]
1454 parent._getAttrMap()
1455 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1456 isinstance(tag.contents[0], NavigableString) and
1457 not parent.attrMap.has_key(tag.name)):
1458 parent[tag.name] = tag.contents[0]
1459 BeautifulStoneSoup.popTag(self)
1460
1461 #Enterprise class names! It has come to our attention that some people
1462 #think the names of the Beautiful Soup parser classes are too silly
1463 #and "unprofessional" for use in enterprise screen-scraping. We feel
1464 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1465 #All-Night Kosher Bakery recommends renaming this file to
1466 #"RobustParser.py" (or, in cases of extreme enterprisness,
1467 #"RobustParserBeanInterface.class") and using the following
1468 #enterprise-friendly class aliases:
1469 class RobustXMLParser(BeautifulStoneSoup):
1470 pass
1471 class RobustHTMLParser(BeautifulSoup):
1472 pass
1473 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1474 pass
1475 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1476 pass
1477 class SimplifyingSOAPParser(BeautifulSOAP):
1478 pass
1479
1480 ######################################################
1481 #
1482 # Bonus library: Unicode, Dammit
1483 #
1484 # This class forces XML data into a standard format (usually to UTF-8
1485 # or Unicode). It is heavily based on code from Mark Pilgrim's
1486 # Universal Feed Parser. It does not rewrite the XML or HTML to
1487 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1488 # (XML) and BeautifulSoup.start_meta (HTML).
1489
1490 # Autodetects character encodings.
1491 # Download from http://chardet.feedparser.org/
1492 try:
1493 import chardet
1494 # import chardet.constants
1495 # chardet.constants._debug = 1
1496 except:
1497 chardet = None
1498 chardet = None
1499
1500 # cjkcodecs and iconv_codec make Python know about more character encodings.
1501 # Both are available from http://cjkpython.i18n.org/
1502 # They're built in if you use Python 2.4.
1503 try:
1504 import cjkcodecs.aliases
1505 except:
1506 pass
1507 try:
1508 import iconv_codec
1509 except:
1510 pass
1511
1512 class UnicodeDammit:
1513 """A class for detecting the encoding of a *ML document and
1514 converting it to a Unicode string. If the source encoding is
1515 windows-1252, can replace MS smart quotes with their HTML or XML
1516 equivalents."""
1517
1518 # This dictionary maps commonly seen values for "charset" in HTML
1519 # meta tags to the corresponding Python codec names. It only covers
1520 # values that aren't in Python's aliases and can't be determined
1521 # by the heuristics in find_codec.
1522 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1523 "x-sjis" : "shift-jis" }
1524
1525 def __init__(self, markup, overrideEncodings=[],
1526 smartQuotesTo='xml'):
1527 self.markup, documentEncoding, sniffedEncoding = \
1528 self._detectEncoding(markup)
1529 self.smartQuotesTo = smartQuotesTo
1530 self.triedEncodings = []
1531 if markup == '' or isinstance(markup, unicode):
1532 self.originalEncoding = None
1533 self.unicode = unicode(markup)
1534 return
1535
1536 u = None
1537 for proposedEncoding in overrideEncodings:
1538 u = self._convertFrom(proposedEncoding)
1539 if u: break
1540 if not u:
1541 for proposedEncoding in (documentEncoding, sniffedEncoding):
1542 u = self._convertFrom(proposedEncoding)
1543 if u: break
1544
1545 # If no luck and we have auto-detection library, try that:
1546 if not u and chardet and not isinstance(self.markup, unicode):
1547 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1548
1549 # As a last resort, try utf-8 and windows-1252:
1550 if not u:
1551 for proposed_encoding in ("utf-8", "windows-1252"):
1552 u = self._convertFrom(proposed_encoding)
1553 if u: break
1554 self.unicode = u
1555 if not u: self.originalEncoding = None
1556
1557 def _subMSChar(self, orig):
1558 """Changes a MS smart quote character to an XML or HTML
1559 entity."""
1560 sub = self.MS_CHARS.get(orig)
1561 if type(sub) == types.TupleType:
1562 if self.smartQuotesTo == 'xml':
1563 sub = '&#x%s;' % sub[1]
1564 else:
1565 sub = '&%s;' % sub[0]
1566 return sub
1567
1568 def _convertFrom(self, proposed):
1569 proposed = self.find_codec(proposed)
1570 if not proposed or proposed in self.triedEncodings:
1571 return None
1572 self.triedEncodings.append(proposed)
1573 markup = self.markup
1574
1575 # Convert smart quotes to HTML if coming from an encoding
1576 # that might have them.
1577 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1578 "iso-8859-1",
1579 "iso-8859-2"):
1580 markup = re.compile("([\x80-\x9f])").sub \
1581 (lambda(x): self._subMSChar(x.group(1)),
1582 markup)
1583
1584 try:
1585 # print "Trying to convert document to %s" % proposed
1586 u = self._toUnicode(markup, proposed)
1587 self.markup = u
1588 self.originalEncoding = proposed
1589 except Exception, e:
1590 # print "That didn't work!"
1591 # print e
1592 return None
1593 #print "Correct encoding: %s" % proposed
1594 return self.markup
1595
1596 def _toUnicode(self, data, encoding):
1597 '''Given a string and its encoding, decodes the string into Unicode.
1598 %encoding is a string recognized by encodings.aliases'''
1599
1600 # strip Byte Order Mark (if present)
1601 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1602 and (data[2:4] != '\x00\x00'):
1603 encoding = 'utf-16be'
1604 data = data[2:]
1605 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1606 and (data[2:4] != '\x00\x00'):
1607 encoding = 'utf-16le'
1608 data = data[2:]
1609 elif data[:3] == '\xef\xbb\xbf':
1610 encoding = 'utf-8'
1611 data = data[3:]
1612 elif data[:4] == '\x00\x00\xfe\xff':
1613 encoding = 'utf-32be'
1614 data = data[4:]
1615 elif data[:4] == '\xff\xfe\x00\x00':
1616 encoding = 'utf-32le'
1617 data = data[4:]
1618 newdata = unicode(data, encoding)
1619 return newdata
1620
1621 def _detectEncoding(self, xml_data):
1622 """Given a document, tries to detect its XML encoding."""
1623 xml_encoding = sniffed_xml_encoding = None
1624 try:
1625 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1626 # EBCDIC
1627 xml_data = self._ebcdic_to_ascii(xml_data)
1628 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1629 # UTF-16BE
1630 sniffed_xml_encoding = 'utf-16be'
1631 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1632 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1633 and (xml_data[2:4] != '\x00\x00'):
1634 # UTF-16BE with BOM
1635 sniffed_xml_encoding = 'utf-16be'
1636 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1637 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1638 # UTF-16LE
1639 sniffed_xml_encoding = 'utf-16le'
1640 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1641 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1642 (xml_data[2:4] != '\x00\x00'):
1643 # UTF-16LE with BOM
1644 sniffed_xml_encoding = 'utf-16le'
1645 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1646 elif xml_data[:4] == '\x00\x00\x00\x3c':
1647 # UTF-32BE
1648 sniffed_xml_encoding = 'utf-32be'
1649 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1650 elif xml_data[:4] == '\x3c\x00\x00\x00':
1651 # UTF-32LE
1652 sniffed_xml_encoding = 'utf-32le'
1653 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1654 elif xml_data[:4] == '\x00\x00\xfe\xff':
1655 # UTF-32BE with BOM
1656 sniffed_xml_encoding = 'utf-32be'
1657 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1658 elif xml_data[:4] == '\xff\xfe\x00\x00':
1659 # UTF-32LE with BOM
1660 sniffed_xml_encoding = 'utf-32le'
1661 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1662 elif xml_data[:3] == '\xef\xbb\xbf':
1663 # UTF-8 with BOM
1664 sniffed_xml_encoding = 'utf-8'
1665 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1666 else:
1667 sniffed_xml_encoding = 'ascii'
1668 pass
1669 xml_encoding_match = re.compile \
1670 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1671 .match(xml_data)
1672 except:
1673 xml_encoding_match = None
1674 if xml_encoding_match:
1675 xml_encoding = xml_encoding_match.groups()[0].lower()
1676 if sniffed_xml_encoding and \
1677 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1678 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1679 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1680 'utf16', 'u16')):
1681 xml_encoding = sniffed_xml_encoding
1682 return xml_data, xml_encoding, sniffed_xml_encoding
1683
1684
1685 def find_codec(self, charset):
1686 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1687 or (charset and self._codec(charset.replace("-", ""))) \
1688 or (charset and self._codec(charset.replace("-", "_"))) \
1689 or charset
1690
1691 def _codec(self, charset):
1692 if not charset: return charset
1693 codec = None
1694 try:
1695 codecs.lookup(charset)
1696 codec = charset
1697 except LookupError:
1698 pass
1699 return codec
1700
1701 EBCDIC_TO_ASCII_MAP = None
1702 def _ebcdic_to_ascii(self, s):
1703 c = self.__class__
1704 if not c.EBCDIC_TO_ASCII_MAP:
1705 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1706 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1707 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1708 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1709 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1710 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1711 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1712 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1713 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1714 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1715 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1716 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1717 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1718 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1719 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1720 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1721 250,251,252,253,254,255)
1722 import string
1723 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1724 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1725 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1726
1727 MS_CHARS = { '\x80' : ('euro', '20AC'),
1728 '\x81' : ' ',
1729 '\x82' : ('sbquo', '201A'),
1730 '\x83' : ('fnof', '192'),
1731 '\x84' : ('bdquo', '201E'),
1732 '\x85' : ('hellip', '2026'),
1733 '\x86' : ('dagger', '2020'),
1734 '\x87' : ('Dagger', '2021'),
1735 '\x88' : ('circ', '2C6'),
1736 '\x89' : ('permil', '2030'),
1737 '\x8A' : ('Scaron', '160'),
1738 '\x8B' : ('lsaquo', '2039'),
1739 '\x8C' : ('OElig', '152'),
1740 '\x8D' : '?',
1741 '\x8E' : ('#x17D', '17D'),
1742 '\x8F' : '?',
1743 '\x90' : '?',
1744 '\x91' : ('lsquo', '2018'),
1745 '\x92' : ('rsquo', '2019'),
1746 '\x93' : ('ldquo', '201C'),
1747 '\x94' : ('rdquo', '201D'),
1748 '\x95' : ('bull', '2022'),
1749 '\x96' : ('ndash', '2013'),
1750 '\x97' : ('mdash', '2014'),
1751 '\x98' : ('tilde', '2DC'),
1752 '\x99' : ('trade', '2122'),
1753 '\x9a' : ('scaron', '161'),
1754 '\x9b' : ('rsaquo', '203A'),
1755 '\x9c' : ('oelig', '153'),
1756 '\x9d' : '?',
1757 '\x9e' : ('#x17E', '17E'),
1758 '\x9f' : ('Yuml', ''),}
1759
1760 #######################################################################
1761
1762
1763 #By default, act as an HTML pretty-printer.
1764 if __name__ == '__main__':
1765 import sys
1766 soup = BeautifulSoup(sys.stdin.read())
1767 print soup.prettify()
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.