530 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			530 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								"""Beautiful Soup
							 | 
						||
| 
								 | 
							
								Elixir and Tonic
							 | 
						||
| 
								 | 
							
								"The Screen-Scraper's Friend"
							 | 
						||
| 
								 | 
							
								http://www.crummy.com/software/BeautifulSoup/
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Beautiful Soup uses a pluggable XML or HTML parser to parse a
							 | 
						||
| 
								 | 
							
								(possibly invalid) document into a tree representation. Beautiful Soup
							 | 
						||
| 
								 | 
							
								provides methods and Pythonic idioms that make it easy to navigate,
							 | 
						||
| 
								 | 
							
								search, and modify the parse tree.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								Beautiful Soup works with Python 2.7 and up. It works better if lxml
							 | 
						||
| 
								 | 
							
								and/or html5lib is installed.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								For more than you ever wanted to know about Beautiful Soup, see the
							 | 
						||
| 
								 | 
							
								documentation:
							 | 
						||
| 
								 | 
							
								http://www.crummy.com/software/BeautifulSoup/bs4/doc/
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								"""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Use of this source code is governed by a BSD-style license that can be
							 | 
						||
| 
								 | 
							
								# found in the LICENSE file.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								__author__ = "Leonard Richardson (leonardr@segfault.org)"
							 | 
						||
| 
								 | 
							
								__version__ = "4.6.0"
							 | 
						||
| 
								 | 
							
								__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
							 | 
						||
| 
								 | 
							
								__license__ = "MIT"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								__all__ = ['BeautifulSoup']
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								import os
							 | 
						||
| 
								 | 
							
								import re
							 | 
						||
| 
								 | 
							
								import traceback
							 | 
						||
| 
								 | 
							
								import warnings
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from .builder import builder_registry, ParserRejectedMarkup
							 | 
						||
| 
								 | 
							
								from .dammit import UnicodeDammit
							 | 
						||
| 
								 | 
							
								from .element import (
							 | 
						||
| 
								 | 
							
								    CData,
							 | 
						||
| 
								 | 
							
								    Comment,
							 | 
						||
| 
								 | 
							
								    DEFAULT_OUTPUT_ENCODING,
							 | 
						||
| 
								 | 
							
								    Declaration,
							 | 
						||
| 
								 | 
							
								    Doctype,
							 | 
						||
| 
								 | 
							
								    NavigableString,
							 | 
						||
| 
								 | 
							
								    PageElement,
							 | 
						||
| 
								 | 
							
								    ProcessingInstruction,
							 | 
						||
| 
								 | 
							
								    ResultSet,
							 | 
						||
| 
								 | 
							
								    SoupStrainer,
							 | 
						||
| 
								 | 
							
								    Tag,
							 | 
						||
| 
								 | 
							
								    )
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# The very first thing we do is give a useful error if someone is
							 | 
						||
| 
								 | 
							
								# running this code under Python 3 without converting it.
							 | 
						||
| 
								 | 
							
								'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class BeautifulSoup(Tag):
							 | 
						||
| 
								 | 
							
								    """
							 | 
						||
| 
								 | 
							
								    This class defines the basic interface called by the tree builders.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    These methods will be called by the parser:
							 | 
						||
| 
								 | 
							
								      reset()
							 | 
						||
| 
								 | 
							
								      feed(markup)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    The tree builder may call these methods from its feed() implementation:
							 | 
						||
| 
								 | 
							
								      handle_starttag(name, attrs) # See note about return value
							 | 
						||
| 
								 | 
							
								      handle_endtag(name)
							 | 
						||
| 
								 | 
							
								      handle_data(data) # Appends to the current data node
							 | 
						||
| 
								 | 
							
								      endData(containerClass=NavigableString) # Ends the current data node
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    No matter how complicated the underlying parser is, you should be
							 | 
						||
| 
								 | 
							
								    able to build a tree using 'start tag' events, 'end tag' events,
							 | 
						||
| 
								 | 
							
								    'data' events, and "done with data" events.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    If you encounter an empty-element tag (aka a self-closing tag,
							 | 
						||
| 
								 | 
							
								    like HTML's <br> tag), call handle_starttag and then
							 | 
						||
| 
								 | 
							
								    handle_endtag.
							 | 
						||
| 
								 | 
							
								    """
							 | 
						||
| 
								 | 
							
								    ROOT_TAG_NAME = u'[document]'
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    # If the end-user gives no indication which tree builder they
							 | 
						||
| 
								 | 
							
								    # want, look for one with these features.
							 | 
						||
| 
								 | 
							
								    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def __init__(self, markup="", features=None, builder=None,
							 | 
						||
| 
								 | 
							
								                 parse_only=None, from_encoding=None, exclude_encodings=None,
							 | 
						||
| 
								 | 
							
								                 **kwargs):
							 | 
						||
| 
								 | 
							
								        """The Soup object is initialized as the 'root tag', and the
							 | 
						||
| 
								 | 
							
								        provided markup (which can be a string or a file-like object)
							 | 
						||
| 
								 | 
							
								        is fed into the underlying parser."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if 'convertEntities' in kwargs:
							 | 
						||
| 
								 | 
							
								            warnings.warn(
							 | 
						||
| 
								 | 
							
								                "BS4 does not respect the convertEntities argument to the "
							 | 
						||
| 
								 | 
							
								                "BeautifulSoup constructor. Entities are always converted "
							 | 
						||
| 
								 | 
							
								                "to Unicode characters.")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if 'markupMassage' in kwargs:
							 | 
						||
| 
								 | 
							
								            del kwargs['markupMassage']
							 | 
						||
| 
								 | 
							
								            warnings.warn(
							 | 
						||
| 
								 | 
							
								                "BS4 does not respect the markupMassage argument to the "
							 | 
						||
| 
								 | 
							
								                "BeautifulSoup constructor. The tree builder is responsible "
							 | 
						||
| 
								 | 
							
								                "for any necessary markup massage.")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if 'smartQuotesTo' in kwargs:
							 | 
						||
| 
								 | 
							
								            del kwargs['smartQuotesTo']
							 | 
						||
| 
								 | 
							
								            warnings.warn(
							 | 
						||
| 
								 | 
							
								                "BS4 does not respect the smartQuotesTo argument to the "
							 | 
						||
| 
								 | 
							
								                "BeautifulSoup constructor. Smart quotes are always converted "
							 | 
						||
| 
								 | 
							
								                "to Unicode characters.")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if 'selfClosingTags' in kwargs:
							 | 
						||
| 
								 | 
							
								            del kwargs['selfClosingTags']
							 | 
						||
| 
								 | 
							
								            warnings.warn(
							 | 
						||
| 
								 | 
							
								                "BS4 does not respect the selfClosingTags argument to the "
							 | 
						||
| 
								 | 
							
								                "BeautifulSoup constructor. The tree builder is responsible "
							 | 
						||
| 
								 | 
							
								                "for understanding self-closing tags.")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if 'isHTML' in kwargs:
							 | 
						||
| 
								 | 
							
								            del kwargs['isHTML']
							 | 
						||
| 
								 | 
							
								            warnings.warn(
							 | 
						||
| 
								 | 
							
								                "BS4 does not respect the isHTML argument to the "
							 | 
						||
| 
								 | 
							
								                "BeautifulSoup constructor. Suggest you use "
							 | 
						||
| 
								 | 
							
								                "features='lxml' for HTML and features='lxml-xml' for "
							 | 
						||
| 
								 | 
							
								                "XML.")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        def deprecated_argument(old_name, new_name):
							 | 
						||
| 
								 | 
							
								            if old_name in kwargs:
							 | 
						||
| 
								 | 
							
								                warnings.warn(
							 | 
						||
| 
								 | 
							
								                    'The "%s" argument to the BeautifulSoup constructor '
							 | 
						||
| 
								 | 
							
								                    'has been renamed to "%s."' % (old_name, new_name))
							 | 
						||
| 
								 | 
							
								                value = kwargs[old_name]
							 | 
						||
| 
								 | 
							
								                del kwargs[old_name]
							 | 
						||
| 
								 | 
							
								                return value
							 | 
						||
| 
								 | 
							
								            return None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        parse_only = parse_only or deprecated_argument(
							 | 
						||
| 
								 | 
							
								            "parseOnlyThese", "parse_only")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        from_encoding = from_encoding or deprecated_argument(
							 | 
						||
| 
								 | 
							
								            "fromEncoding", "from_encoding")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if from_encoding and isinstance(markup, unicode):
							 | 
						||
| 
								 | 
							
								            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
							 | 
						||
| 
								 | 
							
								            from_encoding = None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if len(kwargs) > 0:
							 | 
						||
| 
								 | 
							
								            arg = kwargs.keys().pop()
							 | 
						||
| 
								 | 
							
								            raise TypeError(
							 | 
						||
| 
								 | 
							
								                "__init__() got an unexpected keyword argument '%s'" % arg)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if builder is None:
							 | 
						||
| 
								 | 
							
								            original_features = features
							 | 
						||
| 
								 | 
							
								            if isinstance(features, basestring):
							 | 
						||
| 
								 | 
							
								                features = [features]
							 | 
						||
| 
								 | 
							
								            if features is None or len(features) == 0:
							 | 
						||
| 
								 | 
							
								                features = self.DEFAULT_BUILDER_FEATURES
							 | 
						||
| 
								 | 
							
								            builder_class = builder_registry.lookup(*features)
							 | 
						||
| 
								 | 
							
								            if builder_class is None:
							 | 
						||
| 
								 | 
							
								                raise FeatureNotFound(
							 | 
						||
| 
								 | 
							
								                    "Couldn't find a tree builder with the features you "
							 | 
						||
| 
								 | 
							
								                    "requested: %s. Do you need to install a parser library?"
							 | 
						||
| 
								 | 
							
								                    % ",".join(features))
							 | 
						||
| 
								 | 
							
								            builder = builder_class()
							 | 
						||
| 
								 | 
							
								            if not (original_features == builder.NAME or
							 | 
						||
| 
								 | 
							
								                    original_features in builder.ALTERNATE_NAMES):
							 | 
						||
| 
								 | 
							
								                if builder.is_xml:
							 | 
						||
| 
								 | 
							
								                    markup_type = "XML"
							 | 
						||
| 
								 | 
							
								                else:
							 | 
						||
| 
								 | 
							
								                    markup_type = "HTML"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								                caller = traceback.extract_stack()[0]
							 | 
						||
| 
								 | 
							
								                filename = caller[0]
							 | 
						||
| 
								 | 
							
								                line_number = caller[1]
							 | 
						||
| 
								 | 
							
								                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
							 | 
						||
| 
								 | 
							
								                    filename=filename,
							 | 
						||
| 
								 | 
							
								                    line_number=line_number,
							 | 
						||
| 
								 | 
							
								                    parser=builder.NAME,
							 | 
						||
| 
								 | 
							
								                    markup_type=markup_type))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        self.builder = builder
							 | 
						||
| 
								 | 
							
								        self.is_xml = builder.is_xml
							 | 
						||
| 
								 | 
							
								        self.known_xml = self.is_xml
							 | 
						||
| 
								 | 
							
								        self.builder.soup = self
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        self.parse_only = parse_only
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if hasattr(markup, 'read'):        # It's a file-type object.
							 | 
						||
| 
								 | 
							
								            markup = markup.read()
							 | 
						||
| 
								 | 
							
								        elif len(markup) <= 256 and (
							 | 
						||
| 
								 | 
							
								                (isinstance(markup, bytes) and not b'<' in markup)
							 | 
						||
| 
								 | 
							
								                or (isinstance(markup, unicode) and not u'<' in markup)
							 | 
						||
| 
								 | 
							
								        ):
							 | 
						||
| 
								 | 
							
								            # Print out warnings for a couple beginner problems
							 | 
						||
| 
								 | 
							
								            # involving passing non-markup to Beautiful Soup.
							 | 
						||
| 
								 | 
							
								            # Beautiful Soup will still parse the input as markup,
							 | 
						||
| 
								 | 
							
								            # just in case that's what the user really wants.
							 | 
						||
| 
								 | 
							
								            if (isinstance(markup, unicode)
							 | 
						||
| 
								 | 
							
								                and not os.path.supports_unicode_filenames):
							 | 
						||
| 
								 | 
							
								                possible_filename = markup.encode("utf8")
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                possible_filename = markup
							 | 
						||
| 
								 | 
							
								            is_file = False
							 | 
						||
| 
								 | 
							
								            try:
							 | 
						||
| 
								 | 
							
								                is_file = os.path.exists(possible_filename)
							 | 
						||
| 
								 | 
							
								            except Exception, e:
							 | 
						||
| 
								 | 
							
								                # This is almost certainly a problem involving
							 | 
						||
| 
								 | 
							
								                # characters not valid in filenames on this
							 | 
						||
| 
								 | 
							
								                # system. Just let it go.
							 | 
						||
| 
								 | 
							
								                pass
							 | 
						||
| 
								 | 
							
								            if is_file:
							 | 
						||
| 
								 | 
							
								                if isinstance(markup, unicode):
							 | 
						||
| 
								 | 
							
								                    markup = markup.encode("utf8")
							 | 
						||
| 
								 | 
							
								                warnings.warn(
							 | 
						||
| 
								 | 
							
								                    '"%s" looks like a filename, not markup. You should'
							 | 
						||
| 
								 | 
							
								                    ' probably open this file and pass the filehandle into'
							 | 
						||
| 
								 | 
							
								                    ' Beautiful Soup.' % markup)
							 | 
						||
| 
								 | 
							
								            self._check_markup_is_url(markup)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        for (self.markup, self.original_encoding, self.declared_html_encoding,
							 | 
						||
| 
								 | 
							
								         self.contains_replacement_characters) in (
							 | 
						||
| 
								 | 
							
								             self.builder.prepare_markup(
							 | 
						||
| 
								 | 
							
								                 markup, from_encoding, exclude_encodings=exclude_encodings)):
							 | 
						||
| 
								 | 
							
								            self.reset()
							 | 
						||
| 
								 | 
							
								            try:
							 | 
						||
| 
								 | 
							
								                self._feed()
							 | 
						||
| 
								 | 
							
								                break
							 | 
						||
| 
								 | 
							
								            except ParserRejectedMarkup:
							 | 
						||
| 
								 | 
							
								                pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # Clear out the markup and remove the builder's circular
							 | 
						||
| 
								 | 
							
								        # reference to this object.
							 | 
						||
| 
								 | 
							
								        self.markup = None
							 | 
						||
| 
								 | 
							
								        self.builder.soup = None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def __copy__(self):
							 | 
						||
| 
								 | 
							
								        copy = type(self)(
							 | 
						||
| 
								 | 
							
								            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
							 | 
						||
| 
								 | 
							
								        )
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # Although we encoded the tree to UTF-8, that may not have
							 | 
						||
| 
								 | 
							
								        # been the encoding of the original markup. Set the copy's
							 | 
						||
| 
								 | 
							
								        # .original_encoding to reflect the original object's
							 | 
						||
| 
								 | 
							
								        # .original_encoding.
							 | 
						||
| 
								 | 
							
								        copy.original_encoding = self.original_encoding
							 | 
						||
| 
								 | 
							
								        return copy
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def __getstate__(self):
							 | 
						||
| 
								 | 
							
								        # Frequently a tree builder can't be pickled.
							 | 
						||
| 
								 | 
							
								        d = dict(self.__dict__)
							 | 
						||
| 
								 | 
							
								        if 'builder' in d and not self.builder.picklable:
							 | 
						||
| 
								 | 
							
								            d['builder'] = None
							 | 
						||
| 
								 | 
							
								        return d
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    @staticmethod
							 | 
						||
| 
								 | 
							
								    def _check_markup_is_url(markup):
							 | 
						||
| 
								 | 
							
								        """ 
							 | 
						||
| 
								 | 
							
								        Check if markup looks like it's actually a url and raise a warning 
							 | 
						||
| 
								 | 
							
								        if so. Markup can be unicode or str (py2) / bytes (py3).
							 | 
						||
| 
								 | 
							
								        """
							 | 
						||
| 
								 | 
							
								        if isinstance(markup, bytes):
							 | 
						||
| 
								 | 
							
								            space = b' '
							 | 
						||
| 
								 | 
							
								            cant_start_with = (b"http:", b"https:")
							 | 
						||
| 
								 | 
							
								        elif isinstance(markup, unicode):
							 | 
						||
| 
								 | 
							
								            space = u' '
							 | 
						||
| 
								 | 
							
								            cant_start_with = (u"http:", u"https:")
							 | 
						||
| 
								 | 
							
								        else:
							 | 
						||
| 
								 | 
							
								            return
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if any(markup.startswith(prefix) for prefix in cant_start_with):
							 | 
						||
| 
								 | 
							
								            if not space in markup:
							 | 
						||
| 
								 | 
							
								                if isinstance(markup, bytes):
							 | 
						||
| 
								 | 
							
								                    decoded_markup = markup.decode('utf-8', 'replace')
							 | 
						||
| 
								 | 
							
								                else:
							 | 
						||
| 
								 | 
							
								                    decoded_markup = markup
							 | 
						||
| 
								 | 
							
								                warnings.warn(
							 | 
						||
| 
								 | 
							
								                    '"%s" looks like a URL. Beautiful Soup is not an'
							 | 
						||
| 
								 | 
							
								                    ' HTTP client. You should probably use an HTTP client like'
							 | 
						||
| 
								 | 
							
								                    ' requests to get the document behind the URL, and feed'
							 | 
						||
| 
								 | 
							
								                    ' that document to Beautiful Soup.' % decoded_markup
							 | 
						||
| 
								 | 
							
								                )
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def _feed(self):
							 | 
						||
| 
								 | 
							
								        # Convert the document to Unicode.
							 | 
						||
| 
								 | 
							
								        self.builder.reset()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        self.builder.feed(self.markup)
							 | 
						||
| 
								 | 
							
								        # Close out any unfinished strings and close all the open tags.
							 | 
						||
| 
								 | 
							
								        self.endData()
							 | 
						||
| 
								 | 
							
								        while self.currentTag.name != self.ROOT_TAG_NAME:
							 | 
						||
| 
								 | 
							
								            self.popTag()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def reset(self):
							 | 
						||
| 
								 | 
							
								        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
							 | 
						||
| 
								 | 
							
								        self.hidden = 1
							 | 
						||
| 
								 | 
							
								        self.builder.reset()
							 | 
						||
| 
								 | 
							
								        self.current_data = []
							 | 
						||
| 
								 | 
							
								        self.currentTag = None
							 | 
						||
| 
								 | 
							
								        self.tagStack = []
							 | 
						||
| 
								 | 
							
								        self.preserve_whitespace_tag_stack = []
							 | 
						||
| 
								 | 
							
								        self.pushTag(self)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
							 | 
						||
| 
								 | 
							
								        """Create a new tag associated with this soup."""
							 | 
						||
| 
								 | 
							
								        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def new_string(self, s, subclass=NavigableString):
							 | 
						||
| 
								 | 
							
								        """Create a new NavigableString associated with this soup."""
							 | 
						||
| 
								 | 
							
								        return subclass(s)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def insert_before(self, successor):
							 | 
						||
| 
								 | 
							
								        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def insert_after(self, successor):
							 | 
						||
| 
								 | 
							
								        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def popTag(self):
							 | 
						||
| 
								 | 
							
								        tag = self.tagStack.pop()
							 | 
						||
| 
								 | 
							
								        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
							 | 
						||
| 
								 | 
							
								            self.preserve_whitespace_tag_stack.pop()
							 | 
						||
| 
								 | 
							
								        #print "Pop", tag.name
							 | 
						||
| 
								 | 
							
								        if self.tagStack:
							 | 
						||
| 
								 | 
							
								            self.currentTag = self.tagStack[-1]
							 | 
						||
| 
								 | 
							
								        return self.currentTag
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def pushTag(self, tag):
							 | 
						||
| 
								 | 
							
								        #print "Push", tag.name
							 | 
						||
| 
								 | 
							
								        if self.currentTag:
							 | 
						||
| 
								 | 
							
								            self.currentTag.contents.append(tag)
							 | 
						||
| 
								 | 
							
								        self.tagStack.append(tag)
							 | 
						||
| 
								 | 
							
								        self.currentTag = self.tagStack[-1]
							 | 
						||
| 
								 | 
							
								        if tag.name in self.builder.preserve_whitespace_tags:
							 | 
						||
| 
								 | 
							
								            self.preserve_whitespace_tag_stack.append(tag)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def endData(self, containerClass=NavigableString):
							 | 
						||
| 
								 | 
							
								        if self.current_data:
							 | 
						||
| 
								 | 
							
								            current_data = u''.join(self.current_data)
							 | 
						||
| 
								 | 
							
								            # If whitespace is not preserved, and this string contains
							 | 
						||
| 
								 | 
							
								            # nothing but ASCII spaces, replace it with a single space
							 | 
						||
| 
								 | 
							
								            # or newline.
							 | 
						||
| 
								 | 
							
								            if not self.preserve_whitespace_tag_stack:
							 | 
						||
| 
								 | 
							
								                strippable = True
							 | 
						||
| 
								 | 
							
								                for i in current_data:
							 | 
						||
| 
								 | 
							
								                    if i not in self.ASCII_SPACES:
							 | 
						||
| 
								 | 
							
								                        strippable = False
							 | 
						||
| 
								 | 
							
								                        break
							 | 
						||
| 
								 | 
							
								                if strippable:
							 | 
						||
| 
								 | 
							
								                    if '\n' in current_data:
							 | 
						||
| 
								 | 
							
								                        current_data = '\n'
							 | 
						||
| 
								 | 
							
								                    else:
							 | 
						||
| 
								 | 
							
								                        current_data = ' '
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            # Reset the data collector.
							 | 
						||
| 
								 | 
							
								            self.current_data = []
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            # Should we add this string to the tree at all?
							 | 
						||
| 
								 | 
							
								            if self.parse_only and len(self.tagStack) <= 1 and \
							 | 
						||
| 
								 | 
							
								                   (not self.parse_only.text or \
							 | 
						||
| 
								 | 
							
								                    not self.parse_only.search(current_data)):
							 | 
						||
| 
								 | 
							
								                return
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            o = containerClass(current_data)
							 | 
						||
| 
								 | 
							
								            self.object_was_parsed(o)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def object_was_parsed(self, o, parent=None, most_recent_element=None):
							 | 
						||
| 
								 | 
							
								        """Add an object to the parse tree."""
							 | 
						||
| 
								 | 
							
								        parent = parent or self.currentTag
							 | 
						||
| 
								 | 
							
								        previous_element = most_recent_element or self._most_recent_element
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        next_element = previous_sibling = next_sibling = None
							 | 
						||
| 
								 | 
							
								        if isinstance(o, Tag):
							 | 
						||
| 
								 | 
							
								            next_element = o.next_element
							 | 
						||
| 
								 | 
							
								            next_sibling = o.next_sibling
							 | 
						||
| 
								 | 
							
								            previous_sibling = o.previous_sibling
							 | 
						||
| 
								 | 
							
								            if not previous_element:
							 | 
						||
| 
								 | 
							
								                previous_element = o.previous_element
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        self._most_recent_element = o
							 | 
						||
| 
								 | 
							
								        parent.contents.append(o)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if parent.next_sibling:
							 | 
						||
| 
								 | 
							
								            # This node is being inserted into an element that has
							 | 
						||
| 
								 | 
							
								            # already been parsed. Deal with any dangling references.
							 | 
						||
| 
								 | 
							
								            index = len(parent.contents)-1
							 | 
						||
| 
								 | 
							
								            while index >= 0:
							 | 
						||
| 
								 | 
							
								                if parent.contents[index] is o:
							 | 
						||
| 
								 | 
							
								                    break
							 | 
						||
| 
								 | 
							
								                index -= 1
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                raise ValueError(
							 | 
						||
| 
								 | 
							
								                    "Error building tree: supposedly %r was inserted "
							 | 
						||
| 
								 | 
							
								                    "into %r after the fact, but I don't see it!" % (
							 | 
						||
| 
								 | 
							
								                        o, parent
							 | 
						||
| 
								 | 
							
								                    )
							 | 
						||
| 
								 | 
							
								                )
							 | 
						||
| 
								 | 
							
								            if index == 0:
							 | 
						||
| 
								 | 
							
								                previous_element = parent
							 | 
						||
| 
								 | 
							
								                previous_sibling = None
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                previous_element = previous_sibling = parent.contents[index-1]
							 | 
						||
| 
								 | 
							
								            if index == len(parent.contents)-1:
							 | 
						||
| 
								 | 
							
								                next_element = parent.next_sibling
							 | 
						||
| 
								 | 
							
								                next_sibling = None
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                next_element = next_sibling = parent.contents[index+1]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            o.previous_element = previous_element
							 | 
						||
| 
								 | 
							
								            if previous_element:
							 | 
						||
| 
								 | 
							
								                previous_element.next_element = o
							 | 
						||
| 
								 | 
							
								            o.next_element = next_element
							 | 
						||
| 
								 | 
							
								            if next_element:
							 | 
						||
| 
								 | 
							
								                next_element.previous_element = o
							 | 
						||
| 
								 | 
							
								            o.next_sibling = next_sibling
							 | 
						||
| 
								 | 
							
								            if next_sibling:
							 | 
						||
| 
								 | 
							
								                next_sibling.previous_sibling = o
							 | 
						||
| 
								 | 
							
								            o.previous_sibling = previous_sibling
							 | 
						||
| 
								 | 
							
								            if previous_sibling:
							 | 
						||
| 
								 | 
							
								                previous_sibling.next_sibling = o
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
							 | 
						||
| 
								 | 
							
								        """Pops the tag stack up to and including the most recent
							 | 
						||
| 
								 | 
							
								        instance of the given tag. If inclusivePop is false, pops the tag
							 | 
						||
| 
								 | 
							
								        stack up to but *not* including the most recent instqance of
							 | 
						||
| 
								 | 
							
								        the given tag."""
							 | 
						||
| 
								 | 
							
								        #print "Popping to %s" % name
							 | 
						||
| 
								 | 
							
								        if name == self.ROOT_TAG_NAME:
							 | 
						||
| 
								 | 
							
								            # The BeautifulSoup object itself can never be popped.
							 | 
						||
| 
								 | 
							
								            return
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        most_recently_popped = None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        stack_size = len(self.tagStack)
							 | 
						||
| 
								 | 
							
								        for i in range(stack_size - 1, 0, -1):
							 | 
						||
| 
								 | 
							
								            t = self.tagStack[i]
							 | 
						||
| 
								 | 
							
								            if (name == t.name and nsprefix == t.prefix):
							 | 
						||
| 
								 | 
							
								                if inclusivePop:
							 | 
						||
| 
								 | 
							
								                    most_recently_popped = self.popTag()
							 | 
						||
| 
								 | 
							
								                break
							 | 
						||
| 
								 | 
							
								            most_recently_popped = self.popTag()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return most_recently_popped
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def handle_starttag(self, name, namespace, nsprefix, attrs):
							 | 
						||
| 
								 | 
							
								        """Push a start tag on to the stack.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        If this method returns None, the tag was rejected by the
							 | 
						||
| 
								 | 
							
								        SoupStrainer. You should proceed as if the tag had not occurred
							 | 
						||
| 
								 | 
							
								        in the document. For instance, if this was a self-closing tag,
							 | 
						||
| 
								 | 
							
								        don't call handle_endtag.
							 | 
						||
| 
								 | 
							
								        """
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # print "Start tag %s: %s" % (name, attrs)
							 | 
						||
| 
								 | 
							
								        self.endData()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if (self.parse_only and len(self.tagStack) <= 1
							 | 
						||
| 
								 | 
							
								            and (self.parse_only.text
							 | 
						||
| 
								 | 
							
								                 or not self.parse_only.search_tag(name, attrs))):
							 | 
						||
| 
								 | 
							
								            return None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
							 | 
						||
| 
								 | 
							
								                  self.currentTag, self._most_recent_element)
							 | 
						||
| 
								 | 
							
								        if tag is None:
							 | 
						||
| 
								 | 
							
								            return tag
							 | 
						||
| 
								 | 
							
								        if self._most_recent_element:
							 | 
						||
| 
								 | 
							
								            self._most_recent_element.next_element = tag
							 | 
						||
| 
								 | 
							
								        self._most_recent_element = tag
							 | 
						||
| 
								 | 
							
								        self.pushTag(tag)
							 | 
						||
| 
								 | 
							
								        return tag
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def handle_endtag(self, name, nsprefix=None):
							 | 
						||
| 
								 | 
							
								        #print "End tag: " + name
							 | 
						||
| 
								 | 
							
								        self.endData()
							 | 
						||
| 
								 | 
							
								        self._popToTag(name, nsprefix)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def handle_data(self, data):
							 | 
						||
| 
								 | 
							
								        self.current_data.append(data)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def decode(self, pretty_print=False,
							 | 
						||
| 
								 | 
							
								               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
							 | 
						||
| 
								 | 
							
								               formatter="minimal"):
							 | 
						||
| 
								 | 
							
								        """Returns a string or Unicode representation of this document.
							 | 
						||
| 
								 | 
							
								        To get Unicode, pass None for encoding."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if self.is_xml:
							 | 
						||
| 
								 | 
							
								            # Print the XML declaration
							 | 
						||
| 
								 | 
							
								            encoding_part = ''
							 | 
						||
| 
								 | 
							
								            if eventual_encoding != None:
							 | 
						||
| 
								 | 
							
								                encoding_part = ' encoding="%s"' % eventual_encoding
							 | 
						||
| 
								 | 
							
								            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
							 | 
						||
| 
								 | 
							
								        else:
							 | 
						||
| 
								 | 
							
								            prefix = u''
							 | 
						||
| 
								 | 
							
								        if not pretty_print:
							 | 
						||
| 
								 | 
							
								            indent_level = None
							 | 
						||
| 
								 | 
							
								        else:
							 | 
						||
| 
								 | 
							
								            indent_level = 0
							 | 
						||
| 
								 | 
							
								        return prefix + super(BeautifulSoup, self).decode(
							 | 
						||
| 
								 | 
							
								            indent_level, eventual_encoding, formatter)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Alias to make it easier to type import: 'from bs4 import _soup'
							 | 
						||
| 
								 | 
							
								_s = BeautifulSoup
							 | 
						||
| 
								 | 
							
								_soup = BeautifulSoup
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class BeautifulStoneSoup(BeautifulSoup):
							 | 
						||
| 
								 | 
							
								    """Deprecated interface to an XML parser."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def __init__(self, *args, **kwargs):
							 | 
						||
| 
								 | 
							
								        kwargs['features'] = 'xml'
							 | 
						||
| 
								 | 
							
								        warnings.warn(
							 | 
						||
| 
								 | 
							
								            'The BeautifulStoneSoup class is deprecated. Instead of using '
							 | 
						||
| 
								 | 
							
								            'it, pass features="xml" into the BeautifulSoup constructor.')
							 | 
						||
| 
								 | 
							
								        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class StopParsing(Exception):
							 | 
						||
| 
								 | 
							
								    pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class FeatureNotFound(ValueError):
							 | 
						||
| 
								 | 
							
								    pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#By default, act as an HTML pretty-printer.
							 | 
						||
| 
								 | 
							
								if __name__ == '__main__':
							 | 
						||
| 
								 | 
							
								    import sys
							 | 
						||
| 
								 | 
							
								    soup = BeautifulSoup(sys.stdin)
							 | 
						||
| 
								 | 
							
								    print soup.prettify()
							 |