322 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			322 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								# Use of this source code is governed by a BSD-style license that can be
							 | 
						||
| 
								 | 
							
								# found in the LICENSE file.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from collections import defaultdict
							 | 
						||
| 
								 | 
							
								import itertools
							 | 
						||
| 
								 | 
							
								import sys
							 | 
						||
| 
								 | 
							
								from ..element import (
							 | 
						||
| 
								 | 
							
								    CharsetMetaAttributeValue,
							 | 
						||
| 
								 | 
							
								    ContentMetaAttributeValue,
							 | 
						||
| 
								 | 
							
								    HTMLAwareEntitySubstitution,
							 | 
						||
| 
								 | 
							
								    whitespace_re
							 | 
						||
| 
								 | 
							
								    )
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								__all__ = [
							 | 
						||
| 
								 | 
							
								    'HTMLTreeBuilder',
							 | 
						||
| 
								 | 
							
								    'SAXTreeBuilder',
							 | 
						||
| 
								 | 
							
								    'TreeBuilder',
							 | 
						||
| 
								 | 
							
								    'TreeBuilderRegistry',
							 | 
						||
| 
								 | 
							
								    ]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Some useful features for a TreeBuilder to have.
							 | 
						||
| 
								 | 
							
								FAST = 'fast'
							 | 
						||
| 
								 | 
							
								PERMISSIVE = 'permissive'
							 | 
						||
| 
								 | 
							
								STRICT = 'strict'
							 | 
						||
| 
								 | 
							
								XML = 'xml'
							 | 
						||
| 
								 | 
							
								HTML = 'html'
							 | 
						||
| 
								 | 
							
								HTML_5 = 'html5'
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class TreeBuilderRegistry(object):
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def __init__(self):
							 | 
						||
| 
								 | 
							
								        self.builders_for_feature = defaultdict(list)
							 | 
						||
| 
								 | 
							
								        self.builders = []
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def register(self, treebuilder_class):
							 | 
						||
| 
								 | 
							
								        """Register a treebuilder based on its advertised features."""
							 | 
						||
| 
								 | 
							
								        for feature in treebuilder_class.features:
							 | 
						||
| 
								 | 
							
								            self.builders_for_feature[feature].insert(0, treebuilder_class)
							 | 
						||
| 
								 | 
							
								        self.builders.insert(0, treebuilder_class)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def lookup(self, *features):
							 | 
						||
| 
								 | 
							
								        if len(self.builders) == 0:
							 | 
						||
| 
								 | 
							
								            # There are no builders at all.
							 | 
						||
| 
								 | 
							
								            return None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if len(features) == 0:
							 | 
						||
| 
								 | 
							
								            # They didn't ask for any features. Give them the most
							 | 
						||
| 
								 | 
							
								            # recently registered builder.
							 | 
						||
| 
								 | 
							
								            return self.builders[0]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # Go down the list of features in order, and eliminate any builders
							 | 
						||
| 
								 | 
							
								        # that don't match every feature.
							 | 
						||
| 
								 | 
							
								        features = list(features)
							 | 
						||
| 
								 | 
							
								        features.reverse()
							 | 
						||
| 
								 | 
							
								        candidates = None
							 | 
						||
| 
								 | 
							
								        candidate_set = None
							 | 
						||
| 
								 | 
							
								        while len(features) > 0:
							 | 
						||
| 
								 | 
							
								            feature = features.pop()
							 | 
						||
| 
								 | 
							
								            we_have_the_feature = self.builders_for_feature.get(feature, [])
							 | 
						||
| 
								 | 
							
								            if len(we_have_the_feature) > 0:
							 | 
						||
| 
								 | 
							
								                if candidates is None:
							 | 
						||
| 
								 | 
							
								                    candidates = we_have_the_feature
							 | 
						||
| 
								 | 
							
								                    candidate_set = set(candidates)
							 | 
						||
| 
								 | 
							
								                else:
							 | 
						||
| 
								 | 
							
								                    # Eliminate any candidates that don't have this feature.
							 | 
						||
| 
								 | 
							
								                    candidate_set = candidate_set.intersection(
							 | 
						||
| 
								 | 
							
								                        set(we_have_the_feature))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # The only valid candidates are the ones in candidate_set.
							 | 
						||
| 
								 | 
							
								        # Go through the original list of candidates and pick the first one
							 | 
						||
| 
								 | 
							
								        # that's in candidate_set.
							 | 
						||
| 
								 | 
							
								        if candidate_set is None:
							 | 
						||
| 
								 | 
							
								            return None
							 | 
						||
| 
								 | 
							
								        for candidate in candidates:
							 | 
						||
| 
								 | 
							
								            if candidate in candidate_set:
							 | 
						||
| 
								 | 
							
								                return candidate
							 | 
						||
| 
								 | 
							
								        return None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# The BeautifulSoup class will take feature lists from developers and use them
							 | 
						||
| 
								 | 
							
								# to look up builders in this registry.
							 | 
						||
| 
								 | 
							
								builder_registry = TreeBuilderRegistry()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class TreeBuilder(object):
							 | 
						||
| 
								 | 
							
								    """Turn a document into a Beautiful Soup object tree."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    NAME = "[Unknown tree builder]"
							 | 
						||
| 
								 | 
							
								    ALTERNATE_NAMES = []
							 | 
						||
| 
								 | 
							
								    features = []
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    is_xml = False
							 | 
						||
| 
								 | 
							
								    picklable = False
							 | 
						||
| 
								 | 
							
								    preserve_whitespace_tags = set()
							 | 
						||
| 
								 | 
							
								    empty_element_tags = None # A tag will be considered an empty-element
							 | 
						||
| 
								 | 
							
								                              # tag when and only when it has no contents.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    # A value for these tag/attribute combinations is a space- or
							 | 
						||
| 
								 | 
							
								    # comma-separated list of CDATA, rather than a single CDATA.
							 | 
						||
| 
								 | 
							
								    cdata_list_attributes = {}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def __init__(self):
							 | 
						||
| 
								 | 
							
								        self.soup = None
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def reset(self):
							 | 
						||
| 
								 | 
							
								        pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def can_be_empty_element(self, tag_name):
							 | 
						||
| 
								 | 
							
								        """Might a tag with this name be an empty-element tag?
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        The final markup may or may not actually present this tag as
							 | 
						||
| 
								 | 
							
								        self-closing.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        For instance: an HTMLBuilder does not consider a <p> tag to be
							 | 
						||
| 
								 | 
							
								        an empty-element tag (it's not in
							 | 
						||
| 
								 | 
							
								        HTMLBuilder.empty_element_tags). This means an empty <p> tag
							 | 
						||
| 
								 | 
							
								        will be presented as "<p></p>", not "<p />".
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        The default implementation has no opinion about which tags are
							 | 
						||
| 
								 | 
							
								        empty-element tags, so a tag will be presented as an
							 | 
						||
| 
								 | 
							
								        empty-element tag if and only if it has no contents.
							 | 
						||
| 
								 | 
							
								        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
							 | 
						||
| 
								 | 
							
								        be left alone.
							 | 
						||
| 
								 | 
							
								        """
							 | 
						||
| 
								 | 
							
								        if self.empty_element_tags is None:
							 | 
						||
| 
								 | 
							
								            return True
							 | 
						||
| 
								 | 
							
								        return tag_name in self.empty_element_tags
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def feed(self, markup):
							 | 
						||
| 
								 | 
							
								        raise NotImplementedError()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def prepare_markup(self, markup, user_specified_encoding=None,
							 | 
						||
| 
								 | 
							
								                       document_declared_encoding=None):
							 | 
						||
| 
								 | 
							
								        return markup, None, None, False
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def test_fragment_to_document(self, fragment):
							 | 
						||
| 
								 | 
							
								        """Wrap an HTML fragment to make it look like a document.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        Different parsers do this differently. For instance, lxml
							 | 
						||
| 
								 | 
							
								        introduces an empty <head> tag, and html5lib
							 | 
						||
| 
								 | 
							
								        doesn't. Abstracting this away lets us write simple tests
							 | 
						||
| 
								 | 
							
								        which run HTML fragments through the parser and compare the
							 | 
						||
| 
								 | 
							
								        results against other HTML fragments.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        This method should not be used outside of tests.
							 | 
						||
| 
								 | 
							
								        """
							 | 
						||
| 
								 | 
							
								        return fragment
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def set_up_substitutions(self, tag):
							 | 
						||
| 
								 | 
							
								        return False
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
							 | 
						||
| 
								 | 
							
								        """Replaces class="foo bar" with class=["foo", "bar"]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        Modifies its input in place.
							 | 
						||
| 
								 | 
							
								        """
							 | 
						||
| 
								 | 
							
								        if not attrs:
							 | 
						||
| 
								 | 
							
								            return attrs
							 | 
						||
| 
								 | 
							
								        if self.cdata_list_attributes:
							 | 
						||
| 
								 | 
							
								            universal = self.cdata_list_attributes.get('*', [])
							 | 
						||
| 
								 | 
							
								            tag_specific = self.cdata_list_attributes.get(
							 | 
						||
| 
								 | 
							
								                tag_name.lower(), None)
							 | 
						||
| 
								 | 
							
								            for attr in attrs.keys():
							 | 
						||
| 
								 | 
							
								                if attr in universal or (tag_specific and attr in tag_specific):
							 | 
						||
| 
								 | 
							
								                    # We have a "class"-type attribute whose string
							 | 
						||
| 
								 | 
							
								                    # value is a whitespace-separated list of
							 | 
						||
| 
								 | 
							
								                    # values. Split it into a list.
							 | 
						||
| 
								 | 
							
								                    value = attrs[attr]
							 | 
						||
| 
								 | 
							
								                    if isinstance(value, basestring):
							 | 
						||
| 
								 | 
							
								                        values = whitespace_re.split(value)
							 | 
						||
| 
								 | 
							
								                    else:
							 | 
						||
| 
								 | 
							
								                        # html5lib sometimes calls setAttributes twice
							 | 
						||
| 
								 | 
							
								                        # for the same tag when rearranging the parse
							 | 
						||
| 
								 | 
							
								                        # tree. On the second call the attribute value
							 | 
						||
| 
								 | 
							
								                        # here is already a list.  If this happens,
							 | 
						||
| 
								 | 
							
								                        # leave the value alone rather than trying to
							 | 
						||
| 
								 | 
							
								                        # split it again.
							 | 
						||
| 
								 | 
							
								                        values = value
							 | 
						||
| 
								 | 
							
								                    attrs[attr] = values
							 | 
						||
| 
								 | 
							
								        return attrs
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class SAXTreeBuilder(TreeBuilder):
							 | 
						||
| 
								 | 
							
								    """A Beautiful Soup treebuilder that listens for SAX events."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def feed(self, markup):
							 | 
						||
| 
								 | 
							
								        raise NotImplementedError()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def close(self):
							 | 
						||
| 
								 | 
							
								        pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def startElement(self, name, attrs):
							 | 
						||
| 
								 | 
							
								        attrs = dict((key[1], value) for key, value in list(attrs.items()))
							 | 
						||
| 
								 | 
							
								        #print "Start %s, %r" % (name, attrs)
							 | 
						||
| 
								 | 
							
								        self.soup.handle_starttag(name, attrs)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def endElement(self, name):
							 | 
						||
| 
								 | 
							
								        #print "End %s" % name
							 | 
						||
| 
								 | 
							
								        self.soup.handle_endtag(name)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def startElementNS(self, nsTuple, nodeName, attrs):
							 | 
						||
| 
								 | 
							
								        # Throw away (ns, nodeName) for now.
							 | 
						||
| 
								 | 
							
								        self.startElement(nodeName, attrs)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def endElementNS(self, nsTuple, nodeName):
							 | 
						||
| 
								 | 
							
								        # Throw away (ns, nodeName) for now.
							 | 
						||
| 
								 | 
							
								        self.endElement(nodeName)
							 | 
						||
| 
								 | 
							
								        #handler.endElementNS((ns, node.nodeName), node.nodeName)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def startPrefixMapping(self, prefix, nodeValue):
							 | 
						||
| 
								 | 
							
								        # Ignore the prefix for now.
							 | 
						||
| 
								 | 
							
								        pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def endPrefixMapping(self, prefix):
							 | 
						||
| 
								 | 
							
								        # Ignore the prefix for now.
							 | 
						||
| 
								 | 
							
								        # handler.endPrefixMapping(prefix)
							 | 
						||
| 
								 | 
							
								        pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def characters(self, content):
							 | 
						||
| 
								 | 
							
								        self.soup.handle_data(content)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def startDocument(self):
							 | 
						||
| 
								 | 
							
								        pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def endDocument(self):
							 | 
						||
| 
								 | 
							
								        pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class HTMLTreeBuilder(TreeBuilder):
							 | 
						||
| 
								 | 
							
								    """This TreeBuilder knows facts about HTML.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    Such as which tags are empty-element tags.
							 | 
						||
| 
								 | 
							
								    """
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
							 | 
						||
| 
								 | 
							
								    empty_element_tags = set([
							 | 
						||
| 
								 | 
							
								        # These are from HTML5.
							 | 
						||
| 
								 | 
							
								        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # These are from HTML4, removed in HTML5.
							 | 
						||
| 
								 | 
							
								        'spacer', 'frame'
							 | 
						||
| 
								 | 
							
								    ])
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    # The HTML standard defines these attributes as containing a
							 | 
						||
| 
								 | 
							
								    # space-separated list of values, not a single value. That is,
							 | 
						||
| 
								 | 
							
								    # class="foo bar" means that the 'class' attribute has two values,
							 | 
						||
| 
								 | 
							
								    # 'foo' and 'bar', not the single value 'foo bar'.  When we
							 | 
						||
| 
								 | 
							
								    # encounter one of these attributes, we will parse its value into
							 | 
						||
| 
								 | 
							
								    # a list of values if possible. Upon output, the list will be
							 | 
						||
| 
								 | 
							
								    # converted back into a string.
							 | 
						||
| 
								 | 
							
								    cdata_list_attributes = {
							 | 
						||
| 
								 | 
							
								        "*" : ['class', 'accesskey', 'dropzone'],
							 | 
						||
| 
								 | 
							
								        "a" : ['rel', 'rev'],
							 | 
						||
| 
								 | 
							
								        "link" :  ['rel', 'rev'],
							 | 
						||
| 
								 | 
							
								        "td" : ["headers"],
							 | 
						||
| 
								 | 
							
								        "th" : ["headers"],
							 | 
						||
| 
								 | 
							
								        "td" : ["headers"],
							 | 
						||
| 
								 | 
							
								        "form" : ["accept-charset"],
							 | 
						||
| 
								 | 
							
								        "object" : ["archive"],
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
							 | 
						||
| 
								 | 
							
								        "area" : ["rel"],
							 | 
						||
| 
								 | 
							
								        "icon" : ["sizes"],
							 | 
						||
| 
								 | 
							
								        "iframe" : ["sandbox"],
							 | 
						||
| 
								 | 
							
								        "output" : ["for"],
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def set_up_substitutions(self, tag):
							 | 
						||
| 
								 | 
							
								        # We are only interested in <meta> tags
							 | 
						||
| 
								 | 
							
								        if tag.name != 'meta':
							 | 
						||
| 
								 | 
							
								            return False
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        http_equiv = tag.get('http-equiv')
							 | 
						||
| 
								 | 
							
								        content = tag.get('content')
							 | 
						||
| 
								 | 
							
								        charset = tag.get('charset')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        # We are interested in <meta> tags that say what encoding the
							 | 
						||
| 
								 | 
							
								        # document was originally in. This means HTML 5-style <meta>
							 | 
						||
| 
								 | 
							
								        # tags that provide the "charset" attribute. It also means
							 | 
						||
| 
								 | 
							
								        # HTML 4-style <meta> tags that provide the "content"
							 | 
						||
| 
								 | 
							
								        # attribute and have "http-equiv" set to "content-type".
							 | 
						||
| 
								 | 
							
								        #
							 | 
						||
| 
								 | 
							
								        # In both cases we will replace the value of the appropriate
							 | 
						||
| 
								 | 
							
								        # attribute with a standin object that can take on any
							 | 
						||
| 
								 | 
							
								        # encoding.
							 | 
						||
| 
								 | 
							
								        meta_encoding = None
							 | 
						||
| 
								 | 
							
								        if charset is not None:
							 | 
						||
| 
								 | 
							
								            # HTML 5 style:
							 | 
						||
| 
								 | 
							
								            # <meta charset="utf8">
							 | 
						||
| 
								 | 
							
								            meta_encoding = charset
							 | 
						||
| 
								 | 
							
								            tag['charset'] = CharsetMetaAttributeValue(charset)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        elif (content is not None and http_equiv is not None
							 | 
						||
| 
								 | 
							
								              and http_equiv.lower() == 'content-type'):
							 | 
						||
| 
								 | 
							
								            # HTML 4 style:
							 | 
						||
| 
								 | 
							
								            # <meta http-equiv="content-type" content="text/html; charset=utf8">
							 | 
						||
| 
								 | 
							
								            tag['content'] = ContentMetaAttributeValue(content)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        return (meta_encoding is not None)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def register_treebuilders_from(module):
							 | 
						||
| 
								 | 
							
								    """Copy TreeBuilders from the given module into this module."""
							 | 
						||
| 
								 | 
							
								    # I'm fairly sure this is not the best way to do this.
							 | 
						||
| 
								 | 
							
								    this_module = sys.modules['fastwq.libs.bs4.builder']
							 | 
						||
| 
								 | 
							
								    for name in module.__all__:
							 | 
						||
| 
								 | 
							
								        obj = getattr(module, name)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if issubclass(obj, TreeBuilder):
							 | 
						||
| 
								 | 
							
								            setattr(this_module, name, obj)
							 | 
						||
| 
								 | 
							
								            this_module.__all__.append(name)
							 | 
						||
| 
								 | 
							
								            # Register the builder while we're at it.
							 | 
						||
| 
								 | 
							
								            this_module.builder_registry.register(obj)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class ParserRejectedMarkup(Exception):
							 | 
						||
| 
								 | 
							
								    pass
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Builders are registered in reverse order of priority, so that custom
							 | 
						||
| 
								 | 
							
								# builder registrations will take precedence. In general, we want lxml
							 | 
						||
| 
								 | 
							
								# to take precedence over html5lib, because it's faster. And we only
							 | 
						||
| 
								 | 
							
								# want to use HTMLParser as a last result.
							 | 
						||
| 
								 | 
							
								from . import _htmlparser
							 | 
						||
| 
								 | 
							
								register_treebuilders_from(_htmlparser)
							 |