parent
8227df459a
commit
ad3f37f8ac
@ -0,0 +1,43 @@
|
||||
Behold, mortal, the origins of Beautiful Soup...
|
||||
================================================
|
||||
|
||||
Leonard Richardson is the primary programmer.
|
||||
|
||||
Aaron DeVore is awesome.
|
||||
|
||||
Mark Pilgrim provided the encoding detection code that forms the base
|
||||
of UnicodeDammit.
|
||||
|
||||
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
||||
Soup 4 working under Python 3.
|
||||
|
||||
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
||||
support CSS selectors.
|
||||
|
||||
Sam Ruby helped with a lot of edge cases.
|
||||
|
||||
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
|
||||
work in solving the nestable tags conundrum.
|
||||
|
||||
An incomplete list of people have contributed patches to Beautiful
|
||||
Soup:
|
||||
|
||||
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
|
||||
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
|
||||
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
|
||||
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
|
||||
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
|
||||
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
|
||||
Webster, Paul Wright, Danny Yoo
|
||||
|
||||
An incomplete list of people who made suggestions or found bugs or
|
||||
found ways to break Beautiful Soup:
|
||||
|
||||
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
||||
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
||||
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
||||
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
||||
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
||||
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
|
||||
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
||||
Sousa Rocha, Yichun Wei, Per Vognsen
|
@ -0,0 +1,27 @@
|
||||
Beautiful Soup is made available under the MIT license:
|
||||
|
||||
Copyright (c) 2004-2015 Leonard Richardson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Beautiful Soup incorporates code from the html5lib library, which is
|
||||
also made available under the MIT license. Copyright (c) 2006-2013
|
||||
James Graham and other contributors
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,63 @@
|
||||
= Introduction =
|
||||
|
||||
>>> from bs4 import BeautifulSoup
|
||||
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
|
||||
>>> print soup.prettify()
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
Some
|
||||
<b>
|
||||
bad
|
||||
<i>
|
||||
HTML
|
||||
</i>
|
||||
</b>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
>>> soup.find(text="bad")
|
||||
u'bad'
|
||||
|
||||
>>> soup.i
|
||||
<i>HTML</i>
|
||||
|
||||
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
|
||||
>>> print soup.prettify()
|
||||
<?xml version="1.0" encoding="utf-8">
|
||||
<tag1>
|
||||
Some
|
||||
<tag2 />
|
||||
bad
|
||||
<tag3>
|
||||
XML
|
||||
</tag3>
|
||||
</tag1>
|
||||
|
||||
= Full documentation =
|
||||
|
||||
The bs4/doc/ directory contains full documentation in Sphinx
|
||||
format. Run "make html" in that directory to create HTML
|
||||
documentation.
|
||||
|
||||
= Running the unit tests =
|
||||
|
||||
Beautiful Soup supports unit test discovery from the project root directory:
|
||||
|
||||
$ nosetests
|
||||
|
||||
$ python -m unittest discover -s bs4 # Python 2.7 and up
|
||||
|
||||
If you checked out the source tree, you should see a script in the
|
||||
home directory called test-all-versions. This script will run the unit
|
||||
tests under Python 2.7, then create a temporary Python 3 conversion of
|
||||
the source and run the unit tests again under Python 3.
|
||||
|
||||
= Links =
|
||||
|
||||
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
|
||||
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
http://readthedocs.org/docs/beautiful-soup-4/
|
||||
Discussion group: http://groups.google.com/group/beautifulsoup/
|
||||
Development: https://code.launchpad.net/beautifulsoup/
|
||||
Bug tracker: https://bugs.launchpad.net/beautifulsoup/
|
@ -0,0 +1,31 @@
|
||||
Additions
|
||||
---------
|
||||
|
||||
More of the jQuery API: nextUntil?
|
||||
|
||||
Optimizations
|
||||
-------------
|
||||
|
||||
The html5lib tree builder doesn't use the standard tree-building API,
|
||||
which worries me and has resulted in a number of bugs.
|
||||
|
||||
markup_attr_map can be optimized since it's always a map now.
|
||||
|
||||
Upon encountering UTF-16LE data or some other uncommon serialization
|
||||
of Unicode, UnicodeDammit will convert the data to Unicode, then
|
||||
encode it at UTF-8. This is wasteful because it will just get decoded
|
||||
back to Unicode.
|
||||
|
||||
CDATA
|
||||
-----
|
||||
|
||||
The elementtree XMLParser has a strip_cdata argument that, when set to
|
||||
False, should allow Beautiful Soup to preserve CDATA sections instead
|
||||
of treating them as text. Except it doesn't. (This argument is also
|
||||
present for HTMLParser, and also does nothing there.)
|
||||
|
||||
Currently, htm5lib converts CDATA sections into comments. An
|
||||
as-yet-unreleased version of html5lib changes the parser's handling of
|
||||
CDATA sections to allow CDATA sections in tags like <svg> and
|
||||
<math>. The HTML5TreeBuilder will need to be updated to create CData
|
||||
objects instead of Comment objects in this situation.
|
@ -0,0 +1,100 @@
|
||||
from __future__ import absolute_import
|
||||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
class Formatter(EntitySubstitution):
|
||||
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||
|
||||
Some parts of this strategy come from the distinction between
|
||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||
"""
|
||||
# Registries of XML and HTML formatters.
|
||||
XML_FORMATTERS = {}
|
||||
HTML_FORMATTERS = {}
|
||||
|
||||
HTML = 'html'
|
||||
XML = 'xml'
|
||||
|
||||
HTML_DEFAULTS = dict(
|
||||
cdata_containing_tags=set(["script", "style"]),
|
||||
)
|
||||
|
||||
def _default(self, language, value, kwarg):
|
||||
if value is not None:
|
||||
return value
|
||||
if language == self.XML:
|
||||
return set()
|
||||
return self.HTML_DEFAULTS[kwarg]
|
||||
|
||||
def __init__(
|
||||
self, language=None, entity_substitution=None,
|
||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||
):
|
||||
"""
|
||||
|
||||
:param void_element_close_prefix: By default, represent void
|
||||
elements as <tag/> rather than <tag>
|
||||
"""
|
||||
self.language = language
|
||||
self.entity_substitution = entity_substitution
|
||||
self.void_element_close_prefix = void_element_close_prefix
|
||||
self.cdata_containing_tags = self._default(
|
||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||
)
|
||||
|
||||
def substitute(self, ns):
|
||||
"""Process a string that needs to undergo entity substitution."""
|
||||
if not self.entity_substitution:
|
||||
return ns
|
||||
from .element import NavigableString
|
||||
if (isinstance(ns, NavigableString)
|
||||
and ns.parent is not None
|
||||
and ns.parent.name in self.cdata_containing_tags):
|
||||
# Do nothing.
|
||||
return ns
|
||||
# Substitute.
|
||||
return self.entity_substitution(ns)
|
||||
|
||||
def attribute_value(self, value):
|
||||
"""Process the value of an attribute."""
|
||||
return self.substitute(value)
|
||||
|
||||
def attributes(self, tag):
|
||||
"""Reorder a tag's attributes however you want."""
|
||||
return sorted(tag.attrs.items())
|
||||
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||
|
||||
|
||||
class XMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||
|
||||
|
||||
# Set up aliases for the default formatters.
|
||||
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix = None
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
|
||||
entity_substitution=None
|
||||
)
|
||||
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
XMLFormatter.REGISTRY[None] = Formatter(
|
||||
Formatter(Formatter.XML, entity_substitution=None)
|
||||
)
|
Loading…
Reference in new issue