|
| None | initialize_soup (self, BeautifulSoup soup) |
| |
| _ParserOrParserClass | default_parser (self, Optional[_Encoding] encoding) |
| |
| _LXMLParser | parser_for (self, Optional[_Encoding] encoding) |
| |
| | __init__ (self, Optional[etree.XMLParser] parser=None, Optional[Set[str]] empty_element_tags=None, bool huge_tree=False, **Any kwargs) |
| |
| Iterable[ Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]] | prepare_markup (self, _RawMarkup markup, Optional[_Encoding] user_specified_encoding=None, Optional[_Encoding] document_declared_encoding=None, Optional[_Encodings] exclude_encodings=None) |
| |
| None | feed (self, _RawMarkup markup) |
| |
|
None | close (self) |
| |
|
None | start (self, str|bytes tag, Dict[str|bytes, str|bytes] attrib, _NamespaceMapping nsmap={}) |
| |
|
None | end (self, str|bytes tag) |
| |
|
None | pi (self, str target, str data) |
| |
|
None | data (self, str|bytes data) |
| |
|
None | doctype (self, str name, str pubid, str system) |
| |
|
None | comment (self, str|bytes text) |
| |
| str | test_fragment_to_document (self, str fragment) |
| |
| None | reset (self) |
| |
| bool | can_be_empty_element (self, str tag_name) |
| |
| bool | set_up_substitutions (self, Tag tag) |
| |
|
|
Type | DEFAULT_PARSER_CLASS = etree.XMLParser |
| |
|
bool | is_xml = True |
| |
|
bool | huge_tree |
| |
|
Type | processing_instruction_class [ProcessingInstruction] |
| |
|
str | NAME = "lxml-xml" |
| |
|
list | ALTERNATE_NAMES = ["xml"] |
| |
|
list | features = [NAME, LXML, XML, FAST, PERMISSIVE] |
| |
|
int | CHUNK_SIZE = 512 |
| |
|
_NamespaceMapping | DEFAULT_NSMAPS = dict(xml="http://www.w3.org/XML/1998/namespace") |
| |
|
_InvertedNamespaceMapping | DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) |
| |
|
List | nsmaps [Optional[_InvertedNamespaceMapping]] |
| |
|
Optional | empty_element_tags [Set[str]] |
| |
|
Any | parser |
| |
|
Any | USE_DEFAULT = object() |
| |
|
str | NAME = "[Unknown tree builder]" |
| |
|
list | ALTERNATE_NAMES = [] |
| |
|
list | features = [] |
| |
|
bool | is_xml = False |
| |
|
bool | picklable = False |
| |
|
Optional | soup [BeautifulSoup] |
| |
|
Optional | empty_element_tags = None |
| |
|
Dict | cdata_list_attributes [str, Set[str]] |
| |
|
Set | preserve_whitespace_tags [str] |
| |
|
Dict | string_containers [str, Type[NavigableString]] |
| |
|
bool | tracks_line_numbers |
| |
|
Dict | DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(set) |
| |
|
Set | DEFAULT_PRESERVE_WHITESPACE_TAGS = set() |
| |
|
dict | DEFAULT_STRING_CONTAINERS = {} |
| |
|
Optional | DEFAULT_EMPTY_ELEMENT_TAGS = None |
| |
|
bool | TRACKS_LINE_NUMBERS = False |
| |
| Iterable[
Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
] bs4.builder._lxml.LXMLTreeBuilderForXML.prepare_markup |
( |
|
self, |
|
|
_RawMarkup |
markup, |
|
|
Optional[_Encoding] |
user_specified_encoding = None, |
|
|
Optional[_Encoding] |
document_declared_encoding = None, |
|
|
Optional[_Encodings] |
exclude_encodings = None |
|
) |
| |
Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
lxml really wants to get a bytestring and convert it to
Unicode itself. So instead of using UnicodeDammit to convert
the bytestring to Unicode using different encodings, this
implementation uses EncodingDetector to iterate over the
encodings, and tell lxml to try to parse the document as each
one in turn.
:param markup: Some markup -- hopefully a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
Reimplemented from bs4.builder.TreeBuilder.