Module gazpacho.soup
View Source
from collections import Counter
from html.parser import HTMLParser
from random import sample
import re
from typing import Any, Dict, Optional, Tuple, Union, List
import warnings
from .get import get
from .utils import match, recover_html_and_attrs
class Soup(HTMLParser):
"""\
HTML Soup Parser
Attributes:
- html: content to parse
- tag: element to match
- attrs: element attributes to match
- text: inner data
Methods:
- find: matching content by element tag (and attributes)
- strip: brackets, tags, and attributes from inner data
- get: alternate initializer
Deprecations:
- remove_tags: (as of 1.0) use strip
Examples:
```
from gazpacho import Soup
html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"
url = "https://www.gazpacho.xyz"
soup = Soup(html)
soup = Soup.get(url)
```
"""
def __init__(self, html: Optional[str] = None) -> None:
"""\
Arguments:
- html: content to parse
"""
super().__init__()
self.html = "" if not html else html
self.tag: Optional[str] = None
self.attrs: Optional[Dict[Any, Any]] = None
self.text: Optional[str] = None
def __dir__(self):
return ["attrs", "find", "get", "html", "strip", "tag", "text"]
def __repr__(self) -> str:
return self.html
@classmethod
def get(
cls,
url: str,
params: Optional[Dict[str, str]] = None,
headers: Optional[Dict[str, str]] = None,
) -> "Soup":
"""\
Intialize with gazpacho.get
"""
html = get(url, params, headers)
if not isinstance(html, str):
raise Exception(f"Unable to retrieve contents from {url}")
return cls(html)
@property
def _active(self) -> bool:
return sum(self.counter.values()) > 0
@staticmethod
def _void(tag: str) -> bool:
return tag in [
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
]
def _handle_start(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
html, attrs_dict = recover_html_and_attrs(tag, attrs)
query_attrs = {} if not self.attrs else self.attrs
matching = match(query_attrs, attrs_dict, partial=self.partial)
if (tag == self.tag) and (matching) and (not self._active):
self.groups.append(Soup())
self.groups[-1].tag = tag
self.groups[-1].attrs = attrs_dict
self.groups[-1].html += html
self.counter[tag] += 1
return
if self._active:
self.groups[-1].html += html
self.counter[tag] += 1
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
self._handle_start(tag, attrs)
if self._active:
if self._void(tag):
self.counter[tag] -= 1
def handle_startendtag(
self, tag: str, attrs: List[Tuple[str, Optional[str]]]
) -> None:
self._handle_start(tag, attrs)
if self._active:
self.counter[tag] -= 1
def handle_data(self, data: str) -> None:
if self._active:
if self.groups[-1].text is None:
self.groups[-1].text = data.strip()
self.groups[-1].html += data
def handle_endtag(self, tag: str) -> None:
if self._active:
self.groups[-1].html += f"</{tag}>"
self.counter[tag] -= 1
def strip(self, whitespace: bool = True) -> str:
"""\
Strip brackets, tags, and attributes from inner text
Arguments:
- whitespace: remove extra whitespace characters
Example:
```
html = "<span>AB<b>C</b>D</span>"
soup = Soup(html)
soup.find("span").text
# AB
soup.strip()
# ABCD
```
"""
text = re.sub("<[^>]+>", "", self.html)
if whitespace:
text = " ".join(text.split())
return text
def remove_tags(self, strip: bool = True) -> str:
"""\
Now: .strip()
"""
message = "Marked for removal; use .strip()"
warnings.warn(message, category=FutureWarning, stacklevel=2)
return self.strip(whitespace=strip)
def find(
self,
tag: str,
attrs: Optional[Dict[str, str]] = None,
*,
partial: bool = True,
mode: str = "automatic",
strict: Optional[bool] = None,
) -> Optional[Union[List["Soup"], "Soup"]]:
"""\
Return matching HTML elements
Arguments:
- tag: target element tag
- attrs: target element attributes
- partial: match on attributes
- mode: override return behavior {'auto/automatic', 'all/list', 'first'}
Deprecations:
- strict: (as of 1.0) use partial=
Examples:
```
soup.find('p', {'class': 'a'})
# [<p class="a">1</p>, <p class="a">2</p>]
soup.find('p', {'class': 'a'}, mode='first')
# <p class="a">1</p>
result = soup.find('p', {'class': 'b'}, mode='auto')
print(result)
# <p class="b">3</p>
print(result.text)
# 3
```
"""
self.counter: Counter = Counter()
self.groups: List = []
self.tag = tag
self.attrs = attrs
self.partial = partial
if strict is not None:
message = "Marked for removal; use partial="
warnings.warn(message, category=FutureWarning, stacklevel=2)
partial = not strict
self.feed(self.html)
automatic = ["auto", "automatic"]
all = ["all", "list"]
first = ["first"]
last = ["last"] # undocumented
random = ["random"] # undocumented
if not self.groups:
if mode in all:
return []
else:
return None
elif mode in automatic:
if len(self.groups) == 1:
return self.groups[0]
else:
return self.groups
elif mode in all:
return self.groups
elif mode in first:
return self.groups[0]
elif mode in last:
return self.groups[-1]
elif mode in random:
return sample(self.groups, k=1)[0]
else:
raise ValueError(mode)
Classes
Soup
class Soup(
html: Union[str, NoneType] = None
)
HTML Soup Parser
Attributes:
- html: content to parse
- tag: element to match
- attrs: element attributes to match
- text: inner data
Methods:
- find: matching content by element tag (and attributes)
- strip: brackets, tags, and attributes from inner data
- get: alternate initializer
Deprecations:
- remove_tags: (as of 1.0) use strip
Examples:
from gazpacho import Soup
html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"
url = "https://www.gazpacho.xyz"
soup = Soup(html)
soup = Soup.get(url)
View Source
class Soup(HTMLParser):
"""\
HTML Soup Parser
Attributes:
- html: content to parse
- tag: element to match
- attrs: element attributes to match
- text: inner data
Methods:
- find: matching content by element tag (and attributes)
- strip: brackets, tags, and attributes from inner data
- get: alternate initializer
Deprecations:
- remove_tags: (as of 1.0) use strip
Examples:
```
from gazpacho import Soup
html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"
url = "https://www.gazpacho.xyz"
soup = Soup(html)
soup = Soup.get(url)
```
"""
def __init__(self, html: Optional[str] = None) -> None:
"""\
Arguments:
- html: content to parse
"""
super().__init__()
self.html = "" if not html else html
self.tag: Optional[str] = None
self.attrs: Optional[Dict[Any, Any]] = None
self.text: Optional[str] = None
def __dir__(self):
return ["attrs", "find", "get", "html", "strip", "tag", "text"]
def __repr__(self) -> str:
return self.html
@classmethod
def get(
cls,
url: str,
params: Optional[Dict[str, str]] = None,
headers: Optional[Dict[str, str]] = None,
) -> "Soup":
"""\
Intialize with gazpacho.get
"""
html = get(url, params, headers)
if not isinstance(html, str):
raise Exception(f"Unable to retrieve contents from {url}")
return cls(html)
@property
def _active(self) -> bool:
return sum(self.counter.values()) > 0
@staticmethod
def _void(tag: str) -> bool:
return tag in [
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
]
def _handle_start(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
html, attrs_dict = recover_html_and_attrs(tag, attrs)
query_attrs = {} if not self.attrs else self.attrs
matching = match(query_attrs, attrs_dict, partial=self.partial)
if (tag == self.tag) and (matching) and (not self._active):
self.groups.append(Soup())
self.groups[-1].tag = tag
self.groups[-1].attrs = attrs_dict
self.groups[-1].html += html
self.counter[tag] += 1
return
if self._active:
self.groups[-1].html += html
self.counter[tag] += 1
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
self._handle_start(tag, attrs)
if self._active:
if self._void(tag):
self.counter[tag] -= 1
def handle_startendtag(
self, tag: str, attrs: List[Tuple[str, Optional[str]]]
) -> None:
self._handle_start(tag, attrs)
if self._active:
self.counter[tag] -= 1
def handle_data(self, data: str) -> None:
if self._active:
if self.groups[-1].text is None:
self.groups[-1].text = data.strip()
self.groups[-1].html += data
def handle_endtag(self, tag: str) -> None:
if self._active:
self.groups[-1].html += f"</{tag}>"
self.counter[tag] -= 1
def strip(self, whitespace: bool = True) -> str:
"""\
Strip brackets, tags, and attributes from inner text
Arguments:
- whitespace: remove extra whitespace characters
Example:
```
html = "<span>AB<b>C</b>D</span>"
soup = Soup(html)
soup.find("span").text
# AB
soup.strip()
# ABCD
```
"""
text = re.sub("<[^>]+>", "", self.html)
if whitespace:
text = " ".join(text.split())
return text
def remove_tags(self, strip: bool = True) -> str:
"""\
Now: .strip()
"""
message = "Marked for removal; use .strip()"
warnings.warn(message, category=FutureWarning, stacklevel=2)
return self.strip(whitespace=strip)
def find(
self,
tag: str,
attrs: Optional[Dict[str, str]] = None,
*,
partial: bool = True,
mode: str = "automatic",
strict: Optional[bool] = None,
) -> Optional[Union[List["Soup"], "Soup"]]:
"""\
Return matching HTML elements
Arguments:
- tag: target element tag
- attrs: target element attributes
- partial: match on attributes
- mode: override return behavior {'auto/automatic', 'all/list', 'first'}
Deprecations:
- strict: (as of 1.0) use partial=
Examples:
```
soup.find('p', {'class': 'a'})
# [<p class="a">1</p>, <p class="a">2</p>]
soup.find('p', {'class': 'a'}, mode='first')
# <p class="a">1</p>
result = soup.find('p', {'class': 'b'}, mode='auto')
print(result)
# <p class="b">3</p>
print(result.text)
# 3
```
"""
self.counter: Counter = Counter()
self.groups: List = []
self.tag = tag
self.attrs = attrs
self.partial = partial
if strict is not None:
message = "Marked for removal; use partial="
warnings.warn(message, category=FutureWarning, stacklevel=2)
partial = not strict
self.feed(self.html)
automatic = ["auto", "automatic"]
all = ["all", "list"]
first = ["first"]
last = ["last"] # undocumented
random = ["random"] # undocumented
if not self.groups:
if mode in all:
return []
else:
return None
elif mode in automatic:
if len(self.groups) == 1:
return self.groups[0]
else:
return self.groups
elif mode in all:
return self.groups
elif mode in first:
return self.groups[0]
elif mode in last:
return self.groups[-1]
elif mode in random:
return sample(self.groups, k=1)[0]
else:
raise ValueError(mode)
Ancestors (in MRO)
- html.parser.HTMLParser
- _markupbase.ParserBase
Class variables
CDATA_CONTENT_ELEMENTS
Static methods
get
def get(
url: str,
params: Union[Dict[str, str], NoneType] = None,
headers: Union[Dict[str, str], NoneType] = None
) -> 'Soup'
Intialize with gazpacho.get
View Source
@classmethod
def get(
cls,
url: str,
params: Optional[Dict[str, str]] = None,
headers: Optional[Dict[str, str]] = None,
) -> "Soup":
"""\
Intialize with gazpacho.get
"""
html = get(url, params, headers)
if not isinstance(html, str):
raise Exception(f"Unable to retrieve contents from {url}")
return cls(html)
Methods
check_for_whole_start_tag
def check_for_whole_start_tag(
self,
i
)
View Source
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend_tolerant.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if next == "/":
if rawdata.startswith("/>", j):
return j + 2
if rawdata.startswith("/", j):
# buffer boundary
return -1
# else bogus input
if j > i:
return j
else:
return i + 1
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
if j > i:
return j
else:
return i + 1
raise AssertionError("we should not get here!")
clear_cdata_mode
def clear_cdata_mode(
self
)
View Source
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
close
def close(
self
)
Handle any buffered data.
View Source
def close(self):
"""Handle any buffered data."""
self.goahead(1)
error
def error(
self,
message
)
View Source
def error(self, message):
raise NotImplementedError(
"subclasses of ParserBase must override error()")
feed
def feed(
self,
data
)
Feed data to the parser.
Call this as often as you want, with as little or as much text as you want (may include '\n').
View Source
def feed(self, data):
r"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
find
def find(
self,
tag: str,
attrs: Union[Dict[str, str], NoneType] = None,
*,
partial: bool = True,
mode: str = 'automatic',
strict: Union[bool, NoneType] = None
) -> Union[List[ForwardRef('Soup')], ForwardRef('Soup'), NoneType]
Return matching HTML elements
Arguments:
- tag: target element tag
- attrs: target element attributes
- partial: match on attributes
- mode: override return behavior {'auto/automatic', 'all/list', 'first'}
Deprecations:
- strict: (as of 1.0) use partial=
Examples:
soup.find('p', {'class': 'a'})
# [<p class="a">1</p>, <p class="a">2</p>]
soup.find('p', {'class': 'a'}, mode='first')
# <p class="a">1</p>
result = soup.find('p', {'class': 'b'}, mode='auto')
print(result)
# <p class="b">3</p>
print(result.text)
# 3
View Source
def find(
self,
tag: str,
attrs: Optional[Dict[str, str]] = None,
*,
partial: bool = True,
mode: str = "automatic",
strict: Optional[bool] = None,
) -> Optional[Union[List["Soup"], "Soup"]]:
"""\
Return matching HTML elements
Arguments:
- tag: target element tag
- attrs: target element attributes
- partial: match on attributes
- mode: override return behavior {'auto/automatic', 'all/list', 'first'}
Deprecations:
- strict: (as of 1.0) use partial=
Examples:
```
soup.find('p', {'class': 'a'})
# [<p class="a">1</p>, <p class="a">2</p>]
soup.find('p', {'class': 'a'}, mode='first')
# <p class="a">1</p>
result = soup.find('p', {'class': 'b'}, mode='auto')
print(result)
# <p class="b">3</p>
print(result.text)
# 3
```
"""
self.counter: Counter = Counter()
self.groups: List = []
self.tag = tag
self.attrs = attrs
self.partial = partial
if strict is not None:
message = "Marked for removal; use partial="
warnings.warn(message, category=FutureWarning, stacklevel=2)
partial = not strict
self.feed(self.html)
automatic = ["auto", "automatic"]
all = ["all", "list"]
first = ["first"]
last = ["last"] # undocumented
random = ["random"] # undocumented
if not self.groups:
if mode in all:
return []
else:
return None
elif mode in automatic:
if len(self.groups) == 1:
return self.groups[0]
else:
return self.groups
elif mode in all:
return self.groups
elif mode in first:
return self.groups[0]
elif mode in last:
return self.groups[-1]
elif mode in random:
return sample(self.groups, k=1)[0]
else:
raise ValueError(mode)
get_starttag_text
def get_starttag_text(
self
)
Return full source of start tag: '<...>'.
View Source
def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
getpos
def getpos(
self
)
Return current line number and offset.
View Source
def getpos(self):
"""Return current line number and offset."""
return self.lineno, self.offset
goahead
def goahead(
self,
end
)
View Source
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
# or there's more text incoming. If the latter is True,
# we can't pass the text to handle_data in case we have
# a charref cut in half at end. Try to determine if
# this is the case before proceeding by looking for an
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
not re.compile(r'[\s;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif startswith("</", i):
k = self.parse_endtag(i)
elif startswith("<!--", i):
k = self.parse_comment(i)
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
if match:
name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
else:
if ";" in rawdata[i:]: # bail by consuming &#
self.handle_data(rawdata[i:i+2])
i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete.match(rawdata, i)
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete
break
elif (i + 1) < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")
i = self.updatepos(i, i + 1)
else:
break
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
handle_charref
def handle_charref(
self,
name
)
View Source
def handle_charref(self, name):
pass
handle_comment
def handle_comment(
self,
data
)
View Source
def handle_comment(self, data):
pass
handle_data
def handle_data(
self,
data: str
) -> None
View Source
def handle_data(self, data: str) -> None:
if self._active:
if self.groups[-1].text is None:
self.groups[-1].text = data.strip()
self.groups[-1].html += data
handle_decl
def handle_decl(
self,
decl
)
View Source
def handle_decl(self, decl):
pass
handle_endtag
def handle_endtag(
self,
tag: str
) -> None
View Source
def handle_endtag(self, tag: str) -> None:
if self._active:
self.groups[-1].html += f"</{tag}>"
self.counter[tag] -= 1
handle_entityref
def handle_entityref(
self,
name
)
View Source
def handle_entityref(self, name):
pass
handle_pi
def handle_pi(
self,
data
)
View Source
def handle_pi(self, data):
pass
handle_startendtag
def handle_startendtag(
self,
tag: str,
attrs: List[Tuple[str, Union[str, NoneType]]]
) -> None
View Source
def handle_startendtag(
self, tag: str, attrs: List[Tuple[str, Optional[str]]]
) -> None:
self._handle_start(tag, attrs)
if self._active:
self.counter[tag] -= 1
handle_starttag
def handle_starttag(
self,
tag: str,
attrs: List[Tuple[str, Union[str, NoneType]]]
) -> None
View Source
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
self._handle_start(tag, attrs)
if self._active:
if self._void(tag):
self.counter[tag] -= 1
parse_bogus_comment
def parse_bogus_comment(
self,
i,
report=1
)
View Source
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
'parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
if report:
self.handle_comment(rawdata[i+2:pos])
return pos + 1
parse_comment
def parse_comment(
self,
i,
report=1
)
View Source
def parse_comment(self, i, report=1):
rawdata = self.rawdata
if rawdata[i:i+4] != '<!--':
self.error('unexpected call to parse_comment()')
match = _commentclose.search(rawdata, i+4)
if not match:
return -1
if report:
j = match.start(0)
self.handle_comment(rawdata[i+4: j])
return match.end(0)
parse_declaration
def parse_declaration(
self,
i
)
View Source
def parse_declaration(self, i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
# ISO 8879:1986, however, has more complex
# declaration syntax for elements in <!...>, including:
# --comment--
# [marked section]
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
# ATTLIST, NOTATION, SHORTREF, USEMAP,
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
rawdata = self.rawdata
j = i + 2
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
if rawdata[j:j+1] == ">":
# the empty comment <!>
return j + 1
if rawdata[j:j+1] in ("-", ""):
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return -1
# A simple, practical version could look like: ((name|stringlit) S*) + '>'
n = len(rawdata)
if rawdata[j:j+2] == '--': #comment
# Locate --.*-- as the body of the comment
return self.parse_comment(i)
elif rawdata[j] == '[': #marked section
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
# Note that this is extended by Microsoft Office "Save as Web" function
# to include [if...] and [endif].
return self.parse_marked_section(i)
else: #all other declaration elements
decltype, j = self._scan_name(j, i)
if j < 0:
return j
if decltype == "doctype":
self._decl_otherchars = ''
while j < n:
c = rawdata[j]
if c == ">":
# end of declaration syntax
data = rawdata[i+2:j]
if decltype == "doctype":
self.handle_decl(data)
else:
# According to the HTML5 specs sections "8.2.4.44 Bogus
# comment state" and "8.2.4.45 Markup declaration open
# state", a comment token should be emitted.
# Calling unknown_decl provides more flexibility though.
self.unknown_decl(data)
return j + 1
if c in "\"'":
m = _declstringlit_match(rawdata, j)
if not m:
return -1 # incomplete
j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
name, j = self._scan_name(j, i)
elif c in self._decl_otherchars:
j = j + 1
elif c == "[":
# this could be handled in a separate doctype parser
if decltype == "doctype":
j = self._parse_doctype_subset(j + 1, i)
elif decltype in {"attlist", "linktype", "link", "element"}:
# must tolerate []'d groups in a content model in an element declaration
# also in data attribute specifications of attlist declaration
# also link type declaration subsets in linktype declarations
# also link attribute specification lists in link declarations
self.error("unsupported '[' char in %s declaration" % decltype)
else:
self.error("unexpected '[' char in declaration")
else:
self.error(
"unexpected %r char in declaration" % rawdata[j])
if j < 0:
return j
return -1 # incomplete
parse_endtag
def parse_endtag(
self,
i
)
View Source
def parse_endtag(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
match = endendtag.search(rawdata, i+1) # >
if not match:
return -1
gtpos = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
if self.cdata_elem is not None:
self.handle_data(rawdata[i:gtpos])
return gtpos
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
namematch = tagfind_tolerant.match(rawdata, i+2)
if not namematch:
# w3.org/TR/html5/tokenization.html#end-tag-open-state
if rawdata[i:i+3] == '</>':
return i+3
else:
return self.parse_bogus_comment(i)
tagname = namematch.group(1).lower()
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after tha name should cover
# most of the cases and is much simpler
gtpos = rawdata.find('>', namematch.end())
self.handle_endtag(tagname)
return gtpos+1
elem = match.group(1).lower() # script or style
if self.cdata_elem is not None:
if elem != self.cdata_elem:
self.handle_data(rawdata[i:gtpos])
return gtpos
self.handle_endtag(elem)
self.clear_cdata_mode()
return gtpos
parse_html_declaration
def parse_html_declaration(
self,
i
)
View Source
def parse_html_declaration(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<!', ('unexpected call to '
'parse_html_declaration()')
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+3] == '<![':
return self.parse_marked_section(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
if gtpos == -1:
return -1
self.handle_decl(rawdata[i+2:gtpos])
return gtpos+1
else:
return self.parse_bogus_comment(i)
parse_marked_section
def parse_marked_section(
self,
i,
report=1
)
View Source
def parse_marked_section(self, i, report=1):
rawdata= self.rawdata
assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
sectName, j = self._scan_name( i+3, i )
if j < 0:
return j
if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
# look for standard ]]> ending
match= _markedsectionclose.search(rawdata, i+3)
elif sectName in {"if", "else", "endif"}:
# look for MS Office ]> ending
match= _msmarkedsectionclose.search(rawdata, i+3)
else:
self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
if not match:
return -1
if report:
j = match.start(0)
self.unknown_decl(rawdata[i+3: j])
return match.end(0)
parse_pi
def parse_pi(
self,
i
)
View Source
def parse_pi(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
match = piclose.search(rawdata, i+2) # >
if not match:
return -1
j = match.start()
self.handle_pi(rawdata[i+2: j])
j = match.end()
return j
parse_starttag
def parse_starttag(
self,
i
)
View Source
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
remove_tags
def remove_tags(
self,
strip: bool = True
) -> str
Now: .strip()
View Source
def remove_tags(self, strip: bool = True) -> str:
"""\
Now: .strip()
"""
message = "Marked for removal; use .strip()"
warnings.warn(message, category=FutureWarning, stacklevel=2)
return self.strip(whitespace=strip)
reset
def reset(
self
)
Reset this instance. Loses all unprocessed data.
View Source
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
_markupbase.ParserBase.reset(self)
set_cdata_mode
def set_cdata_mode(
self,
elem
)
View Source
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
strip
def strip(
self,
whitespace: bool = True
) -> str
Strip brackets, tags, and attributes from inner text
Arguments:
- whitespace: remove extra whitespace characters
Example:
html = "<span>AB<b>C</b>D</span>"
soup = Soup(html)
soup.find("span").text
# AB
soup.strip()
# ABCD
View Source
def strip(self, whitespace: bool = True) -> str:
"""\
Strip brackets, tags, and attributes from inner text
Arguments:
- whitespace: remove extra whitespace characters
Example:
```
html = "<span>AB<b>C</b>D</span>"
soup = Soup(html)
soup.find("span").text
# AB
soup.strip()
# ABCD
```
"""
text = re.sub("<[^>]+>", "", self.html)
if whitespace:
text = " ".join(text.split())
return text
unescape
def unescape(
self,
s
)
View Source
def unescape(self, s):
warnings.warn('The unescape method is deprecated and will be removed '
'in 3.5, use html.unescape() instead.',
DeprecationWarning, stacklevel=2)
return unescape(s)
unknown_decl
def unknown_decl(
self,
data
)
View Source
def unknown_decl(self, data):
pass
updatepos
def updatepos(
self,
i,
j
)
View Source
def updatepos(self, i, j):
if i >= j:
return j
rawdata = self.rawdata
nlines = rawdata.count("\n", i, j)
if nlines:
self.lineno = self.lineno + nlines
pos = rawdata.rindex("\n", i, j) # Should not fail
self.offset = j-(pos+1)
else:
self.offset = self.offset + j-i
return j