Skip to content

Module gazpacho.soup

View Source
from collections import Counter

from html.parser import HTMLParser

from random import sample

import re

from typing import Any, Dict, Optional, Tuple, Union, List

import warnings

from .get import get

from .utils import match, recover_html_and_attrs

class Soup(HTMLParser):

    """\

    HTML Soup Parser

    Attributes:

    - html: content to parse

    - tag: element to match

    - attrs: element attributes to match

    - text: inner data

    Methods:

    - find: matching content by element tag (and attributes)

    - strip: brackets, tags, and attributes from inner data

    - get: alternate initializer

    Deprecations:

    - remove_tags: (as of 1.0) use strip

    Examples:

    ```

    from gazpacho import Soup

    html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"

    url = "https://www.gazpacho.xyz"

    soup = Soup(html)

    soup = Soup.get(url)

    ```

    """

    def __init__(self, html: Optional[str] = None) -> None:

        """\

        Arguments:

        - html: content to parse

        """

        super().__init__()

        self.html = "" if not html else html

        self.tag: Optional[str] = None

        self.attrs: Optional[Dict[Any, Any]] = None

        self.text: Optional[str] = None

    def __dir__(self):

        return ["attrs", "find", "get", "html", "strip", "tag", "text"]

    def __repr__(self) -> str:

        return self.html

    @classmethod

    def get(

        cls,

        url: str,

        params: Optional[Dict[str, str]] = None,

        headers: Optional[Dict[str, str]] = None,

    ) -> "Soup":

        """\

        Intialize with gazpacho.get

        """

        html = get(url, params, headers)

        if not isinstance(html, str):

            raise Exception(f"Unable to retrieve contents from {url}")

        return cls(html)

    @property

    def _active(self) -> bool:

        return sum(self.counter.values()) > 0

    @staticmethod

    def _void(tag: str) -> bool:

        return tag in [

            "area",

            "base",

            "br",

            "col",

            "embed",

            "hr",

            "img",

            "input",

            "keygen",

            "link",

            "meta",

            "param",

            "source",

            "track",

            "wbr",

        ]

    def _handle_start(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        html, attrs_dict = recover_html_and_attrs(tag, attrs)

        query_attrs = {} if not self.attrs else self.attrs

        matching = match(query_attrs, attrs_dict, partial=self.partial)

        if (tag == self.tag) and (matching) and (not self._active):

            self.groups.append(Soup())

            self.groups[-1].tag = tag

            self.groups[-1].attrs = attrs_dict

            self.groups[-1].html += html

            self.counter[tag] += 1

            return

        if self._active:

            self.groups[-1].html += html

            self.counter[tag] += 1

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            if self._void(tag):

                self.counter[tag] -= 1

    def handle_startendtag(

        self, tag: str, attrs: List[Tuple[str, Optional[str]]]

    ) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            self.counter[tag] -= 1

    def handle_data(self, data: str) -> None:

        if self._active:

            if self.groups[-1].text is None:

                self.groups[-1].text = data.strip()

            self.groups[-1].html += data

    def handle_endtag(self, tag: str) -> None:

        if self._active:

            self.groups[-1].html += f"</{tag}>"

            self.counter[tag] -= 1

    def strip(self, whitespace: bool = True) -> str:

        """\

        Strip brackets, tags, and attributes from inner text

        Arguments:

        - whitespace: remove extra whitespace characters

        Example:

        ```

        html = "<span>AB<b>C</b>D</span>"

        soup = Soup(html)

        soup.find("span").text

        # AB

        soup.strip()

        # ABCD

        ```

        """

        text = re.sub("<[^>]+>", "", self.html)

        if whitespace:

            text = " ".join(text.split())

        return text

    def remove_tags(self, strip: bool = True) -> str:

        """\

        Now: .strip()

        """

        message = "Marked for removal; use .strip()"

        warnings.warn(message, category=FutureWarning, stacklevel=2)

        return self.strip(whitespace=strip)

    def find(

        self,

        tag: str,

        attrs: Optional[Dict[str, str]] = None,

        *,

        partial: bool = True,

        mode: str = "automatic",

        strict: Optional[bool] = None,

    ) -> Optional[Union[List["Soup"], "Soup"]]:

        """\

        Return matching HTML elements

        Arguments:

        - tag: target element tag

        - attrs: target element attributes

        - partial: match on attributes

        - mode: override return behavior {'auto/automatic', 'all/list', 'first'}

        Deprecations:

        - strict: (as of 1.0) use partial=

        Examples:

        ```

        soup.find('p', {'class': 'a'})

        # [<p class="a">1</p>, <p class="a">2</p>]

        soup.find('p', {'class': 'a'}, mode='first')

        # <p class="a">1</p>

        result = soup.find('p', {'class': 'b'}, mode='auto')

        print(result)

        # <p class="b">3</p>

        print(result.text)

        # 3

        ```

        """

        self.counter: Counter = Counter()

        self.groups: List = []

        self.tag = tag

        self.attrs = attrs

        self.partial = partial

        if strict is not None:

            message = "Marked for removal; use partial="

            warnings.warn(message, category=FutureWarning, stacklevel=2)

            partial = not strict

        self.feed(self.html)

        automatic = ["auto", "automatic"]

        all = ["all", "list"]

        first = ["first"]

        last = ["last"]  # undocumented

        random = ["random"]  # undocumented

        if not self.groups:

            if mode in all:

                return []

            else:

                return None

        elif mode in automatic:

            if len(self.groups) == 1:

                return self.groups[0]

            else:

                return self.groups

        elif mode in all:

            return self.groups

        elif mode in first:

            return self.groups[0]

        elif mode in last:

            return self.groups[-1]

        elif mode in random:

            return sample(self.groups, k=1)[0]

        else:

            raise ValueError(mode)

Classes

Soup

class Soup(
    html: Union[str, NoneType] = None
)

HTML Soup Parser

Attributes:

  • html: content to parse
  • tag: element to match
  • attrs: element attributes to match
  • text: inner data

Methods:

  • find: matching content by element tag (and attributes)
  • strip: brackets, tags, and attributes from inner data
  • get: alternate initializer

Deprecations:

  • remove_tags: (as of 1.0) use strip

Examples:

from gazpacho import Soup

html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"
url = "https://www.gazpacho.xyz"

soup = Soup(html)
soup = Soup.get(url)
View Source
class Soup(HTMLParser):

    """\

    HTML Soup Parser

    Attributes:

    - html: content to parse

    - tag: element to match

    - attrs: element attributes to match

    - text: inner data

    Methods:

    - find: matching content by element tag (and attributes)

    - strip: brackets, tags, and attributes from inner data

    - get: alternate initializer

    Deprecations:

    - remove_tags: (as of 1.0) use strip

    Examples:

    ```

    from gazpacho import Soup

    html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"

    url = "https://www.gazpacho.xyz"

    soup = Soup(html)

    soup = Soup.get(url)

    ```

    """

    def __init__(self, html: Optional[str] = None) -> None:

        """\

        Arguments:

        - html: content to parse

        """

        super().__init__()

        self.html = "" if not html else html

        self.tag: Optional[str] = None

        self.attrs: Optional[Dict[Any, Any]] = None

        self.text: Optional[str] = None

    def __dir__(self):

        return ["attrs", "find", "get", "html", "strip", "tag", "text"]

    def __repr__(self) -> str:

        return self.html

    @classmethod

    def get(

        cls,

        url: str,

        params: Optional[Dict[str, str]] = None,

        headers: Optional[Dict[str, str]] = None,

    ) -> "Soup":

        """\

        Intialize with gazpacho.get

        """

        html = get(url, params, headers)

        if not isinstance(html, str):

            raise Exception(f"Unable to retrieve contents from {url}")

        return cls(html)

    @property

    def _active(self) -> bool:

        return sum(self.counter.values()) > 0

    @staticmethod

    def _void(tag: str) -> bool:

        return tag in [

            "area",

            "base",

            "br",

            "col",

            "embed",

            "hr",

            "img",

            "input",

            "keygen",

            "link",

            "meta",

            "param",

            "source",

            "track",

            "wbr",

        ]

    def _handle_start(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        html, attrs_dict = recover_html_and_attrs(tag, attrs)

        query_attrs = {} if not self.attrs else self.attrs

        matching = match(query_attrs, attrs_dict, partial=self.partial)

        if (tag == self.tag) and (matching) and (not self._active):

            self.groups.append(Soup())

            self.groups[-1].tag = tag

            self.groups[-1].attrs = attrs_dict

            self.groups[-1].html += html

            self.counter[tag] += 1

            return

        if self._active:

            self.groups[-1].html += html

            self.counter[tag] += 1

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            if self._void(tag):

                self.counter[tag] -= 1

    def handle_startendtag(

        self, tag: str, attrs: List[Tuple[str, Optional[str]]]

    ) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            self.counter[tag] -= 1

    def handle_data(self, data: str) -> None:

        if self._active:

            if self.groups[-1].text is None:

                self.groups[-1].text = data.strip()

            self.groups[-1].html += data

    def handle_endtag(self, tag: str) -> None:

        if self._active:

            self.groups[-1].html += f"</{tag}>"

            self.counter[tag] -= 1

    def strip(self, whitespace: bool = True) -> str:

        """\

        Strip brackets, tags, and attributes from inner text

        Arguments:

        - whitespace: remove extra whitespace characters

        Example:

        ```

        html = "<span>AB<b>C</b>D</span>"

        soup = Soup(html)

        soup.find("span").text

        # AB

        soup.strip()

        # ABCD

        ```

        """

        text = re.sub("<[^>]+>", "", self.html)

        if whitespace:

            text = " ".join(text.split())

        return text

    def remove_tags(self, strip: bool = True) -> str:

        """\

        Now: .strip()

        """

        message = "Marked for removal; use .strip()"

        warnings.warn(message, category=FutureWarning, stacklevel=2)

        return self.strip(whitespace=strip)

    def find(

        self,

        tag: str,

        attrs: Optional[Dict[str, str]] = None,

        *,

        partial: bool = True,

        mode: str = "automatic",

        strict: Optional[bool] = None,

    ) -> Optional[Union[List["Soup"], "Soup"]]:

        """\

        Return matching HTML elements

        Arguments:

        - tag: target element tag

        - attrs: target element attributes

        - partial: match on attributes

        - mode: override return behavior {'auto/automatic', 'all/list', 'first'}

        Deprecations:

        - strict: (as of 1.0) use partial=

        Examples:

        ```

        soup.find('p', {'class': 'a'})

        # [<p class="a">1</p>, <p class="a">2</p>]

        soup.find('p', {'class': 'a'}, mode='first')

        # <p class="a">1</p>

        result = soup.find('p', {'class': 'b'}, mode='auto')

        print(result)

        # <p class="b">3</p>

        print(result.text)

        # 3

        ```

        """

        self.counter: Counter = Counter()

        self.groups: List = []

        self.tag = tag

        self.attrs = attrs

        self.partial = partial

        if strict is not None:

            message = "Marked for removal; use partial="

            warnings.warn(message, category=FutureWarning, stacklevel=2)

            partial = not strict

        self.feed(self.html)

        automatic = ["auto", "automatic"]

        all = ["all", "list"]

        first = ["first"]

        last = ["last"]  # undocumented

        random = ["random"]  # undocumented

        if not self.groups:

            if mode in all:

                return []

            else:

                return None

        elif mode in automatic:

            if len(self.groups) == 1:

                return self.groups[0]

            else:

                return self.groups

        elif mode in all:

            return self.groups

        elif mode in first:

            return self.groups[0]

        elif mode in last:

            return self.groups[-1]

        elif mode in random:

            return sample(self.groups, k=1)[0]

        else:

            raise ValueError(mode)

Ancestors (in MRO)

  • html.parser.HTMLParser
  • _markupbase.ParserBase

Class variables

CDATA_CONTENT_ELEMENTS

Static methods

get
def get(
    url: str,
    params: Union[Dict[str, str], NoneType] = None,
    headers: Union[Dict[str, str], NoneType] = None
) -> 'Soup'

Intialize with gazpacho.get

View Source
    @classmethod

    def get(

        cls,

        url: str,

        params: Optional[Dict[str, str]] = None,

        headers: Optional[Dict[str, str]] = None,

    ) -> "Soup":

        """\

        Intialize with gazpacho.get

        """

        html = get(url, params, headers)

        if not isinstance(html, str):

            raise Exception(f"Unable to retrieve contents from {url}")

        return cls(html)

Methods

check_for_whole_start_tag
def check_for_whole_start_tag(
    self,
    i
)
View Source
    def check_for_whole_start_tag(self, i):

        rawdata = self.rawdata

        m = locatestarttagend_tolerant.match(rawdata, i)

        if m:

            j = m.end()

            next = rawdata[j:j+1]

            if next == ">":

                return j + 1

            if next == "/":

                if rawdata.startswith("/>", j):

                    return j + 2

                if rawdata.startswith("/", j):

                    # buffer boundary

                    return -1

                # else bogus input

                if j > i:

                    return j

                else:

                    return i + 1

            if next == "":

                # end of input

                return -1

            if next in ("abcdefghijklmnopqrstuvwxyz=/"

                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

                # end of input in or before attribute value, or we have the

                # '/' from a '/>' ending

                return -1

            if j > i:

                return j

            else:

                return i + 1

        raise AssertionError("we should not get here!")
clear_cdata_mode
def clear_cdata_mode(
    self
)
View Source
    def clear_cdata_mode(self):

        self.interesting = interesting_normal

        self.cdata_elem = None
close
def close(
    self
)

Handle any buffered data.

View Source
    def close(self):

        """Handle any buffered data."""

        self.goahead(1)
error
def error(
    self,
    message
)
View Source
    def error(self, message):

        raise NotImplementedError(

            "subclasses of ParserBase must override error()")
feed
def feed(
    self,
    data
)

Feed data to the parser.

Call this as often as you want, with as little or as much text as you want (may include '\n').

View Source
    def feed(self, data):

        r"""Feed data to the parser.

        Call this as often as you want, with as little or as much text

        as you want (may include '\n').

        """

        self.rawdata = self.rawdata + data

        self.goahead(0)
find
def find(
    self,
    tag: str,
    attrs: Union[Dict[str, str], NoneType] = None,
    *,
    partial: bool = True,
    mode: str = 'automatic',
    strict: Union[bool, NoneType] = None
) -> Union[List[ForwardRef('Soup')], ForwardRef('Soup'), NoneType]

Return matching HTML elements

Arguments:

  • tag: target element tag
  • attrs: target element attributes
  • partial: match on attributes
  • mode: override return behavior {'auto/automatic', 'all/list', 'first'}

Deprecations:

  • strict: (as of 1.0) use partial=

Examples:

soup.find('p', {'class': 'a'})
# [<p class="a">1</p>, <p class="a">2</p>]

soup.find('p', {'class': 'a'}, mode='first')
# <p class="a">1</p>

result = soup.find('p', {'class': 'b'}, mode='auto')
print(result)
# <p class="b">3</p>

print(result.text)
# 3
View Source
    def find(

        self,

        tag: str,

        attrs: Optional[Dict[str, str]] = None,

        *,

        partial: bool = True,

        mode: str = "automatic",

        strict: Optional[bool] = None,

    ) -> Optional[Union[List["Soup"], "Soup"]]:

        """\

        Return matching HTML elements

        Arguments:

        - tag: target element tag

        - attrs: target element attributes

        - partial: match on attributes

        - mode: override return behavior {'auto/automatic', 'all/list', 'first'}

        Deprecations:

        - strict: (as of 1.0) use partial=

        Examples:

        ```

        soup.find('p', {'class': 'a'})

        # [<p class="a">1</p>, <p class="a">2</p>]

        soup.find('p', {'class': 'a'}, mode='first')

        # <p class="a">1</p>

        result = soup.find('p', {'class': 'b'}, mode='auto')

        print(result)

        # <p class="b">3</p>

        print(result.text)

        # 3

        ```

        """

        self.counter: Counter = Counter()

        self.groups: List = []

        self.tag = tag

        self.attrs = attrs

        self.partial = partial

        if strict is not None:

            message = "Marked for removal; use partial="

            warnings.warn(message, category=FutureWarning, stacklevel=2)

            partial = not strict

        self.feed(self.html)

        automatic = ["auto", "automatic"]

        all = ["all", "list"]

        first = ["first"]

        last = ["last"]  # undocumented

        random = ["random"]  # undocumented

        if not self.groups:

            if mode in all:

                return []

            else:

                return None

        elif mode in automatic:

            if len(self.groups) == 1:

                return self.groups[0]

            else:

                return self.groups

        elif mode in all:

            return self.groups

        elif mode in first:

            return self.groups[0]

        elif mode in last:

            return self.groups[-1]

        elif mode in random:

            return sample(self.groups, k=1)[0]

        else:

            raise ValueError(mode)
get_starttag_text
def get_starttag_text(
    self
)

Return full source of start tag: '<...>'.

View Source
    def get_starttag_text(self):

        """Return full source of start tag: '<...>'."""

        return self.__starttag_text
getpos
def getpos(
    self
)

Return current line number and offset.

View Source
    def getpos(self):

        """Return current line number and offset."""

        return self.lineno, self.offset
goahead
def goahead(
    self,
    end
)
View Source
    def goahead(self, end):

        rawdata = self.rawdata

        i = 0

        n = len(rawdata)

        while i < n:

            if self.convert_charrefs and not self.cdata_elem:

                j = rawdata.find('<', i)

                if j < 0:

                    # if we can't find the next <, either we are at the end

                    # or there's more text incoming.  If the latter is True,

                    # we can't pass the text to handle_data in case we have

                    # a charref cut in half at end.  Try to determine if

                    # this is the case before proceeding by looking for an

                    # & near the end and see if it's followed by a space or ;.

                    amppos = rawdata.rfind('&', max(i, n-34))

                    if (amppos >= 0 and

                        not re.compile(r'[\s;]').search(rawdata, amppos)):

                        break  # wait till we get all the text

                    j = n

            else:

                match = self.interesting.search(rawdata, i)  # < or &

                if match:

                    j = match.start()

                else:

                    if self.cdata_elem:

                        break

                    j = n

            if i < j:

                if self.convert_charrefs and not self.cdata_elem:

                    self.handle_data(unescape(rawdata[i:j]))

                else:

                    self.handle_data(rawdata[i:j])

            i = self.updatepos(i, j)

            if i == n: break

            startswith = rawdata.startswith

            if startswith('<', i):

                if starttagopen.match(rawdata, i): # < + letter

                    k = self.parse_starttag(i)

                elif startswith("</", i):

                    k = self.parse_endtag(i)

                elif startswith("<!--", i):

                    k = self.parse_comment(i)

                elif startswith("<?", i):

                    k = self.parse_pi(i)

                elif startswith("<!", i):

                    k = self.parse_html_declaration(i)

                elif (i + 1) < n:

                    self.handle_data("<")

                    k = i + 1

                else:

                    break

                if k < 0:

                    if not end:

                        break

                    k = rawdata.find('>', i + 1)

                    if k < 0:

                        k = rawdata.find('<', i + 1)

                        if k < 0:

                            k = i + 1

                    else:

                        k += 1

                    if self.convert_charrefs and not self.cdata_elem:

                        self.handle_data(unescape(rawdata[i:k]))

                    else:

                        self.handle_data(rawdata[i:k])

                i = self.updatepos(i, k)

            elif startswith("&#", i):

                match = charref.match(rawdata, i)

                if match:

                    name = match.group()[2:-1]

                    self.handle_charref(name)

                    k = match.end()

                    if not startswith(';', k-1):

                        k = k - 1

                    i = self.updatepos(i, k)

                    continue

                else:

                    if ";" in rawdata[i:]:  # bail by consuming &#

                        self.handle_data(rawdata[i:i+2])

                        i = self.updatepos(i, i+2)

                    break

            elif startswith('&', i):

                match = entityref.match(rawdata, i)

                if match:

                    name = match.group(1)

                    self.handle_entityref(name)

                    k = match.end()

                    if not startswith(';', k-1):

                        k = k - 1

                    i = self.updatepos(i, k)

                    continue

                match = incomplete.match(rawdata, i)

                if match:

                    # match.group() will contain at least 2 chars

                    if end and match.group() == rawdata[i:]:

                        k = match.end()

                        if k <= i:

                            k = n

                        i = self.updatepos(i, i + 1)

                    # incomplete

                    break

                elif (i + 1) < n:

                    # not the end of the buffer, and can't be confused

                    # with some other construct

                    self.handle_data("&")

                    i = self.updatepos(i, i + 1)

                else:

                    break

            else:

                assert 0, "interesting.search() lied"

        # end while

        if end and i < n and not self.cdata_elem:

            if self.convert_charrefs and not self.cdata_elem:

                self.handle_data(unescape(rawdata[i:n]))

            else:

                self.handle_data(rawdata[i:n])

            i = self.updatepos(i, n)

        self.rawdata = rawdata[i:]
handle_charref
def handle_charref(
    self,
    name
)
View Source
    def handle_charref(self, name):

        pass
handle_comment
def handle_comment(
    self,
    data
)
View Source
    def handle_comment(self, data):

        pass
handle_data
def handle_data(
    self,
    data: str
) -> None
View Source
    def handle_data(self, data: str) -> None:

        if self._active:

            if self.groups[-1].text is None:

                self.groups[-1].text = data.strip()

            self.groups[-1].html += data
handle_decl
def handle_decl(
    self,
    decl
)
View Source
    def handle_decl(self, decl):

        pass
handle_endtag
def handle_endtag(
    self,
    tag: str
) -> None
View Source
    def handle_endtag(self, tag: str) -> None:

        if self._active:

            self.groups[-1].html += f"</{tag}>"

            self.counter[tag] -= 1
handle_entityref
def handle_entityref(
    self,
    name
)
View Source
    def handle_entityref(self, name):

        pass
handle_pi
def handle_pi(
    self,
    data
)
View Source
    def handle_pi(self, data):

        pass
handle_startendtag
def handle_startendtag(
    self,
    tag: str,
    attrs: List[Tuple[str, Union[str, NoneType]]]
) -> None
View Source
    def handle_startendtag(

        self, tag: str, attrs: List[Tuple[str, Optional[str]]]

    ) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            self.counter[tag] -= 1
handle_starttag
def handle_starttag(
    self,
    tag: str,
    attrs: List[Tuple[str, Union[str, NoneType]]]
) -> None
View Source
    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            if self._void(tag):

                self.counter[tag] -= 1
parse_bogus_comment
def parse_bogus_comment(
    self,
    i,
    report=1
)
View Source
    def parse_bogus_comment(self, i, report=1):

        rawdata = self.rawdata

        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '

                                                'parse_comment()')

        pos = rawdata.find('>', i+2)

        if pos == -1:

            return -1

        if report:

            self.handle_comment(rawdata[i+2:pos])

        return pos + 1
parse_comment
def parse_comment(
    self,
    i,
    report=1
)
View Source
    def parse_comment(self, i, report=1):

        rawdata = self.rawdata

        if rawdata[i:i+4] != '<!--':

            self.error('unexpected call to parse_comment()')

        match = _commentclose.search(rawdata, i+4)

        if not match:

            return -1

        if report:

            j = match.start(0)

            self.handle_comment(rawdata[i+4: j])

        return match.end(0)
parse_declaration
def parse_declaration(
    self,
    i
)
View Source
    def parse_declaration(self, i):

        # This is some sort of declaration; in "HTML as

        # deployed," this should only be the document type

        # declaration ("<!DOCTYPE html...>").

        # ISO 8879:1986, however, has more complex

        # declaration syntax for elements in <!...>, including:

        # --comment--

        # [marked section]

        # name in the following list: ENTITY, DOCTYPE, ELEMENT,

        # ATTLIST, NOTATION, SHORTREF, USEMAP,

        # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM

        rawdata = self.rawdata

        j = i + 2

        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"

        if rawdata[j:j+1] == ">":

            # the empty comment <!>

            return j + 1

        if rawdata[j:j+1] in ("-", ""):

            # Start of comment followed by buffer boundary,

            # or just a buffer boundary.

            return -1

        # A simple, practical version could look like: ((name|stringlit) S*) + '>'

        n = len(rawdata)

        if rawdata[j:j+2] == '--': #comment

            # Locate --.*-- as the body of the comment

            return self.parse_comment(i)

        elif rawdata[j] == '[': #marked section

            # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section

            # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA

            # Note that this is extended by Microsoft Office "Save as Web" function

            # to include [if...] and [endif].

            return self.parse_marked_section(i)

        else: #all other declaration elements

            decltype, j = self._scan_name(j, i)

        if j < 0:

            return j

        if decltype == "doctype":

            self._decl_otherchars = ''

        while j < n:

            c = rawdata[j]

            if c == ">":

                # end of declaration syntax

                data = rawdata[i+2:j]

                if decltype == "doctype":

                    self.handle_decl(data)

                else:

                    # According to the HTML5 specs sections "8.2.4.44 Bogus

                    # comment state" and "8.2.4.45 Markup declaration open

                    # state", a comment token should be emitted.

                    # Calling unknown_decl provides more flexibility though.

                    self.unknown_decl(data)

                return j + 1

            if c in "\"'":

                m = _declstringlit_match(rawdata, j)

                if not m:

                    return -1 # incomplete

                j = m.end()

            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":

                name, j = self._scan_name(j, i)

            elif c in self._decl_otherchars:

                j = j + 1

            elif c == "[":

                # this could be handled in a separate doctype parser

                if decltype == "doctype":

                    j = self._parse_doctype_subset(j + 1, i)

                elif decltype in {"attlist", "linktype", "link", "element"}:

                    # must tolerate []'d groups in a content model in an element declaration

                    # also in data attribute specifications of attlist declaration

                    # also link type declaration subsets in linktype declarations

                    # also link attribute specification lists in link declarations

                    self.error("unsupported '[' char in %s declaration" % decltype)

                else:

                    self.error("unexpected '[' char in declaration")

            else:

                self.error(

                    "unexpected %r char in declaration" % rawdata[j])

            if j < 0:

                return j

        return -1 # incomplete
parse_endtag
def parse_endtag(
    self,
    i
)
View Source
    def parse_endtag(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

        match = endendtag.search(rawdata, i+1) # >

        if not match:

            return -1

        gtpos = match.end()

        match = endtagfind.match(rawdata, i) # </ + tag + >

        if not match:

            if self.cdata_elem is not None:

                self.handle_data(rawdata[i:gtpos])

                return gtpos

            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

            namematch = tagfind_tolerant.match(rawdata, i+2)

            if not namematch:

                # w3.org/TR/html5/tokenization.html#end-tag-open-state

                if rawdata[i:i+3] == '</>':

                    return i+3

                else:

                    return self.parse_bogus_comment(i)

            tagname = namematch.group(1).lower()

            # consume and ignore other stuff between the name and the >

            # Note: this is not 100% correct, since we might have things like

            # </tag attr=">">, but looking for > after tha name should cover

            # most of the cases and is much simpler

            gtpos = rawdata.find('>', namematch.end())

            self.handle_endtag(tagname)

            return gtpos+1

        elem = match.group(1).lower() # script or style

        if self.cdata_elem is not None:

            if elem != self.cdata_elem:

                self.handle_data(rawdata[i:gtpos])

                return gtpos

        self.handle_endtag(elem)

        self.clear_cdata_mode()

        return gtpos
parse_html_declaration
def parse_html_declaration(
    self,
    i
)
View Source
    def parse_html_declaration(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == '<!', ('unexpected call to '

                                        'parse_html_declaration()')

        if rawdata[i:i+4] == '<!--':

            # this case is actually already handled in goahead()

            return self.parse_comment(i)

        elif rawdata[i:i+3] == '<![':

            return self.parse_marked_section(i)

        elif rawdata[i:i+9].lower() == '<!doctype':

            # find the closing >

            gtpos = rawdata.find('>', i+9)

            if gtpos == -1:

                return -1

            self.handle_decl(rawdata[i+2:gtpos])

            return gtpos+1

        else:

            return self.parse_bogus_comment(i)
parse_marked_section
def parse_marked_section(
    self,
    i,
    report=1
)
View Source
    def parse_marked_section(self, i, report=1):

        rawdata= self.rawdata

        assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"

        sectName, j = self._scan_name( i+3, i )

        if j < 0:

            return j

        if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:

            # look for standard ]]> ending

            match= _markedsectionclose.search(rawdata, i+3)

        elif sectName in {"if", "else", "endif"}:

            # look for MS Office ]> ending

            match= _msmarkedsectionclose.search(rawdata, i+3)

        else:

            self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])

        if not match:

            return -1

        if report:

            j = match.start(0)

            self.unknown_decl(rawdata[i+3: j])

        return match.end(0)
parse_pi
def parse_pi(
    self,
    i
)
View Source
    def parse_pi(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

        match = piclose.search(rawdata, i+2) # >

        if not match:

            return -1

        j = match.start()

        self.handle_pi(rawdata[i+2: j])

        j = match.end()

        return j
parse_starttag
def parse_starttag(
    self,
    i
)
View Source
    def parse_starttag(self, i):

        self.__starttag_text = None

        endpos = self.check_for_whole_start_tag(i)

        if endpos < 0:

            return endpos

        rawdata = self.rawdata

        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs

        attrs = []

        match = tagfind_tolerant.match(rawdata, i+1)

        assert match, 'unexpected call to parse_starttag()'

        k = match.end()

        self.lasttag = tag = match.group(1).lower()

        while k < endpos:

            m = attrfind_tolerant.match(rawdata, k)

            if not m:

                break

            attrname, rest, attrvalue = m.group(1, 2, 3)

            if not rest:

                attrvalue = None

            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

                 attrvalue[:1] == '"' == attrvalue[-1:]:

                attrvalue = attrvalue[1:-1]

            if attrvalue:

                attrvalue = unescape(attrvalue)

            attrs.append((attrname.lower(), attrvalue))

            k = m.end()

        end = rawdata[k:endpos].strip()

        if end not in (">", "/>"):

            lineno, offset = self.getpos()

            if "\n" in self.__starttag_text:

                lineno = lineno + self.__starttag_text.count("\n")

                offset = len(self.__starttag_text) \

                         - self.__starttag_text.rfind("\n")

            else:

                offset = offset + len(self.__starttag_text)

            self.handle_data(rawdata[i:endpos])

            return endpos

        if end.endswith('/>'):

            # XHTML-style empty tag: <span attr="value" />

            self.handle_startendtag(tag, attrs)

        else:

            self.handle_starttag(tag, attrs)

            if tag in self.CDATA_CONTENT_ELEMENTS:

                self.set_cdata_mode(tag)

        return endpos
remove_tags
def remove_tags(
    self,
    strip: bool = True
) -> str

Now: .strip()

View Source
    def remove_tags(self, strip: bool = True) -> str:

        """\

        Now: .strip()

        """

        message = "Marked for removal; use .strip()"

        warnings.warn(message, category=FutureWarning, stacklevel=2)

        return self.strip(whitespace=strip)
reset
def reset(
    self
)

Reset this instance. Loses all unprocessed data.

View Source
    def reset(self):

        """Reset this instance.  Loses all unprocessed data."""

        self.rawdata = ''

        self.lasttag = '???'

        self.interesting = interesting_normal

        self.cdata_elem = None

        _markupbase.ParserBase.reset(self)
set_cdata_mode
def set_cdata_mode(
    self,
    elem
)
View Source
    def set_cdata_mode(self, elem):

        self.cdata_elem = elem.lower()

        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
strip
def strip(
    self,
    whitespace: bool = True
) -> str

Strip brackets, tags, and attributes from inner text

Arguments:

  • whitespace: remove extra whitespace characters

Example:

html = "<span>AB<b>C</b>D</span>"
soup = Soup(html)
soup.find("span").text
# AB
soup.strip()
# ABCD
View Source
    def strip(self, whitespace: bool = True) -> str:

        """\

        Strip brackets, tags, and attributes from inner text

        Arguments:

        - whitespace: remove extra whitespace characters

        Example:

        ```

        html = "<span>AB<b>C</b>D</span>"

        soup = Soup(html)

        soup.find("span").text

        # AB

        soup.strip()

        # ABCD

        ```

        """

        text = re.sub("<[^>]+>", "", self.html)

        if whitespace:

            text = " ".join(text.split())

        return text
unescape
def unescape(
    self,
    s
)
View Source
    def unescape(self, s):

        warnings.warn('The unescape method is deprecated and will be removed '

                      'in 3.5, use html.unescape() instead.',

                      DeprecationWarning, stacklevel=2)

        return unescape(s)
unknown_decl
def unknown_decl(
    self,
    data
)
View Source
    def unknown_decl(self, data):

        pass
updatepos
def updatepos(
    self,
    i,
    j
)
View Source
    def updatepos(self, i, j):

        if i >= j:

            return j

        rawdata = self.rawdata

        nlines = rawdata.count("\n", i, j)

        if nlines:

            self.lineno = self.lineno + nlines

            pos = rawdata.rindex("\n", i, j) # Should not fail

            self.offset = j-(pos+1)

        else:

            self.offset = self.offset + j-i

        return j