Module gazpacho.soup

View Source

from collections import Counter

from html.parser import HTMLParser

from random import sample

import re

from typing import Any, Dict, Optional, Tuple, Union, List

import warnings

from .get import get

from .utils import match, recover_html_and_attrs

class Soup(HTMLParser):

    """\

    HTML Soup Parser

    Attributes:

    - html: content to parse

    - tag: element to match

    - attrs: element attributes to match

    - text: inner data

    Methods:

    - find: matching content by element tag (and attributes)

    - strip: brackets, tags, and attributes from inner data

    - get: alternate initializer

    Deprecations:

    - remove_tags: (as of 1.0) use strip

    Examples:

    ```

    from gazpacho import Soup

    html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"

    url = "https://www.gazpacho.xyz"

    soup = Soup(html)

    soup = Soup.get(url)

    ```

    """

    def __init__(self, html: Optional[str] = None) -> None:

        """\

        Arguments:

        - html: content to parse

        """

        super().__init__()

        self.html = "" if not html else html

        self.tag: Optional[str] = None

        self.attrs: Optional[Dict[Any, Any]] = None

        self.text: Optional[str] = None

    def __dir__(self):

        return ["attrs", "find", "get", "html", "strip", "tag", "text"]

    def __repr__(self) -> str:

        return self.html

    @classmethod

    def get(

        cls,

        url: str,

        params: Optional[Dict[str, str]] = None,

        headers: Optional[Dict[str, str]] = None,

    ) -> "Soup":

        """\

        Intialize with gazpacho.get

        """

        html = get(url, params, headers)

        if not isinstance(html, str):

            raise Exception(f"Unable to retrieve contents from {url}")

        return cls(html)

    @property

    def _active(self) -> bool:

        return sum(self.counter.values()) > 0

    @staticmethod

    def _void(tag: str) -> bool:

        return tag in [

            "area",

            "base",

            "br",

            "col",

            "embed",

            "hr",

            "img",

            "input",

            "keygen",

            "link",

            "meta",

            "param",

            "source",

            "track",

            "wbr",

        ]

    def _handle_start(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        html, attrs_dict = recover_html_and_attrs(tag, attrs)

        query_attrs = {} if not self.attrs else self.attrs

        matching = match(query_attrs, attrs_dict, partial=self.partial)

        if (tag == self.tag) and (matching) and (not self._active):

            self.groups.append(Soup())

            self.groups[-1].tag = tag

            self.groups[-1].attrs = attrs_dict

            self.groups[-1].html += html

            self.counter[tag] += 1

            return

        if self._active:

            self.groups[-1].html += html

            self.counter[tag] += 1

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            if self._void(tag):

                self.counter[tag] -= 1

    def handle_startendtag(

        self, tag: str, attrs: List[Tuple[str, Optional[str]]]

    ) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            self.counter[tag] -= 1

    def handle_data(self, data: str) -> None:

        if self._active:

            if self.groups[-1].text is None:

                self.groups[-1].text = data.strip()

            self.groups[-1].html += data

    def handle_endtag(self, tag: str) -> None:

        if self._active:

            self.groups[-1].html += f"</{tag}>"

            self.counter[tag] -= 1

    def strip(self, whitespace: bool = True) -> str:

        """\

        Strip brackets, tags, and attributes from inner text

        Arguments:

        - whitespace: remove extra whitespace characters

        Example:

        ```

        html = "<span>AB<b>C</b>D</span>"

        soup = Soup(html)

        soup.find("span").text

        # AB

        soup.strip()

        # ABCD

        ```

        """

        text = re.sub("<[^>]+>", "", self.html)

        if whitespace:

            text = " ".join(text.split())

        return text

    def remove_tags(self, strip: bool = True) -> str:

        """\

        Now: .strip()

        """

        message = "Marked for removal; use .strip()"

        warnings.warn(message, category=FutureWarning, stacklevel=2)

        return self.strip(whitespace=strip)

    def find(

        self,

        tag: str,

        attrs: Optional[Dict[str, str]] = None,

        *,

        partial: bool = True,

        mode: str = "automatic",

        strict: Optional[bool] = None,

    ) -> Optional[Union[List["Soup"], "Soup"]]:

        """\

        Return matching HTML elements

        Arguments:

        - tag: target element tag

        - attrs: target element attributes

        - partial: match on attributes

        - mode: override return behavior {'auto/automatic', 'all/list', 'first'}

        Deprecations:

        - strict: (as of 1.0) use partial=

        Examples:

        ```

        soup.find('p', {'class': 'a'})

        # [<p class="a">1</p>, <p class="a">2</p>]

        soup.find('p', {'class': 'a'}, mode='first')

        # <p class="a">1</p>

        result = soup.find('p', {'class': 'b'}, mode='auto')

        print(result)

        # <p class="b">3</p>

        print(result.text)

        # 3

        ```

        """

        self.counter: Counter = Counter()

        self.groups: List = []

        self.tag = tag

        self.attrs = attrs

        self.partial = partial

        if strict is not None:

            message = "Marked for removal; use partial="

            warnings.warn(message, category=FutureWarning, stacklevel=2)

            partial = not strict

        self.feed(self.html)

        automatic = ["auto", "automatic"]

        all = ["all", "list"]

        first = ["first"]

        last = ["last"]  # undocumented

        random = ["random"]  # undocumented

        if not self.groups:

            if mode in all:

                return []

            else:

                return None

        elif mode in automatic:

            if len(self.groups) == 1:

                return self.groups[0]

            else:

                return self.groups

        elif mode in all:

            return self.groups

        elif mode in first:

            return self.groups[0]

        elif mode in last:

            return self.groups[-1]

        elif mode in random:

            return sample(self.groups, k=1)[0]

        else:

            raise ValueError(mode)

Classes

Soup

class Soup(
    html: Union[str, NoneType] = None
)

HTML Soup Parser

Attributes:

html: content to parse
tag: element to match
attrs: element attributes to match
text: inner data

Methods:

find: matching content by element tag (and attributes)
strip: brackets, tags, and attributes from inner data
get: alternate initializer

Deprecations:

remove_tags: (as of 1.0) use strip

Examples:

from gazpacho import Soup

html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"
url = "https://www.gazpacho.xyz"

soup = Soup(html)
soup = Soup.get(url)

View Source

class Soup(HTMLParser):

    """\

    HTML Soup Parser

    Attributes:

    - html: content to parse

    - tag: element to match

    - attrs: element attributes to match

    - text: inner data

    Methods:

    - find: matching content by element tag (and attributes)

    - strip: brackets, tags, and attributes from inner data

    - get: alternate initializer

    Deprecations:

    - remove_tags: (as of 1.0) use strip

    Examples:

    ```

    from gazpacho import Soup

    html = "<div><p class='a'>1</p><p class='a'>2</p><p class='b'>3</p></div>"

    url = "https://www.gazpacho.xyz"

    soup = Soup(html)

    soup = Soup.get(url)

    ```

    """

    def __init__(self, html: Optional[str] = None) -> None:

        """\

        Arguments:

        - html: content to parse

        """

        super().__init__()

        self.html = "" if not html else html

        self.tag: Optional[str] = None

        self.attrs: Optional[Dict[Any, Any]] = None

        self.text: Optional[str] = None

    def __dir__(self):

        return ["attrs", "find", "get", "html", "strip", "tag", "text"]

    def __repr__(self) -> str:

        return self.html

    @classmethod

    def get(

        cls,

        url: str,

        params: Optional[Dict[str, str]] = None,

        headers: Optional[Dict[str, str]] = None,

    ) -> "Soup":

        """\

        Intialize with gazpacho.get

        """

        html = get(url, params, headers)

        if not isinstance(html, str):

            raise Exception(f"Unable to retrieve contents from {url}")

        return cls(html)

    @property

    def _active(self) -> bool:

        return sum(self.counter.values()) > 0

    @staticmethod

    def _void(tag: str) -> bool:

        return tag in [

            "area",

            "base",

            "br",

            "col",

            "embed",

            "hr",

            "img",

            "input",

            "keygen",

            "link",

            "meta",

            "param",

            "source",

            "track",

            "wbr",

        ]

    def _handle_start(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        html, attrs_dict = recover_html_and_attrs(tag, attrs)

        query_attrs = {} if not self.attrs else self.attrs

        matching = match(query_attrs, attrs_dict, partial=self.partial)

        if (tag == self.tag) and (matching) and (not self._active):

            self.groups.append(Soup())

            self.groups[-1].tag = tag

            self.groups[-1].attrs = attrs_dict

            self.groups[-1].html += html

            self.counter[tag] += 1

            return

        if self._active:

            self.groups[-1].html += html

            self.counter[tag] += 1

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            if self._void(tag):

                self.counter[tag] -= 1

    def handle_startendtag(

        self, tag: str, attrs: List[Tuple[str, Optional[str]]]

    ) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            self.counter[tag] -= 1

    def handle_data(self, data: str) -> None:

        if self._active:

            if self.groups[-1].text is None:

                self.groups[-1].text = data.strip()

            self.groups[-1].html += data

    def handle_endtag(self, tag: str) -> None:

        if self._active:

            self.groups[-1].html += f"</{tag}>"

            self.counter[tag] -= 1

    def strip(self, whitespace: bool = True) -> str:

        """\

        Strip brackets, tags, and attributes from inner text

        Arguments:

        - whitespace: remove extra whitespace characters

        Example:

        ```

        html = "<span>AB<b>C</b>D</span>"

        soup = Soup(html)

        soup.find("span").text

        # AB

        soup.strip()

        # ABCD

        ```

        """

        text = re.sub("<[^>]+>", "", self.html)

        if whitespace:

            text = " ".join(text.split())

        return text

    def remove_tags(self, strip: bool = True) -> str:

        """\

        Now: .strip()

        """

        message = "Marked for removal; use .strip()"

        warnings.warn(message, category=FutureWarning, stacklevel=2)

        return self.strip(whitespace=strip)

    def find(

        self,

        tag: str,

        attrs: Optional[Dict[str, str]] = None,

        *,

        partial: bool = True,

        mode: str = "automatic",

        strict: Optional[bool] = None,

    ) -> Optional[Union[List["Soup"], "Soup"]]:

        """\

        Return matching HTML elements

        Arguments:

        - tag: target element tag

        - attrs: target element attributes

        - partial: match on attributes

        - mode: override return behavior {'auto/automatic', 'all/list', 'first'}

        Deprecations:

        - strict: (as of 1.0) use partial=

        Examples:

        ```

        soup.find('p', {'class': 'a'})

        # [<p class="a">1</p>, <p class="a">2</p>]

        soup.find('p', {'class': 'a'}, mode='first')

        # <p class="a">1</p>

        result = soup.find('p', {'class': 'b'}, mode='auto')

        print(result)

        # <p class="b">3</p>

        print(result.text)

        # 3

        ```

        """

        self.counter: Counter = Counter()

        self.groups: List = []

        self.tag = tag

        self.attrs = attrs

        self.partial = partial

        if strict is not None:

            message = "Marked for removal; use partial="

            warnings.warn(message, category=FutureWarning, stacklevel=2)

            partial = not strict

        self.feed(self.html)

        automatic = ["auto", "automatic"]

        all = ["all", "list"]

        first = ["first"]

        last = ["last"]  # undocumented

        random = ["random"]  # undocumented

        if not self.groups:

            if mode in all:

                return []

            else:

                return None

        elif mode in automatic:

            if len(self.groups) == 1:

                return self.groups[0]

            else:

                return self.groups

        elif mode in all:

            return self.groups

        elif mode in first:

            return self.groups[0]

        elif mode in last:

            return self.groups[-1]

        elif mode in random:

            return sample(self.groups, k=1)[0]

        else:

            raise ValueError(mode)

Ancestors (in MRO)

html.parser.HTMLParser
_markupbase.ParserBase

Class variables

CDATA_CONTENT_ELEMENTS

Static methods

get

def get(
    url: str,
    params: Union[Dict[str, str], NoneType] = None,
    headers: Union[Dict[str, str], NoneType] = None
) -> 'Soup'

Intialize with gazpacho.get

View Source

    @classmethod

    def get(

        cls,

        url: str,

        params: Optional[Dict[str, str]] = None,

        headers: Optional[Dict[str, str]] = None,

    ) -> "Soup":

        """\

        Intialize with gazpacho.get

        """

        html = get(url, params, headers)

        if not isinstance(html, str):

            raise Exception(f"Unable to retrieve contents from {url}")

        return cls(html)

Methods

check_for_whole_start_tag

def check_for_whole_start_tag(
    self,
    i
)

View Source

    def check_for_whole_start_tag(self, i):

        rawdata = self.rawdata

        m = locatestarttagend_tolerant.match(rawdata, i)

        if m:

            j = m.end()

            next = rawdata[j:j+1]

            if next == ">":

                return j + 1

            if next == "/":

                if rawdata.startswith("/>", j):

                    return j + 2

                if rawdata.startswith("/", j):

                    # buffer boundary

                    return -1

                # else bogus input

                if j > i:

                    return j

                else:

                    return i + 1

            if next == "":

                # end of input

                return -1

            if next in ("abcdefghijklmnopqrstuvwxyz=/"

                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

                # end of input in or before attribute value, or we have the

                # '/' from a '/>' ending

                return -1

            if j > i:

                return j

            else:

                return i + 1

        raise AssertionError("we should not get here!")

clear_cdata_mode

def clear_cdata_mode(
    self
)

View Source

    def clear_cdata_mode(self):

        self.interesting = interesting_normal

        self.cdata_elem = None

close

def close(
    self
)

Handle any buffered data.

View Source

    def close(self):

        """Handle any buffered data."""

        self.goahead(1)

error

def error(
    self,
    message
)

View Source

    def error(self, message):

        raise NotImplementedError(

            "subclasses of ParserBase must override error()")

feed

def feed(
    self,
    data
)

Feed data to the parser.

Call this as often as you want, with as little or as much text as you want (may include '\n').

View Source

    def feed(self, data):

        r"""Feed data to the parser.

        Call this as often as you want, with as little or as much text

        as you want (may include '\n').

        """

        self.rawdata = self.rawdata + data

        self.goahead(0)

find

def find(
    self,
    tag: str,
    attrs: Union[Dict[str, str], NoneType] = None,
    *,
    partial: bool = True,
    mode: str = 'automatic',
    strict: Union[bool, NoneType] = None
) -> Union[List[ForwardRef('Soup')], ForwardRef('Soup'), NoneType]

Return matching HTML elements

Arguments:

tag: target element tag
attrs: target element attributes
partial: match on attributes
mode: override return behavior {'auto/automatic', 'all/list', 'first'}

Deprecations:

strict: (as of 1.0) use partial=

Examples:

soup.find('p', {'class': 'a'})
# [<p class="a">1</p>, <p class="a">2</p>]

soup.find('p', {'class': 'a'}, mode='first')
# <p class="a">1</p>

result = soup.find('p', {'class': 'b'}, mode='auto')
print(result)
# <p class="b">3</p>

print(result.text)
# 3

View Source

    def find(

        self,

        tag: str,

        attrs: Optional[Dict[str, str]] = None,

        *,

        partial: bool = True,

        mode: str = "automatic",

        strict: Optional[bool] = None,

    ) -> Optional[Union[List["Soup"], "Soup"]]:

        """\

        Return matching HTML elements

        Arguments:

        - tag: target element tag

        - attrs: target element attributes

        - partial: match on attributes

        - mode: override return behavior {'auto/automatic', 'all/list', 'first'}

        Deprecations:

        - strict: (as of 1.0) use partial=

        Examples:

        ```

        soup.find('p', {'class': 'a'})

        # [<p class="a">1</p>, <p class="a">2</p>]

        soup.find('p', {'class': 'a'}, mode='first')

        # <p class="a">1</p>

        result = soup.find('p', {'class': 'b'}, mode='auto')

        print(result)

        # <p class="b">3</p>

        print(result.text)

        # 3

        ```

        """

        self.counter: Counter = Counter()

        self.groups: List = []

        self.tag = tag

        self.attrs = attrs

        self.partial = partial

        if strict is not None:

            message = "Marked for removal; use partial="

            warnings.warn(message, category=FutureWarning, stacklevel=2)

            partial = not strict

        self.feed(self.html)

        automatic = ["auto", "automatic"]

        all = ["all", "list"]

        first = ["first"]

        last = ["last"]  # undocumented

        random = ["random"]  # undocumented

        if not self.groups:

            if mode in all:

                return []

            else:

                return None

        elif mode in automatic:

            if len(self.groups) == 1:

                return self.groups[0]

            else:

                return self.groups

        elif mode in all:

            return self.groups

        elif mode in first:

            return self.groups[0]

        elif mode in last:

            return self.groups[-1]

        elif mode in random:

            return sample(self.groups, k=1)[0]

        else:

            raise ValueError(mode)

get_starttag_text

def get_starttag_text(
    self
)

Return full source of start tag: '<...>'.

View Source

    def get_starttag_text(self):

        """Return full source of start tag: '<...>'."""

        return self.__starttag_text

getpos

def getpos(
    self
)

Return current line number and offset.

View Source

    def getpos(self):

        """Return current line number and offset."""

        return self.lineno, self.offset

goahead

def goahead(
    self,
    end
)

View Source

    def goahead(self, end):

        rawdata = self.rawdata

        i = 0

        n = len(rawdata)

        while i < n:

            if self.convert_charrefs and not self.cdata_elem:

                j = rawdata.find('<', i)

                if j < 0:

                    # if we can't find the next <, either we are at the end

                    # or there's more text incoming.  If the latter is True,

                    # we can't pass the text to handle_data in case we have

                    # a charref cut in half at end.  Try to determine if

                    # this is the case before proceeding by looking for an

                    # & near the end and see if it's followed by a space or ;.

                    amppos = rawdata.rfind('&', max(i, n-34))

                    if (amppos >= 0 and

                        not re.compile(r'[\s;]').search(rawdata, amppos)):

                        break  # wait till we get all the text

                    j = n

            else:

                match = self.interesting.search(rawdata, i)  # < or &

                if match:

                    j = match.start()

                else:

                    if self.cdata_elem:

                        break

                    j = n

            if i < j:

                if self.convert_charrefs and not self.cdata_elem:

                    self.handle_data(unescape(rawdata[i:j]))

                else:

                    self.handle_data(rawdata[i:j])

            i = self.updatepos(i, j)

            if i == n: break

            startswith = rawdata.startswith

            if startswith('<', i):

                if starttagopen.match(rawdata, i): # < + letter

                    k = self.parse_starttag(i)

                elif startswith("</", i):

                    k = self.parse_endtag(i)

                elif startswith("<!--", i):

                    k = self.parse_comment(i)

                elif startswith("<?", i):

                    k = self.parse_pi(i)

                elif startswith("<!", i):

                    k = self.parse_html_declaration(i)

                elif (i + 1) < n:

                    self.handle_data("<")

                    k = i + 1

                else:

                    break

                if k < 0:

                    if not end:

                        break

                    k = rawdata.find('>', i + 1)

                    if k < 0:

                        k = rawdata.find('<', i + 1)

                        if k < 0:

                            k = i + 1

                    else:

                        k += 1

                    if self.convert_charrefs and not self.cdata_elem:

                        self.handle_data(unescape(rawdata[i:k]))

                    else:

                        self.handle_data(rawdata[i:k])

                i = self.updatepos(i, k)

            elif startswith("&#", i):

                match = charref.match(rawdata, i)

                if match:

                    name = match.group()[2:-1]

                    self.handle_charref(name)

                    k = match.end()

                    if not startswith(';', k-1):

                        k = k - 1

                    i = self.updatepos(i, k)

                    continue

                else:

                    if ";" in rawdata[i:]:  # bail by consuming &#

                        self.handle_data(rawdata[i:i+2])

                        i = self.updatepos(i, i+2)

                    break

            elif startswith('&', i):

                match = entityref.match(rawdata, i)

                if match:

                    name = match.group(1)

                    self.handle_entityref(name)

                    k = match.end()

                    if not startswith(';', k-1):

                        k = k - 1

                    i = self.updatepos(i, k)

                    continue

                match = incomplete.match(rawdata, i)

                if match:

                    # match.group() will contain at least 2 chars

                    if end and match.group() == rawdata[i:]:

                        k = match.end()

                        if k <= i:

                            k = n

                        i = self.updatepos(i, i + 1)

                    # incomplete

                    break

                elif (i + 1) < n:

                    # not the end of the buffer, and can't be confused

                    # with some other construct

                    self.handle_data("&")

                    i = self.updatepos(i, i + 1)

                else:

                    break

            else:

                assert 0, "interesting.search() lied"

        # end while

        if end and i < n and not self.cdata_elem:

            if self.convert_charrefs and not self.cdata_elem:

                self.handle_data(unescape(rawdata[i:n]))

            else:

                self.handle_data(rawdata[i:n])

            i = self.updatepos(i, n)

        self.rawdata = rawdata[i:]

handle_charref

def handle_charref(
    self,
    name
)

View Source

    def handle_charref(self, name):

        pass

handle_comment

def handle_comment(
    self,
    data
)

View Source

    def handle_comment(self, data):

        pass

handle_data

def handle_data(
    self,
    data: str
) -> None

View Source

    def handle_data(self, data: str) -> None:

        if self._active:

            if self.groups[-1].text is None:

                self.groups[-1].text = data.strip()

            self.groups[-1].html += data

handle_decl

def handle_decl(
    self,
    decl
)

View Source

    def handle_decl(self, decl):

        pass

handle_endtag

def handle_endtag(
    self,
    tag: str
) -> None

View Source

    def handle_endtag(self, tag: str) -> None:

        if self._active:

            self.groups[-1].html += f"</{tag}>"

            self.counter[tag] -= 1

handle_entityref

def handle_entityref(
    self,
    name
)

View Source

    def handle_entityref(self, name):

        pass

handle_pi

def handle_pi(
    self,
    data
)

View Source

    def handle_pi(self, data):

        pass

handle_startendtag

def handle_startendtag(
    self,
    tag: str,
    attrs: List[Tuple[str, Union[str, NoneType]]]
) -> None

View Source

    def handle_startendtag(

        self, tag: str, attrs: List[Tuple[str, Optional[str]]]

    ) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            self.counter[tag] -= 1

handle_starttag

def handle_starttag(
    self,
    tag: str,
    attrs: List[Tuple[str, Union[str, NoneType]]]
) -> None

View Source

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

        self._handle_start(tag, attrs)

        if self._active:

            if self._void(tag):

                self.counter[tag] -= 1

parse_bogus_comment

def parse_bogus_comment(
    self,
    i,
    report=1
)

View Source

    def parse_bogus_comment(self, i, report=1):

        rawdata = self.rawdata

        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '

                                                'parse_comment()')

        pos = rawdata.find('>', i+2)

        if pos == -1:

            return -1

        if report:

            self.handle_comment(rawdata[i+2:pos])

        return pos + 1

parse_comment

def parse_comment(
    self,
    i,
    report=1
)

View Source

    def parse_comment(self, i, report=1):

        rawdata = self.rawdata

        if rawdata[i:i+4] != '<!--':

            self.error('unexpected call to parse_comment()')

        match = _commentclose.search(rawdata, i+4)

        if not match:

            return -1

        if report:

            j = match.start(0)

            self.handle_comment(rawdata[i+4: j])

        return match.end(0)

parse_declaration

def parse_declaration(
    self,
    i
)

View Source

    def parse_declaration(self, i):

        # This is some sort of declaration; in "HTML as

        # deployed," this should only be the document type

        # declaration ("<!DOCTYPE html...>").

        # ISO 8879:1986, however, has more complex

        # declaration syntax for elements in <!...>, including:

        # --comment--

        # [marked section]

        # name in the following list: ENTITY, DOCTYPE, ELEMENT,

        # ATTLIST, NOTATION, SHORTREF, USEMAP,

        # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM

        rawdata = self.rawdata

        j = i + 2

        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"

        if rawdata[j:j+1] == ">":

            # the empty comment <!>

            return j + 1

        if rawdata[j:j+1] in ("-", ""):

            # Start of comment followed by buffer boundary,

            # or just a buffer boundary.

            return -1

        # A simple, practical version could look like: ((name|stringlit) S*) + '>'

        n = len(rawdata)

        if rawdata[j:j+2] == '--': #comment

            # Locate --.*-- as the body of the comment

            return self.parse_comment(i)

        elif rawdata[j] == '[': #marked section

            # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section

            # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA

            # Note that this is extended by Microsoft Office "Save as Web" function

            # to include [if...] and [endif].

            return self.parse_marked_section(i)

        else: #all other declaration elements

            decltype, j = self._scan_name(j, i)

        if j < 0:

            return j

        if decltype == "doctype":

            self._decl_otherchars = ''

        while j < n:

            c = rawdata[j]

            if c == ">":

                # end of declaration syntax

                data = rawdata[i+2:j]

                if decltype == "doctype":

                    self.handle_decl(data)

                else:

                    # According to the HTML5 specs sections "8.2.4.44 Bogus

                    # comment state" and "8.2.4.45 Markup declaration open

                    # state", a comment token should be emitted.

                    # Calling unknown_decl provides more flexibility though.

                    self.unknown_decl(data)

                return j + 1

            if c in "\"'":

                m = _declstringlit_match(rawdata, j)

                if not m:

                    return -1 # incomplete

                j = m.end()

            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":

                name, j = self._scan_name(j, i)

            elif c in self._decl_otherchars:

                j = j + 1

            elif c == "[":

                # this could be handled in a separate doctype parser

                if decltype == "doctype":

                    j = self._parse_doctype_subset(j + 1, i)

                elif decltype in {"attlist", "linktype", "link", "element"}:

                    # must tolerate []'d groups in a content model in an element declaration

                    # also in data attribute specifications of attlist declaration

                    # also link type declaration subsets in linktype declarations

                    # also link attribute specification lists in link declarations

                    self.error("unsupported '[' char in %s declaration" % decltype)

                else:

                    self.error("unexpected '[' char in declaration")

            else:

                self.error(

                    "unexpected %r char in declaration" % rawdata[j])

            if j < 0:

                return j

        return -1 # incomplete

parse_endtag

def parse_endtag(
    self,
    i
)

View Source

    def parse_endtag(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

        match = endendtag.search(rawdata, i+1) # >

        if not match:

            return -1

        gtpos = match.end()

        match = endtagfind.match(rawdata, i) # </ + tag + >

        if not match:

            if self.cdata_elem is not None:

                self.handle_data(rawdata[i:gtpos])

                return gtpos

            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

            namematch = tagfind_tolerant.match(rawdata, i+2)

            if not namematch:

                # w3.org/TR/html5/tokenization.html#end-tag-open-state

                if rawdata[i:i+3] == '</>':

                    return i+3

                else:

                    return self.parse_bogus_comment(i)

            tagname = namematch.group(1).lower()

            # consume and ignore other stuff between the name and the >

            # Note: this is not 100% correct, since we might have things like

            # </tag attr=">">, but looking for > after tha name should cover

            # most of the cases and is much simpler

            gtpos = rawdata.find('>', namematch.end())

            self.handle_endtag(tagname)

            return gtpos+1

        elem = match.group(1).lower() # script or style

        if self.cdata_elem is not None:

            if elem != self.cdata_elem:

                self.handle_data(rawdata[i:gtpos])

                return gtpos

        self.handle_endtag(elem)

        self.clear_cdata_mode()

        return gtpos

parse_html_declaration

def parse_html_declaration(
    self,
    i
)

View Source

    def parse_html_declaration(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == '<!', ('unexpected call to '

                                        'parse_html_declaration()')

        if rawdata[i:i+4] == '<!--':

            # this case is actually already handled in goahead()

            return self.parse_comment(i)

        elif rawdata[i:i+3] == '<![':

            return self.parse_marked_section(i)

        elif rawdata[i:i+9].lower() == '<!doctype':

            # find the closing >

            gtpos = rawdata.find('>', i+9)

            if gtpos == -1:

                return -1

            self.handle_decl(rawdata[i+2:gtpos])

            return gtpos+1

        else:

            return self.parse_bogus_comment(i)

parse_marked_section

def parse_marked_section(
    self,
    i,
    report=1
)

View Source

    def parse_marked_section(self, i, report=1):

        rawdata= self.rawdata

        assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"

        sectName, j = self._scan_name( i+3, i )

        if j < 0:

            return j

        if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:

            # look for standard ]]> ending

            match= _markedsectionclose.search(rawdata, i+3)

        elif sectName in {"if", "else", "endif"}:

            # look for MS Office ]> ending

            match= _msmarkedsectionclose.search(rawdata, i+3)

        else:

            self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])

        if not match:

            return -1

        if report:

            j = match.start(0)

            self.unknown_decl(rawdata[i+3: j])

        return match.end(0)

parse_pi

def parse_pi(
    self,
    i
)

View Source

    def parse_pi(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

        match = piclose.search(rawdata, i+2) # >

        if not match:

            return -1

        j = match.start()

        self.handle_pi(rawdata[i+2: j])

        j = match.end()

        return j

parse_starttag

def parse_starttag(
    self,
    i
)

View Source

    def parse_starttag(self, i):

        self.__starttag_text = None

        endpos = self.check_for_whole_start_tag(i)

        if endpos < 0:

            return endpos

        rawdata = self.rawdata

        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs

        attrs = []

        match = tagfind_tolerant.match(rawdata, i+1)

        assert match, 'unexpected call to parse_starttag()'

        k = match.end()

        self.lasttag = tag = match.group(1).lower()

        while k < endpos:

            m = attrfind_tolerant.match(rawdata, k)

            if not m:

                break

            attrname, rest, attrvalue = m.group(1, 2, 3)

            if not rest:

                attrvalue = None

            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

                 attrvalue[:1] == '"' == attrvalue[-1:]:

                attrvalue = attrvalue[1:-1]

            if attrvalue:

                attrvalue = unescape(attrvalue)

            attrs.append((attrname.lower(), attrvalue))

            k = m.end()

        end = rawdata[k:endpos].strip()

        if end not in (">", "/>"):

            lineno, offset = self.getpos()

            if "\n" in self.__starttag_text:

                lineno = lineno + self.__starttag_text.count("\n")

                offset = len(self.__starttag_text) \

                         - self.__starttag_text.rfind("\n")

            else:

                offset = offset + len(self.__starttag_text)

            self.handle_data(rawdata[i:endpos])

            return endpos

        if end.endswith('/>'):

            # XHTML-style empty tag: <span attr="value" />

            self.handle_startendtag(tag, attrs)

        else:

            self.handle_starttag(tag, attrs)

            if tag in self.CDATA_CONTENT_ELEMENTS:

                self.set_cdata_mode(tag)

        return endpos

remove_tags

def remove_tags(
    self,
    strip: bool = True
) -> str

Now: .strip()

View Source

    def remove_tags(self, strip: bool = True) -> str:

        """\

        Now: .strip()

        """

        message = "Marked for removal; use .strip()"

        warnings.warn(message, category=FutureWarning, stacklevel=2)

        return self.strip(whitespace=strip)

reset

def reset(
    self
)

Reset this instance. Loses all unprocessed data.

View Source

    def reset(self):

        """Reset this instance.  Loses all unprocessed data."""

        self.rawdata = ''

        self.lasttag = '???'

        self.interesting = interesting_normal

        self.cdata_elem = None

        _markupbase.ParserBase.reset(self)

set_cdata_mode

def set_cdata_mode(
    self,
    elem
)

View Source

    def set_cdata_mode(self, elem):

        self.cdata_elem = elem.lower()

        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

strip

def strip(
    self,
    whitespace: bool = True
) -> str

Strip brackets, tags, and attributes from inner text

Arguments:

whitespace: remove extra whitespace characters

Example:

html = "<span>AB<b>C</b>D</span>"
soup = Soup(html)
soup.find("span").text
# AB
soup.strip()
# ABCD

View Source

    def strip(self, whitespace: bool = True) -> str:

        """\

        Strip brackets, tags, and attributes from inner text

        Arguments:

        - whitespace: remove extra whitespace characters

        Example:

        ```

        html = "<span>AB<b>C</b>D</span>"

        soup = Soup(html)

        soup.find("span").text

        # AB

        soup.strip()

        # ABCD

        ```

        """

        text = re.sub("<[^>]+>", "", self.html)

        if whitespace:

            text = " ".join(text.split())

        return text

unescape

def unescape(
    self,
    s
)

View Source

    def unescape(self, s):

        warnings.warn('The unescape method is deprecated and will be removed '

                      'in 3.5, use html.unescape() instead.',

                      DeprecationWarning, stacklevel=2)

        return unescape(s)

unknown_decl

def unknown_decl(
    self,
    data
)

View Source

    def unknown_decl(self, data):

        pass

updatepos

def updatepos(
    self,
    i,
    j
)

View Source

    def updatepos(self, i, j):

        if i >= j:

            return j

        rawdata = self.rawdata

        nlines = rawdata.count("\n", i, j)

        if nlines:

            self.lineno = self.lineno + nlines

            pos = rawdata.rindex("\n", i, j) # Should not fail

            self.offset = j-(pos+1)

        else:

            self.offset = self.offset + j-i

        return j