Skip to content

Module gazpacho.get

View Source
import json

from typing import Dict, Optional, Union

from urllib.error import HTTPError as UrllibHTTPError

from urllib.parse import quote, urlencode, urlsplit, urlunsplit

from urllib.request import build_opener

class HTTPError(Exception):

    def __init__(self, code: int, msg: str) -> None:

        self.code = code

        self.msg = msg

    def __str__(self):

        return f"{self.code} - {self.msg}"

def sanitize(url: str) -> str:

    """\

    Sanitize and format a URL

    Arguments:

    - url: target page

    """

    scheme, netloc, path, query, fragment = urlsplit(url)

    if not scheme:

        scheme, netloc, path, query, fragment = urlsplit(f"http://{url}")

    path = quote(path)

    url = urlunsplit((scheme, netloc, path, query, fragment))

    return url

def get(

    url: str,

    params: Optional[Dict[str, str]] = None,

    headers: Optional[Dict[str, str]] = None,

) -> Union[str, dict]:

    """Retrive url contents

    Params:

    - url: target page

    - params: GET request payload

    - headers: GET request headers

    Example:

    ```

    get('https://httpbin.org/anything', {'soup': 'gazpacho'})

    ```

    """

    url = sanitize(url)

    opener = build_opener()

    if params:

        url += "?" + urlencode(params)

    if headers:

        for h in headers.items():

            opener.addheaders = [h]

    if (headers and not headers.get("User-Agent")) or not headers:

        UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:80.0) Gecko/20100101 Firefox/80.0"

        opener.addheaders = [("User-Agent", UA)]

    try:

        with opener.open(url) as response:

            content = response.read().decode("utf-8")

            if response.headers.get_content_type() == "application/json":

                content = json.loads(content)

    except UrllibHTTPError as e:

        raise HTTPError(e.code, e.msg) from None

    return content

Functions

get

def get(
    url: str,
    params: Union[Dict[str, str], NoneType] = None,
    headers: Union[Dict[str, str], NoneType] = None
) -> Union[str, dict]

Retrive url contents

Params:

  • url: target page
  • params: GET request payload
  • headers: GET request headers

Example:

get('https://httpbin.org/anything', {'soup': 'gazpacho'})
View Source
def get(

    url: str,

    params: Optional[Dict[str, str]] = None,

    headers: Optional[Dict[str, str]] = None,

) -> Union[str, dict]:

    """Retrive url contents

    Params:

    - url: target page

    - params: GET request payload

    - headers: GET request headers

    Example:

    ```

    get('https://httpbin.org/anything', {'soup': 'gazpacho'})

    ```

    """

    url = sanitize(url)

    opener = build_opener()

    if params:

        url += "?" + urlencode(params)

    if headers:

        for h in headers.items():

            opener.addheaders = [h]

    if (headers and not headers.get("User-Agent")) or not headers:

        UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:80.0) Gecko/20100101 Firefox/80.0"

        opener.addheaders = [("User-Agent", UA)]

    try:

        with opener.open(url) as response:

            content = response.read().decode("utf-8")

            if response.headers.get_content_type() == "application/json":

                content = json.loads(content)

    except UrllibHTTPError as e:

        raise HTTPError(e.code, e.msg) from None

    return content

sanitize

def sanitize(
    url: str
) -> str

Sanitize and format a URL

Arguments:

  • url: target page
View Source
def sanitize(url: str) -> str:

    """\

    Sanitize and format a URL

    Arguments:

    - url: target page

    """

    scheme, netloc, path, query, fragment = urlsplit(url)

    if not scheme:

        scheme, netloc, path, query, fragment = urlsplit(f"http://{url}")

    path = quote(path)

    url = urlunsplit((scheme, netloc, path, query, fragment))

    return url

Classes

HTTPError

class HTTPError(
    code: int,
    msg: str
)

Common base class for all non-exit exceptions.

View Source
class HTTPError(Exception):

    def __init__(self, code: int, msg: str) -> None:

        self.code = code

        self.msg = msg

    def __str__(self):

        return f"{self.code} - {self.msg}"

Ancestors (in MRO)

  • builtins.Exception
  • builtins.BaseException

Class variables

args

Methods

with_traceback
def with_traceback(
    ...
)

Exception.with_traceback(tb) -- set self.traceback to tb and return self.