Source code for pydarkstar.scrubbing.scrubber

from ..darkobject import DarkObject
from bs4 import BeautifulSoup
import requests
import logging
import time
import bs4


[docs]class Scrubber(DarkObject):
    def __init__(self):
        super(Scrubber, self).__init__()

[docs]    def scrub(self):
        """
        Get item metadata.
        """
        return {}

    # noinspection PyBroadException
[docs]    @staticmethod
    def soup(url, absolute: bool = False, **kwargs):
        """
        Open URL and create tag soup.

        :param url: website string
        :type url: str

        :param absolute: perform double get request to find absolute url
        :type absolute: bool
        """
        handle = ''
        max_tries = 10
        for i in range(max_tries):
            # noinspection PyPep8
            try:
                if absolute:
                    url = requests.get(url).url
                handle = requests.get(url, params=kwargs).text
                break
            except Exception:
                logging.exception('urlopen failed (attempt %d)', i + 1)
                if i == max_tries - 1:
                    logging.error('the maximum urlopen attempts have been reached')
                    raise
                time.sleep(1)

        try:
            s = BeautifulSoup(handle, features='html5lib')
        except bs4.FeatureNotFound:
            s = BeautifulSoup(handle, features='html.parser')

        return s


if __name__ == '__main__':
    pass