Source code for grab.spider.task

from __future__ import absolute_import
from datetime import datetime, timedelta

from grab.spider.error import SpiderMisuseError
from grab.base import copy_config
from grab.util.warning import warn


class BaseTask(object):
    pass


[docs]class Task(BaseTask):
    """
    Task for spider.
    """

[docs]    def __init__(self, name=None, url=None, grab=None, grab_config=None,
                 priority=None, priority_set_explicitly=True,
                 network_try_count=0, task_try_count=1,
                 disable_cache=False, refresh_cache=False,
                 valid_status=None, use_proxylist=True,
                 cache_timeout=None, delay=None,
                 raw=False, callback=None,
                 fallback_name=None,
                 **kwargs):
        """
        Create `Task` object.

        If more than one of url, grab and grab_config options are non-empty
        then they processed in following order:
        * grab overwrite grab_config
        * grab_config overwrite url

        Args:
            :param name: name of the task. After successful network operation
                task's result will be passed to `task_<name>` method.
            :param url: URL of network document. Any task requires `url` or
                `grab` option to be specified.
            :param grab: configured `Grab` instance. You can use that option in
                case when `url` option is not enough. Do not forget to
                configure `url` option of `Grab` instance because in this case
                the `url` option of `Task` constructor will be overwritten
                with `grab.config['url']`.
            :param priority: - priority of the Task. Tasks with lower priority
                will be processed earlier. By default each new task is assigned
                with random priority from (80, 100) range.
            :param priority_set_explicitly: - internal flag which tells if that
                task priority was assigned manually or generated by spider
                according to priority generation rules.
            :param network_try_count: you'll probably will not need to use it.
                It is used internally to control how many times this task was
                restarted due to network errors. The `Spider` instance has
                `network_try_limit` option. When `network_try_count` attribute
                of the task exceeds the `network_try_limit` attribute then
                processing of the task is abandoned.
            :param task_try_count: the as `network_try_count` but it increased
                only then you use `clone` method. Also you can set it manually.
                It is useful if you want to restart the task after it was
                cancelled due to multiple network errors. As you might guessed
                there is `task_try_limit` option in `Spider` instance. Both
                options `network_try_count` and `network_try_limit` guarantee
                you that you'll not get infinite loop of restarting some task.
            :param disable_cache: if `True` disable cache subsystem.
                The document will be fetched from the Network and it will not
                be saved to cache.
            :param refresh_cache: if `True` the document will be fetched from
                the Network and saved to cache.
            :param valid_status: extra status codes which counts as valid
            :param use_proxylist: it means to use proxylist which was
                configured via `setup_proxylist` method of spider
            :param delay: if specified tells the spider to schedule the task
                and execute    it after `delay` seconds
            :param raw: if `raw` is True then the network response is
                forwarding to the corresponding handler without any check of
                HTTP status code of network error, if `raw` is False (by
                default) then failed response is putting back to task queue or
                if tries limit is reached then the processing of this  request
                is finished.
            :param callback: if you pass some function in `callback` option
                then the network response will be passed to this callback and
                the usual 'task_*' handler will be ignored and no error will be
                raised if such 'task_*' handler does not exist.
            :param fallback_name: the name of method that is called when spider
                gives up to do the task (due to multiple network errors)

            Any non-standard named arguments passed to `Task` constructor will
            be saved as attributes of the object. You can get their values
            later as attributes or with `get` method which allows to use
            default value if attribute does not exist.
        """

        if name == 'generator':
            # The name "generator" is restricted because
            # `task_generator` handler could not be created because
            # this name is already used for special method which
            # generates new tasks
            raise SpiderMisuseError('Task name could not be "generator"')

        self.name = name

        if url is None and grab is None and grab_config is None:
            raise SpiderMisuseError('Either url, grab or grab_config argument '
                                    'of Task constructor should not be None')

        if url is not None and grab is not None:
            raise SpiderMisuseError('Options url and grab could not be used '
                                    'together')

        if url is not None and grab_config is not None:
            raise SpiderMisuseError('Options url and grab_config could not be '
                                    'used together')

        if grab is not None and grab_config is not None:
            raise SpiderMisuseError(
                'Options grab and grab_config could not be used together')

        if grab:
            self.setup_grab_config(grab.dump_config())
        elif grab_config:
            self.setup_grab_config(grab_config)
        else:
            self.grab_config = None
            self.url = url

        if valid_status is None:
            self.valid_status = []
        else:
            self.valid_status = valid_status

        self.process_delay_option(delay)
        self.cache_timeout = cache_timeout
        if cache_timeout is not None:
            warn(
                'Option `cache_timeout` is deprecated and'
                ' is not supported anymore'
            )

        self.fallback_name = fallback_name
        self.priority_set_explicitly = priority_set_explicitly
        self.priority = priority
        self.network_try_count = network_try_count
        self.task_try_count = task_try_count
        self.disable_cache = disable_cache
        self.refresh_cache = refresh_cache
        self.use_proxylist = use_proxylist
        self.raw = raw
        self.callback = callback
        self.coroutines_stack = []
        for key, value in kwargs.items():
            setattr(self, key, value)

[docs]    def get(self, key, default=None):
        """
        Return value of attribute or None if such attribute
        does not exist.
        """
        return getattr(self, key, default)

    def process_delay_option(self, delay):
        if delay:
            self.schedule_time = datetime.utcnow() + timedelta(seconds=delay)
        else:
            self.schedule_time = None

    def setup_grab_config(self, grab_config):
        self.grab_config = copy_config(grab_config)
        self.url = grab_config['url']

[docs]    def clone(self, **kwargs):
        """
        Clone Task instance.

        Reset network_try_count, increase task_try_count.
        Reset priority attribute if it was not set explicitly.
        """

        # First, create exact copy of the current Task object
        attr_copy = self.__dict__.copy()
        if attr_copy.get('grab_config') is not None:
            del attr_copy['url']
        if not attr_copy['priority_set_explicitly']:
            attr_copy['priority'] = None
        task = Task(**attr_copy)

        # Reset some task properties if they have not
        # been set explicitly in kwargs
        if 'network_try_count' not in kwargs:
            task.network_try_count = 0
        if 'task_try_count' not in kwargs:
            task.task_try_count = self.task_try_count + 1
        if 'refresh_cache' not in kwargs:
            task.refresh_cache = False
        if 'disable_cache' not in kwargs:
            task.disable_cache = False

        if kwargs.get('url') is not None and kwargs.get('grab') is not None:
            raise SpiderMisuseError('Options url and grab could not be '
                                    'used together')

        if (kwargs.get('url') is not None and
                kwargs.get('grab_config') is not None):
            raise SpiderMisuseError('Options url and grab_config could not '
                                    'be used together')

        if (kwargs.get('grab') is not None and
                kwargs.get('grab_config') is not None):
            raise SpiderMisuseError('Options grab and grab_config could not '
                                    'be used together')

        if kwargs.get('grab'):
            task.setup_grab_config(kwargs['grab'].dump_config())
            del kwargs['grab']
        elif kwargs.get('grab_config'):
            task.setup_grab_config(kwargs['grab_config'])
            del kwargs['grab_config']
        elif kwargs.get('url'):
            task.url = kwargs['url']
            if task.grab_config:
                task.grab_config['url'] = kwargs['url']
            del kwargs['url']

        for key, value in kwargs.items():
            setattr(task, key, value)

        task.process_delay_option(None)

        return task

    def __repr__(self):
        return '<Task: %s>' % self.url

    def __lt__(self, other):
        return self.priority < other.priority

    def __eq__(self, other):
        if not self.priority or not other.priority:
            return True
        else:
            return self.priority == other.priority

    def get_fallback_handler(self, spider):
        if self.fallback_name:
            return getattr(spider, self.fallback_name)
        elif self.name:
            fb_name = 'task_%s_fallback' % self.name
            if hasattr(spider, fb_name):
                return getattr(spider, fb_name)
        else:
            return None