from __future__ import absolute_import
from datetime import datetime, timedelta
from grab.spider.error import SpiderMisuseError
from grab.base import copy_config
from grab.util.warning import warn
class BaseTask(object):
pass
[docs]class Task(BaseTask):
"""
Task for spider.
"""
[docs] def __init__(self, name=None, url=None, grab=None, grab_config=None,
priority=None, priority_set_explicitly=True,
network_try_count=0, task_try_count=1,
disable_cache=False, refresh_cache=False,
valid_status=None, use_proxylist=True,
cache_timeout=None, delay=None,
raw=False, callback=None,
fallback_name=None,
**kwargs):
"""
Create `Task` object.
If more than one of url, grab and grab_config options are non-empty
then they processed in following order:
* grab overwrite grab_config
* grab_config overwrite url
Args:
:param name: name of the task. After successful network operation
task's result will be passed to `task_<name>` method.
:param url: URL of network document. Any task requires `url` or
`grab` option to be specified.
:param grab: configured `Grab` instance. You can use that option in
case when `url` option is not enough. Do not forget to
configure `url` option of `Grab` instance because in this case
the `url` option of `Task` constructor will be overwritten
with `grab.config['url']`.
:param priority: - priority of the Task. Tasks with lower priority
will be processed earlier. By default each new task is assigned
with random priority from (80, 100) range.
:param priority_set_explicitly: - internal flag which tells if that
task priority was assigned manually or generated by spider
according to priority generation rules.
:param network_try_count: you'll probably will not need to use it.
It is used internally to control how many times this task was
restarted due to network errors. The `Spider` instance has
`network_try_limit` option. When `network_try_count` attribute
of the task exceeds the `network_try_limit` attribute then
processing of the task is abandoned.
:param task_try_count: the as `network_try_count` but it increased
only then you use `clone` method. Also you can set it manually.
It is useful if you want to restart the task after it was
cancelled due to multiple network errors. As you might guessed
there is `task_try_limit` option in `Spider` instance. Both
options `network_try_count` and `network_try_limit` guarantee
you that you'll not get infinite loop of restarting some task.
:param disable_cache: if `True` disable cache subsystem.
The document will be fetched from the Network and it will not
be saved to cache.
:param refresh_cache: if `True` the document will be fetched from
the Network and saved to cache.
:param valid_status: extra status codes which counts as valid
:param use_proxylist: it means to use proxylist which was
configured via `setup_proxylist` method of spider
:param delay: if specified tells the spider to schedule the task
and execute it after `delay` seconds
:param raw: if `raw` is True then the network response is
forwarding to the corresponding handler without any check of
HTTP status code of network error, if `raw` is False (by
default) then failed response is putting back to task queue or
if tries limit is reached then the processing of this request
is finished.
:param callback: if you pass some function in `callback` option
then the network response will be passed to this callback and
the usual 'task_*' handler will be ignored and no error will be
raised if such 'task_*' handler does not exist.
:param fallback_name: the name of method that is called when spider
gives up to do the task (due to multiple network errors)
Any non-standard named arguments passed to `Task` constructor will
be saved as attributes of the object. You can get their values
later as attributes or with `get` method which allows to use
default value if attribute does not exist.
"""
if name == 'generator':
# The name "generator" is restricted because
# `task_generator` handler could not be created because
# this name is already used for special method which
# generates new tasks
raise SpiderMisuseError('Task name could not be "generator"')
self.name = name
if url is None and grab is None and grab_config is None:
raise SpiderMisuseError('Either url, grab or grab_config argument '
'of Task constructor should not be None')
if url is not None and grab is not None:
raise SpiderMisuseError('Options url and grab could not be used '
'together')
if url is not None and grab_config is not None:
raise SpiderMisuseError('Options url and grab_config could not be '
'used together')
if grab is not None and grab_config is not None:
raise SpiderMisuseError(
'Options grab and grab_config could not be used together')
if grab:
self.setup_grab_config(grab.dump_config())
elif grab_config:
self.setup_grab_config(grab_config)
else:
self.grab_config = None
self.url = url
if valid_status is None:
self.valid_status = []
else:
self.valid_status = valid_status
self.process_delay_option(delay)
self.cache_timeout = cache_timeout
if cache_timeout is not None:
warn(
'Option `cache_timeout` is deprecated and'
' is not supported anymore'
)
self.fallback_name = fallback_name
self.priority_set_explicitly = priority_set_explicitly
self.priority = priority
self.network_try_count = network_try_count
self.task_try_count = task_try_count
self.disable_cache = disable_cache
self.refresh_cache = refresh_cache
self.use_proxylist = use_proxylist
self.raw = raw
self.callback = callback
self.coroutines_stack = []
for key, value in kwargs.items():
setattr(self, key, value)
[docs] def get(self, key, default=None):
"""
Return value of attribute or None if such attribute
does not exist.
"""
return getattr(self, key, default)
def process_delay_option(self, delay):
if delay:
self.schedule_time = datetime.utcnow() + timedelta(seconds=delay)
else:
self.schedule_time = None
def setup_grab_config(self, grab_config):
self.grab_config = copy_config(grab_config)
self.url = grab_config['url']
[docs] def clone(self, **kwargs):
"""
Clone Task instance.
Reset network_try_count, increase task_try_count.
Reset priority attribute if it was not set explicitly.
"""
# First, create exact copy of the current Task object
attr_copy = self.__dict__.copy()
if attr_copy.get('grab_config') is not None:
del attr_copy['url']
if not attr_copy['priority_set_explicitly']:
attr_copy['priority'] = None
task = Task(**attr_copy)
# Reset some task properties if they have not
# been set explicitly in kwargs
if 'network_try_count' not in kwargs:
task.network_try_count = 0
if 'task_try_count' not in kwargs:
task.task_try_count = self.task_try_count + 1
if 'refresh_cache' not in kwargs:
task.refresh_cache = False
if 'disable_cache' not in kwargs:
task.disable_cache = False
if kwargs.get('url') is not None and kwargs.get('grab') is not None:
raise SpiderMisuseError('Options url and grab could not be '
'used together')
if (kwargs.get('url') is not None and
kwargs.get('grab_config') is not None):
raise SpiderMisuseError('Options url and grab_config could not '
'be used together')
if (kwargs.get('grab') is not None and
kwargs.get('grab_config') is not None):
raise SpiderMisuseError('Options grab and grab_config could not '
'be used together')
if kwargs.get('grab'):
task.setup_grab_config(kwargs['grab'].dump_config())
del kwargs['grab']
elif kwargs.get('grab_config'):
task.setup_grab_config(kwargs['grab_config'])
del kwargs['grab_config']
elif kwargs.get('url'):
task.url = kwargs['url']
if task.grab_config:
task.grab_config['url'] = kwargs['url']
del kwargs['url']
for key, value in kwargs.items():
setattr(task, key, value)
task.process_delay_option(None)
return task
def __repr__(self):
return '<Task: %s>' % self.url
def __lt__(self, other):
return self.priority < other.priority
def __eq__(self, other):
if not self.priority or not other.priority:
return True
else:
return self.priority == other.priority
def get_fallback_handler(self, spider):
if self.fallback_name:
return getattr(spider, self.fallback_name)
elif self.name:
fb_name = 'task_%s_fallback' % self.name
if hasattr(spider, fb_name):
return getattr(spider, fb_name)
else:
return None