diff --git a/README.md b/README.md index 52c81f11..62833815 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,12 @@ Options: --server=SERVERS servers, default: ['http://localhost:6800'] --database-url=DATABASE_URL SpiderKeeper metadata database default: sqlite:////home/souche/SpiderKeeper.db + --feed-uri=FEED_URI FEED_URI scrapy setting, default: None + --feed-format=FEED_FORMAT + FEED_FORMAT scrapy setting, default: None + --export-uri=EXPORT_URI + Export uri (use if export uri differs from FEED_URI), + default: None --no-auth disable basic auth -v, --verbose log level @@ -80,6 +86,31 @@ Visit: ``` +## Feed options + +- `FEED_URI` - path that is used by scrapy to store feed. +All storages (s3, ftp, local filesystem) are supported. +- `FEED_FORMAT` - exported file format +- `EXPORT_URI` - path where feed can be retrieved from. + +`FEED_URI` and `EXPORT_URI` can contain the following params: +- `%(name)s` - spider name +- `%(create_time)s` - time of job execution start +- `%(job_id)s` - job execution id +- any other params from `Args` set while adding jobs. + +If `EXPORT_URI` is not defined, export uri is equal to `FEED_URI`. +If `FEED_URI` is also not defined, it is not passed to spider. +The same is for `FEED_FORMAT`. + +Example: +``` +FEED_FORMAT = 'csv' +FEED_URI = 's3://bucket/%(name)s/%(job_id)s_%(create_time)s.csv' +EXPORT_URI = 'https://s3.amazonaws.com/bucket/%(name)s/%(job_id)s_%(create_time)s.csv' +``` +Note: need to install `boto3` for uploading to `s3`. + ## TODO - [ ] Job dashboard support filter - [x] User Authentication diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index 2e47cfab..0c164562 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -1,6 +1,7 @@ import datetime import random -from functools import reduce + +from flask import current_app from SpiderKeeper.app import db from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority @@ -147,16 +148,71 @@ def start_spider(self, job_instance): for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: - serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id - job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit() + feed_settings = self.get_feed_params( + job_execution, + spider_name, + arguments + ) + if feed_settings: + arguments['setting'] = feed_settings + + service_job_id = leader.start_spider( + project.project_name, + spider_name, + arguments + ) + + job_execution.service_job_execution_id = service_job_id + db.session.commit() + + def get_feed_params(self, job_execution, spider_name, args): + """Pass FEED_URI and FEED_FORMAT params to spider settings. + + Save EXPORT_URI to db as well. + + """ + custom_settings = [] + feed_uri, export_uri = self.get_feed_uri( + job_execution, + spider_name, + args + ) + if feed_uri: + job_execution.export_uri = export_uri + custom_settings.append( + 'FEED_URI={}'.format(feed_uri)) + feed_format = current_app.config.get('FEED_FORMAT') + if feed_format: + custom_settings.append( + 'FEED_FORMAT={}'.format(feed_format) + ) + return custom_settings + + @staticmethod + def get_feed_uri(job_execution, spider_name, args): + """Pass params to FEED_URI and EXPORT_URI and return the result.""" + feed_uri = current_app.config.get('FEED_URI') + export_uri = current_app.config.get('EXPORT_URI') + if not feed_uri: + return None, None + params = { + 'name': spider_name, + 'job_id': job_execution.id, + 'create_time': + job_execution.create_time.strftime('%Y-%m-%d_%H-%M-%S') + } + params.update({key: value[0] for key, value in args.items()}) + export_uri = export_uri if export_uri else feed_uri + return feed_uri % params, export_uri % params + def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 5376602b..4e2526cd 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -151,13 +151,14 @@ class JobExecution(Base): __tablename__ = 'sk_job_execution' project_id = db.Column(db.INTEGER, nullable=False, index=True) - service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) + service_job_execution_id = db.Column(db.String(50), index=True) job_instance_id = db.Column(db.INTEGER, nullable=False, index=True) create_time = db.Column(db.DATETIME) start_time = db.Column(db.DATETIME) end_time = db.Column(db.DATETIME) running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING) running_on = db.Column(db.Text) + export_uri = db.Column(db.Text, nullable=True) def to_dict(self): job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first() diff --git a/SpiderKeeper/config.py b/SpiderKeeper/config.py index 8f0722d9..21b954af 100644 --- a/SpiderKeeper/config.py +++ b/SpiderKeeper/config.py @@ -40,3 +40,8 @@ BASIC_AUTH_USERNAME = 'admin' BASIC_AUTH_PASSWORD = 'admin' BASIC_AUTH_FORCE = True + +# feed params +FEED_FORMAT = None +FEED_URI = None +EXPORT_URI = None diff --git a/SpiderKeeper/run.py b/SpiderKeeper/run.py index c50887f9..94e159fd 100644 --- a/SpiderKeeper/run.py +++ b/SpiderKeeper/run.py @@ -13,7 +13,10 @@ def main(): SQLALCHEMY_DATABASE_URI=opts.database_url, BASIC_AUTH_USERNAME=opts.username, BASIC_AUTH_PASSWORD=opts.password, - NO_AUTH=opts.no_auth + NO_AUTH=opts.no_auth, + FEED_URI=opts.feed_uri, + FEED_FORMAT=opts.feed_format, + EXPORT_URI=opts.export_uri, )) if opts.verbose: app.logger.setLevel(logging.DEBUG) @@ -56,7 +59,18 @@ def parse_opts(config): help='SpiderKeeper metadata database default: %s' % config.get('SQLALCHEMY_DATABASE_URI'), dest='database_url', default=config.get('SQLALCHEMY_DATABASE_URI')) - + parser.add_option("--feed-uri", + help='FEED_URI scrapy setting, default: %s' % config.get('FEED_URI'), + dest='feed_uri', + default=config.get('FEED_URI')) + parser.add_option("--feed-format", + help='FEED_FORMAT scrapy setting, default: %s' % config.get('FEED_FORMAT'), + dest='feed_format', + default=config.get('FEED_FORMAT')) + parser.add_option("--export-uri", + help='Export uri (use if export uri differs from FEED_URI), default: %s' % config.get('EXPORT_URI'), + dest='export_uri', + default=config.get('EXPORT_URI')) parser.add_option("--no-auth", help="disable basic auth", dest='no_auth',