Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature define feed params (FEED_URI, FEED_FORMAT, EXPORT_URI) #61

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ Options:
--server=SERVERS servers, default: ['http://localhost:6800']
--database-url=DATABASE_URL
SpiderKeeper metadata database default: sqlite:////home/souche/SpiderKeeper.db
--feed-uri=FEED_URI FEED_URI scrapy setting, default: None
--feed-format=FEED_FORMAT
FEED_FORMAT scrapy setting, default: None
--export-uri=EXPORT_URI
Export uri (use if export uri differs from FEED_URI),
default: None
--no-auth disable basic auth
-v, --verbose log level

Expand Down Expand Up @@ -80,6 +86,31 @@ Visit:

```

## Feed options

- `FEED_URI` - path that is used by scrapy to store feed.
All storages (s3, ftp, local filesystem) are supported.
- `FEED_FORMAT` - exported file format
- `EXPORT_URI` - path where feed can be retrieved from.

`FEED_URI` and `EXPORT_URI` can contain the following params:
- `%(name)s` - spider name
- `%(create_time)s` - time of job execution start
- `%(job_id)s` - job execution id
- any other params from `Args` set while adding jobs.

If `EXPORT_URI` is not defined, export uri is equal to `FEED_URI`.
If `FEED_URI` is also not defined, it is not passed to spider.
The same is for `FEED_FORMAT`.

Example:
```
FEED_FORMAT = 'csv'
FEED_URI = 's3://bucket/%(name)s/%(job_id)s_%(create_time)s.csv'
EXPORT_URI = 'https://s3.amazonaws.com/bucket/%(name)s/%(job_id)s_%(create_time)s.csv'
```
Note: need to install `boto3` for uploading to `s3`.

## TODO
- [ ] Job dashboard support filter
- [x] User Authentication
Expand Down
62 changes: 59 additions & 3 deletions SpiderKeeper/app/proxy/spiderctrl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import random
from functools import reduce

from flask import current_app

from SpiderKeeper.app import db
from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority
Expand Down Expand Up @@ -147,16 +148,71 @@ def start_spider(self, job_instance):
for i in range(threshold):
leaders.append(random.choice(candidates))
for leader in leaders:
serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
job_execution = JobExecution()
job_execution.project_id = job_instance.project_id
job_execution.service_job_execution_id = serviec_job_id
job_execution.job_instance_id = job_instance.id
job_execution.create_time = datetime.datetime.now()
job_execution.running_on = leader.server
db.session.add(job_execution)
db.session.commit()

feed_settings = self.get_feed_params(
job_execution,
spider_name,
arguments
)
if feed_settings:
arguments['setting'] = feed_settings

service_job_id = leader.start_spider(
project.project_name,
spider_name,
arguments
)

job_execution.service_job_execution_id = service_job_id
db.session.commit()

def get_feed_params(self, job_execution, spider_name, args):
"""Pass FEED_URI and FEED_FORMAT params to spider settings.

Save EXPORT_URI to db as well.

"""
custom_settings = []
feed_uri, export_uri = self.get_feed_uri(
job_execution,
spider_name,
args
)
if feed_uri:
job_execution.export_uri = export_uri
custom_settings.append(
'FEED_URI={}'.format(feed_uri))
feed_format = current_app.config.get('FEED_FORMAT')
if feed_format:
custom_settings.append(
'FEED_FORMAT={}'.format(feed_format)
)
return custom_settings

@staticmethod
def get_feed_uri(job_execution, spider_name, args):
"""Pass params to FEED_URI and EXPORT_URI and return the result."""
feed_uri = current_app.config.get('FEED_URI')
export_uri = current_app.config.get('EXPORT_URI')
if not feed_uri:
return None, None
params = {
'name': spider_name,
'job_id': job_execution.id,
'create_time':
job_execution.create_time.strftime('%Y-%m-%d_%H-%M-%S')
}
params.update({key: value[0] for key, value in args.items()})
export_uri = export_uri if export_uri else feed_uri
return feed_uri % params, export_uri % params

def cancel_spider(self, job_execution):
job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
project = Project.find_project_by_id(job_instance.project_id)
Expand Down
3 changes: 2 additions & 1 deletion SpiderKeeper/app/spider/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,14 @@ class JobExecution(Base):
__tablename__ = 'sk_job_execution'

project_id = db.Column(db.INTEGER, nullable=False, index=True)
service_job_execution_id = db.Column(db.String(50), nullable=False, index=True)
service_job_execution_id = db.Column(db.String(50), index=True)
job_instance_id = db.Column(db.INTEGER, nullable=False, index=True)
create_time = db.Column(db.DATETIME)
start_time = db.Column(db.DATETIME)
end_time = db.Column(db.DATETIME)
running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING)
running_on = db.Column(db.Text)
export_uri = db.Column(db.Text, nullable=True)

def to_dict(self):
job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first()
Expand Down
5 changes: 5 additions & 0 deletions SpiderKeeper/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,8 @@
BASIC_AUTH_USERNAME = 'admin'
BASIC_AUTH_PASSWORD = 'admin'
BASIC_AUTH_FORCE = True

# feed params
FEED_FORMAT = None
FEED_URI = None
EXPORT_URI = None
18 changes: 16 additions & 2 deletions SpiderKeeper/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ def main():
SQLALCHEMY_DATABASE_URI=opts.database_url,
BASIC_AUTH_USERNAME=opts.username,
BASIC_AUTH_PASSWORD=opts.password,
NO_AUTH=opts.no_auth
NO_AUTH=opts.no_auth,
FEED_URI=opts.feed_uri,
FEED_FORMAT=opts.feed_format,
EXPORT_URI=opts.export_uri,
))
if opts.verbose:
app.logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -56,7 +59,18 @@ def parse_opts(config):
help='SpiderKeeper metadata database default: %s' % config.get('SQLALCHEMY_DATABASE_URI'),
dest='database_url',
default=config.get('SQLALCHEMY_DATABASE_URI'))

parser.add_option("--feed-uri",
help='FEED_URI scrapy setting, default: %s' % config.get('FEED_URI'),
dest='feed_uri',
default=config.get('FEED_URI'))
parser.add_option("--feed-format",
help='FEED_FORMAT scrapy setting, default: %s' % config.get('FEED_FORMAT'),
dest='feed_format',
default=config.get('FEED_FORMAT'))
parser.add_option("--export-uri",
help='Export uri (use if export uri differs from FEED_URI), default: %s' % config.get('EXPORT_URI'),
dest='export_uri',
default=config.get('EXPORT_URI'))
parser.add_option("--no-auth",
help="disable basic auth",
dest='no_auth',
Expand Down