Skip to content

Commit

Permalink
Merge pull request #169 from pricingassistant/0.9.x
Browse files Browse the repository at this point in the history
0.9.x
  • Loading branch information
FlorianPerucki authored Aug 30, 2018
2 parents 9851294 + a6d89c4 commit 657dff6
Show file tree
Hide file tree
Showing 138 changed files with 4,788 additions and 2,214 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ develop-eggs
lib
lib64
__pycache__
.cache

# Installer logs
pip-log.txt
Expand Down Expand Up @@ -44,3 +45,5 @@ mrq-config.py
dump.rdb
supervisord.pid
memory_traces
mrq/dashboard/static/node_modules/
.vscode
6 changes: 4 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ services:
env:
- PYTHON_BIN=python
- PYTHON_BIN=python3
- PYTHON_BIN=/pypy/bin/pypy

before_install:
- docker ps
Expand All @@ -15,5 +16,6 @@ before_install:

# TODO: coveralls?
script:
- docker run -i -t -v `pwd`:/app:rw -w /app mrq_local $PYTHON_BIN -m pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
- docker run -i -t -v `pwd`:/app:rw -w /app mrq_local $PYTHON_BIN -m pytest tests/ -v --junitxml=pytest-report.xml --cov mrq --cov-report term
- docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq $PYTHON_BIN -m pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
# - docker run -i -t -v `pwd`:/app:rw -w /app mrq_local $PYTHON_BIN -m pytest tests/ --collect-only
- docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq $PYTHON_BIN -m pytest tests/ -v --junitxml=pytest-report.xml --cov mrq --cov-report term --timeout-method=thread --timeout=240
41 changes: 32 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ FROM debian:jessie
# https://github.com/docker-library/buildpack-deps/issues/40
#

RUN echo \
'deb ftp://ftp.us.debian.org/debian/ jessie main\n \
deb ftp://ftp.us.debian.org/debian/ jessie-updates main\n \
deb http://security.debian.org jessie/updates main\n' \
> /etc/apt/sources.list

RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10
RUN echo "deb http://repo.mongodb.org/apt/debian wheezy/mongodb-org/3.0 main" > /etc/apt/sources.list.d/mongodb-org-3.0.list
# RUN echo \
# 'deb ftp://ftp.us.debian.org/debian/ jessie main\n \
# deb ftp://ftp.us.debian.org/debian/ jessie-updates main\n \
# deb http://security.debian.org jessie/updates main\n' \
# > /etc/apt/sources.list

RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0C49F3730359A14518585931BC711F9BA15703C6
RUN echo "deb http://repo.mongodb.org/apt/debian jessie/mongodb-org/3.4 main" > /etc/apt/sources.list.d/mongodb-org-3.4.list
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl \
Expand All @@ -21,17 +21,27 @@ RUN apt-get update && \
python-pip \
python3-pip \
python3-dev \
make \
git \
vim \
mongodb-org-server \
bzip2 \
mongodb-org \
nginx redis-server \
g++ \
&& \
apt-get clean -y && \
rm -rf /var/lib/apt/lists/*

RUN curl -sL https://deb.nodesource.com/setup_7.x | bash -
RUN apt-get install -y --no-install-recommends nodejs

# Download pypy
RUN curl -sL 'https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.8-1-linux_x86_64-portable.tar.bz2' > /pypy.tar.bz2 && tar jxvf /pypy.tar.bz2 && rm -rf /pypy.tar.bz2 && mv /pypy-* /pypy

# Upgrade pip
RUN pip install --upgrade --ignore-installed pip
RUN pip3 install --upgrade --ignore-installed pip
RUN /pypy/bin/pypy -m ensurepip

ADD requirements-heroku.txt /app/requirements-heroku.txt
ADD requirements-base.txt /app/requirements-base.txt
Expand All @@ -50,8 +60,21 @@ RUN pip install -r /app/requirements-heroku.txt && \
pip install -r /app/requirements-dashboard.txt && \
rm -rf ~/.cache

RUN /pypy/bin/pip install -r /app/requirements-heroku.txt && \
/pypy/bin/pip install -r /app/requirements-base.txt && \
/pypy/bin/pip install -r /app/requirements-dev.txt && \
/pypy/bin/pip install -r /app/requirements-dashboard.txt && \
rm -rf ~/.cache

RUN mkdir -p /data/db

RUN ln -s /app/mrq/bin/mrq_run.py /usr/bin/mrq-run
RUN ln -s /app/mrq/bin/mrq_worker.py /usr/bin/mrq-worker
RUN ln -s /app/mrq/bin/mrq_agent.py /usr/bin/mrq-agent
RUN ln -s /app/mrq/dashboard/app.py /usr/bin/mrq-dashboard

ENV PYTHONPATH /app

VOLUME ["/data"]
WORKDIR /app

Expand Down
3 changes: 3 additions & 0 deletions Dockerfile-with-code
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM pricingassistant/mrq-env:latest

ADD ./mrq /app/mrq
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
include *.md
recursive-include mrq/supervisord_templates *
include requirements*
recursive-include mrq/dashboard/static *
recursive-include mrq/dashboard/templates *
Expand Down
50 changes: 29 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,52 +1,57 @@
docker:
docker build -t mrq_local .
docker build -t pricingassistant/mrq-env .
docker build -t pricingassistant/mrq -f Dockerfile-with-code .

docker_push:
docker push pricingassistant/mrq-env:latest
docker push pricingassistant/mrq:latest

test: docker
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app mrq_local python -m pytest tests/ -v --instafail"
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env python -m pytest tests/ -v --instafail"

test3: docker
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app mrq_local python3 -m pytest tests/ -v --instafail"
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env python3 -m pytest tests/ -v --instafail"

testpypy: docker
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env /pypy/bin/pypy -m pytest tests/ -v --instafail"

shell:
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -p 8000:8000 -v `pwd`:/app:rw -w /app mrq_local bash"
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -p 8000:8000 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env bash"

reshell:
# Reconnect in the current taskqueue container
sh -c 'docker exec -t -i `docker ps | grep pricingassistant/mrq-env | cut -f 1 -d " "` bash'

shell_noport:
sh -c "docker run --rm -i -t -v `pwd`:/app:rw -w /app mrq_local bash"
sh -c "docker run --rm -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env bash"

docs_serve:
sh -c "docker run --rm -i -t-p 8000:8000 -v `pwd`:/app:rw -w /app mrq_local mkdocs serve"
sh -c "docker run --rm -i -t -p 8000:8000 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env mkdocs serve"

lint: docker
docker run -i -t -v `pwd`:/app:rw -w /app mrq_local pylint --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq
docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env pylint -j 0 --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq

linterrors: docker
docker run -i -t -v `pwd`:/app:rw -w /app mrq_local pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env pylint -j 0 --errors-only --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq

linterrors3: docker
docker run -i -t -v `pwd`:/app:rw -w /app mrq_local python3 -m pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env python3 -m pylint -j 0 --errors-only --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq

virtualenv:
virtualenv venv --distribute

virtualenv_pypy:
virtualenv -p /usr/bin/pypy pypy --distribute
virtualenv venv --distribute --python=python2.7

deps:
pip install -r requirements.txt
pip install -r requirements-dev.txt
pip install -r requirements-dashboard.txt

deps_pypy:
pip install git+git://github.com/schmir/gevent@pypy-hacks
pip install cffi
pip install git+git://github.com/gevent-on-pypy/pypycore
export GEVENT_LOOP=pypycore.loop
pip install -r requirements-pypy.txt

clean:
find . -path ./venv -prune -o -name "*.pyc" -exec rm {} \;
find . -name __pycache__ | xargs rm -r

build_dashboard:
cd mrq/dashboard/static && npm install && mkdir -p bin && npm run build

dashboard:
python mrq/dashboard/app.py

Expand All @@ -62,8 +67,11 @@ pep8:
autopep8:
autopep8 --max-line-length 99 -aaaaaaaa --in-place --recursive mrq

pypi: linterrors
pypi: linterrors linterrors3
python setup.py sdist upload

build_docs:
python scripts/propagate_docs.py

ensureindexes:
mrq-run mrq.basetasks.indexes.EnsureIndexes
9 changes: 3 additions & 6 deletions docs/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ The following general flags can be passed as command-line arguments to either **
- `--mongodb_jobs, --mongodb`: MongoDB URI for the jobs, scheduled_jobs & workers database. Defaults to **mongodb://127.0.0.1:27017/mrq**.
- `--mongodb_logs` :MongoDB URI for the logs database."0" will disable remote logs, "1" will use main MongoDB. Defaults to **1**
- `--mongodb_logs_size`: If provided, sets the log collection to capped to that amount of bytes.
- `--no_mongodb_ensure_indexes`: If provided, skip the creation of MongoDB indexes at worker startup.
- `--redis`: Redis URI. Defaults to **redis://127.0.0.1:6379**.
- `--redis_prefix`: Redis key prefix. Defaults to "mrq".
- `--redis_max_connections`: Redis max connection pool size. Defaults to **1000**.
Expand Down Expand Up @@ -46,16 +45,14 @@ You can pass additional configuration flags:

- `--max_jobs`: Gevent:max number of jobs to do before quitting. Use as a workaround for memory leaks in your tasks. Defaults to **0**
- `--max_memory`: Max memory (in Mb) after which the process will be shut down. Use with `--processes [1-N]`
to have supervisord automatically respawn the worker when this happens. Defaults to **1**
to have the worker automatically respawn when this happens. Defaults to **1**
- `--grenlets, --gevent, --g`: Max number of greenlets to use. Defaults to **1**.
- `--processes, --p`: Number of processes to launch with supervisord. Defaults to **0** (no supervisord).
- `--supervisord_template`: Path of supervisord template to use. Defaults to **supervisord_templates/default.conf**.
- `--processes, --p`: Number of processes to launch . Defaults to **0**.
- `--scheduler`: Run the scheduler. Defaults to **false**.
- `--scheduler_interval`: Seconds between scheduler checks. Defaults to **60** seconds, only ints are acceptable.
- `--report_interval`: Seconds between worker reports to MongoDB. Defaults to **10** seconds, floats are acceptable too.
- `--report_file`: Filepath of a json dump of the worker status. Disabled if none.
- `--subqueues_refresh_interval`: Seconds between worker refreshes of the known subqueues.
- `--subqueues_delimiter`: Delimiter between main queue and subqueue names.
- `--paused_queues_refresh_interval`: Seconds between worker refreshes of the paused queues list.
- `--admin_port`: Start an admin server on this port, if provided. Incompatible with --processes. Defaults to **0**
- `--admin_ip`: IP for the admin server to listen on. Use "0.0.0.0" to allow access from outside. Defaults to **127.0.0.1**.
Expand All @@ -71,7 +68,7 @@ The default is to run tasks one at a time. You should obviously change this beha

This will start 30 greenlets over 3 UNIX processes. Each of them will run 10 jobs at the same time.

As soon as you use the `--processes` option (even with `--processes=1`) then supervisord will be used to control the processes. It is quite useful to manage long-running instances.
The worker is autonomous to handle its processes. It is quite useful to manage long-running instances.

### Simulating network latency

Expand Down
6 changes: 2 additions & 4 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ Remember, put mrq-config.py in your workers directory.
MONGODB_JOBS = "mongodb://127.0.0.1:27017/mrq" # MongoDB URI for the jobs, scheduled_jobs & workers database.Defaults to mongodb://127.0.0.1:27017/mrq
MONGODB_LOGS = 1 #MongoDB URI for the logs database."0" will disable remote logs, "1" will use main MongoDB.Defaults to 1
MONGODB_LOGS_SIZE = None #If provided, sets the log collection to capped to that amount of bytes.
NO_MONGODB_ENSURE_INDEXES = None #If provided, skip the creation of MongoDB indexes at worker startup.

#Redis settings
REDIS = "redis://127.0.0.1:6379" #Redis URI.Defaults to redis://127.0.0.1:6379
Expand Down Expand Up @@ -57,10 +56,9 @@ USE_LARGE_JOB_IDS = False #Do not use compacted job IDs in Redis. For compatibil
QUEUES = ("default",) # The queues to listen on.Defaults to default , which will listen on all queues.
MAX_JOBS = 0 #Gevent:max number of jobs to do before quitting. Workaround for memory leaks in your tasks. Defaults to 0
MAX_TIME = 0 # max number of seconds a worker runs before quitting
MAX_MEMORY = 1 #Max memory (in Mb) after which the process will be shut down. Use with PROCESS = [1-N] to have supervisord automatically respawn the worker when this happens.Defaults to 1
MAX_MEMORY = 1 #Max memory (in Mb) after which the process will be shut down. Use with PROCESS = [1-N] to have the worker automatically respawned when this happens.Defaults to 1
GRENLETS = 1 #Max number of greenlets to use.Defaults to 1.
PROCESSES = 0 #Number of processes to launch with supervisord.Defaults to 0.
SUPERVISORD_TEMPLATE = "supervisord_templates/default.conf" #Path of supervisord template to use. Defaults to supervisord_templates/default.conf.
PROCESSES = 0 #Number of processes to launch.Defaults to 0.
SCHEDULER = False #Run the scheduler.Defaults to False.
SCHEDULER_INTERVAL = 60 #Seconds between scheduler checks.Defaults to 60 seconds, only ints are acceptable.
REPORT_INTERVAL = 10.5 #Seconds between worker reports to MongoDB.Defaults to 10 seconds, floats are acceptable too.
Expand Down
8 changes: 0 additions & 8 deletions docs/design.md

This file was deleted.

2 changes: 0 additions & 2 deletions docs/get-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,3 @@ This was a preview on the very basic features of MRQ. What makes it actually use
* You can run multiple workers in parallel. Each worker can also run multiple greenlets in parallel.
* Workers can dequeue from multiple queues
* You can queue jobs from your Python code to avoid using `mrq-run` from the command-line.

These features will be demonstrated in a future example of a simple web crawler.
1 change: 0 additions & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ MRQ was first developed at [Pricing Assistant](http://pricingassistant.com) and
* **Great [dashboard](http://mrq.readthedocs.org/en/latest/dashboard/):** Have visibility and control on everything: queued jobs, current jobs, worker status, ...
* **Per-job logs:** Get the log output of each task separately in the dashboard
* **Gevent worker:** IO-bound tasks can be done in parallel in the same UNIX process for maximum throughput
* **Supervisord integration:** CPU-bound tasks can be split across several UNIX processes with a single command-line flag
* **Job management:** You can retry, requeue, cancel jobs from the code or the dashboard.
* **Performance:** Bulk job queueing, easy job profiling
* **Easy [configuration](http://mrq.readthedocs.org/en/latest/configuration):** Every aspect of MRQ is configurable through command-line flags or a configuration file
Expand Down
13 changes: 2 additions & 11 deletions docs/jobs-maintenance.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,9 @@ SCHEDULER_TASKS = [
"interval": 3600
},
# This will requeue jobs 'lost' between redis.blpop() and mongo.update(status=started).
# This can happen only when the worker is killed brutally in the middle of dequeue_jobs()
# This will make sure MRQ's indexes are built
{
"path": "mrq.basetasks.cleaning.RequeueLostJobs",
"params": {},
"interval": 24 * 3600
},
# This will clean the list of known queues in Redis. It will mostly remove empty queues
# so that they are not displayed in the dashboard anymore.
{
"path": "mrq.basetasks.cleaning.CleanKnownQueues",
"path": "mrq.basetasks.indexes.EnsureIndexes",
"params": {},
"interval": 24 * 3600
}
Expand Down
2 changes: 1 addition & 1 deletion docs/jobs.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ However, to be reliable a task queue needs to prepare for everything that can go
* ```retry```: The method `task.retry()` was called to interrupt the job but mark it for being retried later. This may be useful when calling unreliable 3rd-party services.
* ```maxretries```: The task was retried too many times. Max retries default to 3 and can be configured globally or per task. At this point it should be up to you to cancel them or requeue them again.

Only jobs in statuses `success` and `cancel` will be cleaned from MongoDB after a delay of `result_ttl` seconds (see [Task configuration](configuration.md))
Jobs in status `success` will be cleaned from MongoDB after a delay of `result_ttl` seconds (see [Task configuration](configuration.md))

## Task API

Expand Down
4 changes: 3 additions & 1 deletion docs/performance.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Performance
# Worker performance

Performance is an explicit goal of MRQ as it was first developed at [Pricing Assistant](http://www.pricingassistant.com/) for crawling billions of web pages.

Expand All @@ -8,6 +8,8 @@ On a regular Macbook Pro, we see 1300 jobs/second in a single worker process wit

However what we are really measuring there is MongoDB's write performance. An install of MRQ with properly scaled MongoDB and Redis instances is be capable of much more.

For more, see our tutorial on [Queue performance](queue-performance.md).

## PyPy support

Earlier in its development MRQ was tested successfully on PyPy but we are waiting for better PyPy+gevent support to continue working on it, as performance was worse than CPython.
Expand Down
Loading

0 comments on commit 657dff6

Please sign in to comment.