From 49ba42651ec1373d218a6dae3a83f2cac8ed0f81 Mon Sep 17 00:00:00 2001 From: Eric Fahlgren Date: Sun, 20 Apr 2025 12:58:43 -0700 Subject: [PATCH] rq: add a garbage collector to the worker Implement a maintenance hook on the standard redis queue worker to do garbage collection on expired builds. When a result expires from the queue, its data will be removed from the public/store/ directory at the regular maintenance interval (default is every 600 seconds). Signed-off-by: Eric Fahlgren --- .github/workflows/publish.yml | 2 +- asu/__init__.py | 2 + asu/rq.py | 72 +++++++++++++++++++++++++++++++++++ podman-compose.yml | 2 +- 4 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 asu/rq.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e37b2e2d..18fb0e4f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -29,7 +29,7 @@ jobs: - name: Set __version__ and poetry version run: | TAG="$(git describe --tags --always | awk -F"-" '{if (NF>1) {print substr($1, 2)".post"$2} else {print substr($1, 2)}}')" - echo "__version__ = \"$TAG\"" > asu/__init__.py + sed "s/__version__.*/__version__ = \"$TAG\"/" -i asu/__init__.py poetry version "$TAG" - name: Build and publish PyPi package diff --git a/asu/__init__.py b/asu/__init__.py index 6c8e6b97..5d03f98d 100644 --- a/asu/__init__.py +++ b/asu/__init__.py @@ -1 +1,3 @@ __version__ = "0.0.0" + +from .rq import GCWorker as GCWorker diff --git a/asu/rq.py b/asu/rq.py new file mode 100644 index 00000000..7c842796 --- /dev/null +++ b/asu/rq.py @@ -0,0 +1,72 @@ +from re import compile +from pathlib import Path +from rq import Queue, Worker +from rq.job import Job +from podman import PodmanClient +from shutil import rmtree + +from asu.config import settings +from asu.util import log, get_podman + +REQUEST_HASH_LENGTH = 64 +store: Path = settings.public_path / "store" +podman: PodmanClient = get_podman() + + +class GCWorker(Worker): + """A Worker class that does periodic garbage collection on ASU's + public store directory. We tie into the standard `Worker` maintenance + sequence, so the period is controlled by the base class. You may change + the garbage collection frequency in podman-compose.yml by adding a + `--maintenance-interval` option to the startup command as follows (the + default is 600 seconds). + + >>> command: rqworker ... --maintenance-interval 1800 + """ + + hash_match = compile(f"^[0-9a-f]{{{REQUEST_HASH_LENGTH}}}$") + + def clean_store(self) -> None: + """For performance testing, the store directory was mounted on a + slow external USB hard drive. A typical timing result showed ~1000 + directories deleted per second on that test system. The synthetic + test directories were created containing 10 files in each. + File count dominated the timing, with file size being relatively + insignificant, likely due to `stat` calls being the bottleneck. + (Just for comparison, tests against store mounted on a fast SSD + were about twice as fast.) + + >>> Cleaning /mnt/slow/public/store: deleted 5000/5000 builds + >>> Timing analysis for clean_store: 5.081s + """ + + deleted: int = 0 + total: int = 0 + dir: Path + queue: Queue + for dir in store.glob("*"): + if not dir.is_dir() or not self.hash_match.match(dir.name): + continue + total += 1 + for queue in self.queues: + job: Job = queue.fetch_job(dir.name) + log.info(f" Found {dir.name = } {job = }") + if job is None: + rmtree(dir) + deleted += 1 + + log.info(f"Cleaning {store}: deleted {deleted}/{total} builds") + + def clean_podman(self) -> None: + """Reclaim space from the various podman disk entities as they are orphaned.""" + removed = podman.containers.prune() + log.info(f"Reclaimed {removed.get('SpaceReclaimed', 0):,d}B from containers") + removed = podman.images.prune() + log.info(f"Reclaimed {removed.get('SpaceReclaimed', 0):,d}B from images") + removed = podman.volumes.prune() + log.info(f"Reclaimed {removed.get('SpaceReclaimed', 0):,d}B from volumes") + + def run_maintenance_tasks(self): + super().run_maintenance_tasks() + self.clean_store() + self.clean_podman() diff --git a/podman-compose.yml b/podman-compose.yml index 3c467125..5c6804a6 100644 --- a/podman-compose.yml +++ b/podman-compose.yml @@ -22,7 +22,7 @@ services: context: . dockerfile: Containerfile restart: unless-stopped - command: rqworker --logging_level INFO + command: rqworker --logging_level INFO --with-scheduler --worker-class asu.GCWorker env_file: .env environment: REDIS_URL: "redis://redis:6379/0"