Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
name: CI
on:
push:
branches:
- "main"
pull_request:

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
lint-python:
name: Lint Python
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
cache: "pip"

- name: Run flake8
uses: py-actions/flake8@v2

validate-compute-block:
name: Validate Compute Block Config
runs-on: ubuntu-latest
needs: lint-python
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5

- name: Intall dependencies
run: |
pip install -r requirements.txt

- name: Check cbcs
run: |
python3 - <<'EOF'
import main

from scystream.sdk.config import load_config, get_compute_block
from scystream.sdk.config.config_loader import _compare_configs
from pathlib import Path

CBC_PATH = Path("cbc.yaml")

if not CBC_PATH.exists():
raise FileNotFoundError("cbc.yaml not found in repo root.")

block_from_code = get_compute_block()
block_from_yaml = load_config(str(CBC_PATH))

_compare_configs(block_from_code, block_from_yaml)

print("cbc.yaml matches python code definition")
EOF

run-test:
name: Run Tests
runs-on: ubuntu-latest
needs: validate-compute-block
services:
minio:
image: lazybit/minio
ports:
- 9000:9000
env:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
options: >-
--health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
--health-interval 5s
--health-retries 5
--health-timeout 5s
postgres:
image: postgres:15
ports:
- 5432:5432
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
options: >-
--health-cmd="pg_isready -U postgres"
--health-interval=5s
--health-retries=10
--health-timeout=5s
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
cache: "pip"

- name: Install dependencies
run: |
pip install -r requirements.txt

- name: Run Tests
run: pytest -vv

build:
name: Build docker image
runs-on: ubuntu-latest
needs: run-test
permissions:
contents: read
packages: write
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/topic-modeling
tags: |
type=ref, event=pr
type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

44 changes: 0 additions & 44 deletions .github/workflows/docker.yaml

This file was deleted.

Empty file added algorithms/__init__.py
Empty file.
15 changes: 14 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,17 @@ services:
- POSTGRES_PASSWORD=postgres
- POSTGRES_DB=postgres
ports:
- "5432:5432"
- "5432:5432"

minio:
image: quay.io/minio/minio
restart: always
command: server /data --console-address ":9001"
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin
- MINIO_LOG_LEVEL=debug
ports:
- "9000:9000"
- "9001:9001"

Binary file added dtm.pkl
Binary file not shown.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ numpy==2.3.3
pandas==2.3.2
SQLAlchemy==2.0.43
psycopg2-binary==2.9.10
pytest==9.0.1
Empty file added test/__init__.py
Empty file.
152 changes: 152 additions & 0 deletions test/test_lda_entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import os
import boto3
import pytest
import psycopg2
import time
import pandas as pd

from pathlib import Path
from botocore.exceptions import ClientError
from main import lda_topic_modeling

MINIO_USER = "minioadmin"
MINIO_PWD = "minioadmin"
BUCKET_NAME = "testbucket"

POSTGRES_USER = "postgres"
POSTGRES_PWD = "postgres"

N_TOPICS = 5


def ensure_bucket(s3, bucket):
try:
s3.head_bucket(Bucket=bucket)
except ClientError as e:
error_code = e.response["Error"]["Code"]
if error_code in ("404", "NoSuchBucket"):
s3.create_bucket(Bucket=bucket)
else:
raise


def download_to_tmp(s3, bucket, key):
tmp_path = Path("/tmp") / key.replace("/", "_")
s3.download_file(bucket, key, str(tmp_path))
return tmp_path


@pytest.fixture
def s3_minio():
client = boto3.client(
"s3",
endpoint_url="http://localhost:9000",
aws_access_key_id=MINIO_USER,
aws_secret_access_key=MINIO_PWD
)
ensure_bucket(client, BUCKET_NAME)
return client


@pytest.fixture(scope="session")
def postgres_conn():
"""Wait until postgres is ready, then yield a live connection."""
for _ in range(30):
try:
conn = psycopg2.connect(
host="127.0.0.1",
port=5432,
user=POSTGRES_USER,
password=POSTGRES_PWD,
database="postgres"
)
conn.autocommit = True
yield conn
conn.close()
return
except Exception:
time.sleep(1)
raise RuntimeError("Postgres did not start")


def test_lda_entrypoint(s3_minio, postgres_conn):
input_dtm_file_name = "dtm"
input_vocab_file_name = "vocab"

doc_topic_table_name = "doc_topic"
topic_terms_table_name = "topic_terms"

dtm_path = Path(__file__).parent / "files" / f"{input_dtm_file_name}.pkl"
dtm_bytes = dtm_path.read_bytes()

vocab_path = Path(__file__).parent / "files" / \
f"{input_vocab_file_name}.pkl"
vocab_bytes = vocab_path.read_bytes()

s3_minio.put_object(
Bucket=BUCKET_NAME,
Key=f"{input_dtm_file_name}.pkl",
Body=dtm_bytes
)
s3_minio.put_object(
Bucket=BUCKET_NAME,
Key=f"{input_vocab_file_name}.pkl",
Body=vocab_bytes
)

env = {
"N_TOPICS": "5",

"dtm_S3_HOST": "http://127.0.0.1",
"dtm_S3_PORT": "9000",
"dtm_S3_ACCESS_KEY": MINIO_USER,
"dtm_S3_SECRET_KEY": MINIO_PWD,
"dtm_BUCKET_NAME": BUCKET_NAME,
"dtm_FILE_PATH": "",
"dtm_FILE_NAME": input_dtm_file_name,
"dtm_FILE_EXT": "pkl",

"vocab_S3_HOST": "http://127.0.0.1",
"vocab_S3_PORT": "9000",
"vocab_S3_ACCESS_KEY": MINIO_USER,
"vocab_S3_SECRET_KEY": MINIO_PWD,
"vocab_BUCKET_NAME": BUCKET_NAME,
"vocab_FILE_PATH": "",
"vocab_FILE_NAME": input_vocab_file_name,
"vocab_FILE_EXT": "pkl",

"docs_to_topics_PG_HOST": "127.0.0.1",
"docs_to_topics_PG_PORT": "5432",
"docs_to_topics_PG_USER": POSTGRES_USER,
"docs_to_topics_PG_PASS": POSTGRES_PWD,
"docs_to_topics_DB_TABLE": doc_topic_table_name,

"top_terms_per_topic_PG_HOST": "127.0.0.1",
"top_terms_per_topic_PG_PORT": "5432",
"top_terms_per_topic_PG_USER": POSTGRES_USER,
"top_terms_per_topic_PG_PASS": POSTGRES_PWD,
"top_terms_per_topic_DB_TABLE": topic_terms_table_name,
}

for k, v in env.items():
os.environ[k] = v

lda_topic_modeling()

cur = postgres_conn.cursor()

# 1. doc-topic distribution
cur.execute(f"SELECT * FROM {doc_topic_table_name} ORDER BY 1;")
doc_topics = pd.DataFrame(cur.fetchall(), columns=[
desc[0] for desc in cur.description])
assert len(doc_topics) == 26
assert doc_topics.shape[1] == N_TOPICS

# 2. topic-term listing
cur.execute(
f"SELECT * FROM {
topic_terms_table_name} ORDER BY topic_id, weight DESC;")
topic_terms = pd.DataFrame(cur.fetchall(), columns=[
desc[0] for desc in cur.description])
assert len(topic_terms) > 0
assert "term" in topic_terms.columns
Loading
Loading