RWTH-TIME · PaulKalho · Dec 3, 2025 · Dec 2, 2025 · Dec 3, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,141 @@
+name: CI
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  lint-python:
+    name: Lint Python
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Run flake8
+        uses: py-actions/flake8@v2
+
+  validate-compute-block:
+    name: Validate Compute Block Config
+    runs-on: ubuntu-latest
+    needs: lint-python
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+
+      - name: Intall dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Check cbcs
+        run: |
+          python3 - <<'EOF'
+          import main
+
+          from scystream.sdk.config import load_config, get_compute_block
+          from scystream.sdk.config.config_loader import _compare_configs
+          from pathlib import Path
+
+          CBC_PATH = Path("cbc.yaml")
+
+          if not CBC_PATH.exists():
+              raise FileNotFoundError("cbc.yaml not found in repo root.")
+
+          block_from_code = get_compute_block()
+          block_from_yaml = load_config(str(CBC_PATH))
+
+          _compare_configs(block_from_code, block_from_yaml)
+
+          print("cbc.yaml matches python code definition")
+          EOF
+
+  run-test:
+    name: Run Tests
+    runs-on: ubuntu-latest
+    needs: validate-compute-block
+    services:
+      minio:
+        image: lazybit/minio
+        ports:
+          - 9000:9000
+        env:
+          MINIO_ROOT_USER: minioadmin
+          MINIO_ROOT_PASSWORD: minioadmin
+        options: >-
+          --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
+          --health-interval 5s
+          --health-retries 5
+          --health-timeout 5s
+      postgres:
+        image: postgres:15
+        ports:
+          - 5432:5432
+        env:
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: postgres
+        options: >-
+          --health-cmd="pg_isready -U postgres"
+          --health-interval=5s
+          --health-retries=10
+          --health-timeout=5s
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run Tests
+        run: pytest -vv
+
+  build:
+    name: Build docker image
+    runs-on: ubuntu-latest
+    needs: run-test
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata for docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/topic-modeling
+          tags: |
+            type=ref, event=pr
+            type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
diff --git a/algorithms/__init__.py b/algorithms/__init__.py
diff --git a/algorithms/lda.py b/algorithms/lda.py
@@ -1,8 +1,12 @@
+import logging
+
 import numpy as np
 import pandas as pd
 
 from sklearn.decomposition import LatentDirichletAllocation
 
+logger = logging.getLogger(__name__)
+
 
 class LDAModeler:
     def __init__(
@@ -24,6 +28,11 @@ def __init__(
         self.random_state = random_state
         self.n_top_words = n_top_words
 
+        logger.debug(
+            f"Initialized LDAModeler: topics={n_topics}, iter={max_iter}, "
+            f"learning='{learning_method}', n_top_words={n_top_words}"
+        )
+
         self.lda = LatentDirichletAllocation(
             n_components=n_topics,
             max_iter=max_iter,
@@ -34,22 +43,31 @@ def __init__(
         self.doc_topic_dist = None
 
     def fit(self):
+        logger.info(f"Running LDA fit(), DTM shape {self.dtm.shape}")
         self.lda.fit(self.dtm)
+        logger.info("LDA model fitted successfully")
 
     def extract_doc_topics(self) -> pd.DataFrame:
         """
         Generates document-topic distribution DF
         """
+        logger.info("Extracting doc-topics...")
         self.doc_topic_dist = self.lda.transform(self.dtm)
-        return pd.DataFrame(
+
+        df = pd.DataFrame(
             self.doc_topic_dist,
             columns=[f"topic_{i}" for i in range(self.n_topics)],
         )
 
+        logger.debug(
+            f"Extracted doc-topic distribution DataFrame shape={df.shape}")
+        return df
+
     def extract_topic_terms(self):
         """
         Generate topic and top-terms DataFrame
         """
+        logger.info("Extracting top terms per topic...")
         idx2term = {idx: term for term, idx in self.vocab.items()}
         topic_rows = []
 
@@ -62,4 +80,7 @@ def extract_topic_terms(self):
                     "term": idx2term[i],
                     "weight": topic[i]
                 })
-        return pd.DataFrame(topic_rows)
+
+        df = pd.DataFrame(topic_rows)
+        logger.info(f"Generated topic_terms DataFrame rows={df.shape[0]}")
+        return df
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -7,4 +7,17 @@ services:
       - POSTGRES_PASSWORD=postgres
       - POSTGRES_DB=postgres
     ports:
-      - "5432:5432" 
+      - "5432:5432"
+
+  minio:
+    image: quay.io/minio/minio
+    restart: always
+    command: server /data --console-address ":9001"
+    environment:
+      - MINIO_ROOT_USER=minioadmin
+      - MINIO_ROOT_PASSWORD=minioadmin
+      - MINIO_LOG_LEVEL=debug
+    ports:
+      - "9000:9000"
+      - "9001:9001"
+
diff --git a/dtm.pkl b/dtm.pkl
diff --git a/main.py b/main.py
@@ -1,3 +1,4 @@
+import logging
 import pickle
 
 from scystream.sdk.core import entrypoint
@@ -13,6 +14,12 @@
 
 from algorithms.lda import LDAModeler
 
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
 
 class DTMFileInput(FileSettings, InputSettings):
     __identifier__ = "dtm"
@@ -48,23 +55,38 @@ class LDATopicModeling(EnvSettings):
 
 
 def write_df_to_postgres(df, settings: PostgresSettings):
+    logger.info(f"Writing DataFrame to DB table '{settings.DB_TABLE}'…")
+
     engine = create_engine(
-            f"postgresql+psycopg2://{settings.PG_USER}:{settings.PG_PASS}"
-            f"@{settings.PG_HOST}:{int(settings.PG_PORT)}/"
+        f"postgresql+psycopg2://{settings.PG_USER}:{settings.PG_PASS}"
+        f"@{settings.PG_HOST}:{int(settings.PG_PORT)}/"
     )
     df.to_sql(settings.DB_TABLE, engine, if_exists="replace", index=False)
+    logger.info(f"Successfully wrote {len(df)} rows to '{settings.DB_TABLE}'.")
 
 
 @entrypoint(LDATopicModeling)
 def lda_topic_modeling(settings):
+    logger.info("Starting LDA topic modeling pipeline…")
+
+    logger.info("Downloading vocabulary file...")
     S3Operations.download(settings.vocab, "vocab.pkl")
+
+    logger.info("Loading vocab.pkl from disk...")
     with open("vocab.pkl", "rb") as f:
         vocab = pickle.load(f)
 
+    logger.info(f"Loaded vocab with {len(vocab)} terms.")
+
+    logger.info("Downloading DTM file...")
     S3Operations.download(settings.dtm, "dtm.pkl")
+
+    logger.info("Loading dtm.pkl from disk...")
     with open("dtm.pkl", "rb") as f:
         dtm = pickle.load(f)
 
+    logger.info(f"Loaded DTM with shape {dtm.shape}")
+
     # TODO: Check if dtm and vocab is of correct schema
     lda = LDAModeler(
         dtm=dtm,

diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ numpy==2.3.3
 pandas==2.3.2
 SQLAlchemy==2.0.43
 psycopg2-binary==2.9.10
+pytest==9.0.1
diff --git a/test/__init__.py b/test/__init__.py