mozilla · mashalifshin · Jul 9, 2025 · Jul 9, 2025 · Aug 6, 2025 · Aug 12, 2025
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -65,6 +65,22 @@ jobs:
           name: Verify CI config is up-to-date
           command: docker run docker-etl:build python3 -m docker_etl.ci_config --dry-run | diff -B .circleci/config.yml -
 
+  build-job-ads-incrementality-dap-collector:
+    docker:
+      - image: << pipeline.parameters.git-image >>
+    steps:
+      - checkout
+      - compare-branch:
+          pattern: ^jobs/ads-incrementality-dap-collector/
+      - setup_remote_docker:
+          version: << pipeline.parameters.docker-version >>
+      - run:
+          name: Build Docker image
+          command: docker build -t app:build jobs/ads-incrementality-dap-collector/
+      - run:
+          name: Test Code
+          command: docker run app:build python3 -m pytest
+
   build-job-bq2sftp:
     docker:
       - image: << pipeline.parameters.git-image >>
@@ -213,6 +229,7 @@ jobs:
           name: Build Docker image
           command: docker build -t app:build jobs/experiments-monitoring-data-export/
 
+
   build-job-extensions:
     docker:
       - image: << pipeline.parameters.git-image >>
@@ -402,6 +419,20 @@ workflows:
     jobs:
       - build-docker-etl
 
+  job-ads-incrementality-dap-collector:
+    jobs:
+      - build-job-ads-incrementality-dap-collector
+      - gcp-gcr/build-and-push-image:
+          context: data-eng-airflow-gcr
+          docker-context: jobs/ads-incrementality-dap-collector/
+          path: jobs/ads-incrementality-dap-collector/
+          image: ads-incrementality-dap-collector_docker_etl
+          requires:
+            - build-job-ads-incrementality-dap-collector
+          filters:
+            branches:
+              only: main
+
   job-bq2sftp:
     jobs:
       - build-job-bq2sftp
@@ -533,7 +564,6 @@ workflows:
             branches:
               only: main
 
-
   job-extensions:
     jobs:
       - build-job-extensions

diff --git a/jobs/ads-incrementality-dap-collector/.dockerignore b/jobs/ads-incrementality-dap-collector/.dockerignore
@@ -0,0 +1,13 @@
+.cache/
+ci_job.yaml
+ci_workflow.yaml
+public_key_to_hpke_config.py
+dev_run_docker.sh
+dev_runbook.md
+.DS_Store
+example_config.json
+*.pyc
+.pytest_cache/
+.python-version
+__pycache__/
+venv/
diff --git a/jobs/ads-incrementality-dap-collector/.flake8 b/jobs/ads-incrementality-dap-collector/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
diff --git a/jobs/ads-incrementality-dap-collector/.gitignore b/jobs/ads-incrementality-dap-collector/.gitignore
@@ -0,0 +1,5 @@
+.DS_Store
+*.pyc
+__pycache__/
+venv/
+.python-version
diff --git a/jobs/ads-incrementality-dap-collector/Dockerfile b/jobs/ads-incrementality-dap-collector/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.12
+LABEL maintainer="Glenda Leonard <[email protected]>"
+ARG HOME="/janus_build"
+WORKDIR ${HOME}
+
+RUN apt update && apt --yes install curl
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH=$HOME/.cargo/bin:$PATH
+
+# build the CLI tool
+RUN git clone --depth 1 https://github.com/divviup/janus.git --branch '0.7.69'
+RUN cd janus && cargo build -r -p janus_tools --bin collect
+
+######### next stage
+
+FROM python:3.12
+LABEL maintainer="Glenda Leonard <[email protected]>"
+# https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md
+ARG USER_ID="10001"
+ARG GROUP_ID="app"
+ARG HOME="/app"
+WORKDIR ${HOME}
+
+RUN groupadd --gid ${USER_ID} ${GROUP_ID} && \
+    useradd --create-home --uid ${USER_ID} --gid ${GROUP_ID} --home-dir ${HOME} ${GROUP_ID}
+##################### from other Dockerfile
+COPY --from=0 /janus_build/janus/target/release/collect ./
+###################
+
+# Drop root and change ownership of the application folder to the user
+RUN chown -R ${USER_ID}:${GROUP_ID} ${HOME}
+USER ${USER_ID}
+ADD ./requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+
+ADD . .
diff --git a/jobs/ads-incrementality-dap-collector/README.md b/jobs/ads-incrementality-dap-collector/README.md
@@ -0,0 +1,131 @@
+# Ads Incrementality DAP collector
+
+## Background
+
+Incrementality is a way to measure the effectiveness of our ads in a general, agreggated, privacy-preserving way --
+without knowing anything about specific users.
+
+Incrementality works by dividing clients into various Nimbus experiment branches that vary how/whether an ad is shown.
+Separately, a [DAP](https://docs.divviup.org/) task is configured to store the metrics for each experiment branch in a
+different DAP bucket.
+
+Firefox is instrumented with [DAP telemetry functionality](https://github.com/mozilla-firefox/firefox/tree/main/toolkit/components/telemetry/dap), which allows it to send metrics and reports into the correct DAP buckets as configured in the experiment.
+
+Then this job can go out and collect metrics from DAP (using bucket info from the experiment's data), and write them
+to BQ.
+
+## Overview
+
+This job is driven by a config file from a GCS bucket. Inform the job of the config file location by passing the
+`gcp_project` and `gcs_config_bucket` parameters. See `example_config.json` for how to structure this file.
+
+The config file specifies the incrementality experiments that are currently running, some config and credentials from DAP,
+and where in BQ to write the incrementality results.
+
+The job will go out to Nimbus and read data for each of the experiments, then go out to DAP and read experiment branch results,
+then put it all together into results rows and write metrics to BQ.
+
+## Configuration
+
+The three recognized top-level keys here are `bq`, `dap`, and `nimbus`
+
+#### bq
+
+Everything the job needs to connect to BigQuery.
+
+- `project`:         GCP project
+- `namespace`:       BQ namespace for ads incrementality
+- `table`:           BQ table where incrementality results go
+
+#### dap
+
+Everything the job needs to connect to DAP.
+
+- `auth_token`:           Token defined in the collector credentials, used to authenticate to the leader
+- `hpke_private_key`:     Private key defined in the collector credentials, used to decrypt shares from the leader
+                          and helper
+- `hpke_config`:          base64 url-encoded version of public key defined in the collector credentials
+- `batch_start`:          Start of the collection interval, as the number of seconds since the Unix epoch
+
+
+#### nimbus
+
+Everything the job needs to connect to Nimbus.
+
+- `api_url`:        API URL for fetching the Nimbus experiment info
+- `experiments`:    List of incrementality experiments configs
+
+##### experiment config list
+
+The experiments that the job should collect results for.
+
+- `slug`:               Experiment slug
+- `batch_duration`:     Optional. Duration of the collection batch interval, in seconds.
+                        This will default to 7 days if not specified
+
+## Usage
+
+This script is intended to be run in a docker container.
+
+It requires setup of some environment variables that hold DAP credentials, and the job will look for those when it
+starts up. A dev script, `dev_run_docker.sh`, is included for convenience to build and run the job locally, and it
+also documents those variables.
+
+Once the environment variables are set up, run the job with:
+
+
+```sh
+./dev_run_docker.sh
+```
+
+To just build the docker image, use:
+
+```sh
+docker build -t ads_incrementality_dap_collector .
+```
+
+To run outside of docker, install dependencies with:
+
+```sh
+pip install -r requirements.txt
+```
+
+Run the script with:
+
+```sh
+python3 -m python_template_job.main
+```
+
+## Testing
+
+Run tests with:
+
+```sh
+python3 -m pytest
+```
+
+## Linting and format
+
+`flake8` and `black` are included for code linting and formatting:
+
+```sh
+pytest --black --flake8
+```
+
+or
+
+```sh
+flake8 .
+```
+
+or
+
+```sh
+black .
+```
+
+or
+
+```sh
+black --diff .
+```
diff --git a/jobs/ads-incrementality-dap-collector/ads_incrementality_dap_collector/constants.py b/jobs/ads-incrementality-dap-collector/ads_incrementality_dap_collector/constants.py
@@ -0,0 +1,72 @@
+from datetime import datetime
+
+from google.cloud import bigquery
+
+DAP_LEADER = "https://dap-09-3.api.divviup.org"
+VDAF = "histogram"
+PROCESS_TIMEOUT = 1200  # 20 mins
+
+CONFIG_FILE_NAME = "config.json"  # See example_config.json for the contents and structure of the job config file.
+LOG_FILE_NAME = f"{datetime.now()}-ads-incrementality-dap-collector.log"
+
+DEFAULT_BATCH_DURATION = 604800
+
+COLLECTOR_RESULTS_SCHEMA = [
+    bigquery.SchemaField(
+        "collection_start",
+        "DATE",
+        mode="REQUIRED",
+        description="Start date of the collected time window, inclusive.",
+    ),
+    bigquery.SchemaField(
+        "collection_end",
+        "DATE",
+        mode="REQUIRED",
+        description="End date of the collected time window, inclusive.",
+    ),
+    bigquery.SchemaField(
+        "country_codes",
+        "JSON",
+        mode="NULLABLE",
+        description="List of 2-char country codes for the experiment",
+    ),
+    bigquery.SchemaField(
+        "experiment_slug",
+        "STRING",
+        mode="REQUIRED",
+        description="Slug indicating the experiment.",
+    ),
+    bigquery.SchemaField(
+        "experiment_branch",
+        "STRING",
+        mode="REQUIRED",
+        description="The experiment branch this data is associated with.",
+    ),
+    bigquery.SchemaField(
+        "advertiser",
+        "STRING",
+        mode="REQUIRED",
+        description="Advertiser associated with this experiment.",
+    ),
+    bigquery.SchemaField(
+        "metric",
+        "STRING",
+        mode="REQUIRED",
+        description="Metric collected for this experiment.",
+    ),
+    bigquery.SchemaField(
+        name="value",
+        field_type="RECORD",
+        mode="REQUIRED",
+        fields=[
+            bigquery.SchemaField("count", "INT64", mode="NULLABLE"),
+            bigquery.SchemaField("histogram", "JSON", mode="NULLABLE"),
+        ],
+    ),
+    bigquery.SchemaField(
+        "created_at",
+        "TIMESTAMP",
+        mode="REQUIRED",
+        description="Timestamp for when this row was written.",
+    ),
+]