Skip to content

Commit 258a13c

Browse files
committed
Merge branch 'dev' into documentation_update
2 parents e212af3 + f192d4f commit 258a13c

File tree

55 files changed

+692
-372
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+692
-372
lines changed

.github/workflows/regression_tests.yml

Lines changed: 16 additions & 16 deletions
Large diffs are not rendered by default.

.github/workflows/regression_tests_variants.yml

Lines changed: 0 additions & 85 deletions
This file was deleted.

docker/build_docker_images.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ do
1313
esac
1414
done
1515

16+
# Artifact repostiory
17+
ARTIFACT_REPO="europe-west-4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
18+
1619
if [[ -z ${GIT_BRANCH+x} ]]
1720
then
1821
GIT_BRANCH='main' # Set default argument
@@ -22,9 +25,9 @@ for FRAMEWORK in "jax" "pytorch" "both"
2225
do
2326
IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}"
2427
DOCKER_BUILD_COMMAND="docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH"
25-
DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
26-
DOCKER_PUSH_COMMAND="docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
27-
DOCKER_PULL_COMMAND="docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
28+
DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME $ARTIFACT_REPO/$IMAGE_NAME"
29+
DOCKER_PUSH_COMMAND="docker push $ARTIFACT_REPO/$IMAGE_NAME"
30+
DOCKER_PULL_COMMAND="docker pull $ARTIFACT_REPO/$IMAGE_NAME"
2831

2932
echo "On branch: ${GIT_BRANCH}"
3033
echo $DOCKER_BUILD_COMMAND

docker/scripts/cloud-init.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ write_files:
4040
ExecStartPre=mount --bind /var/lib/nvidia /var/lib/nvidia
4141
ExecStartPre=mount -o remountexec /var/lib/nvidia
4242
ExecStartPre=/usr/bin/docker-credential-gcr configure-docker --registries us-central1-docker.pkg.dev
43-
ExecStartPre=/usr/bin/docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest
44-
ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest -b true
43+
ExecStartPre=/usr/bin/docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest
44+
ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest -b true
4545
StandardOutput=journal+console
4646
StandardError=journal+console
4747

docker/scripts/startup.sh

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ RSYNC_DATA="true"
5151
OVERWRITE="false"
5252
SAVE_CHECKPOINTS="true"
5353
TUNING_RULESET="external"
54+
ROOT_DATA_BUCKET="algoperf-data"
55+
LOGS_BUCKET="algoperf-runs"
5456

5557
# Pass flag
5658
while [ "$1" != "" ]; do
@@ -136,6 +138,14 @@ while [ "$1" != "" ]; do
136138
shift
137139
ADDITIONAL_REQUIREMENTS_PATH=$1
138140
;;
141+
--data_bucket)
142+
shift
143+
ROOT_DATA_BUCKET=$1
144+
;;
145+
--logs_bucket)
146+
shift
147+
LOGS_BUCKET=$1
148+
;;
139149
*)
140150
usage
141151
exit 1
@@ -179,11 +189,11 @@ VALID_WORKLOADS=("criteo1tb" "imagenet_resnet" "imagenet_resnet_silu" "imagenet_
179189
VALID_RULESETS=("self" "external")
180190

181191
# Set data and experiment paths
182-
ROOT_DATA_BUCKET="gs://mlcommons-data"
183192
ROOT_DATA_DIR="${HOME_DIR}/data"
193+
ROOT_DATA_BUCKET="gs://${ROOT_DATA_BUCKET}"
184194

185-
EXPERIMENT_BUCKET="gs://mlcommons-runs"
186195
EXPERIMENT_DIR="${HOME_DIR}/experiment_runs"
196+
EXPERIMENT_LOGS_BUCKET="gs://${LOGS_BUCKET}"
187197

188198
if [[ -n ${DATASET+x} ]]; then
189199
if [[ ! " ${VALID_DATASETS[@]} " =~ " $DATASET " ]]; then
@@ -283,7 +293,6 @@ if [[ ! -z ${SUBMISSION_PATH+x} ]]; then
283293
--workload=${WORKLOAD} \
284294
--submission_path=${SUBMISSION_PATH} \
285295
--data_dir=${DATA_DIR} \
286-
--num_tuning_trials=1 \
287296
--experiment_dir=${EXPERIMENT_DIR} \
288297
--experiment_name=${EXPERIMENT_NAME} \
289298
--overwrite=${OVERWRITE} \
@@ -313,8 +322,8 @@ if [[ ! -z ${SUBMISSION_PATH+x} ]]; then
313322
RETURN_CODE=$?
314323

315324
if [[ $INTERNAL_CONTRIBUTOR_MODE == "true" ]]; then
316-
/google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/
317-
/google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/
325+
/google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/
326+
/google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/
318327
fi
319328

320329
fi

docs/CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ gcloud auth configure-docker $ARTIFACT_REGISTRY_URL
8888
To pull the latest prebuilt image:
8989

9090
```bash
91-
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/<image_name>
91+
docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/<image_name>
9292
```
9393

9494
The naming convention for `image_name` is `algoperf_<framework>_<branch>`.
@@ -102,7 +102,7 @@ Currently maintained images on the repository are:
102102
- `algoperf_both_dev`
103103

104104
To reference the pulled image you will have to use the full `image_path`, e.g.
105-
`us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`.
105+
`europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_main`.
106106

107107
### Trigger Rebuild and Push of Maintained Images
108108

prize_qualification_baselines/self_tuning/pytorch_nadamw_full_budget.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Submission file for an NAdamW optimizer with warmup+cosine LR in PyTorch."""
22

3+
import collections
34
import math
45
from typing import Any, Dict, Iterator, List, Optional, Tuple
56

@@ -24,6 +25,7 @@
2425
"weight_decay": 0.08121616522670176,
2526
"warmup_factor": 0.02
2627
}
28+
HPARAMS = collections.namedtuple('Hyperparameters', HPARAMS.keys())(**HPARAMS)
2729

2830

2931
# Modified from github.com/pytorch/pytorch/blob/v1.12.1/torch/optim/adamw.py.

prize_qualification_baselines/self_tuning/pytorch_nadamw_target_setting.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Submission file for an NAdamW optimizer with warmup+cosine LR in PyTorch."""
22

3+
import collections
34
import math
45
from typing import Any, Dict, Iterator, List, Optional, Tuple
56

@@ -24,6 +25,7 @@
2425
"weight_decay": 0.08121616522670176,
2526
"warmup_factor": 0.02
2627
}
28+
HPARAMS = collections.namedtuple('Hyperparameters', HPARAMS.keys())(**HPARAMS)
2729

2830

2931
# Modified from github.com/pytorch/pytorch/blob/v1.12.1/torch/optim/adamw.py.

0 commit comments

Comments
 (0)