Skip to content

Commit 25a4fb0

Browse files
Merge branch 'kubeflow:master' into support_kai
2 parents c341262 + b71a690 commit 25a4fb0

File tree

415 files changed

+1729
-39957
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

415 files changed

+1729
-39957
lines changed

.github/workflows/template-publish-image/action.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ runs:
8181
with:
8282
images: ${{ inputs.image }}
8383
tags: |
84+
type=ref,event=tag
8485
type=raw,latest
8586
type=sha
8687

.github/workflows/test-e2e.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
4343
4444
echo "Install Kubeflow SDK"
45-
pip install ./sdk
45+
pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python
4646
4747
- name: Setup cluster
4848
run: |

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ artifacts
2020
__pycache__/
2121
*.egg-info/
2222

23-
# OpenAPI Generator CLI JAR file
24-
hack/python-sdk/openapi-generator-cli.jar
25-
2623
# Coverage
2724
cover.out
2825

.pre-commit-config.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,5 @@ exclude: |
2626
(?x)^(
2727
docs/images/.*|
2828
pkg/client/.*|
29-
sdk/kubeflow/trainer/__init__.py|
30-
sdk/kubeflow/trainer/api/__init__.py|
31-
sdk/kubeflow/trainer/models/.*|
3229
api/python_api/kubeflow_trainer_api/models/.*|
33-
sdk/docs/.*
3430
)$

CHANGELOG.md

Lines changed: 297 additions & 0 deletions
Large diffs are not rendered by default.

CONTRIBUTING.md

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Note for Lima the link is to the Adopters, which supports several different cont
2121
The Kubeflow Trainer project includes a Makefile with several helpful commands to streamline your development workflow:
2222

2323
```sh
24-
# Generate manifests, APIs and SDK
24+
# Generate manifests and APIs.
2525
make generate
2626
```
2727

@@ -215,24 +215,6 @@ kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple --foll
215215

216216
### SDK Development
217217

218-
To generate Python SDK for the operator, run:
218+
Changes to the Kubeflow Trainer Python SDK can be made in the https://github.com/kubeflow/sdk repo.
219219

220-
```sh
221-
./hack/python-sdk/gen-sdk.sh
222-
```
223-
224-
This command will re-generate the api and model files together with the documentation and model tests.
225-
The following files/folders in `sdk/python` are auto-generated and should not be modified directly:
226-
227-
```
228-
sdk/python/docs
229-
sdk/python/kubeflow/training/models
230-
sdk/python/kubeflow/training/*.py
231-
sdk/python/test/*.py
232-
```
233-
234-
The Training Operator client and public APIs are located here:
235-
236-
```
237-
sdk/python/kubeflow/training/api
238-
```
220+
The Trainer SDK can be found at https://github.com/kubeflow/sdk/tree/main/python/kubeflow/trainer.

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ HELM_CHART_TESTING_VERSION ?= v3.12.0
2929
HELM_DOCS_VERSION ?= v1.14.2
3030
YQ_VERSION ?= v4.45.1
3131

32+
# Container runtime (docker or podman)
33+
CONTAINER_RUNTIME ?=
34+
3235
# Tool binaries
3336
GINKGO ?= $(LOCALBIN)/ginkgo
3437
ENVTEST ?= $(LOCALBIN)/setup-envtest
@@ -117,13 +120,11 @@ manifests: controller-gen ## Generate manifests.
117120
output:rbac:artifacts:config=manifests/base/rbac \
118121
output:webhook:artifacts:config=manifests/base/webhook
119122

120-
## TODO (kramaranya): Remove gen-sdk.sh when moving SDK
121123
.PHONY: generate
122-
generate: go-mod-download manifests ## Generate APIs and SDK.
124+
generate: go-mod-download manifests ## Generate APIs.
123125
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
124126
hack/update-codegen.sh
125-
hack/python-api/gen-api.sh
126-
hack/python-sdk/gen-sdk.sh
127+
CONTAINER_RUNTIME=$(CONTAINER_RUNTIME) hack/python-api/gen-api.sh
127128

128129
.PHONY: go-mod-download
129130
go-mod-download: ## Run go mod download to download modules.
@@ -160,7 +161,6 @@ test-integration: ginkgo envtest jobset-operator-crd scheduler-plugins-crd ## Ru
160161
test-python: ## Run Python unit test.
161162
pip install pytest
162163
pip install -r ./cmd/initializers/dataset/requirements.txt
163-
pip install ./sdk
164164

165165
PYTHONPATH=$(PROJECT_DIR) pytest ./pkg/initializers/dataset
166166
PYTHONPATH=$(PROJECT_DIR) pytest ./pkg/initializers/model

OWNERS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
approvers:
22
- andreyvelich
3+
- Electronic-Waste
34
- gaocegege
45
- Jeffwan
56
- johnugeorge
67
- tenzen-y
78
- terrytangyuan
89
reviewers:
9-
- Electronic-Waste
10+
- astefanutti
1011
- jinchihe
1112
- kuizhiqing
1213
emeritus_approvers:

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
[![Build Status](https://github.com/kubeflow/trainer/actions/workflows/test-go.yaml/badge.svg?branch=master)](https://github.com/kubeflow/trainer/actions/workflows/test-go.yaml?branch=master)
44
[![Coverage Status](https://coveralls.io/repos/github/kubeflow/trainer/badge.svg?branch=master)](https://coveralls.io/github/kubeflow/trainer?branch=master)
55
[![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/trainer)](https://goreportcard.com/report/github.com/kubeflow/trainer)
6+
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10435/badge)](https://www.bestpractices.dev/projects/10435)
67

78
<h1 align="center">
89
<img src="./docs/images/trainer-logo.svg" alt="logo" width="200">
@@ -19,8 +20,9 @@ You can integrate other ML libraries such as [HuggingFace](https://huggingface.c
1920
[DeepSpeed](https://github.com/microsoft/DeepSpeed), or [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
2021
with Kubeflow Training to orchestrate their ML training on Kubernetes.
2122

22-
Kubeflow Trainer allows you effortlessly develop your LLMs with the Kubeflow Python SDK and
23-
build Kubernetes-native Training Runtimes with Kubernetes Custom Resources APIs.
23+
Kubeflow Trainer enables you to effortlessly develop your LLMs with the
24+
[Kubeflow Python SDK](https://github.com/kubeflow/sdk/), and build Kubernetes-native Training
25+
Runtimes using Kubernetes Custom Resource APIs.
2426

2527
<h1 align="center">
2628
<img src="./docs/images/trainer-tech-stack.drawio.svg" alt="logo" width="500">

api/openapi-spec/swagger.json

Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13155,32 +13155,14 @@
1315513155
}
1315613156
},
1315713157
"trainer.v1alpha1.ContainerOverride": {
13158-
"description": "ContainerOverride represents parameters that can be overridden using PodSpecOverrides. Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.",
13158+
"description": "ContainerOverride represents parameters that can be overridden using PodSpecOverrides.",
1315913159
"type": "object",
1316013160
"required": [
1316113161
"name"
1316213162
],
1316313163
"properties": {
13164-
"args": {
13165-
"description": "Arguments to the entrypoint for the training container.",
13166-
"type": "array",
13167-
"items": {
13168-
"type": "string",
13169-
"default": ""
13170-
},
13171-
"x-kubernetes-list-type": "atomic"
13172-
},
13173-
"command": {
13174-
"description": "Entrypoint commands for the training container.",
13175-
"type": "array",
13176-
"items": {
13177-
"type": "string",
13178-
"default": ""
13179-
},
13180-
"x-kubernetes-list-type": "atomic"
13181-
},
1318213164
"env": {
13183-
"description": "List of environment variables to set in the container. These values will be merged with the TrainingRuntime's environments.",
13165+
"description": "List of environment variables to set in the container. These values will be merged with the TrainingRuntime's environments. These values can't be set for container with the name: `node`, `dataset-initializer`, or `model-initializer`. For those containers the envs can only be set via Trainer or Initializer APIs.",
1318413166
"type": "array",
1318513167
"items": {
1318613168
"default": {},
@@ -13195,19 +13177,6 @@
1319513177
],
1319613178
"x-kubernetes-list-type": "map"
1319713179
},
13198-
"envFrom": {
13199-
"description": "List of sources to populate environment variables in the container. These values will be merged with the TrainingRuntime's environments.",
13200-
"type": "array",
13201-
"items": {
13202-
"default": {},
13203-
"allOf": [
13204-
{
13205-
"$ref": "#/components/schemas/io.k8s.api.core.v1.EnvFromSource"
13206-
}
13207-
]
13208-
},
13209-
"x-kubernetes-list-type": "atomic"
13210-
},
1321113180
"name": {
1321213181
"description": "Name for the container. TrainingRuntime must have this container.",
1321313182
"type": "string",
@@ -13544,7 +13513,7 @@
1354413513
"x-kubernetes-list-type": "map"
1354513514
},
1354613515
"nodeSelector": {
13547-
"description": "Override for the node selector to place Pod on the specific mode.",
13516+
"description": "Override for the node selector to place Pod on the specific node.",
1354813517
"type": "object",
1354913518
"additionalProperties": {
1355013519
"type": "string",
@@ -13582,7 +13551,7 @@
1358213551
"x-kubernetes-list-type": "atomic"
1358313552
},
1358413553
"volumes": {
13585-
"description": "Overrides for the Pod volume configuration.",
13554+
"description": "Overrides for the Pod volume configurations.",
1358613555
"type": "array",
1358713556
"items": {
1358813557
"default": {},

0 commit comments

Comments
 (0)