Skip to content

Commit 9ad65a2

Browse files
authored
version bump (#4)
Signed-off-by: vsoch <[email protected]> Co-authored-by: vsoch <[email protected]>
1 parent 6a5bef0 commit 9ad65a2

File tree

8 files changed

+238
-11
lines changed

8 files changed

+238
-11
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ and **Merged pull requests**. Critical items to know are:
1414
The versions coincide with releases on pip. Only major versions will be released as tags on Github.
1515

1616
## [0.0.x](https://github.com/converged-computing/flux-metrics-api/tree/main) (0.0.x)
17+
- On the fly metric (from a custom file) support, and job queue counts (0.0.11)
1718
- Support for certificates for uvicorn and change default port to 8443 (0.0.1)
1819
- Skelton release (0.0.0)

README.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ $ flux-metrics-api start
5656
$ flux-metrics-api start --port 9000 --host 127.0.0.1
5757
```
5858

59+
#### SSL
60+
5961
If you want ssl (port 443) you can provide the path to a certificate and keyfile:
6062

6163
```bash
@@ -68,6 +70,52 @@ An example of a full command we might run from within a pod:
6870
$ flux-metrics-api start --port 8443 --ssl-certfile /etc/certs/tls.crt --ssl-keyfile /etc/certs/tls.key --namespace flux-operator --service-name custom-metrics-apiserver
6971
```
7072

73+
#### On the fly custom metrics!
74+
75+
If you want to provide custom metrics, you can write a function in an external file that we will read it and add to the server.
76+
As a general rule:
77+
78+
- The name of the function will be the name of the custom metric
79+
- You can expect the only argument to be the flux handle
80+
- You'll need to do imports within your function to get them in scope
81+
82+
This likely can be improved upon, but is a start for now! We provide an [example file](example/custom-metrics.py). As an example:
83+
84+
```bash
85+
$ flux-metrics-api start --custom-metric ./example/custom-metrics.py
86+
```
87+
88+
And then test it:
89+
90+
```bash
91+
$ curl -s http://localhost:8443/apis/custom.metrics.k8s.io/v1beta2/namespaces/flux-operator/metrics/my_custom_metric_name | jq
92+
```
93+
```console
94+
{
95+
"items": [
96+
{
97+
"metric": {
98+
"name": "my_custom_metric_name"
99+
},
100+
"value": 4,
101+
"timestamp": "2023-06-01T01:39:08+00:00",
102+
"windowSeconds": 0,
103+
"describedObject": {
104+
"kind": "Service",
105+
"namespace": "flux-operator",
106+
"name": "custom-metrics-apiserver",
107+
"apiVersion": "v1beta2"
108+
}
109+
}
110+
],
111+
"apiVersion": "custom.metrics.k8s.io/v1beta2",
112+
"kind": "MetricValueList",
113+
"metadata": {
114+
"selfLink": "/apis/custom.metrics.k8s.io/v1beta2"
115+
}
116+
}
117+
```
118+
71119
See `--help` to see other options available.
72120

73121
### Endpoints
@@ -113,6 +161,14 @@ The following metrics are supported:
113161
- **node_free_count**: number of nodes free in the MiniCluster
114162
- **node_cores_free_count**: number of node cores free in the MiniCluster
115163
- **node_cores_up_count**: number of node cores up in the MiniCluster
164+
- **job_queue_state_new_count**: number of new jobs in the queue
165+
- **job_queue_state_depend_count**: number of jobs in the queue in state "depend"
166+
- **job_queue_state_priority_count**: number of jobs in the queue in state "priority"
167+
- **job_queue_state_sched_count**: number of jobs in the queue in state "sched"
168+
- **job_queue_state_run_count**: number of jobs in the queue in state "run"
169+
- **job_queue_state_cleanup_count**: number of jobs in the queue in state "cleanup"
170+
- **job_queue_state_inactive_count**: number of jobs in the queue in state "inactive"
171+
116172

117173
### Docker
118174

example/custom-metrics.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2023 Lawrence Livermore National Security, LLC and other
2+
# HPCIC DevTools Developers. See the top-level COPYRIGHT file for details.
3+
#
4+
# SPDX-License-Identifier: (MIT)
5+
6+
# This is an example of a custom metrics file you can provide on the command line, e.g,
7+
# flux-metrics-api start --custom-metrics ./custom-metrics.py
8+
9+
# The default format for a custom metric is the following:
10+
11+
12+
def my_custom_metric_name(handle):
13+
"""
14+
All custom metrics will be passed the active flux handle.
15+
16+
- The name of the function is the name of the metric.
17+
- You'll need to import what you need.
18+
"""
19+
# You'll need to import what you need again from Flux
20+
# or other places.
21+
import flux.resource
22+
23+
rpc = flux.resource.list.resource_list(handle)
24+
listing = rpc.get()
25+
return listing.free.ncores

flux_metrics_api/metrics.py

Lines changed: 109 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,13 @@
44
# SPDX-License-Identifier: (MIT)
55

66
import collections
7+
import importlib.util
8+
import inspect
9+
import os
10+
import shutil
11+
import sys
712

13+
import flux_metrics_api.utils as utils
814
from flux_metrics_api.logger import logger
915

1016
try:
@@ -57,29 +63,125 @@ def node_free_count():
5763
return len(listing.free.nodelist)
5864

5965

60-
def update_queue_metrics():
66+
def get_queue_metrics():
6167
"""
6268
Update metrics for counts of jobs in the queue
69+
70+
See https://github.com/flux-framework/flux-core/blob/master/src/common/libjob/job.h#L45-L53
71+
for identifiers.
6372
"""
6473
jobs = flux.job.job_list(handle)
6574
listing = jobs.get()
6675

6776
# Organize based on states
6877
states = [x["state"] for x in listing["jobs"]]
69-
print(states)
7078
counter = collections.Counter(states)
7179

80+
# Lookup of state name to integer
81+
lookup = {
82+
"new": 1,
83+
"depend": 2,
84+
"priority": 4,
85+
"sched": 8,
86+
"run": 16,
87+
"cleanup": 32,
88+
"inactive": 64,
89+
}
90+
7291
# This is how to get states
73-
# TODO make an endpoint for each, if this works at all :/
74-
for stateint, _ in counter.items():
75-
flux.job.info.statetostr(stateint)
92+
counts = {}
93+
for stateint, count in counter.items():
94+
state = flux.job.info.statetostr(stateint)
95+
counts[state] = count
96+
for state in lookup:
97+
if state not in counts:
98+
counts[state] = 0
99+
return counts
100+
101+
102+
# Queue states
103+
104+
105+
def job_queue_state_new_count():
106+
return get_queue_metrics()["new"]
107+
108+
109+
def job_queue_state_depend_count():
110+
return get_queue_metrics()["depend"]
111+
112+
113+
def job_queue_state_priority_count():
114+
return get_queue_metrics()["priority"]
115+
76116

117+
def job_queue_state_sched_count():
118+
return get_queue_metrics()["sched"]
77119

78-
# Organize metrics by name so we can eventually support export of custom set (if needed)
120+
121+
def job_queue_state_run_count():
122+
return get_queue_metrics()["run"]
123+
124+
125+
def job_queue_state_cleanup_count():
126+
return get_queue_metrics()["cleanup"]
127+
128+
129+
def job_queue_state_inactive_count():
130+
return get_queue_metrics()["inactive"]
131+
132+
133+
def add_custom_metrics(metric_file):
134+
"""
135+
Add custom metrics to the server
136+
"""
137+
global metrics
138+
tmpdir = utils.get_tmpdir()
139+
140+
# Copy our metrics file there and do relative import
141+
custom_metrics_file = os.path.join(tmpdir, "custom_metrics.py")
142+
shutil.copyfile(metric_file, custom_metrics_file)
143+
spec = importlib.util.spec_from_file_location("custom_metrics", custom_metrics_file)
144+
cm = importlib.util.module_from_spec(spec)
145+
sys.modules["cm"] = cm
146+
spec.loader.exec_module(cm)
147+
148+
# Discover the names, and add the functions!
149+
for contender in dir(cm):
150+
if contender.startswith("_"):
151+
continue
152+
func = getattr(cm, contender)
153+
154+
# We only care about functions
155+
if func.__class__.__name__ == "function":
156+
args = inspect.signature(func)
157+
158+
# Must have at least one argument (the handle)
159+
# We could be more strict here, but this is probably OK
160+
if len(args.parameters) == 0:
161+
sys.exit(f"{contender} is not a valid function - has no arguments")
162+
print(f"Adding custom function {contender} to metrics.")
163+
custom_metrics[contender] = func
164+
165+
# Cleanup
166+
shutil.rmtree(tmpdir)
167+
168+
169+
# Organize metrics by name
79170
metrics = {
171+
# Node resources
80172
"node_cores_free_count": node_core_free_count,
81173
"node_cores_up_count": node_core_up_count,
82174
"node_free_count": node_free_count,
83175
"node_up_count": node_up_count,
84-
# TODO add shared function to get queue stats
176+
# Queue states
177+
"job_queue_state_new_count": job_queue_state_new_count,
178+
"job_queue_state_depend_count": job_queue_state_depend_count,
179+
"job_queue_state_priority_count": job_queue_state_priority_count,
180+
"job_queue_state_sched_count": job_queue_state_sched_count,
181+
"job_queue_state_run_count": job_queue_state_run_count,
182+
"job_queue_state_cleanup_count": job_queue_state_cleanup_count,
183+
"job_queue_state_inactive_count": job_queue_state_inactive_count,
85184
}
185+
186+
# Custom metrics defined by the user (have the handle provided)
187+
custom_metrics = {}

flux_metrics_api/routes.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import flux_metrics_api.defaults as defaults
1313
import flux_metrics_api.types as types
1414
import flux_metrics_api.version as version
15-
from flux_metrics_api.metrics import metrics
15+
from flux_metrics_api.metrics import custom_metrics, handle, metrics
1616

1717
schemas = APISpecSchemaGenerator(
1818
APISpec(
@@ -55,7 +55,7 @@ def get_metric(request):
5555
# TODO we don't do anything with namespace currently, we assume we won't
5656
# be able to hit this if running in the wrong one
5757
# Unknown metric
58-
if metric_name not in metrics:
58+
if metric_name not in metrics and metric_name not in custom_metrics:
5959
print(f"Unknown metric requested {metric_name}")
6060
return JSONResponse(
6161
{"detail": "This metric is not known to the server."}, status_code=404
@@ -65,7 +65,10 @@ def get_metric(request):
6565
metric = types.new_identifier(metric_name)
6666

6767
# Get the value from Flux, assemble into listing
68-
value = metrics[metric_name]()
68+
if metric_name in custom_metrics:
69+
value = custom_metrics[metric_name](handle)
70+
else:
71+
value = metrics[metric_name]()
6972
metric_value = types.new_metric(metric, value=value)
7073

7174
# Give the endpoint for the service as metadata

flux_metrics_api/server.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import flux_metrics_api
1616
import flux_metrics_api.defaults as defaults
17+
import flux_metrics_api.metrics as metrics
1718
from flux_metrics_api.logger import setup_logger
1819
from flux_metrics_api.routes import routes
1920

@@ -73,6 +74,7 @@ def get_parser():
7374
start.add_argument(
7475
"--service-name", help="Service name the metrics service is running from"
7576
)
77+
start.add_argument("--custom-metric", help="A Python file with custom metrics")
7678
start.add_argument(
7779
"--api-path",
7880
dest="api_path",
@@ -111,6 +113,9 @@ def start(args):
111113
if args.ssl_certfile and not args.ssl_keyfile:
112114
sys.exit("A --ssl-certfile was provided without a --ssl-keyfile.")
113115

116+
# The user wants to add a file with custom metrics
117+
if args.custom_metric:
118+
metrics.add_custom_metrics(args.custom_metric)
114119
app = Starlette(debug=args.debug, routes=routes)
115120
uvicorn.run(
116121
app,

flux_metrics_api/utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
#
44
# SPDX-License-Identifier: (MIT)
55

6+
import os
7+
import tempfile
8+
from contextlib import contextmanager
9+
610

711
def read_file(path):
812
"""
@@ -11,3 +15,34 @@ def read_file(path):
1115
with open(path, "r") as fd:
1216
content = fd.read()
1317
return content
18+
19+
20+
@contextmanager
21+
def workdir(dirname):
22+
"""
23+
Provide context for a working directory, e.g.,
24+
25+
with workdir(name):
26+
# do stuff
27+
"""
28+
here = os.getcwd()
29+
os.chdir(dirname)
30+
try:
31+
yield
32+
finally:
33+
os.chdir(here)
34+
35+
36+
def get_tmpdir(tmpdir=None, prefix="", create=True):
37+
"""
38+
Get a temporary directory for an operation.
39+
"""
40+
tmpdir = tmpdir or tempfile.gettempdir()
41+
prefix = prefix or "flux-metrics-api-tmp"
42+
prefix = "%s.%s" % (prefix, next(tempfile._get_candidate_names()))
43+
tmpdir = os.path.join(tmpdir, prefix)
44+
45+
if not os.path.exists(tmpdir) and create is True:
46+
os.mkdir(tmpdir)
47+
48+
return tmpdir

flux_metrics_api/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44
# SPDX-License-Identifier: (MIT)
55

6-
__version__ = "0.0.1"
6+
__version__ = "0.0.11"
77
AUTHOR = "Vanessa Sochat"
88
99
NAME = "flux-metrics-api"

0 commit comments

Comments
 (0)