Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<div align="center">
<h1 align="center">Kubernetes Resource Recommendations Based on Historical Data</h1>
<h2 align="center">Get recommendations based on your existing data in Prometheus/Coralogix/Thanos/Mimir and more!</h2>
<p align="center">
<p align="center">
<a href="#installation"><strong>Installation</strong></a>
.
<a href="#how-krr-works"><strong>How KRR works</strong></a>
Expand Down Expand Up @@ -206,7 +206,7 @@ Apart from running KRR as a CLI tool you can also run KRR inside your cluster. W

<img src="./images/ui_recommendation.png">

You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a <a href="https://platform.robusta.dev/signup/?benefits=krr&utm_source=github&utm_medium=krr-readme&utm_content=in-cluster-ui">UI</a>.
You can also run KRR in-cluster as a Kubernetes Job, if you don't want to view results easily in a <a href="https://platform.robusta.dev/signup/?benefits=krr&utm_source=github&utm_medium=krr-readme&utm_content=in-cluster-ui">UI</a>.

```
kubectl apply -f https://raw.githubusercontent.com/robusta-dev/krr/refs/heads/main/docs/krr-in-cluster/krr-in-cluster-job.yaml
Expand Down Expand Up @@ -405,6 +405,22 @@ Refer to `krr simple --help`, and look at the flags `--prometheus-url`, `--prome
If you need help, contact us on Slack, email, or by opening a GitHub issue.
</details>

<details>
<summary>VCluster</summary>

KRR supports VCluster software when Prometheus is outside of the VCluster (on physical cluster or centralized). Because of VCluster pod renaming, you need to provide :

- `vcluster-namespace` : The namespace on physical cluster where VCluster is
- `vcluster-name` : The name of your VCluster (set during VCluster deployment)

Other parameters like namespace selector, pod selector etc work as expected.

```sh
krr simple --vcluster-name my-vcluster-name --vcluster-namespace my-vcluster-namespace
```

</details>

<details>
<summary>Debug mode</summary>
If you want to see additional debug logs:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13"
urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13"
websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13"
zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13"
tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
41 changes: 41 additions & 0 deletions robusta_krr/core/integrations/prometheus/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import asyncio
import datetime
import enum
import hashlib
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
from typing import Any, Optional, TypedDict
Expand Down Expand Up @@ -259,3 +260,43 @@ def combine_batches(self, results: list[PodsTimeData]) -> PodsTimeData:
"""

return reduce(lambda x, y: x | y, results, {})

## Vcluster
def get_vcluster_pod_real_name(self, pod_name: str, pod_namespace: str) -> str:
"""
Returns the pod name on the (host) cluster, which is different from the pod name in the VCluster.
When not in a VCluster, just returns the pod name as is.

Args:
pod_name (string): The pod name in the cluster krr connected to
pod_namespace (string): The pod namespace in the cluster krr connected to

Returns:
string: the pod name in the host cluster.
"""

if settings.vcluster_name is None:
return pod_name
else:
host_pod_name = f"{pod_name}-x-{pod_namespace}-x-{settings.vcluster_name}"
if len(host_pod_name) > 63:
host_pod_name_sha256 = hashlib.sha256(host_pod_name.encode()).hexdigest()
host_pod_name = f"{host_pod_name[:52]}-{host_pod_name_sha256[:10]}"
return host_pod_name

def get_pod_namespace(self, pod_namespace: str) -> str:
"""
Returns the pod namespace on the (host) cluster, which is different from the pod namespace in the VCluster.
When not in a VCluster, just returns the pod namespace as is.

Args:
pod_namespace (string): The pod namespace in the cluster krr connected to

Returns:
string: the pod namepace in the host cluster.
"""

if settings.vcluster_namespace is None:
return pod_namespace
else:
return settings.vcluster_namespace
50 changes: 31 additions & 19 deletions robusta_krr/core/integrations/prometheus/metrics/cpu.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from robusta_krr.core.models.objects import K8sObjectData

from .base import PrometheusMetric, QueryType
import logging


logger = logging.getLogger("krr")

class CPULoader(PrometheusMetric):
"""
A metric loader for loading CPU usage metrics.
Expand All @@ -11,20 +13,24 @@ class CPULoader(PrometheusMetric):
query_type: QueryType = QueryType.QueryRange

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max(
rate(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}[{step}]
)
) by (container, pod, job)
"""
rate(
container_cpu_usage_seconds_total{{
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}[{step}]
)
) by (container, pod, job)
"""
logger.debug(f"{prom_query}")

return prom_query
Comment on lines +16 to +33
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Escape pod names for regex and anchor matches; minor logging nit

Unescaped pod names in a regex matcher can overmatch (e.g., dots in names). Anchor the regex to avoid partial matches and escape pod names. Also, avoid unnecessary f-string in logger.debug.

Apply this diff:

-from .base import PrometheusMetric, QueryType
+from .base import PrometheusMetric, QueryType
 import logging
+import re
@@
-        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_selector = "|".join(
+            re.escape(self.get_vcluster_pod_real_name(pod.name, object.namespace))
+            for pod in object.pods
+        )
+        pods_selector = f"^({pods_selector})$" if pods_selector else r"^$"
@@
-                            pod=~"{pods_selector}",
+                            pod=~"{pods_selector}",
@@
-        logger.debug(f"{prom_query}")
+        logger.debug(prom_query)
@@
-            pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+            pods_selector = "|".join(
+                re.escape(self.get_vcluster_pod_real_name(pod.name, object.namespace))
+                for pod in object.pods
+            )
+            pods_selector = f"^({pods_selector})$" if pods_selector else r"^$"
@@
-                                pod=~"{pods_selector}",
+                                pod=~"{pods_selector}",
@@
-            logger.debug(f"{prom_query}")
+            logger.debug(prom_query)
@@
-        pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
+        pods_selector = "|".join(
+            re.escape(self.get_vcluster_pod_real_name(pod.name, object.namespace))
+            for pod in object.pods
+        )
+        pods_selector = f"^({pods_selector})$" if pods_selector else r"^$"
@@
-                        pod=~"{pods_selector}",
+                        pod=~"{pods_selector}",
@@
-        logger.debug(f"{prom_query}")
+        logger.debug(prom_query)

Notes:

  • Using re.escape ensures literal matching of pod names in the regex.
  • Anchoring with ^(...)$ prevents partial matches.
  • When no pods exist, '^$' ensures no match rather than a potentially surprising catch-all.

Also applies to: 46-67, 77-95



def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
Expand All @@ -37,15 +43,16 @@ def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:

class PercentileCPULoader(PrometheusMetric):
def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
quantile_over_time(
{round(percentile / 100, 2)},
max(
rate(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -55,6 +62,8 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
logger.debug(f"{prom_query}")
return prom_query

return PercentileCPULoader

Expand All @@ -65,13 +74,14 @@ class CPUAmountLoader(PrometheusMetric):
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
count_over_time(
max(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -80,3 +90,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
logger.debug(f"{prom_query}")
return prom_query
43 changes: 28 additions & 15 deletions robusta_krr/core/integrations/prometheus/metrics/memory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from robusta_krr.core.models.objects import K8sObjectData

from .base import PrometheusMetric, QueryType
import logging

logger = logging.getLogger("krr")

class MemoryLoader(PrometheusMetric):
"""
Expand All @@ -11,18 +13,21 @@ class MemoryLoader(PrometheusMetric):
query_type: QueryType = QueryType.QueryRange

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max(
container_memory_working_set_bytes{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
}}
) by (container, pod, job)
"""
logger.debug(f"{prom_query}")
return prom_query


class MaxMemoryLoader(PrometheusMetric):
Expand All @@ -31,13 +36,14 @@ class MaxMemoryLoader(PrometheusMetric):
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max_over_time(
max(
container_memory_working_set_bytes{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -46,21 +52,23 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""

logger.debug(f"{prom_query}")
return prom_query

class MemoryAmountLoader(PrometheusMetric):
"""
A metric loader for loading memory points count.
"""

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
count_over_time(
max(
container_memory_working_set_bytes{{
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -69,7 +77,9 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""

logger.debug(f"{prom_query}")
return prom_query

# TODO: Need to battle test if this one is correct.
class MaxOOMKilledMemoryLoader(PrometheusMetric):
"""
Expand All @@ -79,15 +89,16 @@ class MaxOOMKilledMemoryLoader(PrometheusMetric):
warning_on_no_data = False

def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
pods_selector = "|".join(self.get_vcluster_pod_real_name(pod.name, object.namespace) for pod in object.pods)
pods_namespace = self.get_pod_namespace(object.namespace)
cluster_label = self.get_prometheus_cluster_label()
return f"""
prom_query = f"""
max_over_time(
max(
max(
kube_pod_container_resource_limits{{
resource="memory",
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -97,7 +108,7 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
max(
kube_pod_container_status_last_terminated_reason{{
reason="OOMKilled",
namespace="{object.namespace}",
namespace="{pods_namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
Expand All @@ -107,3 +118,5 @@ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
[{duration}:{step}]
)
"""
logger.debug(f"{prom_query}")
return prom_query
4 changes: 4 additions & 0 deletions robusta_krr/core/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ class Config(pd.BaseSettings):
inside_cluster: bool = False
_logging_console: Optional[Console] = pd.PrivateAttr(None)

# vcluster settings
vcluster_name: Optional[str] = pd.Field(None)
vcluster_namespace: Optional[str] = pd.Field(None)

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)

Expand Down
16 changes: 15 additions & 1 deletion robusta_krr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,18 @@ def run_strategy(
help="A list of sinks to send the scan to",
rich_help_panel="Publish Scan Settings",
),
vcluster_namespace: str = typer.Option(
None,
"--vcluster-namespace",
help="The vcluster namespace on physical cluster",
rich_help_panel="VCluster Settings",
),
vcluster_name: str = typer.Option(
None,
"--vcluster-name",
help="The vcluster name on physical cluster",
rich_help_panel="VCluster Settings",
),
Comment on lines +323 to +334
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Make vcluster CLI options Optional[str] and clarify help about paired usage

Both options default to None but are annotated as str. Align to Optional[str] and clarify help that both must be provided together when using VCluster to avoid misconfiguration.

Apply this diff:

-                vcluster_namespace: str = typer.Option(
+                vcluster_namespace: Optional[str] = typer.Option(
                     None,
                     "--vcluster-namespace",
-                    help="The vcluster namespace on physical cluster",
+                    help="The vcluster namespace on the physical cluster (required when using --vcluster-name).",
                     rich_help_panel="VCluster Settings",
                 ),
-                vcluster_name: str = typer.Option(
+                vcluster_name: Optional[str] = typer.Option(
                     None,
                     "--vcluster-name",
-                    help="The vcluster name on physical cluster",
+                    help="The vcluster name on the physical cluster (required when using --vcluster-namespace).",
                     rich_help_panel="VCluster Settings",
                 ),

As a follow-up, add a paired-args validation right before constructing Config to prevent partial configuration:

# Enforce that either both vcluster_* are provided or neither
if bool(vcluster_name) ^ bool(vcluster_namespace):
    raise click.BadOptionUsage(
        "--vcluster-name/--vcluster-namespace",
        "Both --vcluster-name and --vcluster-namespace must be provided together when using VCluster."
    )
🤖 Prompt for AI Agents
robusta_krr/main.py around lines 323 to 334, the CLI options vcluster_namespace
and vcluster_name are typed as str but default to None and lack clear paired-use
help; change their type annotations to Optional[str] and update their help text
to state that both must be provided together when using VCluster, and add a
validation immediately before constructing Config that checks for a partial set
(XOR) of these two values and raises click.BadOptionUsage with a message that
both --vcluster-name and --vcluster-namespace must be provided together.

**strategy_args,
) -> None:
f"""Run KRR using the `{_strategy_name}` strategy"""
Expand Down Expand Up @@ -373,7 +385,9 @@ def run_strategy(
start_time=start_time,
scan_id=scan_id,
named_sinks=named_sinks,
)
vcluster_namespace=vcluster_namespace,
vcluster_name=vcluster_name,
)
Config.set_config(config)
except ValidationError as e:
logger.exception("Error occured while parsing arguments")
Expand Down