-
Notifications
You must be signed in to change notification settings - Fork 294
[Application Runtime] Support setting probes for the application runtime sidecars #9043
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Application Runtime] Support setting probes for the application runtime sidecars #9043
Conversation
17e66cb to
a3f2f2f
Compare
TomerShor
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An initial review
| _K8S_CLASSES = { | ||
| "V1HTTPGetAction": V1HTTPGetAction, | ||
| "V1ExecAction": V1ExecAction, | ||
| "V1TCPSocketAction": V1TCPSocketAction, | ||
| "V1GRPCAction": V1GRPCAction, | ||
| "V1Probe": V1Probe, | ||
| } | ||
| _SCHEMA_CACHE: dict[str, dict] = {} | ||
|
|
||
|
|
||
| def _get_schema(class_name: str) -> dict: | ||
| """Get and cache Kubernetes class schema.""" | ||
| if class_name in _SCHEMA_CACHE: | ||
| return _SCHEMA_CACHE[class_name] | ||
|
|
||
| k8s_class = _K8S_CLASSES.get(class_name) | ||
| if not k8s_class: | ||
| raise ValueError(f"Unknown Kubernetes class: {class_name}") | ||
|
|
||
| schema = getattr(k8s_class, "openapi_types", None) or getattr( | ||
| k8s_class, "swagger_types", None | ||
| ) | ||
| if not schema: | ||
| raise RuntimeError(f"No schema found for {class_name}") | ||
|
|
||
| _SCHEMA_CACHE[class_name] = schema | ||
| return schema | ||
|
|
||
|
|
||
| def _validate_schema( | ||
| data: dict, schema: dict, schema_class_name: str | ||
| ) -> tuple[bool, str]: | ||
| """Recursively validate dict against Kubernetes API schema.""" | ||
| _type_checkers = { | ||
| "str": lambda v: isinstance(v, str), | ||
| "int": lambda v: isinstance(v, int) | ||
| or (isinstance(v, float) and v.is_integer()), | ||
| "bool": lambda v: isinstance(v, bool), | ||
| "datetime": lambda v: isinstance(v, str | dict), | ||
| "date": lambda v: isinstance(v, str | dict), | ||
| "object": lambda v: isinstance(v, str | dict), | ||
| } | ||
|
|
||
| def _is_compatible(field_value: typing.Any, expected_field_type: str) -> bool: | ||
| if field_value is None: | ||
| return True | ||
| if expected_field_type in _type_checkers: | ||
| return _type_checkers[expected_field_type](field_value) | ||
| return False | ||
|
|
||
| for key, value in data.items(): | ||
| if key not in schema: | ||
| return False, f"{schema_class_name}: unknown field '{key}'" | ||
|
|
||
| expected_type = schema[key] | ||
|
|
||
| # Handle nested Kubernetes classes | ||
| if isinstance(expected_type, str) and expected_type in _K8S_CLASSES: | ||
| if value is None: | ||
| continue | ||
| if not isinstance(value, dict): | ||
| return ( | ||
| False, | ||
| f"{schema_class_name}.{key}: expected dict, got {type(value).__name__}", | ||
| ) | ||
| try: | ||
| nested_schema = _get_schema(expected_type) | ||
| is_valid, error = _validate_schema(value, nested_schema, expected_type) | ||
| if not is_valid: | ||
| return False, error | ||
| except (RuntimeError, ValueError) as e: | ||
| return False, f"{schema_class_name}.{key}: {str(e)}" | ||
|
|
||
| elif not _is_compatible(value, expected_type): | ||
| return ( | ||
| False, | ||
| f"{schema_class_name}.{key}: invalid type {type(value).__name__}", | ||
| ) | ||
|
|
||
| return True, "" | ||
|
|
||
|
|
||
| def _validate_http_probe( | ||
| http_get: dict, probe_type: str, sidecar_name: str | ||
| ) -> str | None: | ||
| """Validate HTTP probe configuration. | ||
| Only path and port are mandatory if httpGet is configured. | ||
| :param http_get: The httpGet configuration dict from the probe | ||
| :param probe_type: The type of probe (readinessProbe, livenessProbe, startupProbe) | ||
| :param sidecar_name: The name of the sidecar container | ||
| :return: Error message string if validation fails, None if validation passes | ||
| """ | ||
| if not isinstance(http_get, dict): | ||
| return f"{probe_type} in sidecar '{sidecar_name}': httpGet must be a dict" | ||
| if not http_get.get("path"): | ||
| return f"{probe_type} in sidecar '{sidecar_name}': httpGet.path is required" | ||
| if not http_get.get("port"): | ||
| return f"{probe_type} in sidecar '{sidecar_name}': httpGet.port is required" | ||
| if not isinstance(http_get.get("port"), int | str): | ||
| return ( | ||
| f"{probe_type} in sidecar '{sidecar_name}': httpGet.port must be int or str" | ||
| ) | ||
| scheme = http_get.get("scheme") | ||
| if scheme and scheme not in ["HTTP", "HTTPS"]: | ||
| return f"{probe_type} in sidecar '{sidecar_name}': httpGet.scheme must be HTTP or HTTPS" | ||
| return None | ||
|
|
||
|
|
||
| def _validate_sidecar_probes(sidecars: list[dict]) -> None: | ||
| """Validate probe configurations in sidecars against Kubernetes V1Probe schema.""" | ||
| # Minimum values are based on Kubernetes probe configuration documentation: | ||
| # https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#configure-probes | ||
| timing_fields = { | ||
| PROBE_TIMING_INITIAL_DELAY_SECONDS: ( | ||
| lambda v: isinstance(v, int) and v >= 0, | ||
| "non-negative integer (minimum 0)", | ||
| ), | ||
| PROBE_TIMING_PERIOD_SECONDS: ( | ||
| lambda v: isinstance(v, int) and v >= 1, | ||
| "integer >= 1 (minimum 1)", | ||
| ), | ||
| PROBE_TIMING_TIMEOUT_SECONDS: ( | ||
| lambda v: isinstance(v, int) and v >= 1, | ||
| "integer >= 1 (minimum 1)", | ||
| ), | ||
| PROBE_TIMING_FAILURE_THRESHOLD: ( | ||
| lambda v: isinstance(v, int) and v >= 1, | ||
| "integer >= 1 (minimum 1)", | ||
| ), | ||
| PROBE_TIMING_SUCCESS_THRESHOLD: ( | ||
| lambda v: isinstance(v, int) and v >= 1, | ||
| "integer >= 1 (minimum 1)", | ||
| ), | ||
| } | ||
|
|
||
| try: | ||
| probe_schema = _get_schema("V1Probe") | ||
| except (RuntimeError, ValueError) as e: | ||
| raise mlrun.errors.MLRunBadRequestError(f"Failed to load probe schema: {e}") | ||
|
|
||
| for sidecar in sidecars: | ||
| sidecar_name = sidecar.get("name", "unknown") | ||
|
|
||
| for probe_type in PROBE_KEYS: | ||
| probe = sidecar.get(probe_type) | ||
| if not probe: | ||
| continue | ||
|
|
||
| # Schema validation | ||
| is_valid, error = _validate_schema(probe, probe_schema, "V1Probe") | ||
| if not is_valid: | ||
| raise mlrun.errors.MLRunBadRequestError( | ||
| f"Invalid {probe_type} in sidecar '{sidecar_name}': {error}" | ||
| ) | ||
|
|
||
| # HTTP probe validation - only path and port are mandatory if httpGet is configured | ||
| http_get = probe.get("httpGet") | ||
| if http_get: | ||
| error = _validate_http_probe(http_get, probe_type, sidecar_name) | ||
| if error: | ||
| raise mlrun.errors.MLRunBadRequestError(error) | ||
|
|
||
| # Timing parameters validation | ||
| for field, (validator, desc) in timing_fields.items(): | ||
| if field in probe and not validator(probe[field]): | ||
| raise mlrun.errors.MLRunBadRequestError( | ||
| f"{probe_type} in sidecar '{sidecar_name}': {field} must be a {desc} integer" | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Private functions should be below Public functions
- This entire section shouldn't be in this file at all. While relevant for nuclio, it is a very kubernetes thing, so I would put it in
mlrun/k8s_utils.py.
Generally, the endpoints package should be more of a high level transport layer. It should call the validations but not necessarily implement all of them.
I see that this file doesn't fully adhere to this convention, but it doesn't mean we should continue this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also - why is this needed? was this a requirement?
When gets the request from MLRun, it unmarshals the probe configuration into the Probe Go type, so if it doesn't adhere to it - the request will fail.
So are do we really need to do it in MLRun first?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TomerShor After re-reading the requirements, I realized I missed an important part of the validation definition:
plus provide the user with an option to supply a dictionary with any parameter that is part of the probe configuration (in which case we will not validate the dictionary and it’s up to the user to make sure the parameters are correct).
Given this, the validation I added is not needed on the MLRun server side, so I’m going to remove it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR comments 1 - Avoid non required validations
870c3d6 to
fb81c79
Compare
|
Check out this pull request on See visual diffs & provide feedback on Jupyter Notebooks. Powered by ReviewNB |
b05db2a to
fac0095
Compare
rokatyy
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
some inline comments
| try: | ||
| type = ProbeType(type.value if hasattr(type, "value") else type) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can also be done like here:
mlrun/mlrun/common/schemas/partition.py
Lines 25 to 27 in 7728707
| @classmethod | |
| def is_valid(cls, value: str) -> bool: | |
| return value in cls._value2member_map_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR 1- fixing Kate comments
| if not isinstance(type, ProbeType): | ||
| try: | ||
| type = ProbeType(type.value if hasattr(type, "value") else type) | ||
| except (ValueError, AttributeError): | ||
| raise ValueError( | ||
| f"Invalid probe type: {type}. Must be one of: {[p.value for p in ProbeType]}" | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR 1- fixing Kate comments
| value is used, even if it was modified after set_probe() was called | ||
| """ | ||
| internal_port = None | ||
| if hasattr(self.spec, "internal_application_port"): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is defined as a property
mlrun/mlrun/runtimes/nuclio/application/application.py
Lines 196 to 198 in 435559a
| @property | |
| def internal_application_port(self): | |
| return self._internal_application_port |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR 1- fixing Kate comments
| http_get["port"] = internal_port | ||
| sidecar[probe_key] = probe_config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a debug log with an enriched port value
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR 1- fixing Kate comments
| raise ValueError( | ||
| f"Cannot enrich {probe_type.value} probe: HTTP probe requires a port, " | ||
| "but internal_application_port is not set. " | ||
| "Please set the internal_application_port or provide http_port in set_probe()." |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just fyi - technically it should not be none, because we always enrich it here
mlrun/mlrun/runtimes/nuclio/application/application.py
Lines 162 to 165 in a3f2f2f
| self.internal_application_port = ( | |
| internal_application_port | |
| or mlrun.mlconf.function.application.default_sidecar_internal_port | |
| ) |
_internal_application_port to None
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rokatyy Is it possible that mlrun.mlconf.function.application.default_sidecar_internal_port is None?
If not than no problem I will remove this check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@weilerN not really
| # Override timing parameters from explicit arguments | ||
| if initial_delay_seconds is not None: | ||
| probe_config[ProbeTimeConfig.INITIAL_DELAY_SECONDS.value] = ( | ||
| initial_delay_seconds | ||
| ) | ||
| if period_seconds is not None: | ||
| probe_config[ProbeTimeConfig.PERIOD_SECONDS.value] = period_seconds | ||
| if failure_threshold is not None: | ||
| probe_config[ProbeTimeConfig.FAILURE_THRESHOLD.value] = failure_threshold | ||
| if timeout_seconds is not None: | ||
| probe_config[ProbeTimeConfig.TIMEOUT_SECONDS.value] = timeout_seconds |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Override timing parameters from explicit arguments | |
| if initial_delay_seconds is not None: | |
| probe_config[ProbeTimeConfig.INITIAL_DELAY_SECONDS.value] = ( | |
| initial_delay_seconds | |
| ) | |
| if period_seconds is not None: | |
| probe_config[ProbeTimeConfig.PERIOD_SECONDS.value] = period_seconds | |
| if failure_threshold is not None: | |
| probe_config[ProbeTimeConfig.FAILURE_THRESHOLD.value] = failure_threshold | |
| if timeout_seconds is not None: | |
| probe_config[ProbeTimeConfig.TIMEOUT_SECONDS.value] = timeout_seconds | |
| probe_config.update({ | |
| config.value: value | |
| for config, value in { | |
| ProbeTimeConfig.INITIAL_DELAY_SECONDS: initial_delay_seconds, | |
| ProbeTimeConfig.PERIOD_SECONDS: period_seconds, | |
| ProbeTimeConfig.FAILURE_THRESHOLD: failure_threshold, | |
| ProbeTimeConfig.TIMEOUT_SECONDS: timeout_seconds, | |
| }.items() | |
| if value is not None | |
| }) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR 1- fixing Kate comments
daf226c to
2d5ccf0
Compare
2bbacfc to
a92461e
Compare
| "\n", | ||
| "### Setting probes\n", | ||
| "\n", | ||
| "The `set_probe()` method allows you to configure probes.\n" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a note that this overrides any existing probe configuration
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mlrun/common/runtimes/constants.py
Outdated
| auth_session = f"{_env_prefix}AUTH_SESSION" | ||
|
|
||
|
|
||
| # Kubernetes probe type keys |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Kubernetes probe type keys | |
| # Kubernetes probe types |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Store probe configuration in the sidecar | ||
| sidecar = self._set_sidecar(self._get_sidecar_name()) | ||
| sidecar[type.key] = probe_config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if I don't pass ANY argument? all default to None, so will this basically delete the probe configuration, or should we deny this usage?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- If no arguments are provided (i.e.
application.set_probes()), the execution will fail becausetypeis a mandatory parameter. - Good question. I’ll raise this with Product. Currently, calling
application.set_probes(type="readiness")creates a probes sidecar with empty values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed here - CR 3 - validate no empty request in client side
TomerShor
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM !
great work
rokatyy
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
83159bb to
6ceef05
Compare
|
@weilerN pull latest development, the CI failures should be fixed |
d041bb1 to
e4f83a4
Compare
e4f83a4 to
772dce5
Compare
### 📝 Description This change adds backend validation for sidecar probe configurations during Nuclio function deployment. Before deploying the function, the API now validates that each configured sidecar probe (`readinessProbe`, `livenessProbe`, `startupProbe`) defines exactly one Kubernetes health-check mechanism (`httpGet`, `exec`, `tcpSocket`, or `grpc`). Configurations that are missing a health check or define multiple checks are rejected with a clear 400 Bad Request error. This validation is intentionally minimal and aligned with the Kubernetes V1Probe schema, ensuring obvious misconfigurations are caught early while avoiding deep field-level validation. --- ### 🛠️ Changes Made - Added backend validation for sidecar probes before deployment. --- ### ✅ Checklist - [ ] I updated the documentation (if applicable) - [x] I have tested the changes in this PR - [x] I confirmed whether my changes are covered by system tests - [x] If yes, I ran all relevant system tests and ensured they passed before submitting this PR - [ ] I updated existing system tests and/or added new ones if needed to cover my changes - [ ] If I introduced a deprecation: - [ ] I followed the [Deprecation Guidelines](./DEPRECATION.md) - [ ] I updated the relevant Jira ticket for documentation --- ### 🧪 Testing - Dev test: tested both negative and positive flows - Added unit tests --- ### 🔗 References - Ticket link: https://iguazio.atlassian.net/browse/ML-11788 - Design docs links: - External links: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#configure-probes --- ### 🚨 Breaking Changes? - [ ] Yes (explain below) - [x] No <!-- If yes, describe what needs to be changed downstream: --> --- ### 🔍️ Additional Notes - This PR follows up on #9043
📝 Description
Add first-class support for configuring Kubernetes probes (readiness, liveness, startup) on the ApplicationRuntime sidecar.
This includes a clear API for setting and deleting probes, validation of probe types, and automatic enrichment of HTTP probe ports from the internal application port at deploy time.
🛠️ Changes Made
ProbeTypeandProbeTimeConfigenums for consistent probe configuration keys.set_probe()anddelete_probe()APIs toApplicationRuntimefor managing sidecar probes.internal_application_portjust before deployment.✅ Checklist
🧪 Testing
Set a
sleepas part of healthcheck function:created application runtime and deployment failed (unhealthy):
Warning Unhealthy 3m1s (x867 over 138m) kubelet Readiness probe failed: Get "http://10.233.67.57:5000/internal": context deadline exceeded (Client.Timeout exceeded while awaiting headers)🔗 References
🚨 Breaking Changes?
🔍️ Additional Notes