[Workflows] Auth secret token mounting for Workflows (#9151)

elbamit · web-flow · commit 66d39c52d845 · 2026-01-05T11:51:02.000+02:00
### 📝 Description   Add support for IG4 authentication on workflows by mounting the secret on the argo pods. This PR moves the core logic of `enrich_and_validate_auth_token_name` out of the launcher to a more common place so it can be used by workflows since they don't go through launcher/runtime handler. --- ### 🛠️ Changes Made  - Move `enrich_and_validate_auth_token_name` core logic from launcher to `mlrun.auth.utils` - Create helper function `resolve_auth_token_secret_name` for pipelines that gets token name and then extract secret name. - Refactor `replace_kfp_plaintext_secret_env_vars_with_secret_refs` to `process_kfp_workflow_secret_references` to pass the `auth_secret_name` param so that it gets mounted to the argo pods during `_enrich_kfp_workflow_yaml_credentials` --- ### ✅ Checklist - [ ] I updated the documentation (if applicable) - [x] I have tested the changes in this PR - [ ] I confirmed whether my changes are covered by system tests - [ ] If yes, I ran all relevant system tests and ensured they passed before submitting this PR - [ ] I updated existing system tests and/or added new ones if needed to cover my changes - [ ] If I introduced a deprecation: - [ ] I followed the [Deprecation Guidelines](./DEPRECATION.md) - [ ] I updated the relevant Jira ticket for documentation --- ### 🧪 Testing   Unit tests - `test_resolve_auth_secret_name` - `test_enrich_and_validate_auth_token_name` --- ### 🔗 References - Ticket link: https://iguazio.atlassian.net/browse/ML-11588 - Design docs links: - External links: --- ### 🚨 Breaking Changes? - [ ] Yes (explain below) - [ ] No  --- ### 🔍️ Additional Notes
diff --git a/mlrun/auth/utils.py b/mlrun/auth/utils.py
@@ -17,6 +17,7 @@
 
 import yaml
 
+import mlrun.common.constants
 import mlrun.common.schemas
 import mlrun.utils.helpers
 from mlrun.config import config as mlconf
diff --git a/mlrun/common/constants.py b/mlrun/common/constants.py
@@ -40,7 +40,7 @@
 
 MLRUN_JOB_AUTH_SECRET_PATH = "/var/mlrun-secrets/auth"
 MLRUN_JOB_AUTH_SECRET_FILE = ".igz.yml"
-MLRUN_JOB_AUTH_DEFAULT_TOKEN_NAME = "default"
+MLRUN_RUNTIME_AUTH_DEFAULT_TOKEN_NAME = "default"
 
 
 class MLRunInternalLabels:
diff --git a/pipeline-adapters/mlrun-pipelines-kfp-common/pyproject.toml b/pipeline-adapters/mlrun-pipelines-kfp-common/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mlrun-pipelines-kfp-common"
-version = "0.6.0"
+version = "0.6.1"
 description = "MLRun Pipelines adapter package for providing KFP common functionality"
 readme = "README.md"
 requires-python = ">=3.11, <3.12"
diff --git a/pipeline-adapters/mlrun-pipelines-kfp-common/src/mlrun_pipelines/common/ops.py b/pipeline-adapters/mlrun-pipelines-kfp-common/src/mlrun_pipelines/common/ops.py
@@ -730,11 +730,12 @@ def _enrich_gpu_limits(function, task):
         task.container.add_resource_limit(resource_name, resource_value)
 
 
-def replace_kfp_plaintext_secret_env_vars_with_secret_refs(
+def process_kfp_workflow_secret_references(
     byte_buffer: bytes,
     content_type: str,
     env_var_names: list[str],
     secrets_store: "SecretsStore",
+    auth_secret_name: typing.Optional[str] = None,
 ) -> bytes:
     if content_type.endswith(
         "zip"
@@ -744,13 +745,15 @@ def replace_kfp_plaintext_secret_env_vars_with_secret_refs(
             byte_buffer=byte_buffer,
             env_var_names=env_var_names,
             secrets_store=secrets_store,
+            auth_secret_name=auth_secret_name,
         )
         return modified_zip_bytes
     elif content_type.endswith(("yaml", "plain")):
         modified_yaml_bytes = _enrich_kfp_workflow_yaml_credentials(
             yaml_bytes=byte_buffer,
             env_var_names=env_var_names,
             secrets_store=secrets_store,
+            auth_secret_name=auth_secret_name,
         )
         return modified_yaml_bytes
     else:
@@ -761,11 +764,12 @@ def _enrich_kfp_workflow_credentials_in_subprocess(
     byte_buffer: bytes,
     env_var_names: list[str],
     secrets_store: "SecretsStore",
+    auth_secret_name: typing.Optional[str] = None,
 ) -> bytes:
     queue = multiprocessing.Queue()
     process = multiprocessing.Process(
         target=_enrich_wrapper,
-        args=(queue, byte_buffer, env_var_names, secrets_store),
+        args=(queue, byte_buffer, env_var_names, secrets_store, auth_secret_name),
     )
     process.start()
     result = queue.get()
@@ -778,11 +782,13 @@ def _enrich_wrapper(
     byte_buffer: bytes,
     env_var_names: list[str],
     secrets_store: "SecretsStore",
+    auth_secret_name: typing.Optional[str] = None,
 ):
     result = _enrich_kfp_workflow_zip_credentials(
         byte_buffer=byte_buffer,
         env_var_names=env_var_names,
         secrets_store=secrets_store,
+        auth_secret_name=auth_secret_name,
     )
     queue.put(result)
 
@@ -791,6 +797,7 @@ def _enrich_kfp_workflow_zip_credentials(
     byte_buffer: bytes,
     env_var_names: list[str],
     secrets_store: "SecretsStore",
+    auth_secret_name: typing.Optional[str] = None,
 ) -> bytes:
     in_memory_zip = io.BytesIO(byte_buffer)
     with zipfile.ZipFile(in_memory_zip, "r") as zip_read:
@@ -806,6 +813,7 @@ def _enrich_kfp_workflow_zip_credentials(
                 yaml_bytes=file_data,
                 env_var_names=env_var_names,
                 secrets_store=secrets_store,
+                auth_secret_name=auth_secret_name,
             )
             files_data[file_name] = modified_yaml
 
@@ -821,13 +829,16 @@ def _enrich_kfp_workflow_yaml_credentials(
     yaml_bytes: bytes,
     env_var_names: list[str],
     secrets_store: "SecretsStore",
+    auth_secret_name: typing.Optional[str] = None,
 ) -> bytes:
     """
     Modifies the given workflow YAML to add secret environment variables to container specifications.
     The function checks if the workflow uses Argo Workflows or Tekton Pipelines and injects the
     environment variables accordingly.
     """
     workflow_dict = yaml.safe_load(yaml_bytes)
+    workflow_dict = add_auth_mount_to_argo_pods(workflow_dict, auth_secret_name)
+
     # Determine the KFP version by checking the 'apiVersion' field
     api_version = (
         workflow_dict.get("api_version") or workflow_dict.get("apiVersion", "").lower()
@@ -867,6 +878,41 @@ def _enrich_kfp_workflow_yaml_credentials(
         )
 
 
+def add_auth_mount_to_argo_pods(
+    workflow_dict: dict, auth_secret_name: typing.Optional[str] = None
+) -> dict:
+    if auth_secret_name:
+        volume = {
+            "name": "secret",
+            "secret": {
+                "secretName": auth_secret_name,
+                "items": [
+                    {
+                        "key": "tokensFile",
+                        "path": mlrun.common.constants.MLRUN_JOB_AUTH_SECRET_FILE,
+                    }
+                ],
+            },
+        }
+        volume_mount = {
+            "name": "secret",
+            "mountPath": mlrun.common.constants.MLRUN_JOB_AUTH_SECRET_PATH,
+        }
+
+        for template in workflow_dict["spec"]["templates"]:
+            # Skip DAG-only templates
+            if "container" not in template:
+                continue
+
+            # Add volumes to the template
+            template.setdefault("volumes", []).append(volume)
+
+            # Add volumeMounts to the container
+            template["container"].setdefault("volumeMounts", []).append(volume_mount)
+
+    return workflow_dict
+
+
 def _replace_secret_envs_in_argocd_template(
     env_var_names: list[str],
     container: dict,
diff --git a/server/py/services/api/api/endpoints/pipelines.py b/server/py/services/api/api/endpoints/pipelines.py
@@ -487,6 +487,7 @@ async def _create_pipeline(
         content_type,
         data,
         arguments,
+        auth_info,
     )
 
     return {
diff --git a/server/py/services/api/crud/pipelines.py b/server/py/services/api/crud/pipelines.py
@@ -25,6 +25,7 @@
 import sqlalchemy.orm
 
 import mlrun
+import mlrun.auth.utils
 import mlrun.common.constants as mlrun_constants
 import mlrun.common.formatters
 import mlrun.common.helpers
@@ -47,9 +48,10 @@
 
 import framework.api.utils
 import framework.utils.singletons.db
+import framework.utils.singletons.k8s
 import services.api.crud
+import services.api.utils.helpers
 from services.api.crud.workflows import RerunRunner
-from services.api.utils.helpers import resolve_client_default_kfp_image
 
 
 class Pipelines(
@@ -432,7 +434,7 @@ def rerun_pipeline_via_runner(
                                    - status:  `"running"`
                                    - run_id:  the new MLRun-run UID for the RerunRunner job
         """
-        client_image = resolve_client_default_kfp_image(
+        client_image = services.api.utils.helpers.resolve_client_default_kfp_image(
             project,
             workflow_spec=None,
             client_version=client_version,
@@ -601,6 +603,7 @@ def create_pipeline(
         content_type: str,
         data: bytes,
         arguments: typing.Optional[dict] = None,
+        auth_info: typing.Optional[mlrun.common.schemas.AuthInfo] = None,
     ):
         if arguments is None:
             arguments = {}
@@ -616,11 +619,21 @@ def create_pipeline(
         mlrun.utils.logger.debug(
             "Writing pipeline to temp file", content_type=content_type
         )
-        data = mlrun_pipelines.common.ops.replace_kfp_plaintext_secret_env_vars_with_secret_refs(
+
+        # TODO In ML-11600, pass the token name from the request
+        provided_token_name = None
+        # Workflows do not go through launcher/runtime handler
+        # So enrichment, validation and secret retrieval need to be done here
+        auth_secret_name = services.api.utils.helpers.resolve_auth_token_secret_name(
+            provided_token_name=provided_token_name, username=auth_info.username
+        )
+
+        data = mlrun_pipelines.common.ops.process_kfp_workflow_secret_references(
             byte_buffer=data,
             content_type=content_type,
             env_var_names=["MLRUN_AUTH_SESSION", "V3IO_ACCESS_KEY"],
             secrets_store=services.api.crud.Secrets(),
+            auth_secret_name=auth_secret_name,
         )
         pipeline_file = tempfile.NamedTemporaryFile(suffix=content_type)
         with open(pipeline_file.name, "wb") as fp:
diff --git a/server/py/services/api/launcher.py b/server/py/services/api/launcher.py
@@ -18,6 +18,7 @@
 
 from dependency_injector import containers, providers
 
+import mlrun.auth.utils
 import mlrun.common.constants as mlrun_constants
 import mlrun.common.runtimes.constants
 import mlrun.common.schemas.schedule
@@ -674,36 +675,24 @@ def _validate_retry(runtime_kind: str, retry: Optional["mlrun.model.Retry"]):
                     f"must be less than {staleness_threshold_seconds} seconds, got {max_delay} seconds"
                 )
 
-    # TODO In ML-11600, implement token name resolution and validation + tests
     def enrich_and_validate_auth_token_name(
         self, object: Union[mlrun.run.RunObject, mlrun.runtimes.RemoteRuntime]
     ):
-        if mlrun.mlconf.is_iguazio_v4_mode():
-            if object.spec.auth is None:
-                object.spec.auth = {}
-
-            # Get the provided token name, if any
-            provided_token_name = object.spec.auth.get("token_name")
-
-            # Resolve token name and raise error only if token is explicitly provided by the user
-            # in ML-11600, we will implement a proper resolution logic that checks all secret tokens
-            # of the user and finds a valid one if no token name is provided
-            raise_error_on_failure = bool(provided_token_name)
-            token_name = (
-                provided_token_name
-                or mlrun.common.constants.MLRUN_JOB_AUTH_DEFAULT_TOKEN_NAME
-            )
-            self._validate_token_name(
-                token_name, raise_error_on_failure=raise_error_on_failure
-            )
-
-            object.spec.auth["token_name"] = token_name
+        if object.spec.auth is None:
+            object.spec.auth = {}
+
+        # Get the provided token name, if any
+        provided_token_name = object.spec.auth.get("token_name")
+
+        # In ML-11600, we will implement a proper resolution logic that checks all secret tokens
+        # of the user and finds a valid one if no token name is provided
+        # If token name not provided, use default
+        token_name = (
+            provided_token_name
+            or mlrun.common.constants.MLRUN_RUNTIME_AUTH_DEFAULT_TOKEN_NAME
+        )
 
-    # TODO implement validation in ML-11600
-    def _validate_token_name(
-        self, token_name: str, raise_error_on_failure: bool = False
-    ):
-        pass
+        object.spec.auth["token_name"] = token_name
 
 
 # Once this file is imported it will set the container server side launcher
diff --git a/server/py/services/api/tests/unit/api/test_utils.py b/server/py/services/api/tests/unit/api/test_utils.py
@@ -34,6 +34,7 @@
 import mlrun.runtimes.mounts
 import mlrun.runtimes.pod
 import mlrun.utils
+from mlrun.common.types import AuthenticationMode
 from server.py.framework.api.utils import (
     _generate_function_and_task_from_submit_run_body,
 )
@@ -2043,3 +2044,57 @@ def test_setenv_from_the_project_secret(secret_name, expect_exception, kind):
         else:
             # Should not raise
             framework.api.utils.validate_function_env_vars(function)
+
+
+@pytest.mark.parametrize(
+    "provided_token, secret_name, expected_secret_name, expected_token_name",
+    [
+        # default token, secret exists
+        (
+            None,
+            "secret-1",
+            "secret-1",
+            mlrun.common.constants.MLRUN_RUNTIME_AUTH_DEFAULT_TOKEN_NAME,
+        ),
+        # explicit token, secret exists
+        ("custom-token", "secret-2", "secret-2", "custom-token"),
+        # default token, secret missing
+        (
+            None,
+            None,
+            None,
+            mlrun.common.constants.MLRUN_RUNTIME_AUTH_DEFAULT_TOKEN_NAME,
+        ),
+        # explicit token, secret missing
+        ("custom-token", None, None, "custom-token"),
+    ],
+)
+def test_resolve_auth_secret_name(
+    monkeypatch, provided_token, secret_name, expected_secret_name, expected_token_name
+):
+    mlrun.mlconf.httpdb.authentication.mode = AuthenticationMode.IGUAZIO_V4
+
+    secret = None
+    if secret_name:
+        secret = unittest.mock.Mock()
+        secret.metadata.name = secret_name
+
+    k8s_helper = unittest.mock.Mock()
+    k8s_helper._get_user_token_secret.return_value = secret
+
+    monkeypatch.setattr(
+        "framework.utils.singletons.k8s.get_k8s_helper",
+        lambda: k8s_helper,
+    )
+
+    result = services.api.utils.helpers.resolve_auth_token_secret_name(
+        provided_token, "test-user"
+    )
+
+    assert result == expected_secret_name
+
+    # Verify the function uses the correct token name (default or provided)
+    k8s_helper._get_user_token_secret.assert_called_once_with(
+        username="test-user",
+        token_name=expected_token_name,
+    )
diff --git a/server/py/services/api/tests/unit/test_launcher.py b/server/py/services/api/tests/unit/test_launcher.py
@@ -468,3 +468,30 @@ def test_launcher_skips_aborted_or_deleted_run(monkeypatch):
     # Validate result
     assert run.status.state == mlrun.common.runtimes.constants.RunStates.aborted
     assert not runtime_handler_mock.called
+
+
+@pytest.mark.parametrize(
+    "initial_auth, expected_token",
+    [
+        # auth missing → default token
+        (None, mlrun.common.constants.MLRUN_RUNTIME_AUTH_DEFAULT_TOKEN_NAME),
+        # auth exists but no token_name → default token
+        ({}, mlrun.common.constants.MLRUN_RUNTIME_AUTH_DEFAULT_TOKEN_NAME),
+        # explicit token_name → preserved
+        ({"token_name": "custom-token"}, "custom-token"),
+    ],
+)
+def test_enrich_and_validate_auth_token_name(
+    initial_auth,
+    expected_token,
+):
+    launcher = services.api.launcher.ServerSideLauncher(
+        auth_info=mlrun.common.schemas.AuthInfo()
+    )
+    run = mlrun.run.RunObject(
+        spec=mlrun.model.RunSpec(auth=initial_auth),
+    )
+
+    launcher.enrich_and_validate_auth_token_name(run)
+
+    assert run.spec.auth["token_name"] == expected_token
diff --git a/server/py/services/api/utils/helpers.py b/server/py/services/api/utils/helpers.py

Original file line number	Diff line number	Diff line change
`@@ -487,6 +487,7 @@ async def _create_pipeline(`
`487`	`487`	`content_type,`
`488`	`488`	`data,`
`489`	`489`	`arguments,`
	`490`	`+ auth_info,`
`490`	`491`	`)`
`491`	`492`
`492`	`493`	`return {`