Skip to content

Timed out after retries #383

@mwojtyczka

Description

@mwojtyczka

The tool failed while creating backup groups even though retries are in place:

`TimeoutError: Timed out after 0:20:00

DatabricksError Traceback (most recent call last)
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/retries.py:29, in retried..decorator..wrapper(*args, **kwargs)
28 try:
---> 29 return func(*args, **kwargs)
30 except Exception as err:

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/mixins/hardening.py:57, in rate_limited..decorator..wrapper(*args, **kwargs)
56 rate_limiter.throttle()
---> 57 return func(*args, **kwargs)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/workspace_access/groups.py:106, in GroupManager._get_or_create_backup_group(self, source_group_name, source_group)
105 logger.info(f"Creating backup group {backup_group_name}")
--> 106 backup_group = self._ws.groups.create(
107 display_name=backup_group_name,
108 meta=source_group.meta,
109 entitlements=source_group.entitlements,
110 roles=source_group.roles,
111 members=source_group.members,
112 )
113 self._workspace_groups.append(backup_group)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/service/iam.py:1716, in GroupsAPI.create(self, display_name, entitlements, external_id, groups, id, members, meta, roles)
1715 headers = {'Accept': 'application/json', 'Content-Type': 'application/json', }
-> 1716 res = self._api.do('POST', '/api/2.0/preview/scim/v2/Groups', body=body, headers=headers)
1717 return Group.from_dict(res)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/core.py:1061, in ApiClient.do(self, method, path, query, headers, body, raw, files, data)
1059 retryable = retried(timeout=timedelta(seconds=self._retry_timeout_seconds),
1060 is_retryable=self._is_retryable)
-> 1061 return retryable(self._perform)(method,
1062 path,
1063 query=query,
1064 headers=headers,
1065 body=body,
1066 raw=raw,
1067 files=files,
1068 data=data)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/retries.py:47, in retried..decorator..wrapper(*args, **kwargs)
45 if retry_reason is None:
46 # raise if exception is not retryable
---> 47 raise err
49 logger.debug(f'Retrying: {retry_reason} (sleeping ~{sleep}s)')

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/retries.py:29, in retried..decorator..wrapper(*args, **kwargs)
28 try:
---> 29 return func(*args, **kwargs)
30 except Exception as err:

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/core.py:1150, in ApiClient._perform(self, method, path, query, headers, body, raw, files, data)
1149 payload = response.json()
-> 1150 raise self._make_nicer_error(response=response, **payload) from None
1151 if raw:

DatabricksError: None Group with name db-temp-idm2bcd_dssi03prod_crin13_read already exists.

The above exception was the direct cause of the following exception:

TimeoutError Traceback (most recent call last)
File ~/.ipykernel/1030/command--1-1764370551:18
15 entry = [ep for ep in metadata.distribution("databricks_labs_ucx").entry_points if ep.name == "runtime"]
16 if entry:
17 # Load and execute the entrypoint, assumes no parameters
---> 18 entry[0].load()()
19 else:
20 import databricks_labs_ucx

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/runtime.py:215, in main()
214 def main():
--> 215 trigger(*sys.argv)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/framework/tasks.py:93, in trigger(*argv)
90 cfg = WorkspaceConfig.from_file(Path(args["config"]))
91 logging.getLogger("databricks").setLevel(cfg.log_level)
---> 93 current_task.fn(cfg)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/runtime.py:192, in migrate_permissions(cfg)
168 """As we embark on the complex journey of migrating from Hive Metastore to the Databricks Unity Catalog,
169 a crucial phase in this transition involves the careful management of permissions.
170 This intricate process entails several key steps: first, applying permissions to designated backup groups;
(...)
189
190 See interactive tutorial here."""
191 toolkit = GroupMigrationToolkit(cfg)
--> 192 toolkit.prepare_environment()
193 if toolkit.has_groups():
194 toolkit.apply_permissions_to_backup_groups()

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/workspace_access/migration.py:121, in GroupMigrationToolkit.prepare_environment(self)
120 def prepare_environment(self):
--> 121 self._group_manager.prepare_groups_in_environment()

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/workspace_access/groups.py:221, in GroupManager.prepare_groups_in_environment(self)
218 valid_group_names = list(ws_group_names.intersection(ac_group_names))
219 logger.info(f"Found {len(valid_group_names)} workspace groups that have corresponding account groups")
--> 221 self._set_migration_groups(valid_group_names)
222 logger.info("Environment prepared successfully")

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/workspace_access/groups.py:127, in GroupManager._set_migration_groups(self, groups_names)
124 backup_group = self._get_or_create_backup_group(source_group_name=name, source_group=ws_group)
125 return MigrationGroupInfo(workspace=ws_group, backup=backup_group, account=acc_group)
--> 127 collected_groups = ThreadedExecution.gather(
128 "get group info", [partial(get_group_info, group_name) for group_name in groups_names]
129 )
130 for g in collected_groups:
131 self._migration_state.add(g)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/framework/parallel.py:48, in ThreadedExecution.gather(cls, name, tasks)
45 @classmethod
46 def gather(cls, name: str, tasks: list[ExecutableFunction]) -> list[ExecutableResult]:
47 reporter = ProgressReporter(len(tasks), f"{name}: ")
---> 48 return cls(tasks, num_threads=4, progress_reporter=reporter).run()

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/framework/parallel.py:63, in ThreadedExecution.run(self)
60 results = concurrent.futures.wait(self._futures, return_when=ALL_COMPLETED)
62 logger.debug("Collecting the results from threaded execution")
---> 63 collected = [future.result() for future in results.done]
64 return collected

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/framework/parallel.py:63, in (.0)
60 results = concurrent.futures.wait(self._futures, return_when=ALL_COMPLETED)
62 logger.debug("Collecting the results from threaded execution")
---> 63 collected = [future.result() for future in results.done]
64 return collected

File /usr/lib/python3.10/concurrent/futures/_base.py:451, in Future.result(self, timeout)
449 raise CancelledError()
450 elif self._state == FINISHED:
--> 451 return self.__get_result()
453 self._condition.wait(timeout)
455 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File /usr/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
401 if self._exception:
402 try:
--> 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
406 self = None

File /usr/lib/python3.10/concurrent/futures/thread.py:58, in _WorkItem.run(self)
55 return
57 try:
---> 58 result = self.fn(*self.args, **self.kwargs)
59 except BaseException as exc:
60 self.future.set_exception(exc)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/labs/ucx/workspace_access/groups.py:124, in GroupManager._set_migration_groups..get_group_info(name)
122 acc_group = self._get_group(name, "account")
123 assert acc_group, f"Group {name} not found on the account level"
--> 124 backup_group = self._get_or_create_backup_group(source_group_name=name, source_group=ws_group)
125 return MigrationGroupInfo(workspace=ws_group, backup=backup_group, account=acc_group)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/databricks/sdk/retries.py:52, in retried..decorator..wrapper(*args, **kwargs)
50 time.sleep(sleep + random())
51 attempt += 1
---> 52 raise TimeoutError(f'Timed out after {timeout}') from last_err

TimeoutError: Timed out after 0:20:00`

Last log entries:
INFO [d.l.ucx.workspace_access.groups] Backup group db-temp-idm2bcd_dssi03prod_zme4_dev successfully created 22:58 INFO [d.l.ucx.framework.parallel] get group info: 1084/1084, rps: 0.050/sec

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

Status

No status

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions