Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/databricks/labs/ucx/framework/crawlers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataclasses
import logging
import os
import pkgutil
from abc import ABC, abstractmethod
from collections.abc import Iterator
from typing import ClassVar
Expand Down Expand Up @@ -244,3 +245,30 @@ def _snapshot(self, fetcher, loader) -> list[any]:
def _append_records(self, items):
logger.debug(f"[{self._full_name}] found {len(items)} new records for {self._table}")
self._backend.save_table(self._full_name, items, self._klass, mode="append")


class SchemaDeployer:
def __init__(self, sql_backend: SqlBackend, inventory_schema: str, mod: any):
self._sql_backend = sql_backend
self._inventory_schema = inventory_schema
self._module = mod

def deploy_schema(self):
logger.info(f"Ensuring {self._inventory_schema} database exists")
self._sql_backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{self._inventory_schema}")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it filtered by the assessment tool ? I recall that we do not have such thing

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean?


def deploy_table(self, name: str, klass: type):
logger.info(f"Ensuring {self._inventory_schema}.{name} table exists")
self._sql_backend.create_table(f"hive_metastore.{self._inventory_schema}.{name}", klass)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens there if we change/remove a column in a table ? I assume that this is handled by automatically by the save_table.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is not handled yet - see #471


def deploy_view(self, name: str, relative_filename: str):
query = self._load(relative_filename)
logger.info(f"Ensuring {self._inventory_schema}.{name} view matches {relative_filename} contents")
ddl = f"CREATE OR REPLACE VIEW hive_metastore.{self._inventory_schema}.{name} AS {query}"
self._sql_backend.execute(ddl)

def _load(self, relative_filename: str) -> str:
data = pkgutil.get_data(self._module.__name__, relative_filename)
sql = data.decode("utf-8")
sql = sql.replace("$inventory", f"hive_metastore.{self._inventory_schema}")
return sql
5 changes: 4 additions & 1 deletion src/databricks/labs/ucx/framework/dashboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def _table_viz_args(
condensed=True,
with_row_number=False,
description: str | None = None,
search_by: str | None = None,
) -> dict:
return {
"type": "TABLE",
Expand All @@ -326,7 +327,9 @@ def _table_viz_args(
"condensed": condensed,
"withRowNumber": with_row_number,
"version": 2,
"columns": [VizColumn(name=x, title=x).as_dict() for x in columns.split(",")],
"columns": [
VizColumn(name=x, title=x, allowSearch=x == search_by).as_dict() for x in columns.split(",")
],
},
}

Expand Down
27 changes: 16 additions & 11 deletions src/databricks/labs/ucx/hive_metastore/grants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
from collections import defaultdict
from collections.abc import Iterator
from dataclasses import dataclass
from functools import partial

Expand Down Expand Up @@ -198,7 +197,7 @@ def _grants(
view: str | None = None,
any_file: bool = False,
anonymous_function: bool = False,
) -> Iterator[Grant]:
) -> list[Grant]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! I just found this as well in #477 - nice this is fixed.

"""
Fetches and yields grant information for the specified database objects.

Expand Down Expand Up @@ -239,23 +238,29 @@ def _grants(
anonymous_function=anonymous_function,
)
try:
grants = []
object_type_normalization = {"SCHEMA": "DATABASE", "CATALOG$": "CATALOG"}
for row in self._fetch(f"SHOW GRANTS ON {on_type} {key}"):
(principal, action_type, object_type, _) = row
if object_type in object_type_normalization:
object_type = object_type_normalization[object_type]
if on_type != object_type:
continue
yield Grant(
principal=principal,
action_type=action_type,
table=table,
view=view,
database=database,
catalog=catalog,
any_file=any_file,
anonymous_function=anonymous_function,
# we have to return concrete list, as with yield we're executing
# everything on the main thread.
grants.append(
Grant(
principal=principal,
action_type=action_type,
table=table,
view=view,
database=database,
catalog=catalog,
any_file=any_file,
anonymous_function=anonymous_function,
)
)
return grants
except Exception as e:
# TODO: https://github.com/databrickslabs/ucx/issues/406
logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}")
Expand Down
8 changes: 8 additions & 0 deletions src/databricks/labs/ucx/hive_metastore/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ def sql_alter_from(self, catalog):
)


@dataclass
class TableError:
catalog: str
database: str
name: str = None
error: str = None


class TablesCrawler(CrawlerBase):
def __init__(self, backend: SqlBackend, schema):
"""
Expand Down
59 changes: 56 additions & 3 deletions src/databricks/labs/ucx/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,30 @@
from databricks.sdk.service.workspace import ImportFormat

from databricks.labs.ucx.__about__ import __version__
from databricks.labs.ucx.assessment.crawlers import (
AzureServicePrincipalInfo,
ClusterInfo,
GlobalInitScriptInfo,
JobInfo,
PipelineInfo,
)
from databricks.labs.ucx.config import GroupsConfig, WorkspaceConfig
from databricks.labs.ucx.framework.crawlers import (
SchemaDeployer,
SqlBackend,
StatementExecutionBackend,
)
from databricks.labs.ucx.framework.dashboards import DashboardFromFiles
from databricks.labs.ucx.framework.install_state import InstallState
from databricks.labs.ucx.framework.tasks import _TASKS, Task
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocation
from databricks.labs.ucx.hive_metastore.grants import Grant
from databricks.labs.ucx.hive_metastore.hms_lineage import HiveMetastoreLineageEnabler
from databricks.labs.ucx.hive_metastore.mounts import Mount
from databricks.labs.ucx.hive_metastore.tables import Table, TableError
from databricks.labs.ucx.runtime import main
from databricks.labs.ucx.workspace_access.base import Permissions
from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo

TAG_STEP = "step"
TAG_APP = "App"
Expand Down Expand Up @@ -89,12 +107,36 @@
logger = logging.getLogger(__name__)


def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
from databricks.labs import ucx

deployer = SchemaDeployer(sql_backend, inventory_schema, ucx)
deployer.deploy_schema()
deployer.deploy_table("azure_service_principals", AzureServicePrincipalInfo)
deployer.deploy_table("clusters", ClusterInfo)
deployer.deploy_table("global_init_scripts", GlobalInitScriptInfo)
deployer.deploy_table("jobs", JobInfo)
deployer.deploy_table("pipelines", PipelineInfo)
deployer.deploy_table("external_locations", ExternalLocation)
deployer.deploy_table("mounts", Mount)
deployer.deploy_table("grants", Grant)
deployer.deploy_table("tables", Table)
deployer.deploy_table("table_failures", TableError)
deployer.deploy_table("workspace_objects", WorkspaceObjectInfo)
deployer.deploy_table("permissions", Permissions)
deployer.deploy_view("objects", "queries/views/objects.sql")
deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")


class WorkspaceInstaller:
def __init__(self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool = True):
def __init__(
self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool = True, sql_backend: SqlBackend = None
):
if "DATABRICKS_RUNTIME_VERSION" in os.environ:
msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
raise SystemExit(msg)
self._ws = ws
self._sql_backend = sql_backend
self._prefix = prefix
self._prompts = promtps
self._this_file = Path(__file__)
Expand All @@ -111,10 +153,16 @@ def _run_configured(self):
self._install_spark_config_for_hms_lineage()
self._create_dashboards()
self._create_jobs()
self._create_database()
readme = f'{self.notebook_link(f"{self._install_folder}/README.py")}'
msg = f"Installation completed successfully! Please refer to the {readme} notebook for next steps."
logger.info(msg)

def _create_database(self):
if self._sql_backend is None:
self._sql_backend = StatementExecutionBackend(self._ws, self._current_config.warehouse_id)
deploy_schema(self._sql_backend, self._current_config.inventory_database)

def _install_spark_config_for_hms_lineage(self):
hms_lineage = HiveMetastoreLineageEnabler(ws=self._ws)
logger.info(
Expand Down Expand Up @@ -157,10 +205,15 @@ def _install_spark_config_for_hms_lineage(self):

@staticmethod
def run_for_config(
ws: WorkspaceClient, config: WorkspaceConfig, *, prefix="ucx", override_clusters: dict[str, str] | None = None
ws: WorkspaceClient,
config: WorkspaceConfig,
*,
prefix="ucx",
override_clusters: dict[str, str] | None = None,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would we need this ? Looks like it's only useful for testing purposes, just override the cluster directly in the test instead ?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's for a faster devloop - see 4453b4c & c297074

sql_backend: SqlBackend = None,
) -> "WorkspaceInstaller":
logger.info(f"Installing UCX v{__version__} on {ws.config.host}")
workspace_installer = WorkspaceInstaller(ws, prefix=prefix, promtps=False)
workspace_installer = WorkspaceInstaller(ws, prefix=prefix, promtps=False, sql_backend=sql_backend)
logger.info(f"Installing UCX v{workspace_installer._version} on {ws.config.host}")
workspace_installer._config = config
workspace_installer._write_config()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- viz type=table, name=Azure Service Principals, columns=application_id,secret_scope,secret_key,tenant_id,storage_account
-- widget title=Azure Service Principals, col=0, row=49, size_x=6, size_y=8
-- widget title=Azure Service Principals, row=0, col=0, size_x=3, size_y=8
select
application_id,
if(secret_scope = '', "NA", secret_scope) secret_scope,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- viz type=counter, name=Workspace UC readiness, counter_label=UC readiness, value_column=readiness
-- widget row=0, col=0, size_x=1, size_y=3
WITH raw AS (
SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects
)
SELECT CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness FROM raw
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- viz type=counter, name=Total Database Count, counter_label=Total Databases, value_column=count_total_databases
-- widget col=0, row=0, size_x=1, size_y=3
-- widget row=0, col=1, size_x=1, size_y=3
SELECT COUNT(DISTINCT `database`) AS count_total_databases
FROM $inventory.tables
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- viz type=counter, name=Storage Locations, counter_label=Storage Locations, value_column=count_total
-- widget row=0, col=4, size_x=1, size_y=3
SELECT count(*) AS count_total
FROM $inventory.external_locations
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- viz type=counter, name=Total Table Count, counter_label=Total Tables, value_column=count_total_tables
-- widget col=1, row=0, size_x=1, size_y=3
-- widget row=0, col=3, size_x=1, size_y=3
SELECT count(*) AS count_total_tables
FROM $inventory.tables
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- viz type=table, name=Object Type Readiness, columns=object_type,readiness
-- widget title=Readiness, row=1, col=0, size_x=1, size_y=8
WITH raw AS (
SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects
)
SELECT object_type, CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness
FROM raw
GROUP BY object_type
ORDER BY readiness DESC
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- viz type=table, name=Failure Summary, search_by=failure, columns=failure,count
-- widget title=Failure Summary, row=1, col=1, size_x=2, size_y=8
WITH raw AS (
SELECT EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure FROM $inventory.objects WHERE failures <> '[]'
)
SELECT failure, COUNT(*) AS count FROM raw GROUP BY failure
ORDER BY count DESC, failure DESC
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
-- viz type=table, name=Table Types, columns=database,name,type,format,table_view,storage,is_delta,location
-- widget title=Table Types, col=0, row=3, size_x=6, size_y=6
SELECT `database`,
name,
-- viz type=table, name=Table Types, search_by=name, columns=name,type,format,storage,is_delta,location
-- widget title=Table Types, row=1, col=3, size_x=3, size_y=8
SELECT CONCAT(`database`, '.', name) AS name,
object_type AS type,
UPPER(table_format) AS format,
IF(object_type IN ("MANAGED", "EXTERNAL"), "TABLE", "VIEW") AS table_view,
CASE
WHEN STARTSWITH(location, "dbfs:/mnt") THEN "DBFS MOUNT"
WHEN STARTSWITH(location, "/dbfs/mnt") THEN "DBFS MOUNT"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
-- viz type=table, name=Database Summary, search_by=database, columns=database,upgrade,tables,views,dbfs_root,delta_tables,total_grants,granted_principals,database_grants,table_grants,service_principal_grants,user_grants,group_grants
-- widget title=Database Summary, col=0, row=2, size_x=6, size_y=8
WITH table_stats AS (
SELECT
`database`,
object_type,
UPPER(table_format) AS `format`,
`location`,
IF(object_type IN ("MANAGED", "EXTERNAL"), 1, 0) AS is_table,
IF(object_type = "VIEW", 1, 0) AS is_view,
CASE
WHEN STARTSWITH(location, "dbfs:/") AND NOT STARTSWITH(location, "dbfs:/mnt") THEN 1
WHEN STARTSWITH(location, "/dbfs/") AND NOT STARTSWITH(location, "/dbfs/mnt") THEN 1
ELSE 0
END AS is_dbfs_root,
CASE
WHEN STARTSWITH(location, "wasb") THEN 1
WHEN STARTSWITH(location, "adl") THEN 1
ELSE 0
END AS is_unsupported,
IF(format = "DELTA", 1, 0) AS is_delta
FROM $inventory.tables
), database_stats AS (
SELECT `database`,
CASE
WHEN SUM(is_table) = 0 AND SUM(is_view) > 0 THEN "View Migration Required"
WHEN SUM(is_dbfs_root)/SUM(is_table) > .3 THEN "Asset Replication Required"
WHEN SUM(is_delta)/SUM(is_table) < .7 THEN "Some Non Delta Assets"
WHEN SUM(is_unsupported)/SUM(is_table) > .7 THEN "Storage Migration Required"
ELSE "In Place Sync"
END AS upgrade,
SUM(is_table) AS tables,
SUM(is_view) AS views,
SUM(is_unsupported) AS unsupported,
SUM(is_dbfs_root) AS dbfs_root,
SUM(is_delta) AS delta_tables
FROM table_stats
GROUP BY `database`
), grant_stats AS (
SELECT
`database`,
COUNT(*) AS total_grants,
COUNT(DISTINCT principal) AS granted_principals,
SUM(IF(object_type == 'DATABASE', 1, 0)) AS database_grants,
SUM(IF(object_type == 'TABLE', 1, 0)) AS table_grants,
SUM(IF(principal_type == 'service-principal', 1, 0)) AS service_principal_grants,
SUM(IF(principal_type == 'user', 1, 0)) AS user_grants,
SUM(IF(principal_type == 'group', 1, 0)) AS group_grants
FROM $inventory.grant_detail
GROUP BY `database`
)
SELECT * FROM database_stats FULL JOIN grant_stats USING (`database`)
ORDER BY tables DESC
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- viz type=table, name=External Locations, columns=location
-- widget title=External Locations, col=0, row=17, size_x=3, size_y=8
-- widget title=External Locations, row=3, col=0, size_x=3, size_y=8
SELECT location
FROM $inventory.external_locations
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- viz type=table, name=Mount Points, columns=name,source
-- widget title=Mount Points, col=3, row=17, size_x=3, size_y=8
-- widget title=Mount Points, row=3, col=3, size_x=3, size_y=8
SELECT name,
source
FROM $inventory.mounts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- viz type=table, name=Clusters, columns=failure,cluster_name,cluster_id,creator
-- widget title=Incompatible clusters, row=4, col=0, size_x=3, size_y=8
SELECT EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure,
cluster_id,
cluster_name,
creator
FROM $inventory.clusters
WHERE NOT STARTSWITH(cluster_name, "job-")
ORDER BY cluster_id DESC
10 changes: 10 additions & 0 deletions src/databricks/labs/ucx/queries/assessment/main/04_3_jobs.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- viz type=table, name=Jobs, columns=failure,job_id,job_name,creator
-- widget title=Incompatible jobs, row=4, col=3, size_x=3, size_y=8
SELECT
EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure,
job_id,
job_name,
creator
FROM $inventory.jobs
WHERE job_name not like '[UCX]%'
ORDER BY job_id DESC
Loading