databrickslabs · nfx · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
@@ -1,6 +1,7 @@
 import dataclasses
 import logging
 import os
+import pkgutil
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from typing import ClassVar
@@ -244,3 +245,30 @@ def _snapshot(self, fetcher, loader) -> list[any]:
     def _append_records(self, items):
         logger.debug(f"[{self._full_name}] found {len(items)} new records for {self._table}")
         self._backend.save_table(self._full_name, items, self._klass, mode="append")
+
+
+class SchemaDeployer:
+    def __init__(self, sql_backend: SqlBackend, inventory_schema: str, mod: any):
+        self._sql_backend = sql_backend
+        self._inventory_schema = inventory_schema
+        self._module = mod
+
+    def deploy_schema(self):
+        logger.info(f"Ensuring {self._inventory_schema} database exists")
+        self._sql_backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{self._inventory_schema}")
+
+    def deploy_table(self, name: str, klass: type):
+        logger.info(f"Ensuring {self._inventory_schema}.{name} table exists")
+        self._sql_backend.create_table(f"hive_metastore.{self._inventory_schema}.{name}", klass)
+
+    def deploy_view(self, name: str, relative_filename: str):
+        query = self._load(relative_filename)
+        logger.info(f"Ensuring {self._inventory_schema}.{name} view matches {relative_filename} contents")
+        ddl = f"CREATE OR REPLACE VIEW hive_metastore.{self._inventory_schema}.{name} AS {query}"
+        self._sql_backend.execute(ddl)
+
+    def _load(self, relative_filename: str) -> str:
+        data = pkgutil.get_data(self._module.__name__, relative_filename)
+        sql = data.decode("utf-8")
+        sql = sql.replace("$inventory", f"hive_metastore.{self._inventory_schema}")
+        return sql
@@ -316,6 +316,7 @@ def _table_viz_args(
         condensed=True,
         with_row_number=False,
         description: str | None = None,
+        search_by: str | None = None,
     ) -> dict:
         return {
             "type": "TABLE",
@@ -326,7 +327,9 @@ def _table_viz_args(
                 "condensed": condensed,
                 "withRowNumber": with_row_number,
                 "version": 2,
-                "columns": [VizColumn(name=x, title=x).as_dict() for x in columns.split(",")],
+                "columns": [
+                    VizColumn(name=x, title=x, allowSearch=x == search_by).as_dict() for x in columns.split(",")
+                ],
             },
         }
 

@@ -1,6 +1,5 @@
 import logging
 from collections import defaultdict
-from collections.abc import Iterator
 from dataclasses import dataclass
 from functools import partial
 
@@ -198,7 +197,7 @@ def _grants(
         view: str | None = None,
         any_file: bool = False,
         anonymous_function: bool = False,
-    ) -> Iterator[Grant]:
+    ) -> list[Grant]:
         """
         Fetches and yields grant information for the specified database objects.
 
@@ -239,23 +238,29 @@ def _grants(
             anonymous_function=anonymous_function,
         )
         try:
+            grants = []
             object_type_normalization = {"SCHEMA": "DATABASE", "CATALOG$": "CATALOG"}
             for row in self._fetch(f"SHOW GRANTS ON {on_type} {key}"):
                 (principal, action_type, object_type, _) = row
                 if object_type in object_type_normalization:
                     object_type = object_type_normalization[object_type]
                 if on_type != object_type:
                     continue
-                yield Grant(
-                    principal=principal,
-                    action_type=action_type,
-                    table=table,
-                    view=view,
-                    database=database,
-                    catalog=catalog,
-                    any_file=any_file,
-                    anonymous_function=anonymous_function,
+                # we have to return concrete list, as with yield we're executing
+                # everything on the main thread.
+                grants.append(
+                    Grant(
+                        principal=principal,
+                        action_type=action_type,
+                        table=table,
+                        view=view,
+                        database=database,
+                        catalog=catalog,
+                        any_file=any_file,
+                        anonymous_function=anonymous_function,
+                    )
                 )
+                return grants
         except Exception as e:
             # TODO: https://github.com/databrickslabs/ucx/issues/406
             logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}")

@@ -73,6 +73,14 @@ def sql_alter_from(self, catalog):
         )
 
 
+@dataclass
+class TableError:
+    catalog: str
+    database: str
+    name: str = None
+    error: str = None
+
+
 class TablesCrawler(CrawlerBase):
     def __init__(self, backend: SqlBackend, schema):
         """

@@ -23,12 +23,30 @@
 from databricks.sdk.service.workspace import ImportFormat
 
 from databricks.labs.ucx.__about__ import __version__
+from databricks.labs.ucx.assessment.crawlers import (
+    AzureServicePrincipalInfo,
+    ClusterInfo,
+    GlobalInitScriptInfo,
+    JobInfo,
+    PipelineInfo,
+)
 from databricks.labs.ucx.config import GroupsConfig, WorkspaceConfig
+from databricks.labs.ucx.framework.crawlers import (
+    SchemaDeployer,
+    SqlBackend,
+    StatementExecutionBackend,
+)
 from databricks.labs.ucx.framework.dashboards import DashboardFromFiles
 from databricks.labs.ucx.framework.install_state import InstallState
 from databricks.labs.ucx.framework.tasks import _TASKS, Task
+from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocation
+from databricks.labs.ucx.hive_metastore.grants import Grant
 from databricks.labs.ucx.hive_metastore.hms_lineage import HiveMetastoreLineageEnabler
+from databricks.labs.ucx.hive_metastore.mounts import Mount
+from databricks.labs.ucx.hive_metastore.tables import Table, TableError
 from databricks.labs.ucx.runtime import main
+from databricks.labs.ucx.workspace_access.base import Permissions
+from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo
 
 TAG_STEP = "step"
 TAG_APP = "App"
@@ -89,12 +107,36 @@
 logger = logging.getLogger(__name__)
 
 
+def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
+    from databricks.labs import ucx
+
+    deployer = SchemaDeployer(sql_backend, inventory_schema, ucx)
+    deployer.deploy_schema()
+    deployer.deploy_table("azure_service_principals", AzureServicePrincipalInfo)
+    deployer.deploy_table("clusters", ClusterInfo)
+    deployer.deploy_table("global_init_scripts", GlobalInitScriptInfo)
+    deployer.deploy_table("jobs", JobInfo)
+    deployer.deploy_table("pipelines", PipelineInfo)
+    deployer.deploy_table("external_locations", ExternalLocation)
+    deployer.deploy_table("mounts", Mount)
+    deployer.deploy_table("grants", Grant)
+    deployer.deploy_table("tables", Table)
+    deployer.deploy_table("table_failures", TableError)
+    deployer.deploy_table("workspace_objects", WorkspaceObjectInfo)
+    deployer.deploy_table("permissions", Permissions)
+    deployer.deploy_view("objects", "queries/views/objects.sql")
+    deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")
+
+
 class WorkspaceInstaller:
-    def __init__(self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool = True):
+    def __init__(
+        self, ws: WorkspaceClient, *, prefix: str = "ucx", promtps: bool = True, sql_backend: SqlBackend = None
+    ):
         if "DATABRICKS_RUNTIME_VERSION" in os.environ:
             msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
             raise SystemExit(msg)
         self._ws = ws
+        self._sql_backend = sql_backend
         self._prefix = prefix
         self._prompts = promtps
         self._this_file = Path(__file__)
@@ -111,10 +153,16 @@ def _run_configured(self):
         self._install_spark_config_for_hms_lineage()
         self._create_dashboards()
         self._create_jobs()
+        self._create_database()
         readme = f'{self.notebook_link(f"{self._install_folder}/README.py")}'
         msg = f"Installation completed successfully! Please refer to the {readme} notebook for next steps."
         logger.info(msg)
 
+    def _create_database(self):
+        if self._sql_backend is None:
+            self._sql_backend = StatementExecutionBackend(self._ws, self._current_config.warehouse_id)
+        deploy_schema(self._sql_backend, self._current_config.inventory_database)
+
     def _install_spark_config_for_hms_lineage(self):
         hms_lineage = HiveMetastoreLineageEnabler(ws=self._ws)
         logger.info(
@@ -157,10 +205,15 @@ def _install_spark_config_for_hms_lineage(self):
 
     @staticmethod
     def run_for_config(
-        ws: WorkspaceClient, config: WorkspaceConfig, *, prefix="ucx", override_clusters: dict[str, str] | None = None
+        ws: WorkspaceClient,
+        config: WorkspaceConfig,
+        *,
+        prefix="ucx",
+        override_clusters: dict[str, str] | None = None,
+        sql_backend: SqlBackend = None,
     ) -> "WorkspaceInstaller":
         logger.info(f"Installing UCX v{__version__} on {ws.config.host}")
-        workspace_installer = WorkspaceInstaller(ws, prefix=prefix, promtps=False)
+        workspace_installer = WorkspaceInstaller(ws, prefix=prefix, promtps=False, sql_backend=sql_backend)
         logger.info(f"Installing UCX v{workspace_installer._version} on {ws.config.host}")
         workspace_installer._config = config
         workspace_installer._write_config()

@@ -1,5 +1,5 @@
 -- viz type=table, name=Azure Service Principals, columns=application_id,secret_scope,secret_key,tenant_id,storage_account
--- widget title=Azure Service Principals, col=0, row=49, size_x=6, size_y=8
+-- widget title=Azure Service Principals, row=0, col=0, size_x=3, size_y=8
 select
   application_id,
   if(secret_scope = '', "NA", secret_scope) secret_scope,

@@ -0,0 +1,6 @@
+-- viz type=counter, name=Workspace UC readiness, counter_label=UC readiness, value_column=readiness
+-- widget row=0, col=0, size_x=1, size_y=3
+WITH raw AS (
+  SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects
+)
+SELECT CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness FROM raw
@@ -1,4 +1,4 @@
 -- viz type=counter, name=Total Database Count, counter_label=Total Databases, value_column=count_total_databases
--- widget col=0, row=0, size_x=1, size_y=3
+-- widget row=0, col=1, size_x=1, size_y=3
 SELECT COUNT(DISTINCT `database`) AS count_total_databases
 FROM $inventory.tables
@@ -0,0 +1,4 @@
+-- viz type=counter, name=Storage Locations, counter_label=Storage Locations, value_column=count_total
+-- widget row=0, col=4, size_x=1, size_y=3
+SELECT count(*) AS count_total
+FROM $inventory.external_locations
@@ -1,4 +1,4 @@
 -- viz type=counter, name=Total Table Count, counter_label=Total Tables, value_column=count_total_tables
--- widget col=1, row=0, size_x=1, size_y=3
+-- widget row=0, col=3, size_x=1, size_y=3
 SELECT count(*) AS count_total_tables
 FROM $inventory.tables
@@ -0,0 +1,9 @@
+-- viz type=table, name=Object Type Readiness, columns=object_type,readiness
+-- widget title=Readiness, row=1, col=0, size_x=1, size_y=8
+WITH raw AS (
+  SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects
+)
+SELECT object_type, CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness
+FROM raw
+GROUP BY object_type
+ORDER BY readiness DESC
@@ -0,0 +1,7 @@
+-- viz type=table, name=Failure Summary, search_by=failure, columns=failure,count
+-- widget title=Failure Summary, row=1, col=1, size_x=2, size_y=8
+WITH raw AS (
+  SELECT EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure FROM $inventory.objects WHERE failures <> '[]'
+)
+SELECT failure, COUNT(*) AS count FROM raw GROUP BY failure
+ORDER BY count DESC, failure DESC
@@ -1,10 +1,8 @@
--- viz type=table, name=Table Types, columns=database,name,type,format,table_view,storage,is_delta,location
--- widget title=Table Types, col=0, row=3, size_x=6, size_y=6
-SELECT `database`,
-       name,
+-- viz type=table, name=Table Types, search_by=name, columns=name,type,format,storage,is_delta,location
+-- widget title=Table Types, row=1, col=3, size_x=3, size_y=8
+SELECT CONCAT(`database`, '.', name) AS name,
        object_type AS type,
        UPPER(table_format) AS format,
-       IF(object_type IN ("MANAGED", "EXTERNAL"), "TABLE", "VIEW") AS table_view,
        CASE
            WHEN STARTSWITH(location, "dbfs:/mnt") THEN "DBFS MOUNT"
            WHEN STARTSWITH(location, "/dbfs/mnt") THEN "DBFS MOUNT"

@@ -0,0 +1,53 @@
+-- viz type=table, name=Database Summary, search_by=database, columns=database,upgrade,tables,views,dbfs_root,delta_tables,total_grants,granted_principals,database_grants,table_grants,service_principal_grants,user_grants,group_grants
+-- widget title=Database Summary, col=0, row=2, size_x=6, size_y=8
+WITH table_stats AS (
+  SELECT
+    `database`,
+    object_type,
+    UPPER(table_format) AS `format`,
+    `location`,
+    IF(object_type IN ("MANAGED", "EXTERNAL"), 1, 0) AS is_table,
+    IF(object_type = "VIEW", 1, 0) AS is_view,
+    CASE
+        WHEN STARTSWITH(location, "dbfs:/") AND NOT STARTSWITH(location, "dbfs:/mnt") THEN 1
+        WHEN STARTSWITH(location, "/dbfs/") AND NOT STARTSWITH(location, "/dbfs/mnt") THEN 1
+        ELSE 0
+    END AS is_dbfs_root,
+    CASE
+        WHEN STARTSWITH(location, "wasb") THEN 1
+        WHEN STARTSWITH(location, "adl") THEN 1
+        ELSE 0
+    END AS is_unsupported,
+    IF(format = "DELTA", 1, 0) AS is_delta
+   FROM $inventory.tables
+), database_stats AS (
+  SELECT `database`,
+    CASE
+        WHEN SUM(is_table) = 0 AND SUM(is_view) > 0 THEN "View Migration Required"
+        WHEN SUM(is_dbfs_root)/SUM(is_table) > .3 THEN "Asset Replication Required"
+        WHEN SUM(is_delta)/SUM(is_table) < .7 THEN "Some Non Delta Assets"
+        WHEN SUM(is_unsupported)/SUM(is_table) > .7 THEN "Storage Migration Required"
+        ELSE "In Place Sync"
+    END AS upgrade,
+    SUM(is_table) AS tables,
+    SUM(is_view) AS views,
+    SUM(is_unsupported) AS unsupported,
+    SUM(is_dbfs_root) AS dbfs_root,
+    SUM(is_delta) AS delta_tables
+  FROM table_stats
+  GROUP BY `database`
+), grant_stats AS (
+  SELECT
+    `database`,
+    COUNT(*) AS total_grants,
+    COUNT(DISTINCT principal) AS granted_principals,
+    SUM(IF(object_type == 'DATABASE', 1, 0)) AS database_grants,
+    SUM(IF(object_type == 'TABLE', 1, 0)) AS table_grants,
+    SUM(IF(principal_type == 'service-principal', 1, 0)) AS service_principal_grants,
+    SUM(IF(principal_type == 'user', 1, 0)) AS user_grants,
+    SUM(IF(principal_type == 'group', 1, 0)) AS group_grants
+  FROM $inventory.grant_detail
+  GROUP BY `database`
+)
+SELECT * FROM database_stats FULL JOIN grant_stats USING (`database`)
+ORDER BY tables DESC
@@ -1,4 +1,4 @@
 -- viz type=table, name=External Locations, columns=location
--- widget title=External Locations, col=0, row=17, size_x=3, size_y=8
+-- widget title=External Locations, row=3, col=0, size_x=3, size_y=8
 SELECT location
 FROM $inventory.external_locations
@@ -1,5 +1,5 @@
 -- viz type=table, name=Mount Points, columns=name,source
--- widget title=Mount Points, col=3, row=17, size_x=3, size_y=8
+-- widget title=Mount Points, row=3, col=3, size_x=3, size_y=8
 SELECT name,
        source
 FROM $inventory.mounts
@@ -0,0 +1,9 @@
+-- viz type=table, name=Clusters, columns=failure,cluster_name,cluster_id,creator
+-- widget title=Incompatible clusters, row=4, col=0, size_x=3, size_y=8
+SELECT EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure,
+    cluster_id,
+    cluster_name,
+    creator
+FROM $inventory.clusters
+WHERE NOT STARTSWITH(cluster_name, "job-")
+ORDER BY cluster_id DESC
@@ -0,0 +1,10 @@
+-- viz type=table, name=Jobs, columns=failure,job_id,job_name,creator
+-- widget title=Incompatible jobs, row=4, col=3, size_x=3, size_y=8
+SELECT
+    EXPLODE(FROM_JSON(failures, 'array<string>')) AS failure,
+    job_id,
+    job_name,
+    creator
+FROM $inventory.jobs
+WHERE job_name not like '[UCX]%'
+ORDER BY job_id DESC