[Model Monitoring] Fix TimescaleDB endpoint count to query correct tables (ML-11624) (#9164)

alxtkr77 · Alex Toker · gtopper · web-flow · commit 5828b8090df0 · 2026-01-11T21:19:04.000+02:00
## Summary Fixes bug where Model Monitoring App shows "Endpoints: 0" when using TimescaleDB but correctly shows endpoints with V3IO. **Root cause**: `count_processed_model_endpoints` was querying the PREDICTIONS table, which doesn't have an `application_name` column, instead of METRICS and APP_RESULTS tables which have this column. ## Changes Made - Move `count_processed_model_endpoints` from `TimescaleDBPredictionsQueries` to `TimescaleDBConnector` - Use SQL UNION to efficiently count endpoints from both METRICS and APP_RESULTS tables in a single query - Store `_tables` and `_pre_aggregate_manager` as instance variables in connector for cross-table operations - Remove unused `_count_with_application_join` and `_count_simple` methods - Update tests to use `connector` fixture directly ## Testing - All 14 TimescaleDB aggregation tests pass - Tested endpoint counting with: - No data (returns empty dict) - Data in METRICS only - Data in APP_RESULTS only - Data in BOTH tables (UNION behavior - counts unique endpoints) - Time range filtering ## Checklist - [x] Code formatted with `ruff format` - [x] Code passes `ruff check` linting - [x] Unit tests pass - [x] No secrets in diff ## Reference - Jira: [ML-11624](https://iguazio.atlassian.net/browse/ML-11624) [ML-11624]: https://iguazio.atlassian.net/browse/ML-11624?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --------- Co-authored-by: Alex Toker <alex_toker@mckinsey.com> Co-authored-by: Gal Topper <gal.topper@gmail.com>
diff --git a/mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py b/mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py
@@ -24,9 +24,6 @@
 import mlrun.model_monitoring.db.tsdb.timescaledb.timescaledb_schema as timescaledb_schema
 import mlrun.utils
 from mlrun.common.schemas.model_monitoring.model_endpoints import _MetricPoint
-from mlrun.model_monitoring.db.tsdb.timescaledb.timescaledb_connection import (
-    Statement,
-)
 from mlrun.model_monitoring.db.tsdb.timescaledb.utils.timescaledb_dataframe_processor import (
     TimescaleDBDataFrameProcessor,
 )
@@ -377,113 +374,3 @@ def build_raw_query():
             column_mapping_rules=column_mapping_rules,
             debug_name="avg_latency",
         )
-
-    def count_processed_model_endpoints(
-        self,
-        start: Optional[datetime] = None,
-        end: Optional[datetime] = None,
-        application_names: Optional[Union[str, list[str]]] = None,
-    ) -> dict[str, int]:
-        """
-        Optimized count with application filtering using JOIN approach.
-
-        This implementation:
-        1. Uses JOIN when application filtering is needed (most performant)
-        2. Falls back to simple query when no filtering (fastest for that case)
-        3. Leverages TimescaleDB's chunk exclusion and parallel processing
-        4. Can utilize pre-aggregates when available
-        """
-        start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
-        start, end = self._pre_aggregate_manager.get_start_end(start, end)
-
-        predictions_table = self.tables[mm_schemas.TimescaleDBTables.PREDICTIONS]
-
-        if application_names:
-            # Ensure application_names is a list
-            if isinstance(application_names, str):
-                application_names = [application_names]
-
-            result = {}
-
-            # For each application, call the existing JOIN method and wrap result in dict
-            for app_name in application_names:
-                # Use existing _count_with_application_join but extract count for single app
-                count = self._count_with_application_join(
-                    predictions_table,
-                    start,
-                    end,
-                    [app_name],  # Pass as list to existing method
-                )
-                result[app_name] = count
-
-            return result
-        else:
-            # Use existing simple count method and wrap result
-            total_count = self._count_simple(predictions_table, start, end)
-            return {"total": total_count} if total_count > 0 else {}
-
-    def _count_with_application_join(
-        self,
-        predictions_table,
-        start: datetime,
-        end: datetime,
-        application_names: Union[str, list[str]],
-    ) -> int:
-        """
-        Use JOIN with metrics table for application filtering.
-
-        Performance characteristics:
-        - Leverages indexes on both tables
-        - TimescaleDB optimizes time-based JOINs
-        - Chunk exclusion works on both sides
-        - DISTINCT applied after filtering
-        """
-        metrics_table = self.tables[mm_schemas.TimescaleDBTables.METRICS]
-
-        # Normalize application_names to list for consistent handling
-        if isinstance(application_names, str):
-            app_names_list = [application_names]
-        else:
-            app_names_list = list(application_names)
-
-        # Build parameterized query with proper placeholders
-        app_placeholders = ", ".join(["%s"] * len(app_names_list))
-
-        query_sql = f"""
-        SELECT COUNT(DISTINCT p.{mm_schemas.WriterEvent.ENDPOINT_ID}) AS endpoint_count
-        FROM {predictions_table.full_name()} p
-        INNER JOIN {metrics_table.full_name()} m
-            ON p.{mm_schemas.WriterEvent.ENDPOINT_ID} = m.{mm_schemas.WriterEvent.ENDPOINT_ID}
-            AND m.{metrics_table.time_column} >= %s
-            AND m.{metrics_table.time_column} <= %s
-        WHERE p.{predictions_table.time_column} >= %s
-            AND p.{predictions_table.time_column} <= %s
-            AND m.{mm_schemas.WriterEvent.APPLICATION_NAME} IN ({app_placeholders})
-        """
-
-        # Parameters: [start, end, start, end] + application_names_list
-        params = [start, end, start, end] + app_names_list
-
-        stmt = Statement(query_sql, params)
-        result = self._connection.run(query=stmt)
-
-        return result.data[0][0] if result and result.data else 0
-
-    def _count_simple(self, predictions_table, start: datetime, end: datetime) -> int:
-        """
-        Simple count without application filtering.
-
-        Uses the schema's query builder for consistency and potential pre-aggregate usage.
-        """
-        columns = [
-            f"COUNT(DISTINCT {mm_schemas.WriterEvent.ENDPOINT_ID}) AS endpoint_count"
-        ]
-
-        query = predictions_table._get_records_query(
-            start=start,
-            end=end,
-            columns_to_filter=columns,
-        )
-
-        result = self._connection.run(query=query)
-        return result.data[0][0] if result and result.data else 0
diff --git a/mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py b/mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py
@@ -38,6 +38,7 @@
     TimescaleDBResultsQueries,
 )
 from mlrun.model_monitoring.db.tsdb.timescaledb.timescaledb_connection import (
+    Statement,
     TimescaleDBConnection,
 )
 from mlrun.model_monitoring.db.tsdb.timescaledb.timescaledb_operations import (
@@ -109,27 +110,27 @@ def __init__(
         )
 
         # Create shared components needed by query classes
-        tables = timescaledb_schema.create_table_schemas(project)
-        pre_aggregate_manager = PreAggregateManager(pre_aggregate_config)
+        self._tables = timescaledb_schema.create_table_schemas(project)
+        self._pre_aggregate_manager = PreAggregateManager(pre_aggregate_config)
 
         # Create specialized query handlers with proper initialization
         self._metrics_queries = TimescaleDBMetricsQueries(
             project=project,
             connection=self._connection,
-            pre_aggregate_manager=pre_aggregate_manager,
-            tables=tables,
+            pre_aggregate_manager=self._pre_aggregate_manager,
+            tables=self._tables,
         )
         self._predictions_queries = TimescaleDBPredictionsQueries(
             project=project,
             connection=self._connection,
-            pre_aggregate_manager=pre_aggregate_manager,
-            tables=tables,
+            pre_aggregate_manager=self._pre_aggregate_manager,
+            tables=self._tables,
         )
         self._results_queries = TimescaleDBResultsQueries(
             connection=self._connection,
             project=project,
-            pre_aggregate_manager=pre_aggregate_manager,
-            tables=tables,
+            pre_aggregate_manager=self._pre_aggregate_manager,
+            tables=self._tables,
         )
 
         # Create operations and stream handlers
@@ -396,10 +397,75 @@ def get_last_request(self, *args, **kwargs):
     def get_avg_latency(self, *args, **kwargs):
         return self._predictions_queries.get_avg_latency(*args, **kwargs)
 
-    def count_processed_model_endpoints(self, *args, **kwargs):
-        return self._predictions_queries.count_processed_model_endpoints(
-            *args, **kwargs
-        )
+    def count_processed_model_endpoints(
+        self,
+        start: Optional[datetime.datetime] = None,
+        end: Optional[datetime.datetime] = None,
+        application_names: Optional[list[str] | str] = None,
+    ) -> dict[str, int]:
+        """
+        Count unique endpoints per application from METRICS and APP_RESULTS tables.
+
+        Uses SQL UNION to efficiently count endpoints that have data in EITHER table.
+
+        :param start: Start time for the query (default: last 24 hours)
+        :param end: End time for the query (default: current time)
+        :param application_names: Filter by specific application names
+        :return: Dictionary mapping application_name to endpoint count
+        """
+        # Set default time range
+        start = start or (mlrun.utils.datetime_now() - datetime.timedelta(hours=24))
+        start, end = self._pre_aggregate_manager.get_start_end(start, end)
+
+        metrics_table = self._tables[mm_schemas.TimescaleDBTables.METRICS]
+        app_results_table = self._tables[mm_schemas.TimescaleDBTables.APP_RESULTS]
+        time_column = mm_schemas.WriterEvent.END_INFER_TIME
+        app_column = mm_schemas.WriterEvent.APPLICATION_NAME
+        endpoint_column = mm_schemas.WriterEvent.ENDPOINT_ID
+
+        # Build application filter and params
+        app_filter_metrics = ""
+        app_filter_results = ""
+
+        if application_names:
+            if isinstance(application_names, str):
+                application_names = [application_names]
+            app_names_list = list(application_names)
+            app_placeholders = ", ".join(["%s"] * len(app_names_list))
+            app_filter_metrics = f"AND {app_column} IN ({app_placeholders})"
+            app_filter_results = f"AND {app_column} IN ({app_placeholders})"
+            # Params: metrics (start, end, apps), app_results (start, end, apps)
+            params = [start, end] + app_names_list + [start, end] + app_names_list
+        else:
+            params = [start, end, start, end]
+
+        # Use UNION to combine endpoints from both METRICS and APP_RESULTS tables
+        query_sql = f"""
+        SELECT {app_column}, COUNT(DISTINCT {endpoint_column}) as endpoint_count
+        FROM (
+            SELECT DISTINCT {app_column}, {endpoint_column}
+            FROM {metrics_table.full_name()}
+            WHERE {time_column} >= %s AND {time_column} <= %s
+            {app_filter_metrics}
+
+            UNION
+
+            SELECT DISTINCT {app_column}, {endpoint_column}
+            FROM {app_results_table.full_name()}
+            WHERE {time_column} >= %s AND {time_column} <= %s
+            {app_filter_results}
+        ) combined
+        GROUP BY {app_column}
+        """
+
+        stmt = Statement(query_sql, params)
+        result = self._connection.run(query=stmt)
+
+        if not result or not result.data:
+            return {}
+
+        # Convert result to dict: {application_name: count}
+        return {row[0]: row[1] for row in result.data}
 
     def get_drift_status(self, *args, **kwargs):
         return self._results_queries.get_drift_status(*args, **kwargs)
diff --git a/tests/model_monitoring/db/tsdb/timescaledb/test_timescaledb_queries_aggregation.py b/tests/model_monitoring/db/tsdb/timescaledb/test_timescaledb_queries_aggregation.py