[MLOS-459] Support enriched evalmetric event submission (#7503)

gsvigruha · web-flow · commit 4ed95938bd60 · 2026-02-13T11:32:40.000-05:00
* Add reasoning, assessment and metadata

* more guards

* nit

* fix syntax

* some unit tests

* more tests

* fix lint

* undefined

* address comments

* partial revert

* revert metadata

* pass / fail

* fix test

* fix test

* json

* token

* doh

* fix message

* fix doc

* fixes

* fix
diff --git a/index.d.ts b/index.d.ts
@@ -3243,13 +3243,13 @@ declare namespace tracer {
       /**
        * The type of evaluation metric, one of 'categorical', 'score', or 'boolean'
        */
-      metricType: 'categorical' | 'score' | 'boolean',
+      metricType: 'categorical' | 'score' | 'boolean' | 'json',
 
       /**
        * The value of the evaluation metric.
-       * Must be string for 'categorical' metrics, number for 'score' metrics, and boolean for 'boolean' metrics.
+       * Must be string for 'categorical' metrics, number for 'score' metrics, boolean for 'boolean' metrics and a JSON object for 'json' metrics.
        */
-      value: string | number | boolean,
+      value: string | number | boolean | { [key: string]: any },
 
       /**
        * An object of string key-value pairs to tag the evaluation metric with.
@@ -3265,6 +3265,21 @@ declare namespace tracer {
        * The timestamp in milliseconds when the evaluation metric result was generated.
        */
       timestampMs?: number
+
+      /**
+       * Reasoning for the evaluation result.
+       */
+      reasoning?: string,
+
+      /**
+       * Whether the evaluation passed or failed. Valid values are pass and fail.
+       */
+      assessment?: 'pass' | 'fail',
+
+      /**
+       * Arbitrary JSON data associated with the evaluation.
+       */
+      metadata?: { [key: string]: any }
     }
 
     interface Document {
diff --git a/packages/dd-trace/src/llmobs/sdk.js b/packages/dd-trace/src/llmobs/sdk.js
@@ -359,15 +359,15 @@ class LLMObs extends NoopLLMObs {
         throw new Error('timestampMs must be a non-negative integer. Evaluation metric data will not be sent')
       }
 
-      const { label, value, tags } = options
+      const { label, value, tags, reasoning, assessment, metadata } = options
       const metricType = options.metricType?.toLowerCase()
       if (!label) {
         err = 'invalid_metric_label'
         throw new Error('label must be the specified name of the evaluation metric')
       }
-      if (!metricType || !['categorical', 'score', 'boolean'].includes(metricType)) {
+      if (!metricType || !['categorical', 'score', 'boolean', 'json'].includes(metricType)) {
         err = 'invalid_metric_type'
-        throw new Error('metricType must be one of "categorical" or "score"')
+        throw new Error('metricType must be one of "categorical", "score", "boolean" or "json"')
       }
       if (metricType === 'categorical' && typeof value !== 'string') {
         err = 'invalid_metric_value'
@@ -381,6 +381,22 @@ class LLMObs extends NoopLLMObs {
         err = 'invalid_metric_value'
         throw new Error('value must be a boolean for a boolean metric')
       }
+      if (metricType === 'json' && !(typeof value === 'object' && value != null && !Array.isArray(value))) {
+        err = 'invalid_metric_value'
+        throw new Error('value must be a JSON object for a json metric')
+      }
+      if (assessment != null && assessment !== 'pass' && assessment !== 'fail') {
+        err = 'invalid_assessment'
+        throw new Error('assessment must be pass or fail')
+      }
+      if (reasoning != null && typeof reasoning !== 'string') {
+        err = 'invalid_reasoning'
+        throw new Error('reasoning must be a string')
+      }
+      if (metadata != null && (typeof metadata !== 'object' || Array.isArray(metadata))) {
+        err = 'invalid_metadata'
+        throw new Error('metadata must be a JSON object')
+      }
 
       const evaluationTags = {
         'ddtrace.version': tracerVersion,
@@ -425,6 +441,15 @@ class LLMObs extends NoopLLMObs {
         timestamp_ms: timestampMs,
         tags: Object.entries(evaluationTags).map(([key, value]) => `${key}:${value}`),
       }
+      if (reasoning != null) {
+        payload.reasoning = reasoning
+      }
+      if (metadata != null) {
+        payload.metadata = metadata
+      }
+      if (assessment != null) {
+        payload.assessment = assessment
+      }
       const currentStore = storage.getStore()
       const routing = currentStore?.routingContext
       evalMetricAppendCh.publish({ payload, routing })
diff --git a/packages/dd-trace/test/llmobs/sdk/index.spec.js b/packages/dd-trace/test/llmobs/sdk/index.spec.js
@@ -1268,6 +1268,101 @@ describe('sdk', () => {
       }), { message: 'value must be a boolean for a boolean metric' })
     })
 
+    it('submits a json evaluation metric', () => {
+      llmobs.submitEvaluation(spanCtx, {
+        label: 'has_toxicity',
+        metricType: 'json',
+        value: { f1: 0.8, recall: 1, precision: 0.5 },
+        timestampMs: 1234,
+      })
+
+      const evalMetric = LLMObsEvalMetricsWriter.prototype.append.getCall(0).args[0]
+
+      assert.deepStrictEqual(evalMetric, {
+        join_on: {
+          span: {
+            span_id: '5678',
+            trace_id: '1234',
+          },
+        },
+        label: 'has_toxicity',
+        metric_type: 'json',
+        ml_app: 'mlApp',
+        json_value: { f1: 0.8, recall: 1, precision: 0.5 },
+        timestamp_ms: 1234,
+        tags: [`ddtrace.version:${tracerVersion}`, 'ml_app:mlApp'],
+      })
+    })
+
+    it('throws an error when submitting a non-JSON object json evaluation metric', () => {
+      assert.throws(() => llmobs.submitEvaluation(spanCtx, {
+        label: 'has_toxicity',
+        metricType: 'json',
+        value: 'it is super toxic!',
+      }), { message: 'value must be a JSON object for a json metric' })
+    })
+
+    it('submits an enriched evaluation metric', () => {
+      llmobs.submitEvaluation(spanCtx, {
+        mlApp: 'test',
+        timestampMs: 1234,
+        label: 'toxic',
+        metricType: 'score',
+        value: 0.6,
+        reasoning: 'this input is toxic',
+        assessment: 'fail',
+        metadata: { some: 'details' },
+        tags: {
+          host: 'localhost',
+        },
+      })
+
+      assert.deepStrictEqual(LLMObsEvalMetricsWriter.prototype.append.getCall(0).args[0], {
+        join_on: {
+          span: {
+            span_id: spanCtx.spanId,
+            trace_id: spanCtx.traceId,
+          },
+        },
+        ml_app: 'test',
+        timestamp_ms: 1234,
+        label: 'toxic',
+        metric_type: 'score',
+        score_value: 0.6,
+        tags: [`ddtrace.version:${tracerVersion}`, 'ml_app:test', 'host:localhost'],
+        reasoning: 'this input is toxic',
+        assessment: 'fail',
+        metadata: { some: 'details' },
+      })
+    })
+
+    it('throws an error when submitting a non-string reasoning', () => {
+      assert.throws(() => llmobs.submitEvaluation(spanCtx, {
+        label: 'has_toxicity',
+        metricType: 'boolean',
+        value: true,
+        reasoning: 1,
+      }), { message: 'reasoning must be a string' })
+    })
+
+    it('throws an error when submitting a non pass/fail assessment', () => {
+      assert.throws(() => llmobs.submitEvaluation(spanCtx, {
+        label: 'has_toxicity',
+        metricType: 'boolean',
+        value: true,
+        assessment: 'correct',
+      }), { message: 'assessment must be pass or fail' })
+    })
+
+    it('throws an error when submitting an non JSON object metadata', () => {
+      assert.throws(() => llmobs.submitEvaluation(spanCtx, {
+        label: 'has_toxicity',
+        metricType: 'boolean',
+        value: true,
+        metadata: 'some metadata',
+      }), { message: 'metadata must be a JSON object' })
+    })
+
     describe('with DD_TRACE_OTEL_ENABLED set', () => {
       before(() => {
         process.env.DD_TRACE_OTEL_ENABLED = 'true'