Skip to content

Commit 4ed9593

Browse files
authored
[MLOS-459] Support enriched evalmetric event submission (#7503)
* Add reasoning, assessment and metadata * more guards * nit * fix syntax * some unit tests * more tests * fix lint * undefined * address comments * partial revert * revert metadata * pass / fail * fix test * fix test * json * token * doh * fix message * fix doc * fixes * fix
1 parent 70b9dba commit 4ed9593

File tree

3 files changed

+141
-6
lines changed

3 files changed

+141
-6
lines changed

index.d.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3243,13 +3243,13 @@ declare namespace tracer {
32433243
/**
32443244
* The type of evaluation metric, one of 'categorical', 'score', or 'boolean'
32453245
*/
3246-
metricType: 'categorical' | 'score' | 'boolean',
3246+
metricType: 'categorical' | 'score' | 'boolean' | 'json',
32473247

32483248
/**
32493249
* The value of the evaluation metric.
3250-
* Must be string for 'categorical' metrics, number for 'score' metrics, and boolean for 'boolean' metrics.
3250+
* Must be string for 'categorical' metrics, number for 'score' metrics, boolean for 'boolean' metrics and a JSON object for 'json' metrics.
32513251
*/
3252-
value: string | number | boolean,
3252+
value: string | number | boolean | { [key: string]: any },
32533253

32543254
/**
32553255
* An object of string key-value pairs to tag the evaluation metric with.
@@ -3265,6 +3265,21 @@ declare namespace tracer {
32653265
* The timestamp in milliseconds when the evaluation metric result was generated.
32663266
*/
32673267
timestampMs?: number
3268+
3269+
/**
3270+
* Reasoning for the evaluation result.
3271+
*/
3272+
reasoning?: string,
3273+
3274+
/**
3275+
* Whether the evaluation passed or failed. Valid values are pass and fail.
3276+
*/
3277+
assessment?: 'pass' | 'fail',
3278+
3279+
/**
3280+
* Arbitrary JSON data associated with the evaluation.
3281+
*/
3282+
metadata?: { [key: string]: any }
32683283
}
32693284

32703285
interface Document {

packages/dd-trace/src/llmobs/sdk.js

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -359,15 +359,15 @@ class LLMObs extends NoopLLMObs {
359359
throw new Error('timestampMs must be a non-negative integer. Evaluation metric data will not be sent')
360360
}
361361

362-
const { label, value, tags } = options
362+
const { label, value, tags, reasoning, assessment, metadata } = options
363363
const metricType = options.metricType?.toLowerCase()
364364
if (!label) {
365365
err = 'invalid_metric_label'
366366
throw new Error('label must be the specified name of the evaluation metric')
367367
}
368-
if (!metricType || !['categorical', 'score', 'boolean'].includes(metricType)) {
368+
if (!metricType || !['categorical', 'score', 'boolean', 'json'].includes(metricType)) {
369369
err = 'invalid_metric_type'
370-
throw new Error('metricType must be one of "categorical" or "score"')
370+
throw new Error('metricType must be one of "categorical", "score", "boolean" or "json"')
371371
}
372372
if (metricType === 'categorical' && typeof value !== 'string') {
373373
err = 'invalid_metric_value'
@@ -381,6 +381,22 @@ class LLMObs extends NoopLLMObs {
381381
err = 'invalid_metric_value'
382382
throw new Error('value must be a boolean for a boolean metric')
383383
}
384+
if (metricType === 'json' && !(typeof value === 'object' && value != null && !Array.isArray(value))) {
385+
err = 'invalid_metric_value'
386+
throw new Error('value must be a JSON object for a json metric')
387+
}
388+
if (assessment != null && assessment !== 'pass' && assessment !== 'fail') {
389+
err = 'invalid_assessment'
390+
throw new Error('assessment must be pass or fail')
391+
}
392+
if (reasoning != null && typeof reasoning !== 'string') {
393+
err = 'invalid_reasoning'
394+
throw new Error('reasoning must be a string')
395+
}
396+
if (metadata != null && (typeof metadata !== 'object' || Array.isArray(metadata))) {
397+
err = 'invalid_metadata'
398+
throw new Error('metadata must be a JSON object')
399+
}
384400

385401
const evaluationTags = {
386402
'ddtrace.version': tracerVersion,
@@ -425,6 +441,15 @@ class LLMObs extends NoopLLMObs {
425441
timestamp_ms: timestampMs,
426442
tags: Object.entries(evaluationTags).map(([key, value]) => `${key}:${value}`),
427443
}
444+
if (reasoning != null) {
445+
payload.reasoning = reasoning
446+
}
447+
if (metadata != null) {
448+
payload.metadata = metadata
449+
}
450+
if (assessment != null) {
451+
payload.assessment = assessment
452+
}
428453
const currentStore = storage.getStore()
429454
const routing = currentStore?.routingContext
430455
evalMetricAppendCh.publish({ payload, routing })

packages/dd-trace/test/llmobs/sdk/index.spec.js

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,6 +1268,101 @@ describe('sdk', () => {
12681268
}), { message: 'value must be a boolean for a boolean metric' })
12691269
})
12701270

1271+
it('submits a json evaluation metric', () => {
1272+
llmobs.submitEvaluation(spanCtx, {
1273+
label: 'has_toxicity',
1274+
metricType: 'json',
1275+
value: { f1: 0.8, recall: 1, precision: 0.5 },
1276+
timestampMs: 1234,
1277+
})
1278+
1279+
const evalMetric = LLMObsEvalMetricsWriter.prototype.append.getCall(0).args[0]
1280+
1281+
assert.deepStrictEqual(evalMetric, {
1282+
join_on: {
1283+
span: {
1284+
span_id: '5678',
1285+
trace_id: '1234',
1286+
},
1287+
},
1288+
label: 'has_toxicity',
1289+
metric_type: 'json',
1290+
ml_app: 'mlApp',
1291+
json_value: { f1: 0.8, recall: 1, precision: 0.5 },
1292+
timestamp_ms: 1234,
1293+
tags: [`ddtrace.version:${tracerVersion}`, 'ml_app:mlApp'],
1294+
})
1295+
})
1296+
1297+
it('throws an error when submitting a non-JSON object json evaluation metric', () => {
1298+
assert.throws(() => llmobs.submitEvaluation(spanCtx, {
1299+
label: 'has_toxicity',
1300+
metricType: 'json',
1301+
value: 'it is super toxic!',
1302+
}), { message: 'value must be a JSON object for a json metric' })
1303+
})
1304+
1305+
it('submits an enriched evaluation metric', () => {
1306+
llmobs.submitEvaluation(spanCtx, {
1307+
mlApp: 'test',
1308+
timestampMs: 1234,
1309+
label: 'toxic',
1310+
metricType: 'score',
1311+
value: 0.6,
1312+
reasoning: 'this input is toxic',
1313+
assessment: 'fail',
1314+
metadata: { some: 'details' },
1315+
tags: {
1316+
host: 'localhost',
1317+
},
1318+
})
1319+
1320+
assert.deepStrictEqual(LLMObsEvalMetricsWriter.prototype.append.getCall(0).args[0], {
1321+
join_on: {
1322+
span: {
1323+
span_id: spanCtx.spanId,
1324+
trace_id: spanCtx.traceId,
1325+
},
1326+
},
1327+
ml_app: 'test',
1328+
timestamp_ms: 1234,
1329+
label: 'toxic',
1330+
metric_type: 'score',
1331+
score_value: 0.6,
1332+
tags: [`ddtrace.version:${tracerVersion}`, 'ml_app:test', 'host:localhost'],
1333+
reasoning: 'this input is toxic',
1334+
assessment: 'fail',
1335+
metadata: { some: 'details' },
1336+
})
1337+
})
1338+
1339+
it('throws an error when submitting a non-string reasoning', () => {
1340+
assert.throws(() => llmobs.submitEvaluation(spanCtx, {
1341+
label: 'has_toxicity',
1342+
metricType: 'boolean',
1343+
value: true,
1344+
reasoning: 1,
1345+
}), { message: 'reasoning must be a string' })
1346+
})
1347+
1348+
it('throws an error when submitting a non pass/fail assessment', () => {
1349+
assert.throws(() => llmobs.submitEvaluation(spanCtx, {
1350+
label: 'has_toxicity',
1351+
metricType: 'boolean',
1352+
value: true,
1353+
assessment: 'correct',
1354+
}), { message: 'assessment must be pass or fail' })
1355+
})
1356+
1357+
it('throws an error when submitting an non JSON object metadata', () => {
1358+
assert.throws(() => llmobs.submitEvaluation(spanCtx, {
1359+
label: 'has_toxicity',
1360+
metricType: 'boolean',
1361+
value: true,
1362+
metadata: 'some metadata',
1363+
}), { message: 'metadata must be a JSON object' })
1364+
})
1365+
12711366
describe('with DD_TRACE_OTEL_ENABLED set', () => {
12721367
before(() => {
12731368
process.env.DD_TRACE_OTEL_ENABLED = 'true'

0 commit comments

Comments
 (0)