Skip to content

Commit d07fee9

Browse files
authored
Added tests for llmobs (#9395)
1 parent 962f2c5 commit d07fee9

2 files changed

Lines changed: 296 additions & 2 deletions

File tree

dd-trace-api/src/main/java/datadog/trace/api/llmobs/LLMObs.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public static LLMObsSpan startWorkflowSpan(
4646
return SPAN_FACTORY.startWorkflowSpan(spanName, mlApp, sessionId);
4747
}
4848

49-
public LLMObsSpan startEmbeddingSpan(
49+
public static LLMObsSpan startEmbeddingSpan(
5050
String spanName,
5151
@Nullable String mlApp,
5252
@Nullable String modelProvider,
@@ -55,7 +55,7 @@ public LLMObsSpan startEmbeddingSpan(
5555
return SPAN_FACTORY.startEmbeddingSpan(spanName, mlApp, modelProvider, modelName, sessionId);
5656
}
5757

58-
public LLMObsSpan startRetrievalSpan(
58+
public static LLMObsSpan startRetrievalSpan(
5959
String spanName, @Nullable String mlApp, @Nullable String sessionId) {
6060
return SPAN_FACTORY.startRetrievalSpan(spanName, mlApp, sessionId);
6161
}
Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
package datadog.trace.api.llmobs
2+
3+
import datadog.trace.api.llmobs.noop.NoOpLLMObsSpan
4+
import datadog.trace.api.llmobs.noop.NoOpLLMObsSpanFactory
5+
import datadog.trace.api.llmobs.noop.NoOpLLMObsEvalProcessor
6+
import datadog.trace.test.util.DDSpecification
7+
import spock.lang.Shared
8+
import java.lang.reflect.Field
9+
10+
class LLMObsTest extends DDSpecification {
11+
12+
@Shared
13+
def originalSpanFactory
14+
@Shared
15+
def originalEvalProcessor
16+
17+
def setupSpec() {
18+
// Store original values
19+
originalSpanFactory = getStaticField("SPAN_FACTORY")
20+
originalEvalProcessor = getStaticField("EVAL_PROCESSOR")
21+
}
22+
23+
def cleanupSpec() {
24+
// Restore original values
25+
setStaticField("SPAN_FACTORY", originalSpanFactory)
26+
setStaticField("EVAL_PROCESSOR", originalEvalProcessor)
27+
}
28+
29+
def cleanup() {
30+
// Reset to defaults after each test
31+
setStaticField("SPAN_FACTORY", NoOpLLMObsSpanFactory.INSTANCE)
32+
setStaticField("EVAL_PROCESSOR", NoOpLLMObsEvalProcessor.INSTANCE)
33+
}
34+
35+
private static void setStaticField(String fieldName, Object value) {
36+
Field field = LLMObs.getDeclaredField(fieldName)
37+
field.setAccessible(true)
38+
field.set(null, value)
39+
}
40+
41+
private static Object getStaticField(String fieldName) {
42+
Field field = LLMObs.getDeclaredField(fieldName)
43+
field.setAccessible(true)
44+
return field.get(null)
45+
}
46+
47+
def "test ToolCall creation and getters"() {
48+
given:
49+
def arguments = [location: "New York", unit: "celsius"]
50+
51+
when:
52+
def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", arguments)
53+
54+
then:
55+
toolCall.name == "get_weather"
56+
toolCall.type == "function"
57+
toolCall.toolId == "tool-123"
58+
toolCall.arguments == arguments
59+
}
60+
61+
def "test ToolCall with null arguments"() {
62+
when:
63+
def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", null)
64+
65+
then:
66+
toolCall.name == "get_weather"
67+
toolCall.type == "function"
68+
toolCall.toolId == "tool-123"
69+
toolCall.arguments == null
70+
}
71+
72+
def "test LLMMessage creation with toolCalls"() {
73+
given:
74+
def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", [location: "Paris"])
75+
def toolCalls = [toolCall]
76+
77+
when:
78+
def message = LLMObs.LLMMessage.from("assistant", "Let me check the weather", toolCalls)
79+
80+
then:
81+
message.role == "assistant"
82+
message.content == "Let me check the weather"
83+
message.toolCalls == toolCalls
84+
message.toolCalls.size() == 1
85+
message.toolCalls[0].name == "get_weather"
86+
message.toolCalls[0].type == "function"
87+
message.toolCalls[0].toolId == "tool-123"
88+
message.toolCalls[0].arguments == [location: "Paris"]
89+
}
90+
91+
def "test LLMMessage creation without toolCalls"() {
92+
when:
93+
def message = LLMObs.LLMMessage.from("user", "What's the weather like?")
94+
95+
then:
96+
message.role == "user"
97+
message.content == "What's the weather like?"
98+
message.toolCalls == null
99+
}
100+
101+
def "test LLMMessage with multiple toolCalls"() {
102+
given:
103+
def toolCall1 = LLMObs.ToolCall.from("get_weather", "function", "tool-1", [location: "New York"])
104+
def toolCall2 = LLMObs.ToolCall.from("get_stock_price", "function", "tool-2", [symbol: "AAPL"])
105+
def toolCalls = [toolCall1, toolCall2]
106+
107+
when:
108+
def message = LLMObs.LLMMessage.from("assistant", "I'll help you with both requests", toolCalls)
109+
110+
then:
111+
message.role == "assistant"
112+
message.content == "I'll help you with both requests"
113+
message.toolCalls == toolCalls
114+
message.toolCalls.size() == 2
115+
message.toolCalls[0].name == "get_weather"
116+
message.toolCalls[1].name == "get_stock_price"
117+
}
118+
119+
def "test default NoOp span factory behavior"() {
120+
when:
121+
def llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", "app", "session")
122+
def agentSpan = LLMObs.startAgentSpan("test", "app", "session")
123+
def toolSpan = LLMObs.startToolSpan("test", "app", "session")
124+
def taskSpan = LLMObs.startTaskSpan("test", "app", "session")
125+
def workflowSpan = LLMObs.startWorkflowSpan("test", "app", "session")
126+
def embeddingSpan = LLMObs.startEmbeddingSpan("test", "app", "openai", "model", "session")
127+
def retrievalSpan = LLMObs.startRetrievalSpan("test", "app", "session")
128+
129+
then:
130+
llmSpan == NoOpLLMObsSpan.INSTANCE
131+
agentSpan == NoOpLLMObsSpan.INSTANCE
132+
toolSpan == NoOpLLMObsSpan.INSTANCE
133+
taskSpan == NoOpLLMObsSpan.INSTANCE
134+
workflowSpan == NoOpLLMObsSpan.INSTANCE
135+
embeddingSpan == NoOpLLMObsSpan.INSTANCE
136+
retrievalSpan == NoOpLLMObsSpan.INSTANCE
137+
}
138+
139+
def "test span creation with null optional parameters"() {
140+
when:
141+
def llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", null, null)
142+
def agentSpan = LLMObs.startAgentSpan("test", null, null)
143+
def toolSpan = LLMObs.startToolSpan("test", null, null)
144+
def taskSpan = LLMObs.startTaskSpan("test", null, null)
145+
def workflowSpan = LLMObs.startWorkflowSpan("test", null, null)
146+
def embeddingSpan = LLMObs.startEmbeddingSpan("test", null, null, null, null)
147+
def retrievalSpan = LLMObs.startRetrievalSpan("test", null, null)
148+
149+
then:
150+
llmSpan == NoOpLLMObsSpan.INSTANCE
151+
agentSpan == NoOpLLMObsSpan.INSTANCE
152+
toolSpan == NoOpLLMObsSpan.INSTANCE
153+
taskSpan == NoOpLLMObsSpan.INSTANCE
154+
workflowSpan == NoOpLLMObsSpan.INSTANCE
155+
embeddingSpan == NoOpLLMObsSpan.INSTANCE
156+
retrievalSpan == NoOpLLMObsSpan.INSTANCE
157+
}
158+
159+
def "test default NoOp evaluation processor behavior"() {
160+
when:
161+
// These should not throw exceptions
162+
LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, [:])
163+
LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, "app", [:])
164+
LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", [:])
165+
LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", "app", [:])
166+
167+
then:
168+
noExceptionThrown()
169+
}
170+
171+
def "test evaluation submission with various score values"() {
172+
given:
173+
def span = NoOpLLMObsSpan.INSTANCE
174+
def tags = [category: "test", version: "1.0"]
175+
176+
when:
177+
LLMObs.SubmitEvaluation(span, "accuracy", 0.0, tags)
178+
LLMObs.SubmitEvaluation(span, "precision", 1.0, tags)
179+
LLMObs.SubmitEvaluation(span, "recall", 0.85, tags)
180+
LLMObs.SubmitEvaluation(span, "f1_score", 0.92, "myapp", tags)
181+
182+
then:
183+
noExceptionThrown()
184+
}
185+
186+
def "test evaluation submission with categorical values"() {
187+
given:
188+
def span = NoOpLLMObsSpan.INSTANCE
189+
def tags = [evaluator: "human", context: "production"]
190+
191+
when:
192+
LLMObs.SubmitEvaluation(span, "quality", "excellent", tags)
193+
LLMObs.SubmitEvaluation(span, "relevance", "poor", tags)
194+
LLMObs.SubmitEvaluation(span, "toxicity", "safe", "content-app", tags)
195+
196+
then:
197+
noExceptionThrown()
198+
}
199+
200+
def "test evaluation submission with empty tags"() {
201+
given:
202+
def span = NoOpLLMObsSpan.INSTANCE
203+
def emptyTags = [:]
204+
205+
when:
206+
LLMObs.SubmitEvaluation(span, "score", 0.75, emptyTags)
207+
LLMObs.SubmitEvaluation(span, "category", "good", emptyTags)
208+
209+
then:
210+
noExceptionThrown()
211+
}
212+
213+
def "test span creation with custom factory returns actual spans"() {
214+
given:
215+
def mockSpanFactory = Mock(LLMObs.LLMObsSpanFactory)
216+
def mockEvalProcessor = Mock(LLMObs.LLMObsEvalProcessor)
217+
218+
def mockLLMSpan = Mock(LLMObsSpan)
219+
def mockAgentSpan = Mock(LLMObsSpan)
220+
def mockToolSpan = Mock(LLMObsSpan)
221+
def mockTaskSpan = Mock(LLMObsSpan)
222+
def mockWorkflowSpan = Mock(LLMObsSpan)
223+
def mockEmbeddingSpan = Mock(LLMObsSpan)
224+
def mockRetrievalSpan = Mock(LLMObsSpan)
225+
226+
// Set up the custom factory
227+
setStaticField("SPAN_FACTORY", mockSpanFactory)
228+
setStaticField("EVAL_PROCESSOR", mockEvalProcessor)
229+
230+
when:
231+
def llmSpan = LLMObs.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1")
232+
def agentSpan = LLMObs.startAgentSpan("agent-task", "my-app", "session-1")
233+
def toolSpan = LLMObs.startToolSpan("weather-tool", "my-app", "session-1")
234+
def taskSpan = LLMObs.startTaskSpan("summarize-task", "my-app", "session-1")
235+
def workflowSpan = LLMObs.startWorkflowSpan("data-workflow", "my-app", "session-1")
236+
def embeddingSpan = LLMObs.startEmbeddingSpan("text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1")
237+
def retrievalSpan = LLMObs.startRetrievalSpan("document-retrieval", "my-app", "session-1")
238+
239+
// Test evaluation submission
240+
LLMObs.SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, [test: "value"])
241+
LLMObs.SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", [reviewer: "human"])
242+
243+
then:
244+
// Verify all span factory methods were called with correct parameters
245+
1 * mockSpanFactory.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1") >> mockLLMSpan
246+
1 * mockSpanFactory.startAgentSpan("agent-task", "my-app", "session-1") >> mockAgentSpan
247+
1 * mockSpanFactory.startToolSpan("weather-tool", "my-app", "session-1") >> mockToolSpan
248+
1 * mockSpanFactory.startTaskSpan("summarize-task", "my-app", "session-1") >> mockTaskSpan
249+
1 * mockSpanFactory.startWorkflowSpan("data-workflow", "my-app", "session-1") >> mockWorkflowSpan
250+
1 * mockSpanFactory.startEmbeddingSpan("text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1") >> mockEmbeddingSpan
251+
1 * mockSpanFactory.startRetrievalSpan("document-retrieval", "my-app", "session-1") >> mockRetrievalSpan
252+
253+
// Verify evaluation processor methods were called
254+
1 * mockEvalProcessor.SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, [test: "value"])
255+
1 * mockEvalProcessor.SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", [reviewer: "human"])
256+
257+
// Verify the correct spans were returned
258+
llmSpan == mockLLMSpan
259+
agentSpan == mockAgentSpan
260+
toolSpan == mockToolSpan
261+
taskSpan == mockTaskSpan
262+
workflowSpan == mockWorkflowSpan
263+
embeddingSpan == mockEmbeddingSpan
264+
retrievalSpan == mockRetrievalSpan
265+
266+
// Verify spans are not the NoOp instances
267+
llmSpan != NoOpLLMObsSpan.INSTANCE
268+
agentSpan != NoOpLLMObsSpan.INSTANCE
269+
toolSpan != NoOpLLMObsSpan.INSTANCE
270+
taskSpan != NoOpLLMObsSpan.INSTANCE
271+
workflowSpan != NoOpLLMObsSpan.INSTANCE
272+
embeddingSpan != NoOpLLMObsSpan.INSTANCE
273+
retrievalSpan != NoOpLLMObsSpan.INSTANCE
274+
}
275+
276+
def "test span creation with null parameters using custom factory"() {
277+
given:
278+
def mockSpanFactory = Mock(LLMObs.LLMObsSpanFactory)
279+
def mockSpan = Mock(LLMObsSpan)
280+
281+
setStaticField("SPAN_FACTORY", mockSpanFactory)
282+
283+
when:
284+
def llmSpan = LLMObs.startLLMSpan("test-span", "gpt-4", "openai", null, null)
285+
def embeddingSpan = LLMObs.startEmbeddingSpan("embed-span", null, null, null, null)
286+
287+
then:
288+
1 * mockSpanFactory.startLLMSpan("test-span", "gpt-4", "openai", null, null) >> mockSpan
289+
1 * mockSpanFactory.startEmbeddingSpan("embed-span", null, null, null, null) >> mockSpan
290+
291+
llmSpan == mockSpan
292+
embeddingSpan == mockSpan
293+
}
294+
}

0 commit comments

Comments
 (0)