|
| 1 | +package datadog.trace.api.llmobs |
| 2 | + |
| 3 | +import datadog.trace.api.llmobs.noop.NoOpLLMObsSpan |
| 4 | +import datadog.trace.api.llmobs.noop.NoOpLLMObsSpanFactory |
| 5 | +import datadog.trace.api.llmobs.noop.NoOpLLMObsEvalProcessor |
| 6 | +import datadog.trace.test.util.DDSpecification |
| 7 | +import spock.lang.Shared |
| 8 | +import java.lang.reflect.Field |
| 9 | + |
| 10 | +class LLMObsTest extends DDSpecification { |
| 11 | + |
| 12 | + @Shared |
| 13 | + def originalSpanFactory |
| 14 | + @Shared |
| 15 | + def originalEvalProcessor |
| 16 | + |
| 17 | + def setupSpec() { |
| 18 | + // Store original values |
| 19 | + originalSpanFactory = getStaticField("SPAN_FACTORY") |
| 20 | + originalEvalProcessor = getStaticField("EVAL_PROCESSOR") |
| 21 | + } |
| 22 | + |
| 23 | + def cleanupSpec() { |
| 24 | + // Restore original values |
| 25 | + setStaticField("SPAN_FACTORY", originalSpanFactory) |
| 26 | + setStaticField("EVAL_PROCESSOR", originalEvalProcessor) |
| 27 | + } |
| 28 | + |
| 29 | + def cleanup() { |
| 30 | + // Reset to defaults after each test |
| 31 | + setStaticField("SPAN_FACTORY", NoOpLLMObsSpanFactory.INSTANCE) |
| 32 | + setStaticField("EVAL_PROCESSOR", NoOpLLMObsEvalProcessor.INSTANCE) |
| 33 | + } |
| 34 | + |
| 35 | + private static void setStaticField(String fieldName, Object value) { |
| 36 | + Field field = LLMObs.getDeclaredField(fieldName) |
| 37 | + field.setAccessible(true) |
| 38 | + field.set(null, value) |
| 39 | + } |
| 40 | + |
| 41 | + private static Object getStaticField(String fieldName) { |
| 42 | + Field field = LLMObs.getDeclaredField(fieldName) |
| 43 | + field.setAccessible(true) |
| 44 | + return field.get(null) |
| 45 | + } |
| 46 | + |
| 47 | + def "test ToolCall creation and getters"() { |
| 48 | + given: |
| 49 | + def arguments = [location: "New York", unit: "celsius"] |
| 50 | + |
| 51 | + when: |
| 52 | + def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", arguments) |
| 53 | + |
| 54 | + then: |
| 55 | + toolCall.name == "get_weather" |
| 56 | + toolCall.type == "function" |
| 57 | + toolCall.toolId == "tool-123" |
| 58 | + toolCall.arguments == arguments |
| 59 | + } |
| 60 | + |
| 61 | + def "test ToolCall with null arguments"() { |
| 62 | + when: |
| 63 | + def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", null) |
| 64 | + |
| 65 | + then: |
| 66 | + toolCall.name == "get_weather" |
| 67 | + toolCall.type == "function" |
| 68 | + toolCall.toolId == "tool-123" |
| 69 | + toolCall.arguments == null |
| 70 | + } |
| 71 | + |
| 72 | + def "test LLMMessage creation with toolCalls"() { |
| 73 | + given: |
| 74 | + def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", [location: "Paris"]) |
| 75 | + def toolCalls = [toolCall] |
| 76 | + |
| 77 | + when: |
| 78 | + def message = LLMObs.LLMMessage.from("assistant", "Let me check the weather", toolCalls) |
| 79 | + |
| 80 | + then: |
| 81 | + message.role == "assistant" |
| 82 | + message.content == "Let me check the weather" |
| 83 | + message.toolCalls == toolCalls |
| 84 | + message.toolCalls.size() == 1 |
| 85 | + message.toolCalls[0].name == "get_weather" |
| 86 | + message.toolCalls[0].type == "function" |
| 87 | + message.toolCalls[0].toolId == "tool-123" |
| 88 | + message.toolCalls[0].arguments == [location: "Paris"] |
| 89 | + } |
| 90 | + |
| 91 | + def "test LLMMessage creation without toolCalls"() { |
| 92 | + when: |
| 93 | + def message = LLMObs.LLMMessage.from("user", "What's the weather like?") |
| 94 | + |
| 95 | + then: |
| 96 | + message.role == "user" |
| 97 | + message.content == "What's the weather like?" |
| 98 | + message.toolCalls == null |
| 99 | + } |
| 100 | + |
| 101 | + def "test LLMMessage with multiple toolCalls"() { |
| 102 | + given: |
| 103 | + def toolCall1 = LLMObs.ToolCall.from("get_weather", "function", "tool-1", [location: "New York"]) |
| 104 | + def toolCall2 = LLMObs.ToolCall.from("get_stock_price", "function", "tool-2", [symbol: "AAPL"]) |
| 105 | + def toolCalls = [toolCall1, toolCall2] |
| 106 | + |
| 107 | + when: |
| 108 | + def message = LLMObs.LLMMessage.from("assistant", "I'll help you with both requests", toolCalls) |
| 109 | + |
| 110 | + then: |
| 111 | + message.role == "assistant" |
| 112 | + message.content == "I'll help you with both requests" |
| 113 | + message.toolCalls == toolCalls |
| 114 | + message.toolCalls.size() == 2 |
| 115 | + message.toolCalls[0].name == "get_weather" |
| 116 | + message.toolCalls[1].name == "get_stock_price" |
| 117 | + } |
| 118 | + |
| 119 | + def "test default NoOp span factory behavior"() { |
| 120 | + when: |
| 121 | + def llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", "app", "session") |
| 122 | + def agentSpan = LLMObs.startAgentSpan("test", "app", "session") |
| 123 | + def toolSpan = LLMObs.startToolSpan("test", "app", "session") |
| 124 | + def taskSpan = LLMObs.startTaskSpan("test", "app", "session") |
| 125 | + def workflowSpan = LLMObs.startWorkflowSpan("test", "app", "session") |
| 126 | + def embeddingSpan = LLMObs.startEmbeddingSpan("test", "app", "openai", "model", "session") |
| 127 | + def retrievalSpan = LLMObs.startRetrievalSpan("test", "app", "session") |
| 128 | + |
| 129 | + then: |
| 130 | + llmSpan == NoOpLLMObsSpan.INSTANCE |
| 131 | + agentSpan == NoOpLLMObsSpan.INSTANCE |
| 132 | + toolSpan == NoOpLLMObsSpan.INSTANCE |
| 133 | + taskSpan == NoOpLLMObsSpan.INSTANCE |
| 134 | + workflowSpan == NoOpLLMObsSpan.INSTANCE |
| 135 | + embeddingSpan == NoOpLLMObsSpan.INSTANCE |
| 136 | + retrievalSpan == NoOpLLMObsSpan.INSTANCE |
| 137 | + } |
| 138 | + |
| 139 | + def "test span creation with null optional parameters"() { |
| 140 | + when: |
| 141 | + def llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", null, null) |
| 142 | + def agentSpan = LLMObs.startAgentSpan("test", null, null) |
| 143 | + def toolSpan = LLMObs.startToolSpan("test", null, null) |
| 144 | + def taskSpan = LLMObs.startTaskSpan("test", null, null) |
| 145 | + def workflowSpan = LLMObs.startWorkflowSpan("test", null, null) |
| 146 | + def embeddingSpan = LLMObs.startEmbeddingSpan("test", null, null, null, null) |
| 147 | + def retrievalSpan = LLMObs.startRetrievalSpan("test", null, null) |
| 148 | + |
| 149 | + then: |
| 150 | + llmSpan == NoOpLLMObsSpan.INSTANCE |
| 151 | + agentSpan == NoOpLLMObsSpan.INSTANCE |
| 152 | + toolSpan == NoOpLLMObsSpan.INSTANCE |
| 153 | + taskSpan == NoOpLLMObsSpan.INSTANCE |
| 154 | + workflowSpan == NoOpLLMObsSpan.INSTANCE |
| 155 | + embeddingSpan == NoOpLLMObsSpan.INSTANCE |
| 156 | + retrievalSpan == NoOpLLMObsSpan.INSTANCE |
| 157 | + } |
| 158 | + |
| 159 | + def "test default NoOp evaluation processor behavior"() { |
| 160 | + when: |
| 161 | + // These should not throw exceptions |
| 162 | + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, [:]) |
| 163 | + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, "app", [:]) |
| 164 | + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", [:]) |
| 165 | + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", "app", [:]) |
| 166 | + |
| 167 | + then: |
| 168 | + noExceptionThrown() |
| 169 | + } |
| 170 | + |
| 171 | + def "test evaluation submission with various score values"() { |
| 172 | + given: |
| 173 | + def span = NoOpLLMObsSpan.INSTANCE |
| 174 | + def tags = [category: "test", version: "1.0"] |
| 175 | + |
| 176 | + when: |
| 177 | + LLMObs.SubmitEvaluation(span, "accuracy", 0.0, tags) |
| 178 | + LLMObs.SubmitEvaluation(span, "precision", 1.0, tags) |
| 179 | + LLMObs.SubmitEvaluation(span, "recall", 0.85, tags) |
| 180 | + LLMObs.SubmitEvaluation(span, "f1_score", 0.92, "myapp", tags) |
| 181 | + |
| 182 | + then: |
| 183 | + noExceptionThrown() |
| 184 | + } |
| 185 | + |
| 186 | + def "test evaluation submission with categorical values"() { |
| 187 | + given: |
| 188 | + def span = NoOpLLMObsSpan.INSTANCE |
| 189 | + def tags = [evaluator: "human", context: "production"] |
| 190 | + |
| 191 | + when: |
| 192 | + LLMObs.SubmitEvaluation(span, "quality", "excellent", tags) |
| 193 | + LLMObs.SubmitEvaluation(span, "relevance", "poor", tags) |
| 194 | + LLMObs.SubmitEvaluation(span, "toxicity", "safe", "content-app", tags) |
| 195 | + |
| 196 | + then: |
| 197 | + noExceptionThrown() |
| 198 | + } |
| 199 | + |
| 200 | + def "test evaluation submission with empty tags"() { |
| 201 | + given: |
| 202 | + def span = NoOpLLMObsSpan.INSTANCE |
| 203 | + def emptyTags = [:] |
| 204 | + |
| 205 | + when: |
| 206 | + LLMObs.SubmitEvaluation(span, "score", 0.75, emptyTags) |
| 207 | + LLMObs.SubmitEvaluation(span, "category", "good", emptyTags) |
| 208 | + |
| 209 | + then: |
| 210 | + noExceptionThrown() |
| 211 | + } |
| 212 | + |
| 213 | + def "test span creation with custom factory returns actual spans"() { |
| 214 | + given: |
| 215 | + def mockSpanFactory = Mock(LLMObs.LLMObsSpanFactory) |
| 216 | + def mockEvalProcessor = Mock(LLMObs.LLMObsEvalProcessor) |
| 217 | + |
| 218 | + def mockLLMSpan = Mock(LLMObsSpan) |
| 219 | + def mockAgentSpan = Mock(LLMObsSpan) |
| 220 | + def mockToolSpan = Mock(LLMObsSpan) |
| 221 | + def mockTaskSpan = Mock(LLMObsSpan) |
| 222 | + def mockWorkflowSpan = Mock(LLMObsSpan) |
| 223 | + def mockEmbeddingSpan = Mock(LLMObsSpan) |
| 224 | + def mockRetrievalSpan = Mock(LLMObsSpan) |
| 225 | + |
| 226 | + // Set up the custom factory |
| 227 | + setStaticField("SPAN_FACTORY", mockSpanFactory) |
| 228 | + setStaticField("EVAL_PROCESSOR", mockEvalProcessor) |
| 229 | + |
| 230 | + when: |
| 231 | + def llmSpan = LLMObs.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1") |
| 232 | + def agentSpan = LLMObs.startAgentSpan("agent-task", "my-app", "session-1") |
| 233 | + def toolSpan = LLMObs.startToolSpan("weather-tool", "my-app", "session-1") |
| 234 | + def taskSpan = LLMObs.startTaskSpan("summarize-task", "my-app", "session-1") |
| 235 | + def workflowSpan = LLMObs.startWorkflowSpan("data-workflow", "my-app", "session-1") |
| 236 | + def embeddingSpan = LLMObs.startEmbeddingSpan("text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1") |
| 237 | + def retrievalSpan = LLMObs.startRetrievalSpan("document-retrieval", "my-app", "session-1") |
| 238 | + |
| 239 | + // Test evaluation submission |
| 240 | + LLMObs.SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, [test: "value"]) |
| 241 | + LLMObs.SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", [reviewer: "human"]) |
| 242 | + |
| 243 | + then: |
| 244 | + // Verify all span factory methods were called with correct parameters |
| 245 | + 1 * mockSpanFactory.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1") >> mockLLMSpan |
| 246 | + 1 * mockSpanFactory.startAgentSpan("agent-task", "my-app", "session-1") >> mockAgentSpan |
| 247 | + 1 * mockSpanFactory.startToolSpan("weather-tool", "my-app", "session-1") >> mockToolSpan |
| 248 | + 1 * mockSpanFactory.startTaskSpan("summarize-task", "my-app", "session-1") >> mockTaskSpan |
| 249 | + 1 * mockSpanFactory.startWorkflowSpan("data-workflow", "my-app", "session-1") >> mockWorkflowSpan |
| 250 | + 1 * mockSpanFactory.startEmbeddingSpan("text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1") >> mockEmbeddingSpan |
| 251 | + 1 * mockSpanFactory.startRetrievalSpan("document-retrieval", "my-app", "session-1") >> mockRetrievalSpan |
| 252 | + |
| 253 | + // Verify evaluation processor methods were called |
| 254 | + 1 * mockEvalProcessor.SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, [test: "value"]) |
| 255 | + 1 * mockEvalProcessor.SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", [reviewer: "human"]) |
| 256 | + |
| 257 | + // Verify the correct spans were returned |
| 258 | + llmSpan == mockLLMSpan |
| 259 | + agentSpan == mockAgentSpan |
| 260 | + toolSpan == mockToolSpan |
| 261 | + taskSpan == mockTaskSpan |
| 262 | + workflowSpan == mockWorkflowSpan |
| 263 | + embeddingSpan == mockEmbeddingSpan |
| 264 | + retrievalSpan == mockRetrievalSpan |
| 265 | + |
| 266 | + // Verify spans are not the NoOp instances |
| 267 | + llmSpan != NoOpLLMObsSpan.INSTANCE |
| 268 | + agentSpan != NoOpLLMObsSpan.INSTANCE |
| 269 | + toolSpan != NoOpLLMObsSpan.INSTANCE |
| 270 | + taskSpan != NoOpLLMObsSpan.INSTANCE |
| 271 | + workflowSpan != NoOpLLMObsSpan.INSTANCE |
| 272 | + embeddingSpan != NoOpLLMObsSpan.INSTANCE |
| 273 | + retrievalSpan != NoOpLLMObsSpan.INSTANCE |
| 274 | + } |
| 275 | + |
| 276 | + def "test span creation with null parameters using custom factory"() { |
| 277 | + given: |
| 278 | + def mockSpanFactory = Mock(LLMObs.LLMObsSpanFactory) |
| 279 | + def mockSpan = Mock(LLMObsSpan) |
| 280 | + |
| 281 | + setStaticField("SPAN_FACTORY", mockSpanFactory) |
| 282 | + |
| 283 | + when: |
| 284 | + def llmSpan = LLMObs.startLLMSpan("test-span", "gpt-4", "openai", null, null) |
| 285 | + def embeddingSpan = LLMObs.startEmbeddingSpan("embed-span", null, null, null, null) |
| 286 | + |
| 287 | + then: |
| 288 | + 1 * mockSpanFactory.startLLMSpan("test-span", "gpt-4", "openai", null, null) >> mockSpan |
| 289 | + 1 * mockSpanFactory.startEmbeddingSpan("embed-span", null, null, null, null) >> mockSpan |
| 290 | + |
| 291 | + llmSpan == mockSpan |
| 292 | + embeddingSpan == mockSpan |
| 293 | + } |
| 294 | +} |
0 commit comments