Skip to content

Commit bcf39ba

Browse files
committed
Autoparser - full single commit squish
1 parent abb9f3c commit bcf39ba

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+12902
-10814
lines changed

common/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,11 @@ add_library(${TARGET} STATIC
4747
arg.cpp
4848
arg.h
4949
base64.hpp
50-
chat-parser.cpp
51-
chat-parser.h
52-
chat-parser-xml-toolcall.h
53-
chat-parser-xml-toolcall.cpp
50+
chat-auto-parser-generator.cpp
51+
chat-auto-parser-helpers.cpp
52+
chat-auto-parser.h
53+
chat-diff-analyzer.cpp
54+
chat-diff-analyzer.h
5455
chat-peg-parser.cpp
5556
chat-peg-parser.h
5657
chat.cpp

common/chat-auto-parser-generator.cpp

Lines changed: 411 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
#include "chat-auto-parser-helpers.h"
2+
3+
#include "chat-auto-parser.h"
4+
#include "chat-diff-analyzer.h"
5+
#include "chat.h"
6+
#include "log.h"
7+
#include "nlohmann/json.hpp"
8+
9+
#include <cctype>
10+
#include <numeric>
11+
12+
using json = nlohmann::ordered_json;
13+
14+
std::string trim_whitespace(const std::string & str) {
15+
size_t start = 0;
16+
while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
17+
start++;
18+
}
19+
20+
if (start == str.length()) {
21+
return "";
22+
}
23+
24+
size_t end = str.length() - 1;
25+
while (end > start && std::isspace(static_cast<unsigned char>(str[end]))) {
26+
end--;
27+
}
28+
29+
return str.substr(start, end - start + 1);
30+
}
31+
32+
std::string trim_leading_whitespace(const std::string & str) {
33+
size_t start = 0;
34+
while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
35+
start++;
36+
}
37+
38+
return str.substr(start);
39+
}
40+
41+
std::string trim_trailing_whitespace(const std::string & str) {
42+
if (str.empty()) {
43+
return "";
44+
}
45+
46+
size_t end = str.length() - 1;
47+
while (end > 0 && std::isspace(static_cast<unsigned char>(str[end]))) {
48+
end--;
49+
}
50+
51+
// If first char is also whitespace, return empty string
52+
if (end == 0 && std::isspace(static_cast<unsigned char>(str[0]))) {
53+
return "";
54+
}
55+
56+
return str.substr(0, end + 1);
57+
}
58+
59+
std::string trim_trailing_newlines(const std::string & str) {
60+
size_t end = str.length();
61+
while (end > 0 && str[end - 1] == '\n') {
62+
end--;
63+
}
64+
65+
return str.substr(0, end);
66+
}
67+
68+
static size_t common_prefix_len(const std::string & left, const std::string & right) {
69+
size_t prefix_len = 0;
70+
size_t min_len = std::min(left.length(), right.length());
71+
while (prefix_len < min_len && left[prefix_len] == right[prefix_len]) {
72+
prefix_len++;
73+
}
74+
return prefix_len;
75+
}
76+
77+
static size_t common_suffix_len(const std::string & left, const std::string & right) {
78+
size_t suffix_len = 0;
79+
size_t min_len = std::min(left.length(), right.length());
80+
while (suffix_len < min_len && left[left.length() - 1 - suffix_len] == right[right.length() - 1 - suffix_len]) {
81+
suffix_len++;
82+
}
83+
return suffix_len;
84+
}
85+
86+
diff_split calculate_diff_split(const std::string & left, const std::string & right) {
87+
diff_split result;
88+
89+
auto left_seg = segmentize_markers(left);
90+
auto right_seg = segmentize_markers(right);
91+
92+
if (left_seg.empty()) {
93+
result.right = right;
94+
return result;
95+
}
96+
if (right_seg.empty()) {
97+
result.left = left;
98+
return result;
99+
}
100+
101+
auto left_start = left_seg.begin();
102+
auto left_end = --left_seg.end();
103+
auto right_start = right_seg.begin();
104+
auto right_end = --right_seg.end();
105+
106+
auto test = [&] () {
107+
return left_start != left_end && right_start != right_end;
108+
};
109+
110+
bool left_fully_consumed = false;
111+
bool right_fully_consumed = false;
112+
113+
while (test()) {
114+
bool advanced = false;
115+
if (*left_start == *right_start) {
116+
result.prefix.append(left_start->value);
117+
left_start++;
118+
right_start++;
119+
advanced = true;
120+
}
121+
if (*left_end == *right_end) {
122+
result.suffix = left_end->value + result.suffix;
123+
if (left_start != left_end) {
124+
left_end--;
125+
} else {
126+
left_fully_consumed = true;
127+
}
128+
if (right_start != right_end) {
129+
right_end--;
130+
} else {
131+
right_fully_consumed = true;
132+
}
133+
advanced = true;
134+
}
135+
if (!advanced) {
136+
break;
137+
}
138+
}
139+
140+
if (left_start == left_end && right_start != right_end) {
141+
if (*left_start == *right_end) {
142+
result.suffix = right_end->value + result.suffix;
143+
right_end--;
144+
left_fully_consumed = true;
145+
} else if (*left_start == *right_start) {
146+
result.prefix.append(right_start->value);
147+
right_start++;
148+
left_fully_consumed = true;
149+
}
150+
} else if (right_start == right_end && left_start != left_end) {
151+
if (*left_end == *right_start) {
152+
result.suffix = left_end->value + result.suffix;
153+
left_end--;
154+
right_fully_consumed = true;
155+
} else if (*left_start == *right_start) {
156+
result.prefix.append(left_start->value);
157+
left_start++;
158+
right_fully_consumed = true;
159+
}
160+
} else if (left_start == left_end && right_start == right_end && *left_start == *right_start && left_start->type == segment_type::MARKER) {
161+
result.prefix.append(right_start->value);
162+
left_fully_consumed = true;
163+
right_fully_consumed = true;
164+
}
165+
166+
auto eat_segment = [](std::string & str, segment & seg) -> std::string { return str.append(seg.value); };
167+
168+
bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
169+
bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
170+
171+
std::string remainder_left = std::accumulate(left_start, left_fully_consumed ? left_end : ++left_end, std::string(), eat_segment);
172+
std::string remainder_right = std::accumulate(right_start, right_fully_consumed ? right_end : ++right_end, std::string(), eat_segment);
173+
174+
size_t suffix_len = can_have_text_suffix ? common_suffix_len(remainder_left, remainder_right) : 0;
175+
// avoid overlaps between prefix and suffix
176+
size_t prefix_len = can_have_text_prefix ? common_prefix_len(remainder_left.substr(0, remainder_left.size() - suffix_len),
177+
remainder_right.substr(0, remainder_right.size() - suffix_len)) : 0;
178+
179+
result.prefix.append(remainder_left.substr(0, prefix_len));
180+
result.suffix = remainder_left.substr(remainder_left.length() - suffix_len, suffix_len) + result.suffix;
181+
result.left = remainder_left.substr(prefix_len, remainder_left.length() - prefix_len - suffix_len);
182+
result.right = remainder_right.substr(prefix_len, remainder_right.length() - prefix_len - suffix_len);
183+
184+
if (result.left == "" && result.right == "") {
185+
// degenerate case, no diff
186+
result.prefix = left;
187+
result.suffix = "";
188+
// pick prefix = all as representation
189+
}
190+
return result;
191+
}
192+
193+
// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
194+
std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right) {
195+
// Find the common prefix of left and right
196+
size_t common_prefix_len = 0;
197+
size_t min_len = std::min(left.length(), right.length());
198+
while (common_prefix_len < min_len && left[common_prefix_len] == right[common_prefix_len]) {
199+
common_prefix_len++;
200+
}
201+
202+
// If there's no common prefix, return empty string
203+
if (common_prefix_len == 0) {
204+
return "";
205+
}
206+
207+
// Find the common prefix in the full string
208+
std::string common_prefix = left.substr(0, common_prefix_len);
209+
size_t pos = full.find(common_prefix);
210+
211+
// If not found, return empty string
212+
if (pos == std::string::npos) {
213+
return "";
214+
}
215+
216+
// Return everything before the common prefix
217+
return full.substr(0, pos);
218+
}
219+
220+
// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
221+
std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right) {
222+
// Find the common suffix of left and right (compare from the end)
223+
size_t common_suffix_len = 0;
224+
size_t min_len = std::min(left.length(), right.length());
225+
while (common_suffix_len < min_len &&
226+
left[left.length() - 1 - common_suffix_len] == right[right.length() - 1 - common_suffix_len]) {
227+
common_suffix_len++;
228+
}
229+
230+
// If there's no common suffix, return empty string
231+
if (common_suffix_len == 0) {
232+
return "";
233+
}
234+
235+
// Extract the common suffix
236+
std::string common_suffix = left.substr(left.length() - common_suffix_len);
237+
238+
// Find the last occurrence of the common suffix in the full string
239+
size_t pos = full.rfind(common_suffix);
240+
241+
// If not found, return empty string
242+
if (pos == std::string::npos) {
243+
return "";
244+
}
245+
246+
// Return everything after the common suffix
247+
return full.substr(pos + common_suffix_len);
248+
}
249+
250+
// TODO: segmentize will treat a JSON array inside tags as a tag: <calls>[{ "fun": { ... } }]</calls> will be three markers
251+
// not too worried about that because it hasn't turned out as a problem anywhere, but noting here in case it will
252+
// Might have to put some restrictions on tag contents as well (like "no { }")
253+
std::vector<segment> segmentize_markers(const std::string & text) {
254+
std::vector<segment> retval;
255+
bool in_marker = false;
256+
char marker_opener = '\0';
257+
258+
auto is_marker_opener = [](char c) -> bool { return c == '<' || c == '['; };
259+
auto is_marker_closer = [](char op, char c) -> bool { return (op == '<' && c == '>') || (op == '[' && c == ']'); };
260+
261+
size_t last_border = 0;
262+
263+
for (size_t cur_pos = 0; cur_pos < text.length(); cur_pos++) {
264+
if (!in_marker && is_marker_opener(text[cur_pos])) {
265+
if (last_border < cur_pos) {
266+
retval.push_back(segment(segment_type::TEXT, text.substr(last_border, cur_pos - last_border)));
267+
}
268+
last_border = cur_pos;
269+
in_marker = true;
270+
marker_opener = text[cur_pos];
271+
} else if (in_marker && is_marker_closer(marker_opener, text[cur_pos])) {
272+
// no need to check because last_border will always be smaller
273+
retval.push_back(segment(segment_type::MARKER, text.substr(last_border, cur_pos - last_border + 1)));
274+
last_border = cur_pos + 1;
275+
in_marker = false;
276+
marker_opener = '\0';
277+
}
278+
}
279+
if (last_border < text.length()) {
280+
retval.push_back(segment(segment_type::TEXT, text.substr(last_border)));
281+
}
282+
return retval;
283+
}
284+
285+
std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments) {
286+
std::vector<segment> result;
287+
for (const auto & seg : segments) {
288+
if (!trim_whitespace(seg.value).empty()) {
289+
result.push_back(seg);
290+
}
291+
}
292+
return result;
293+
}
294+
295+
namespace autoparser {
296+
297+
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
298+
templates_params tmpl_params;
299+
tmpl_params.messages = params.messages;
300+
tmpl_params.tools = params.tools;
301+
tmpl_params.add_generation_prompt = params.add_generation_prompt;
302+
tmpl_params.enable_thinking = params.enable_thinking;
303+
304+
if (params.extra_context) {
305+
tmpl_params.extra_context = *params.extra_context;
306+
}
307+
tmpl_params.extra_context["enable_thinking"] = params.enable_thinking;
308+
309+
try {
310+
return common_chat_template_direct_apply(tmpl, tmpl_params);
311+
} catch (const std::exception & e) {
312+
LOG_DBG("Template application failed: %s\n", e.what());
313+
return "";
314+
}
315+
}
316+
317+
std::optional<compare_variants_result> compare_variants(
318+
const common_chat_template & tmpl,
319+
const template_params & params_A,
320+
const std::function<void(template_params &)> & params_modifier) {
321+
// Create variant B by copying A
322+
template_params params_B = params_A;
323+
324+
// Apply modifier to create variant B
325+
if (params_modifier) {
326+
params_modifier(params_B);
327+
}
328+
329+
// Apply template to both variants
330+
std::string output_A = apply_template(tmpl, params_A);
331+
std::string output_B = apply_template(tmpl, params_B);
332+
333+
// Check for template application failures
334+
if (output_A.empty() || output_B.empty()) {
335+
return std::nullopt;
336+
}
337+
338+
// Calculate diff and return result with both outputs
339+
compare_variants_result result;
340+
result.diff = calculate_diff_split(output_A, output_B);
341+
result.output_A = output_A;
342+
result.output_B = output_B;
343+
344+
return result;
345+
}
346+
347+
} // namespace autoparser
348+

0 commit comments

Comments
 (0)