Skip to content

Commit c4265a5

Browse files
authored
fix: preserve Telegram word boundaries when rechunking HTML (openclaw#47274)
* fix: preserve Telegram chunk word boundaries * fix: address Telegram chunking review feedback * fix: preserve Telegram retry separators * fix: preserve Telegram chunking boundaries (openclaw#47274)
1 parent 26e0a3e commit c4265a5

File tree

3 files changed

+242
-6
lines changed

3 files changed

+242
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
4444
- Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc.
4545
- Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411)
4646
- Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent.
47+
- Telegram/message chunking: preserve spaces, paragraph separators, and word boundaries when HTML overflow rechunking splits formatted replies. (#47274)
4748

4849
## 2026.3.13
4950

extensions/telegram/src/format.ts

Lines changed: 212 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,146 @@ function sliceLinkSpans(
512512
});
513513
}
514514

515+
function sliceMarkdownIR(ir: MarkdownIR, start: number, end: number): MarkdownIR {
516+
return {
517+
text: ir.text.slice(start, end),
518+
styles: sliceStyleSpans(ir.styles, start, end),
519+
links: sliceLinkSpans(ir.links, start, end),
520+
};
521+
}
522+
523+
function mergeAdjacentStyleSpans(styles: MarkdownIR["styles"]): MarkdownIR["styles"] {
524+
const merged: MarkdownIR["styles"] = [];
525+
for (const span of styles) {
526+
const last = merged.at(-1);
527+
if (last && last.style === span.style && span.start <= last.end) {
528+
last.end = Math.max(last.end, span.end);
529+
continue;
530+
}
531+
merged.push({ ...span });
532+
}
533+
return merged;
534+
}
535+
536+
function mergeAdjacentLinkSpans(links: MarkdownIR["links"]): MarkdownIR["links"] {
537+
const merged: MarkdownIR["links"] = [];
538+
for (const link of links) {
539+
const last = merged.at(-1);
540+
if (last && last.href === link.href && link.start <= last.end) {
541+
last.end = Math.max(last.end, link.end);
542+
continue;
543+
}
544+
merged.push({ ...link });
545+
}
546+
return merged;
547+
}
548+
549+
function mergeMarkdownIRChunks(left: MarkdownIR, right: MarkdownIR): MarkdownIR {
550+
const offset = left.text.length;
551+
return {
552+
text: left.text + right.text,
553+
styles: mergeAdjacentStyleSpans([
554+
...left.styles,
555+
...right.styles.map((span) => ({
556+
...span,
557+
start: span.start + offset,
558+
end: span.end + offset,
559+
})),
560+
]),
561+
links: mergeAdjacentLinkSpans([
562+
...left.links,
563+
...right.links.map((link) => ({
564+
...link,
565+
start: link.start + offset,
566+
end: link.end + offset,
567+
})),
568+
]),
569+
};
570+
}
571+
572+
function renderTelegramChunkHtml(ir: MarkdownIR): string {
573+
return wrapFileReferencesInHtml(renderTelegramHtml(ir));
574+
}
575+
576+
function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number {
577+
const maxEnd = Math.min(text.length, start + limit);
578+
if (maxEnd >= text.length) {
579+
return text.length;
580+
}
581+
582+
let lastOutsideParenNewlineBreak = -1;
583+
let lastOutsideParenWhitespaceBreak = -1;
584+
let lastOutsideParenWhitespaceRunStart = -1;
585+
let lastAnyNewlineBreak = -1;
586+
let lastAnyWhitespaceBreak = -1;
587+
let lastAnyWhitespaceRunStart = -1;
588+
let parenDepth = 0;
589+
let sawNonWhitespace = false;
590+
591+
for (let index = start; index < maxEnd; index += 1) {
592+
const char = text[index];
593+
if (char === "(") {
594+
sawNonWhitespace = true;
595+
parenDepth += 1;
596+
continue;
597+
}
598+
if (char === ")" && parenDepth > 0) {
599+
sawNonWhitespace = true;
600+
parenDepth -= 1;
601+
continue;
602+
}
603+
if (!/\s/.test(char)) {
604+
sawNonWhitespace = true;
605+
continue;
606+
}
607+
if (!sawNonWhitespace) {
608+
continue;
609+
}
610+
if (char === "\n") {
611+
lastAnyNewlineBreak = index + 1;
612+
if (parenDepth === 0) {
613+
lastOutsideParenNewlineBreak = index + 1;
614+
}
615+
continue;
616+
}
617+
const whitespaceRunStart =
618+
index === start || !/\s/.test(text[index - 1] ?? "") ? index : lastAnyWhitespaceRunStart;
619+
lastAnyWhitespaceBreak = index + 1;
620+
lastAnyWhitespaceRunStart = whitespaceRunStart;
621+
if (parenDepth === 0) {
622+
lastOutsideParenWhitespaceBreak = index + 1;
623+
lastOutsideParenWhitespaceRunStart = whitespaceRunStart;
624+
}
625+
}
626+
627+
const resolveWhitespaceBreak = (breakIndex: number, runStart: number): number => {
628+
if (breakIndex <= start) {
629+
return breakIndex;
630+
}
631+
if (runStart <= start) {
632+
return breakIndex;
633+
}
634+
return /\s/.test(text[breakIndex] ?? "") ? runStart : breakIndex;
635+
};
636+
637+
if (lastOutsideParenNewlineBreak > start) {
638+
return lastOutsideParenNewlineBreak;
639+
}
640+
if (lastOutsideParenWhitespaceBreak > start) {
641+
return resolveWhitespaceBreak(
642+
lastOutsideParenWhitespaceBreak,
643+
lastOutsideParenWhitespaceRunStart,
644+
);
645+
}
646+
if (lastAnyNewlineBreak > start) {
647+
return lastAnyNewlineBreak;
648+
}
649+
if (lastAnyWhitespaceBreak > start) {
650+
return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
651+
}
652+
return maxEnd;
653+
}
654+
515655
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
516656
if (!ir.text) {
517657
return [];
@@ -523,7 +663,7 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
523663
const chunks: MarkdownIR[] = [];
524664
let cursor = 0;
525665
while (cursor < ir.text.length) {
526-
const end = Math.min(ir.text.length, cursor + normalizedLimit);
666+
const end = findMarkdownIRPreservedSplitIndex(ir.text, cursor, normalizedLimit);
527667
chunks.push({
528668
text: ir.text.slice(cursor, end),
529669
styles: sliceStyleSpans(ir.styles, cursor, end),
@@ -534,32 +674,98 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
534674
return chunks;
535675
}
536676

677+
function coalesceWhitespaceOnlyMarkdownIRChunks(chunks: MarkdownIR[], limit: number): MarkdownIR[] {
678+
const coalesced: MarkdownIR[] = [];
679+
let index = 0;
680+
681+
while (index < chunks.length) {
682+
const chunk = chunks[index];
683+
if (!chunk) {
684+
index += 1;
685+
continue;
686+
}
687+
if (chunk.text.trim().length > 0) {
688+
coalesced.push(chunk);
689+
index += 1;
690+
continue;
691+
}
692+
693+
const prev = coalesced.at(-1);
694+
const next = chunks[index + 1];
695+
const chunkLength = chunk.text.length;
696+
697+
const canMergePrev = (candidate: MarkdownIR) =>
698+
renderTelegramChunkHtml(candidate).length <= limit;
699+
const canMergeNext = (candidate: MarkdownIR) =>
700+
renderTelegramChunkHtml(candidate).length <= limit;
701+
702+
if (prev) {
703+
const mergedPrev = mergeMarkdownIRChunks(prev, chunk);
704+
if (canMergePrev(mergedPrev)) {
705+
coalesced[coalesced.length - 1] = mergedPrev;
706+
index += 1;
707+
continue;
708+
}
709+
}
710+
711+
if (next) {
712+
const mergedNext = mergeMarkdownIRChunks(chunk, next);
713+
if (canMergeNext(mergedNext)) {
714+
chunks[index + 1] = mergedNext;
715+
index += 1;
716+
continue;
717+
}
718+
}
719+
720+
if (prev && next) {
721+
for (let prefixLength = chunkLength - 1; prefixLength >= 1; prefixLength -= 1) {
722+
const prefix = sliceMarkdownIR(chunk, 0, prefixLength);
723+
const suffix = sliceMarkdownIR(chunk, prefixLength, chunkLength);
724+
const mergedPrev = mergeMarkdownIRChunks(prev, prefix);
725+
const mergedNext = mergeMarkdownIRChunks(suffix, next);
726+
if (canMergePrev(mergedPrev) && canMergeNext(mergedNext)) {
727+
coalesced[coalesced.length - 1] = mergedPrev;
728+
chunks[index + 1] = mergedNext;
729+
break;
730+
}
731+
}
732+
}
733+
734+
index += 1;
735+
}
736+
737+
return coalesced;
738+
}
739+
537740
function renderTelegramChunksWithinHtmlLimit(
538741
ir: MarkdownIR,
539742
limit: number,
540743
): TelegramFormattedChunk[] {
541744
const normalizedLimit = Math.max(1, Math.floor(limit));
542745
const pending = chunkMarkdownIR(ir, normalizedLimit);
543-
const rendered: TelegramFormattedChunk[] = [];
746+
const finalized: MarkdownIR[] = [];
544747
while (pending.length > 0) {
545748
const chunk = pending.shift();
546749
if (!chunk) {
547750
continue;
548751
}
549-
const html = wrapFileReferencesInHtml(renderTelegramHtml(chunk));
752+
const html = renderTelegramChunkHtml(chunk);
550753
if (html.length <= normalizedLimit || chunk.text.length <= 1) {
551-
rendered.push({ html, text: chunk.text });
754+
finalized.push(chunk);
552755
continue;
553756
}
554757
const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
555758
if (split.length <= 1) {
556759
// Worst-case safety: avoid retry loops, deliver the chunk as-is.
557-
rendered.push({ html, text: chunk.text });
760+
finalized.push(chunk);
558761
continue;
559762
}
560763
pending.unshift(...split);
561764
}
562-
return rendered;
765+
return coalesceWhitespaceOnlyMarkdownIRChunks(finalized, normalizedLimit).map((chunk) => ({
766+
html: renderTelegramChunkHtml(chunk),
767+
text: chunk.text,
768+
}));
563769
}
564770

565771
export function markdownToTelegramChunks(

extensions/telegram/src/format.wrap-md.test.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
174174
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
175175
expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
176176
});
177+
178+
it("prefers word boundaries when html-limit retry splits formatted prose", () => {
179+
const input = "**Which of these**";
180+
const chunks = markdownToTelegramChunks(input, 16);
181+
expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]);
182+
expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
183+
});
184+
185+
it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
186+
const input = "**foo (bar baz qux quux**";
187+
const chunks = markdownToTelegramChunks(input, 20);
188+
expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]);
189+
expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
190+
});
191+
192+
it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
193+
const input = "**ab <<**";
194+
const chunks = markdownToTelegramChunks(input, 11);
195+
expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab <<");
196+
expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true);
197+
expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true);
198+
});
199+
200+
it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => {
201+
const input = "ab\n\n<<";
202+
const chunks = markdownToTelegramChunks(input, 6);
203+
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
204+
expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true);
205+
});
177206
});
178207

179208
describe("edge cases", () => {

0 commit comments

Comments
 (0)