Skip to content

Commit 4dc55ea

Browse files
lml2468Takhoffman
andauthored
fix(feishu): chunk large documents for write/append to avoid API 400 errors (#14402)
* fix(feishu): chunk large documents for write/append to avoid API 400 errors The Feishu API limits documentBlockChildren.create to 50 blocks per request and document.convert has content size limits for large markdown. Previously, writeDoc and appendDoc would send the entire content in a single API call, causing HTTP 400 errors for long documents. This commit adds: - splitMarkdownByHeadings(): splits markdown at # or ## headings - chunkedConvertMarkdown(): converts each chunk independently - chunkedInsertBlocks(): batches blocks into groups of ≤50 Both writeDoc and appendDoc now use the chunked helpers while preserving backward compatibility for small documents. Image processing correctly receives all inserted blocks across batches. * fix(feishu): skip heading detection inside fenced code blocks Addresses review feedback: splitMarkdownByHeadings() now tracks fenced code blocks (``` or ~~~) and skips heading-based splitting when inside one, preventing corruption of code block content. * Feishu/Docx: add convert fallback chunking + tests --------- Co-authored-by: lml2468 <[email protected]> Co-authored-by: Tak Hoffman <[email protected]>
1 parent 27882dc commit 4dc55ea

File tree

3 files changed

+271
-7
lines changed

3 files changed

+271
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Docs: https://docs.openclaw.ai
3333
- Feishu/Local media sends: propagate `mediaLocalRoots` through Feishu outbound media sending into `loadWebMedia` so local path attachments work with post-CVE local-root enforcement. (#27884) Thanks @joelnishanth.
3434
- Feishu/Group sender allowlist fallback: add global `channels.feishu.groupSenderAllowFrom` sender authorization for group chats, with per-group `groups.<id>.allowFrom` precedence and regression coverage for allow/block/precedence behavior. (#29174) Thanks @1MoreBuild.
3535
- Feishu/Docx append/write ordering: insert converted Docx blocks sequentially (single-block creates) so Feishu append/write preserves markdown block order instead of returning shuffled sections in asynchronous batch inserts. (#26172, #26022) Thanks @echoVic.
36+
- Feishu/Docx convert fallback chunking: recursively split oversized markdown chunks (including long no-heading sections) when `document.convert` hits content limits, while keeping fenced-code-aware split boundaries whenever possible. (#14402) Thanks @lml2468.
3637
- Feishu/Inbound media regression coverage: add explicit tests for message resource type mapping (`image` stays `image`, non-image maps to `file`) to prevent reintroducing unsupported Feishu `type=audio` fetches. (#16311, #8746) Thanks @Yaxuan42.
3738
- Feishu/API quota controls: add `typingIndicator` and `resolveSenderNames` config flags (top-level and per-account) so operators can disable typing reactions and sender-name lookup requests while keeping default behavior unchanged. (#10513) Thanks @BigUncle.
3839
- Security/Feishu webhook ingress: bound unauthenticated webhook rate-limit state with stale-window pruning and a hard key cap to prevent unbounded pre-auth memory growth from rotating source keys. (#26050) Thanks @bmendonca3.

extensions/feishu/src/docx.test.ts

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ describe("feishu_doc image fetch hardening", () => {
147147
const result = await feishuDocTool.execute("tool-call", {
148148
action: "append",
149149
doc_token: "doc_1",
150-
content: "## H1\ntext\n## H2",
150+
content: "plain text body",
151151
});
152152

153153
// Verify sequential insertion: one call per block
@@ -163,6 +163,135 @@ describe("feishu_doc image fetch hardening", () => {
163163
expect(result.details.blocks_added).toBe(3);
164164
});
165165

166+
it("falls back to size-based convert chunking for long no-heading markdown", async () => {
167+
let successChunkCount = 0;
168+
convertMock.mockImplementation(async ({ data }) => {
169+
const content = data.content as string;
170+
if (content.length > 280) {
171+
return { code: 999, msg: "content too large" };
172+
}
173+
successChunkCount++;
174+
const blockId = `b_${successChunkCount}`;
175+
return {
176+
code: 0,
177+
data: {
178+
blocks: [{ block_type: 2, block_id: blockId }],
179+
first_level_block_ids: [blockId],
180+
},
181+
};
182+
});
183+
184+
blockChildrenCreateMock.mockImplementation(async ({ data }) => ({
185+
code: 0,
186+
data: { children: data.children },
187+
}));
188+
189+
const registerTool = vi.fn();
190+
registerFeishuDocTools({
191+
config: {
192+
channels: {
193+
feishu: { appId: "app_id", appSecret: "app_secret" },
194+
},
195+
} as any,
196+
logger: { debug: vi.fn(), info: vi.fn() } as any,
197+
registerTool,
198+
} as any);
199+
200+
const feishuDocTool = registerTool.mock.calls
201+
.map((call) => call[0])
202+
.map((tool) => (typeof tool === "function" ? tool({}) : tool))
203+
.find((tool) => tool.name === "feishu_doc");
204+
expect(feishuDocTool).toBeDefined();
205+
206+
const longMarkdown = Array.from(
207+
{ length: 120 },
208+
(_, i) => `line ${i} with enough content to trigger fallback chunking`,
209+
).join("\n");
210+
211+
const result = await feishuDocTool.execute("tool-call", {
212+
action: "append",
213+
doc_token: "doc_1",
214+
content: longMarkdown,
215+
});
216+
217+
expect(convertMock.mock.calls.length).toBeGreaterThan(1);
218+
expect(successChunkCount).toBeGreaterThan(1);
219+
expect(result.details.blocks_added).toBe(successChunkCount);
220+
});
221+
222+
it("keeps fenced code blocks balanced when size fallback split is needed", async () => {
223+
const convertedChunks: string[] = [];
224+
let successChunkCount = 0;
225+
let failFirstConvert = true;
226+
convertMock.mockImplementation(async ({ data }) => {
227+
const content = data.content as string;
228+
convertedChunks.push(content);
229+
if (failFirstConvert) {
230+
failFirstConvert = false;
231+
return { code: 999, msg: "content too large" };
232+
}
233+
successChunkCount++;
234+
const blockId = `c_${successChunkCount}`;
235+
return {
236+
code: 0,
237+
data: {
238+
blocks: [{ block_type: 2, block_id: blockId }],
239+
first_level_block_ids: [blockId],
240+
},
241+
};
242+
});
243+
244+
blockChildrenCreateMock.mockImplementation(async ({ data }) => ({
245+
code: 0,
246+
data: { children: data.children },
247+
}));
248+
249+
const registerTool = vi.fn();
250+
registerFeishuDocTools({
251+
config: {
252+
channels: {
253+
feishu: { appId: "app_id", appSecret: "app_secret" },
254+
},
255+
} as any,
256+
logger: { debug: vi.fn(), info: vi.fn() } as any,
257+
registerTool,
258+
} as any);
259+
260+
const feishuDocTool = registerTool.mock.calls
261+
.map((call) => call[0])
262+
.map((tool) => (typeof tool === "function" ? tool({}) : tool))
263+
.find((tool) => tool.name === "feishu_doc");
264+
expect(feishuDocTool).toBeDefined();
265+
266+
const fencedMarkdown = [
267+
"## Section",
268+
"```ts",
269+
"const alpha = 1;",
270+
"const beta = 2;",
271+
"const gamma = alpha + beta;",
272+
"console.log(gamma);",
273+
"```",
274+
"",
275+
"Tail paragraph one with enough text to exceed API limits when combined. ".repeat(8),
276+
"Tail paragraph two with enough text to exceed API limits when combined. ".repeat(8),
277+
"Tail paragraph three with enough text to exceed API limits when combined. ".repeat(8),
278+
].join("\n");
279+
280+
const result = await feishuDocTool.execute("tool-call", {
281+
action: "append",
282+
doc_token: "doc_1",
283+
content: fencedMarkdown,
284+
});
285+
286+
expect(convertMock.mock.calls.length).toBeGreaterThan(1);
287+
expect(successChunkCount).toBeGreaterThan(1);
288+
for (const chunk of convertedChunks) {
289+
const fenceCount = chunk.match(/```/g)?.length ?? 0;
290+
expect(fenceCount % 2).toBe(0);
291+
}
292+
expect(result.details.blocks_added).toBe(successChunkCount);
293+
});
294+
166295
it("skips image upload when markdown image URL is blocked", async () => {
167296
const consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
168297
fetchRemoteMediaMock.mockRejectedValueOnce(

extensions/feishu/src/docx.ts

Lines changed: 140 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ function cleanBlocksForInsert(blocks: any[]): { cleaned: any[]; skipped: string[
8585

8686
// ============ Core Functions ============
8787

88+
/** Max blocks per documentBlockChildren.create request */
89+
const MAX_BLOCKS_PER_INSERT = 50;
90+
const MAX_CONVERT_RETRY_DEPTH = 8;
91+
8892
async function convertMarkdown(client: Lark.Client, markdown: string) {
8993
const res = await client.docx.document.convert({
9094
data: { content_type: "markdown", content: markdown },
@@ -143,6 +147,138 @@ async function insertBlocks(
143147
return { children: allInserted, skipped };
144148
}
145149

150+
/** Split markdown into chunks at top-level headings (# or ##) to stay within API content limits */
151+
function splitMarkdownByHeadings(markdown: string): string[] {
152+
const lines = markdown.split("\n");
153+
const chunks: string[] = [];
154+
let current: string[] = [];
155+
let inFencedBlock = false;
156+
157+
for (const line of lines) {
158+
if (/^(`{3,}|~{3,})/.test(line)) {
159+
inFencedBlock = !inFencedBlock;
160+
}
161+
if (!inFencedBlock && /^#{1,2}\s/.test(line) && current.length > 0) {
162+
chunks.push(current.join("\n"));
163+
current = [];
164+
}
165+
current.push(line);
166+
}
167+
if (current.length > 0) {
168+
chunks.push(current.join("\n"));
169+
}
170+
return chunks;
171+
}
172+
173+
/** Split markdown by size, preferring to break outside fenced code blocks when possible */
174+
function splitMarkdownBySize(markdown: string, maxChars: number): string[] {
175+
if (markdown.length <= maxChars) {
176+
return [markdown];
177+
}
178+
179+
const lines = markdown.split("\n");
180+
const chunks: string[] = [];
181+
let current: string[] = [];
182+
let currentLength = 0;
183+
let inFencedBlock = false;
184+
185+
for (const line of lines) {
186+
if (/^(`{3,}|~{3,})/.test(line)) {
187+
inFencedBlock = !inFencedBlock;
188+
}
189+
190+
const lineLength = line.length + 1;
191+
const wouldExceed = currentLength + lineLength > maxChars;
192+
if (current.length > 0 && wouldExceed && !inFencedBlock) {
193+
chunks.push(current.join("\n"));
194+
current = [];
195+
currentLength = 0;
196+
}
197+
198+
current.push(line);
199+
currentLength += lineLength;
200+
}
201+
202+
if (current.length > 0) {
203+
chunks.push(current.join("\n"));
204+
}
205+
206+
if (chunks.length > 1) {
207+
return chunks;
208+
}
209+
210+
// Degenerate case: no safe boundary outside fenced content.
211+
const midpoint = Math.floor(lines.length / 2);
212+
if (midpoint <= 0 || midpoint >= lines.length) {
213+
return [markdown];
214+
}
215+
return [lines.slice(0, midpoint).join("\n"), lines.slice(midpoint).join("\n")];
216+
}
217+
218+
async function convertMarkdownWithFallback(client: Lark.Client, markdown: string, depth = 0) {
219+
try {
220+
return await convertMarkdown(client, markdown);
221+
} catch (error) {
222+
if (depth >= MAX_CONVERT_RETRY_DEPTH || markdown.length < 2) {
223+
throw error;
224+
}
225+
226+
const splitTarget = Math.max(256, Math.floor(markdown.length / 2));
227+
const chunks = splitMarkdownBySize(markdown, splitTarget);
228+
if (chunks.length <= 1) {
229+
throw error;
230+
}
231+
232+
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- SDK block types
233+
const blocks: any[] = [];
234+
const firstLevelBlockIds: string[] = [];
235+
236+
for (const chunk of chunks) {
237+
const converted = await convertMarkdownWithFallback(client, chunk, depth + 1);
238+
blocks.push(...converted.blocks);
239+
firstLevelBlockIds.push(...converted.firstLevelBlockIds);
240+
}
241+
242+
return { blocks, firstLevelBlockIds };
243+
}
244+
}
245+
246+
/** Convert markdown in chunks to avoid document.convert content size limits */
247+
async function chunkedConvertMarkdown(client: Lark.Client, markdown: string) {
248+
const chunks = splitMarkdownByHeadings(markdown);
249+
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- SDK block types
250+
const allBlocks: any[] = [];
251+
for (const chunk of chunks) {
252+
const { blocks, firstLevelBlockIds } = await convertMarkdownWithFallback(client, chunk);
253+
const sorted = sortBlocksByFirstLevel(blocks, firstLevelBlockIds);
254+
allBlocks.push(...sorted);
255+
}
256+
return allBlocks;
257+
}
258+
259+
/** Insert blocks in batches of MAX_BLOCKS_PER_INSERT to avoid API 400 errors */
260+
/* eslint-disable @typescript-eslint/no-explicit-any -- SDK block types */
261+
async function chunkedInsertBlocks(
262+
client: Lark.Client,
263+
docToken: string,
264+
blocks: any[],
265+
parentBlockId?: string,
266+
): Promise<{ children: any[]; skipped: string[] }> {
267+
/* eslint-enable @typescript-eslint/no-explicit-any */
268+
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- SDK block types
269+
const allChildren: any[] = [];
270+
const allSkipped: string[] = [];
271+
272+
for (let i = 0; i < blocks.length; i += MAX_BLOCKS_PER_INSERT) {
273+
const batch = blocks.slice(i, i + MAX_BLOCKS_PER_INSERT);
274+
const { children, skipped } = await insertBlocks(client, docToken, batch, parentBlockId);
275+
allChildren.push(...children);
276+
allSkipped.push(...skipped);
277+
}
278+
279+
return { children: allChildren, skipped: allSkipped };
280+
}
281+
146282
async function clearDocumentContent(client: Lark.Client, docToken: string) {
147283
const existing = await client.docx.documentBlock.list({
148284
path: { document_id: docToken },
@@ -499,13 +635,12 @@ async function createDoc(
499635
async function writeDoc(client: Lark.Client, docToken: string, markdown: string, maxBytes: number) {
500636
const deleted = await clearDocumentContent(client, docToken);
501637

502-
const { blocks, firstLevelBlockIds } = await convertMarkdown(client, markdown);
638+
const blocks = await chunkedConvertMarkdown(client, markdown);
503639
if (blocks.length === 0) {
504640
return { success: true, blocks_deleted: deleted, blocks_added: 0, images_processed: 0 };
505641
}
506-
const sortedBlocks = sortBlocksByFirstLevel(blocks, firstLevelBlockIds);
507642

508-
const { children: inserted, skipped } = await insertBlocks(client, docToken, sortedBlocks);
643+
const { children: inserted, skipped } = await chunkedInsertBlocks(client, docToken, blocks);
509644
const imagesProcessed = await processImages(client, docToken, markdown, inserted, maxBytes);
510645

511646
return {
@@ -525,13 +660,12 @@ async function appendDoc(
525660
markdown: string,
526661
maxBytes: number,
527662
) {
528-
const { blocks, firstLevelBlockIds } = await convertMarkdown(client, markdown);
663+
const blocks = await chunkedConvertMarkdown(client, markdown);
529664
if (blocks.length === 0) {
530665
throw new Error("Content is empty");
531666
}
532-
const sortedBlocks = sortBlocksByFirstLevel(blocks, firstLevelBlockIds);
533667

534-
const { children: inserted, skipped } = await insertBlocks(client, docToken, sortedBlocks);
668+
const { children: inserted, skipped } = await chunkedInsertBlocks(client, docToken, blocks);
535669
const imagesProcessed = await processImages(client, docToken, markdown, inserted, maxBytes);
536670

537671
return {

0 commit comments

Comments
 (0)