Skip to content
/ core Public

Commit 2095087

Browse files
authored
feat: replace Algolia with local CJK search (#2621)
* feat: replace Algolia with local CJK search * feat: add search highlight metadata * Schedule search index rebuild * refactor: share rich text extraction for search indexing
1 parent 1d3da45 commit 2095087

36 files changed

+1666
-1277
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ MX Space Core is a headless CMS server built with **NestJS**, **MongoDB**, and *
2626
| **AI Workflow** | Summary generation, multi-language translation, comment moderation, writing assistance, streaming responses |
2727
| **LLM Providers** | OpenAI, OpenAI-compatible, Anthropic, OpenRouter |
2828
| **Real-time** | WebSocket via Socket.IO with Redis adapter for multi-instance broadcast |
29-
| **Distribution** | RSS/Atom feeds, sitemap, Algolia search, aggregate API |
29+
| **Distribution** | RSS/Atom feeds, sitemap, local search, aggregate API |
3030
| **Auth** | JWT sessions, passkeys, OAuth, API keys (via better-auth) |
3131
| **Deployment** | Docker (multi-arch), PM2, standalone binary |
3232

apps/core/package.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252
"changelog": true
5353
},
5454
"dependencies": {
55-
"@algolia/client-search": "^5.49.1",
5655
"@antfu/install-pkg": "1.1.0",
5756
"@anthropic-ai/sdk": "^0.78.0",
5857
"@babel/core": "7.29.0",
@@ -90,7 +89,6 @@
9089
"@typegoose/auto-increment": "^5.0.0",
9190
"@typegoose/typegoose": "^13.2.0",
9291
"@types/jsonwebtoken": "9.0.10",
93-
"algoliasearch": "5.49.1",
9492
"axios": "^1.13.3",
9593
"axios-retry": "4.5.0",
9694
"bcryptjs": "^3.0.3",

apps/core/src/constants/db.constant.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export const PROJECT_COLLECTION_NAME = 'projects'
2424
export const READER_COLLECTION_NAME = 'readers'
2525
export const RECENTLY_COLLECTION_NAME = 'recentlies'
2626
export const SAY_COLLECTION_NAME = 'says'
27+
export const SEARCH_DOCUMENT_COLLECTION_NAME = 'search_documents'
2728
export const SERVERLESS_LOG_COLLECTION_NAME = 'serverless_logs'
2829
export const SERVERLESS_STORAGE_COLLECTION_NAME = 'serverless_storages'
2930
export const SLUG_TRACKER_COLLECTION_NAME = 'slug_trackers'

apps/core/src/constants/error-code.constant.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ export enum ErrorCodeEnum {
5151
// biz - disabled/not enabled (400/403)
5252
LinkDisabled = 13000,
5353
SubpathLinkDisabled = 13001,
54-
AlgoliaNotEnabled = 13002,
55-
AlgoliaNotConfigured = 13003,
5654
BackupNotEnabled = 13004,
5755
SubscribeNotEnabled = 13005,
5856
PasswordLoginDisabled = 13006,
@@ -204,8 +202,6 @@ export const ErrorCode = Object.freeze<Record<ErrorCodeEnum, [string, number]>>(
204202
'管理员当前禁用了子路径友链申请',
205203
422,
206204
],
207-
[ErrorCodeEnum.AlgoliaNotEnabled]: ['Algolia 未开启', 400],
208-
[ErrorCodeEnum.AlgoliaNotConfigured]: ['Algolia 未配置', 400],
209205
[ErrorCodeEnum.BackupNotEnabled]: ['请先在设置中开启备份功能', 400],
210206
[ErrorCodeEnum.SubscribeNotEnabled]: ['订阅功能未开启', 400],
211207
[ErrorCodeEnum.PasswordLoginDisabled]: ['密码登录已禁用', 400],

apps/core/src/constants/event-bus.constant.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
export enum EventBusEvents {
22
EmailInit = 'email.init',
3-
PushSearch = 'search.push',
43
TokenExpired = 'token.expired',
54
CleanAggregateCache = 'cache.aggregate',
65
SystemException = 'system.exception',

apps/core/src/migration/history.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import v10_0_5 from './version/v10.0.5'
3333
import v10_1_0 from './version/v10.1.0'
3434
import v10_4_1 from './version/v10.4.1'
3535
import v10_4_2 from './version/v10.4.2'
36+
import v10_4_3 from './version/v10.4.3'
3637

3738
export default [
3839
v200Alpha1,
@@ -70,4 +71,5 @@ export default [
7071
v10_1_0,
7172
v10_4_1,
7273
v10_4_2,
74+
v10_4_3,
7375
]
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import type { Db } from 'mongodb'
2+
3+
import {
4+
NOTE_COLLECTION_NAME,
5+
PAGE_COLLECTION_NAME,
6+
POST_COLLECTION_NAME,
7+
SEARCH_DOCUMENT_COLLECTION_NAME,
8+
} from '~/constants/db.constant'
9+
import { buildSearchDocument } from '~/modules/search/search-document.util'
10+
11+
import { defineMigration } from '../helper'
12+
13+
export default defineMigration(
14+
'v10.4.3-search-index-initial-rebuild',
15+
async (db: Db) => {
16+
const [posts, pages, notes] = await Promise.all([
17+
db
18+
.collection(POST_COLLECTION_NAME)
19+
.find(
20+
{},
21+
{
22+
projection: {
23+
title: 1,
24+
text: 1,
25+
content: 1,
26+
contentFormat: 1,
27+
slug: 1,
28+
created: 1,
29+
modified: 1,
30+
isPublished: 1,
31+
},
32+
},
33+
)
34+
.toArray(),
35+
db
36+
.collection(PAGE_COLLECTION_NAME)
37+
.find(
38+
{},
39+
{
40+
projection: {
41+
title: 1,
42+
text: 1,
43+
content: 1,
44+
contentFormat: 1,
45+
slug: 1,
46+
created: 1,
47+
modified: 1,
48+
},
49+
},
50+
)
51+
.toArray(),
52+
db
53+
.collection(NOTE_COLLECTION_NAME)
54+
.find(
55+
{},
56+
{
57+
projection: {
58+
title: 1,
59+
text: 1,
60+
content: 1,
61+
contentFormat: 1,
62+
slug: 1,
63+
nid: 1,
64+
created: 1,
65+
modified: 1,
66+
isPublished: 1,
67+
publicAt: 1,
68+
password: 1,
69+
},
70+
},
71+
)
72+
.toArray(),
73+
])
74+
75+
const documents = [
76+
...posts.map((doc) => buildSearchDocument('post', doc)),
77+
...pages.map((doc) => buildSearchDocument('page', doc)),
78+
...notes.map((doc) => buildSearchDocument('note', doc)),
79+
]
80+
81+
const collection = db.collection(SEARCH_DOCUMENT_COLLECTION_NAME)
82+
await collection.deleteMany({})
83+
84+
if (documents.length) {
85+
await collection.insertMany(documents, { ordered: false })
86+
}
87+
},
88+
)

apps/core/src/modules/ai/ai-translation/lexical-translation-parser.ts

Lines changed: 14 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,20 @@
11
// Lexical translation parser: extract translatable segments from serialized JSON.
22
// Uses blacklist-based skipping + generalized nested editor detection.
33

4-
import {
5-
CodeBlockNode,
6-
CodeSnippetNode,
7-
EmbedNode,
8-
ExcalidrawNode,
9-
FootnoteNode,
10-
GalleryNode,
11-
ImageNode,
12-
KaTeXBlockNode,
13-
KaTeXInlineNode,
14-
LinkCardNode,
15-
MentionNode,
16-
MermaidNode,
17-
VideoNode,
18-
} from '@haklex/rich-headless'
19-
204
import {
215
BLOCK_ID_STATE_KEY,
226
NODE_STATE_KEY,
237
} from '~/constants/lexical.constant'
8+
import {
9+
isNestedLexicalEditorState,
10+
KNOWN_LEXICAL_STRUCTURAL_PROPS,
11+
LEXICAL_CONTEXT_EXCALIDRAW_TYPE,
12+
LEXICAL_CONTEXT_SKIP_BLOCKS,
13+
LEXICAL_CONTEXT_SKIP_INLINE,
14+
} from '~/utils/content.util'
2415

2516
const FORMAT_CODE = 16
2617

27-
const EXCALIDRAW_TYPE = ExcalidrawNode.getType()
28-
29-
const SKIP_BLOCKS = new Set([
30-
'code',
31-
CodeBlockNode.getType(),
32-
CodeSnippetNode.getType(),
33-
'code-highlight',
34-
ImageNode.getType(),
35-
VideoNode.getType(),
36-
GalleryNode.getType(),
37-
LinkCardNode.getType(),
38-
KaTeXBlockNode.getType(),
39-
MermaidNode.getType(),
40-
EmbedNode.getType(),
41-
'horizontalrule',
42-
'component',
43-
])
44-
45-
const SKIP_INLINE = new Set([
46-
KaTeXInlineNode.getType(),
47-
MentionNode.getType(),
48-
FootnoteNode.getType(),
49-
])
50-
51-
const KNOWN_STRUCTURAL_PROPS = new Set([
52-
'children',
53-
'type',
54-
'version',
55-
'direction',
56-
'format',
57-
'indent',
58-
'style',
59-
'detail',
60-
'mode',
61-
'text',
62-
'tag',
63-
'listType',
64-
'start',
65-
'value',
66-
'url',
67-
'rel',
68-
'target',
69-
'colSpan',
70-
'headerState',
71-
'width',
72-
NODE_STATE_KEY,
73-
])
74-
7518
export interface TranslationSegment {
7619
id: string
7720
text: string
@@ -142,28 +85,6 @@ function extractExcalidrawTexts(
14285
}
14386
}
14487

145-
function extractExcalidrawTextForContext(node: any): string {
146-
if (!node.snapshot || typeof node.snapshot !== 'string') return ''
147-
try {
148-
const parsed = JSON.parse(node.snapshot)
149-
if (!parsed.store) return ''
150-
const texts: string[] = []
151-
for (const value of Object.values(parsed.store)) {
152-
const shape = value as any
153-
if (
154-
shape?.props?.text &&
155-
typeof shape.props.text === 'string' &&
156-
shape.props.text.trim()
157-
) {
158-
texts.push(shape.props.text)
159-
}
160-
}
161-
return texts.join('\n')
162-
} catch {
163-
return ''
164-
}
165-
}
166-
16788
function walkNode(
16889
node: any,
16990
segments: TranslationSegment[],
@@ -174,13 +95,13 @@ function walkNode(
17495
if (!node) return
17596

17697
// Handle excalidraw: extract text from shapes within snapshot
177-
if (node.type === EXCALIDRAW_TYPE) {
98+
if (node.type === LEXICAL_CONTEXT_EXCALIDRAW_TYPE) {
17899
extractExcalidrawTexts(node, propertySegments, counter, ctx)
179100
return
180101
}
181102

182-
if (SKIP_BLOCKS.has(node.type)) return
183-
if (SKIP_INLINE.has(node.type)) return
103+
if (LEXICAL_CONTEXT_SKIP_BLOCKS.has(node.type)) return
104+
if (LEXICAL_CONTEXT_SKIP_INLINE.has(node.type)) return
184105

185106
// Special translatable properties
186107
if (
@@ -263,17 +184,11 @@ function scanNestedEditorStates(
263184
ctx: BlockContext,
264185
): void {
265186
for (const [propName, propValue] of Object.entries(node)) {
266-
if (KNOWN_STRUCTURAL_PROPS.has(propName)) continue
187+
if (KNOWN_LEXICAL_STRUCTURAL_PROPS.has(propName)) continue
267188

268189
// Single nested editor state: { root: { children: [...] } }
269-
if (
270-
propValue &&
271-
typeof propValue === 'object' &&
272-
!Array.isArray(propValue) &&
273-
(propValue as any).root &&
274-
Array.isArray((propValue as any).root.children)
275-
) {
276-
for (const child of (propValue as any).root.children) {
190+
if (isNestedLexicalEditorState(propValue)) {
191+
for (const child of propValue.root.children) {
277192
walkNode(child, segments, propertySegments, counter, ctx)
278193
}
279194
continue
@@ -282,12 +197,7 @@ function scanNestedEditorStates(
282197
// Array of nested editor states
283198
if (Array.isArray(propValue)) {
284199
for (const item of propValue) {
285-
if (
286-
item &&
287-
typeof item === 'object' &&
288-
item.root &&
289-
Array.isArray(item.root.children)
290-
) {
200+
if (isNestedLexicalEditorState(item)) {
291201
for (const child of item.root.children) {
292202
walkNode(child, segments, propertySegments, counter, ctx)
293203
}
@@ -297,69 +207,6 @@ function scanNestedEditorStates(
297207
}
298208
}
299209

300-
// ── Document context extraction ──
301-
302-
const BLOCK_TYPES = new Set([
303-
'listitem',
304-
'tablecell',
305-
'tablerow',
306-
'details',
307-
'list',
308-
'table',
309-
'root',
310-
])
311-
312-
function extractBlockText(node: any): string {
313-
if (!node) return ''
314-
if (node.type === EXCALIDRAW_TYPE)
315-
return extractExcalidrawTextForContext(node)
316-
if (SKIP_BLOCKS.has(node.type)) return ''
317-
if (SKIP_INLINE.has(node.type)) return ''
318-
if (node.type === 'text') return node.text ?? ''
319-
if (node.type === 'linebreak') return '\n'
320-
321-
const parts: string[] = []
322-
323-
if (Array.isArray(node.children)) {
324-
const sep = BLOCK_TYPES.has(node.type) ? '\n' : ''
325-
const joined = node.children.map(extractBlockText).filter(Boolean).join(sep)
326-
if (joined) parts.push(joined)
327-
}
328-
329-
// Nested editor states (same generic scan)
330-
for (const [propName, propValue] of Object.entries(node)) {
331-
if (KNOWN_STRUCTURAL_PROPS.has(propName)) continue
332-
if (
333-
propValue &&
334-
typeof propValue === 'object' &&
335-
!Array.isArray(propValue) &&
336-
(propValue as any).root &&
337-
Array.isArray((propValue as any).root.children)
338-
) {
339-
const nested = (propValue as any).root.children
340-
.map(extractBlockText)
341-
.filter(Boolean)
342-
if (nested.length) parts.push(nested.join('\n'))
343-
}
344-
if (Array.isArray(propValue)) {
345-
for (const item of propValue) {
346-
if (item?.root && Array.isArray(item.root.children)) {
347-
const nested = item.root.children
348-
.map(extractBlockText)
349-
.filter(Boolean)
350-
if (nested.length) parts.push(nested.join('\n'))
351-
}
352-
}
353-
}
354-
}
355-
356-
return parts.join('\n')
357-
}
358-
359-
export function extractDocumentContext(rootChildren: any[]): string {
360-
return rootChildren.map(extractBlockText).filter(Boolean).join('\n\n')
361-
}
362-
363210
// ── Parser ──
364211

365212
function readBlockId(node: any): string | null {

0 commit comments

Comments
 (0)