1interface Chunk {
2 id: string;
3 text: string;
4 metadata: ChunkMetadata;
5}
6
7interface ChunkMetadata {
8 documentId: string;
9 source: string;
10 section?: string;
11 chunkIndex: number;
12 totalChunks: number;
13}
14
15class RecursiveChunker {
16 private separators = ['\n\n', '\n', '. ', ' '];
17
18 constructor(
19 private maxTokens: number = 512,
20 private overlap: number = 50,
21 ) {}
22
23 chunk(text: string, metadata: Omit<ChunkMetadata, 'chunkIndex' | 'totalChunks'>): Chunk[] {
24 const rawChunks = this.split(text, this.separators);
25 const total = rawChunks.length;
26
27 return rawChunks.map((text, index) => ({
28 id: crypto.randomUUID(),
29 text: text.trim(),
30 metadata: { ...metadata, chunkIndex: index, totalChunks: total },
31 })).filter(c => c.text.length > 0);
32 }
33
34 private split(text: string, separators: string[]): string[] {
35 if (separators.length === 0) return [text];
36
37 const [sep, ...remaining] = separators;
38 const parts = text.split(sep);
39 const chunks: string[] = [];
40 let current = '';
41
42 for (const part of parts) {
43 const candidate = current ? current + sep + part : part;
44
45 if (this.tokenCount(candidate) > this.maxTokens) {
46 if (current) chunks.push(current);
47 if (this.tokenCount(part) > this.maxTokens) {
48 chunks.push(...this.split(part, remaining));
49 current = '';
50 } else {
51 current = part;
52 }
53 } else {
54 current = candidate;
55 }
56 }
57
58 if (current) chunks.push(current);
59 return chunks;
60 }
61
62 private tokenCount(text: string): number {
63 return Math.ceil(text.split(/\s+/).length * 1.33);
64 }
65}
66
67class SectionAwareChunker {
68 private recursive: RecursiveChunker;
69
70 constructor(maxTokens: number = 512) {
71 this.recursive = new RecursiveChunker(maxTokens);
72 }
73
74 chunk(text: string, metadata: Omit<ChunkMetadata, 'chunkIndex' | 'totalChunks' | 'section'>): Chunk[] {
75 const sections = this.extractSections(text);
76 const allChunks: Chunk[] = [];
77
78 for (const section of sections) {
79 const prefix = `Section: ${section.header}\n\n`;
80 const sectionChunks = this.recursive.chunk(section.content, {
81 ...metadata,
82 section: section.header,
83 });
84
85 for (const chunk of sectionChunks) {
86 chunk.text = prefix + chunk.text;
87 allChunks.push(chunk);
88 }
89 }
90
91 return allChunks;
92 }
93
94 private extractSections(text: string): Array<{ header: string; content: string }> {
95 const sections: Array<{ header: string; content: string }> = [];
96 let currentHeader = 'Introduction';
97 let currentContent = '';
98
99 for (const line of text.split('\n')) {
100 const match = line.match(/^#{1,3}\s+(.+)$/);
101 if (match) {
102 if (currentContent.trim()) {
103 sections.push({ header: currentHeader, content: currentContent.trim() });
104 }
105 currentHeader = match[1];
106 currentContent = '';
107 } else {
108 currentContent += line + '\n';
109 }
110 }
111
112 if (currentContent.trim()) {
113 sections.push({ header: currentHeader, content: currentContent.trim() });
114 }
115
116 return sections;
117 }
118}
119