Skip to content

Commit 1bbd77f

Browse files
committed
Improve space detection logic in PDF text extraction
1 parent 4ed6b17 commit 1bbd77f

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

Diff for: src/utils/pdf.ts

+16-4
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,23 @@ export async function extractTextFromPDF(
141141
const prevEndX = prevItem.transform[4] + (prevItem.width ?? 0);
142142
const currentStartX = item.transform[4];
143143
const space = currentStartX - prevEndX;
144-
145-
if (space > ((item.width ?? 0) * 0.3)) {
146-
lineText += ' ' + item.str;
144+
145+
// Get average character width as fallback
146+
const avgCharWidth = (item.width ?? 0) / Math.max(1, item.str.length);
147+
148+
// Multiple conditions for space detection
149+
const needsSpace =
150+
// Primary check: significant gap between items
151+
space > Math.max(avgCharWidth * 0.3, 2) ||
152+
// Secondary check: natural word boundary
153+
(!/^\W/.test(item.str) && !/\W$/.test(prevItem.str)) ||
154+
// Tertiary check: items are far enough apart relative to their size
155+
(space > ((prevItem.width ?? 0) * 0.25));
156+
157+
if (needsSpace) {
158+
lineText += ' ' + item.str;
147159
} else {
148-
lineText += item.str;
160+
lineText += item.str;
149161
}
150162
}
151163
prevItem = item;

0 commit comments

Comments
 (0)