File tree 1 file changed +16
-4
lines changed
1 file changed +16
-4
lines changed Original file line number Diff line number Diff line change @@ -141,11 +141,23 @@ export async function extractTextFromPDF(
141
141
const prevEndX = prevItem . transform [ 4 ] + ( prevItem . width ?? 0 ) ;
142
142
const currentStartX = item . transform [ 4 ] ;
143
143
const space = currentStartX - prevEndX ;
144
-
145
- if ( space > ( ( item . width ?? 0 ) * 0.3 ) ) {
146
- lineText += ' ' + item . str ;
144
+
145
+ // Get average character width as fallback
146
+ const avgCharWidth = ( item . width ?? 0 ) / Math . max ( 1 , item . str . length ) ;
147
+
148
+ // Multiple conditions for space detection
149
+ const needsSpace =
150
+ // Primary check: significant gap between items
151
+ space > Math . max ( avgCharWidth * 0.3 , 2 ) ||
152
+ // Secondary check: natural word boundary
153
+ ( ! / ^ \W / . test ( item . str ) && ! / \W $ / . test ( prevItem . str ) ) ||
154
+ // Tertiary check: items are far enough apart relative to their size
155
+ ( space > ( ( prevItem . width ?? 0 ) * 0.25 ) ) ;
156
+
157
+ if ( needsSpace ) {
158
+ lineText += ' ' + item . str ;
147
159
} else {
148
- lineText += item . str ;
160
+ lineText += item . str ;
149
161
}
150
162
}
151
163
prevItem = item ;
You can’t perform that action at this time.
0 commit comments