@@ -8,7 +8,7 @@ source .venv/bin/activate
8
8
pip install pypdf pymupdf pdfplumber PyPDF2 pdfminer.six
9
9
10
10
# Create directories for each parser
11
- mkdir -p pypdf pymupdf pdfjs pdfplumber pdfreader pypdf2 pdfminer pdf-parse-new
11
+ mkdir -p pypdf pymupdf pdfjs pdfplumber pdfreader pypdf2 pdfminer pdf-parse-new unpdf
12
12
13
13
# Create Python files for each parser
14
14
cat > pypdf/pypdf_parser.py << EOL
@@ -161,10 +161,10 @@ PyPDF2
161
161
pdfminer.six
162
162
EOL
163
163
164
- # Set up PDF.js (TypeScript/Node.js implementation)
164
+ # Set up pdfjs (TypeScript/Node.js implementation using pdfjs-dist )
165
165
cd pdfjs
166
166
npm init -y
167
- npm install pdf-parse @types/node typescript ts-node @types/pdf-parse
167
+ npm install pdfjs-dist @types/node typescript ts-node
168
168
npm pkg set type=" module" scripts.build=" tsc" scripts.start=" node --experimental-specifier-resolution=node --loader ts-node/esm src/pdfjs_parser.ts"
169
169
170
170
cat > tsconfig.json << EOL
@@ -186,44 +186,44 @@ EOL
186
186
187
187
mkdir -p src
188
188
cat > src/pdfjs_parser.ts << EOL
189
- import fs from "fs";
190
-
191
- async function importPdfParse() {
192
- if (!module.parent) {
193
- // Temporarily set module.parent to mimic being required by another module
194
- module.parent = module;
195
- }
196
- const pdfParse = await import("pdf-parse");
197
- module.parent = undefined; // Reset module.parent after import if needed
198
- return pdfParse;
199
- }
189
+ import fs from 'fs';
190
+ import * as pdfjsLib from 'pdfjs-dist';
200
191
201
192
async function extractText(pdfPath: string): Promise<string> {
202
- const dataBuffer = fs.readFileSync(pdfPath);
203
- const pdfParse = await importPdfParse();
204
- const data = await pdfParse.default(dataBuffer);
205
- return data.text.trim();
193
+ const data = new Uint8Array(fs.readFileSync(pdfPath));
194
+ const loadingTask = pdfjsLib.getDocument({ data });
195
+ const doc = await loadingTask.promise;
196
+
197
+ let fullText = '';
198
+ for (let i = 1; i <= doc.numPages; i++) {
199
+ const page = await doc.getPage(i);
200
+ const content = await page.getTextContent();
201
+ const strings = content.items.map((item: any) => item.str);
202
+ fullText += strings.join(' ') + '\n';
203
+ }
204
+
205
+ return fullText.trim();
206
206
}
207
207
208
208
async function main() {
209
- if (process.argv.length < 3) {
210
- console.log('Usage: npm run start -- <pdf_file> [-j]');
211
- process.exit(1);
212
- }
209
+ if (process.argv.length < 3) {
210
+ console.log('Usage: npm run start -- <pdf_file> [-j]');
211
+ process.exit(1);
212
+ }
213
213
214
- const pdfPath = process.argv[2];
215
- const jsonOutput = process.argv.includes('-j');
214
+ const pdfPath = process.argv[2];
215
+ const jsonOutput = process.argv.includes('-j');
216
216
217
- try {
218
- const text = await extractText(pdfPath);
219
- if (jsonOutput) {
220
- console.log(JSON.stringify({ text }));
221
- } else {
222
- console.log(text);
223
- }
224
- } catch (error) {
225
- console.error('Error:', error);
217
+ try {
218
+ const text = await extractText(pdfPath);
219
+ if (jsonOutput) {
220
+ console.log(JSON.stringify({ text }));
221
+ } else {
222
+ console.log(text);
226
223
}
224
+ } catch (error) {
225
+ console.error('Error:', error);
226
+ }
227
227
}
228
228
229
229
main();
374
374
375
375
cd ..
376
376
377
+ # Set up unpdf (TypeScript/Node.js implementation)
378
+ cd unpdf
379
+ npm init -y
380
+ npm install unpdf @types/node typescript ts-node
381
+ npm pkg set type=" module" scripts.build=" tsc" scripts.start=" node --experimental-specifier-resolution=node --loader ts-node/esm src/unpdf_parser.ts"
382
+
383
+ cat > tsconfig.json << EOL
384
+ {
385
+ "compilerOptions": {
386
+ "target": "ES2022",
387
+ "module": "ESNext",
388
+ "moduleResolution": "node",
389
+ "esModuleInterop": true,
390
+ "strict": true,
391
+ "outDir": "./dist"
392
+ },
393
+ "include": ["src/**/*"],
394
+ "ts-node": {
395
+ "esm": true
396
+ }
397
+ }
398
+ EOL
399
+
400
+ mkdir -p src
401
+ cat > src/unpdf_parser.ts << EOL
402
+ import fs from "fs";
403
+ import { extractText } from "unpdf";
404
+
405
+ async function extractPdfText(pdfPath: string): Promise<string> {
406
+ const buffer = fs.readFileSync(pdfPath);
407
+ const text = await extractText(buffer);
408
+ return text.trim();
409
+ }
410
+
411
+ async function main() {
412
+ if (process.argv.length < 3) {
413
+ console.log('Usage: npm run start -- <pdf_file> [-j]');
414
+ process.exit(1);
415
+ }
416
+
417
+ const pdfPath = process.argv[2];
418
+ const jsonOutput = process.argv.includes('-j');
419
+
420
+ try {
421
+ const text = await extractPdfText(pdfPath);
422
+ if (jsonOutput) {
423
+ console.log(JSON.stringify({ text }));
424
+ } else {
425
+ console.log(text);
426
+ }
427
+ } catch (error) {
428
+ console.error('Error:', error);
429
+ }
430
+ }
431
+
432
+ main();
433
+ EOL
434
+
435
+ cd ..
436
+
377
437
# Create a README file
378
438
cat > README.md << EOL
379
439
# PDF Parser Comparison
@@ -383,12 +443,13 @@ This project compares different PDF parsing libraries for text extraction accura
383
443
## Libraries included:
384
444
1. PyPDF (Python)
385
445
2. PyMuPDF (Python)
386
- 3. PDF.js (TypeScript/Node.js)
446
+ 3. PDF.js (TypeScript/Node.js using pdfjs-dist )
387
447
4. pdfplumber (Python)
388
448
5. pdfreader (TypeScript/Node.js)
389
449
6. PyPDF2 (Python)
390
450
7. pdfminer.six (Python)
391
451
8. pdf-parse-new (TypeScript/Node.js)
452
+ 9. unpdf (TypeScript/Node.js)
392
453
393
454
## Usage:
394
455
For Python parsers, run:
@@ -414,4 +475,4 @@ cd pdfjs && npm run start -- ../sample.pdf -j
414
475
All parsers handle multipage PDFs and concatenate the text from all pages into a single output.
415
476
EOL
416
477
417
- echo " Setup complete. See README.md for usage instructions."
478
+ echo " Setup complete. See README.md for usage instructions."
0 commit comments