Skip to content

Commit 943d563

Browse files
committed
Add unpdf
Fixes #4
1 parent 996492b commit 943d563

7 files changed

+1055
-36
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@ This project compares different PDF parsing libraries for text extraction accura
55
## Libraries included:
66
1. PyPDF (Python)
77
2. PyMuPDF (Python)
8-
3. PDF.js (TypeScript/Node.js)
8+
3. PDF.js (TypeScript/Node.js using pdfjs-dist)
99
4. pdfplumber (Python)
1010
5. pdfreader (TypeScript/Node.js)
1111
6. PyPDF2 (Python)
1212
7. pdfminer.six (Python)
1313
8. pdf-parse-new (TypeScript/Node.js)
14+
9. unpdf (TypeScript/Node.js)
1415

1516
## Usage:
1617
For Python parsers, run:

comprehensive-pdf-parser-setup.sh

+96-35
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ source .venv/bin/activate
88
pip install pypdf pymupdf pdfplumber PyPDF2 pdfminer.six
99

1010
# Create directories for each parser
11-
mkdir -p pypdf pymupdf pdfjs pdfplumber pdfreader pypdf2 pdfminer pdf-parse-new
11+
mkdir -p pypdf pymupdf pdfjs pdfplumber pdfreader pypdf2 pdfminer pdf-parse-new unpdf
1212

1313
# Create Python files for each parser
1414
cat > pypdf/pypdf_parser.py << EOL
@@ -161,10 +161,10 @@ PyPDF2
161161
pdfminer.six
162162
EOL
163163

164-
# Set up PDF.js (TypeScript/Node.js implementation)
164+
# Set up pdfjs (TypeScript/Node.js implementation using pdfjs-dist)
165165
cd pdfjs
166166
npm init -y
167-
npm install pdf-parse @types/node typescript ts-node @types/pdf-parse
167+
npm install pdfjs-dist @types/node typescript ts-node
168168
npm pkg set type="module" scripts.build="tsc" scripts.start="node --experimental-specifier-resolution=node --loader ts-node/esm src/pdfjs_parser.ts"
169169

170170
cat > tsconfig.json << EOL
@@ -186,44 +186,44 @@ EOL
186186

187187
mkdir -p src
188188
cat > src/pdfjs_parser.ts << EOL
189-
import fs from "fs";
190-
191-
async function importPdfParse() {
192-
if (!module.parent) {
193-
// Temporarily set module.parent to mimic being required by another module
194-
module.parent = module;
195-
}
196-
const pdfParse = await import("pdf-parse");
197-
module.parent = undefined; // Reset module.parent after import if needed
198-
return pdfParse;
199-
}
189+
import fs from 'fs';
190+
import * as pdfjsLib from 'pdfjs-dist';
200191
201192
async function extractText(pdfPath: string): Promise<string> {
202-
const dataBuffer = fs.readFileSync(pdfPath);
203-
const pdfParse = await importPdfParse();
204-
const data = await pdfParse.default(dataBuffer);
205-
return data.text.trim();
193+
const data = new Uint8Array(fs.readFileSync(pdfPath));
194+
const loadingTask = pdfjsLib.getDocument({ data });
195+
const doc = await loadingTask.promise;
196+
197+
let fullText = '';
198+
for (let i = 1; i <= doc.numPages; i++) {
199+
const page = await doc.getPage(i);
200+
const content = await page.getTextContent();
201+
const strings = content.items.map((item: any) => item.str);
202+
fullText += strings.join(' ') + '\n';
203+
}
204+
205+
return fullText.trim();
206206
}
207207
208208
async function main() {
209-
if (process.argv.length < 3) {
210-
console.log('Usage: npm run start -- <pdf_file> [-j]');
211-
process.exit(1);
212-
}
209+
if (process.argv.length < 3) {
210+
console.log('Usage: npm run start -- <pdf_file> [-j]');
211+
process.exit(1);
212+
}
213213
214-
const pdfPath = process.argv[2];
215-
const jsonOutput = process.argv.includes('-j');
214+
const pdfPath = process.argv[2];
215+
const jsonOutput = process.argv.includes('-j');
216216
217-
try {
218-
const text = await extractText(pdfPath);
219-
if (jsonOutput) {
220-
console.log(JSON.stringify({ text }));
221-
} else {
222-
console.log(text);
223-
}
224-
} catch (error) {
225-
console.error('Error:', error);
217+
try {
218+
const text = await extractText(pdfPath);
219+
if (jsonOutput) {
220+
console.log(JSON.stringify({ text }));
221+
} else {
222+
console.log(text);
226223
}
224+
} catch (error) {
225+
console.error('Error:', error);
226+
}
227227
}
228228
229229
main();
@@ -374,6 +374,66 @@ EOL
374374

375375
cd ..
376376

377+
# Set up unpdf (TypeScript/Node.js implementation)
378+
cd unpdf
379+
npm init -y
380+
npm install unpdf @types/node typescript ts-node
381+
npm pkg set type="module" scripts.build="tsc" scripts.start="node --experimental-specifier-resolution=node --loader ts-node/esm src/unpdf_parser.ts"
382+
383+
cat > tsconfig.json << EOL
384+
{
385+
"compilerOptions": {
386+
"target": "ES2022",
387+
"module": "ESNext",
388+
"moduleResolution": "node",
389+
"esModuleInterop": true,
390+
"strict": true,
391+
"outDir": "./dist"
392+
},
393+
"include": ["src/**/*"],
394+
"ts-node": {
395+
"esm": true
396+
}
397+
}
398+
EOL
399+
400+
mkdir -p src
401+
cat > src/unpdf_parser.ts << EOL
402+
import fs from "fs";
403+
import { extractText } from "unpdf";
404+
405+
async function extractPdfText(pdfPath: string): Promise<string> {
406+
const buffer = fs.readFileSync(pdfPath);
407+
const text = await extractText(buffer);
408+
return text.trim();
409+
}
410+
411+
async function main() {
412+
if (process.argv.length < 3) {
413+
console.log('Usage: npm run start -- <pdf_file> [-j]');
414+
process.exit(1);
415+
}
416+
417+
const pdfPath = process.argv[2];
418+
const jsonOutput = process.argv.includes('-j');
419+
420+
try {
421+
const text = await extractPdfText(pdfPath);
422+
if (jsonOutput) {
423+
console.log(JSON.stringify({ text }));
424+
} else {
425+
console.log(text);
426+
}
427+
} catch (error) {
428+
console.error('Error:', error);
429+
}
430+
}
431+
432+
main();
433+
EOL
434+
435+
cd ..
436+
377437
# Create a README file
378438
cat > README.md << EOL
379439
# PDF Parser Comparison
@@ -383,12 +443,13 @@ This project compares different PDF parsing libraries for text extraction accura
383443
## Libraries included:
384444
1. PyPDF (Python)
385445
2. PyMuPDF (Python)
386-
3. PDF.js (TypeScript/Node.js)
446+
3. PDF.js (TypeScript/Node.js using pdfjs-dist)
387447
4. pdfplumber (Python)
388448
5. pdfreader (TypeScript/Node.js)
389449
6. PyPDF2 (Python)
390450
7. pdfminer.six (Python)
391451
8. pdf-parse-new (TypeScript/Node.js)
452+
9. unpdf (TypeScript/Node.js)
392453
393454
## Usage:
394455
For Python parsers, run:
@@ -414,4 +475,4 @@ cd pdfjs && npm run start -- ../sample.pdf -j
414475
All parsers handle multipage PDFs and concatenate the text from all pages into a single output.
415476
EOL
416477

417-
echo "Setup complete. See README.md for usage instructions."
478+
echo "Setup complete. See README.md for usage instructions."

run-all-pdf-parsers.sh

+4
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,8 @@ cd "$SCRIPT_DIR/pdf-parse-new"
4949
run_parser "pdf-parse-new" "npm run start --"
5050
cd "$SCRIPT_DIR"
5151

52+
cd "$SCRIPT_DIR/unpdf"
53+
run_parser "unpdf" "npm run start --"
54+
cd "$SCRIPT_DIR"
55+
5256
echo "All parsing complete. Results are in the '$RESULTS_DIR' directory."

0 commit comments

Comments
 (0)