Skip to content

Commit 84c05e3

Browse files
Add URL2PDF, Substack; remove Amazon (they are blocking this now)
1 parent 6dbf8f0 commit 84c05e3

File tree

15 files changed

+1431
-73
lines changed

15 files changed

+1431
-73
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.DS_Store

README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ This is a collection of scripts I've written that may or may not be useful to my
44

55
# List of scripts
66
* [new-machine-setup](new-machine-setup) is a guide for new machine setup and a `~/.zshrc` template
7-
* [amazon_seller_feedback](amazon_seller_feedback) is a script to get bad reviews of Amazon sellers
87
* [macos-window-resizer](macos-window-resizer) is a script to resize windows for OBS capture on macOS
9-
* [github-unsubscribe-script](github-unsubscribe-script) is a script that unsubscribes you from all notifications
8+
* [github-unsubscribe-script](github-unsubscribe-script) is a script that unsubscribes you from all Github notifications
9+
* [download-webpages-as-pdf](download-webpages-as-pdf) is a script that downloads a set of webpages as PDFs
10+
* [get-all-substack-urls-for-user](get-all-substack-urls-for-user) is a script that downloads information on all of a user's articles on Substack

amazon_seller_feedback/.python-version

-1
This file was deleted.

amazon_seller_feedback/README.md

-29
This file was deleted.

amazon_seller_feedback/amazon_seller_feedback.py

-40
This file was deleted.

amazon_seller_feedback/requirements.txt

-1
This file was deleted.

download-urls-as-pdf/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
node_modules
2+
package-lock.json
3+
example.pdf

download-urls-as-pdf/README.md

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Download URLs as PDF
2+
3+
Note: This script scrolls down each page before downloading a PDF to ensure all assets are loaded (e.g. lazily loaded images on a page). This significantly increased page save times.
4+
5+
## Setup
6+
7+
### Node.js
8+
1. Install nvm: https://github.com/nvm-sh/nvm#installing-and-updating
9+
1. install node: `nvm install --lts`
10+
1. instal depedencies: `cd` to this dir, `npm install`
11+
12+
### Input JSON
13+
1. Create a JSON file that has the following form:
14+
```json
15+
[
16+
{
17+
url: "www.example.com",
18+
title: "file-name-of-pdf",
19+
folder: "name-of-folder"
20+
},
21+
...
22+
]
23+
```
24+
1. Create all the folders specified in the JSON file
25+
1. Add the filepath to the value of the `JSON_FILE_PATH` variable in `index.js`
26+
27+
## Usage
28+
1. `node index.js`
29+
1. [Optional] Login to any websites if the content is behind a login
30+
1. Press any key to continue
31+
1. Output will be in the folder you specified in the JSON with the filename `${title}.pdf`. You must create the folders before running this script.
32+
33+
34+
## Testing
35+
1. Add the to the value of the `JSON_FILE_PATH` variable in `index.js` to `test-input.json`
36+
1. Run the script: `node index.js`
37+
1. Observe a file called `example.pdf` that has the contents of the example.com webpage in it.

download-urls-as-pdf/index.js

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import puppeteer from 'puppeteer';
2+
import * as fs from 'fs';
3+
import * as path from 'path';
4+
5+
// Expected JSON File form:
6+
// [
7+
// {
8+
// url: "www.example.com",
9+
// title: "file-name-of-pdf",
10+
// folder: "name-of-folder"
11+
// },
12+
// ...
13+
// ]
14+
const JSON_FILE_PATH = "test-input.json"
15+
16+
const PAGE_WIDTH = "1200"
17+
const PAGE_HEIGHT = "800"
18+
19+
async function main() {
20+
if (JSON_FILE_PATH == "") {
21+
console.log("Add the filepath to the value of the `JSON_FILE_PATH` variable in `index.js`")
22+
process.exit(1)
23+
}
24+
console.log("Launching browser...")
25+
const browser = await puppeteer.launch({
26+
headless: false,
27+
args: [`--window-size=${PAGE_WIDTH},${PAGE_HEIGHT}`],
28+
defaultViewport: {
29+
width: parseInt(PAGE_WIDTH, 10),
30+
height: parseInt(PAGE_HEIGHT, 10)
31+
}
32+
});
33+
const pages = await browser.pages()
34+
const page = pages[0] // Get first tab in open browser
35+
36+
// In case what you need to download is behind a login form
37+
console.log("Login in the browser and then press any key to continue...")
38+
await keypress()
39+
console.log("Key pressed.")
40+
41+
let rawdata = fs.readFileSync(JSON_FILE_PATH);
42+
let webpages = JSON.parse(rawdata);
43+
for (const webpage of webpages) {
44+
console.log(`Downloading ${webpage.url}`);
45+
46+
// Navigate to page, wait until all network traffic stops
47+
await page.goto(webpage.url, { waitUntil: 'networkidle2', networkIdleTimeout: 5000 });
48+
// Scroll through the page to ensure all content loads
49+
await autoScroll(page);
50+
// Save PDF
51+
const filename = `${webpage.title}.pdf`
52+
const filePath = path.join(webpage.folder, filename);
53+
const pdfConfig = {
54+
path: filePath, // Saves file to this location
55+
format: 'A4',
56+
width: `${PAGE_WIDTH}px`,
57+
height: `${PAGE_HEIGHT}px`
58+
};
59+
await page.pdf(pdfConfig);
60+
}
61+
await browser.close();
62+
console.log('Done.')
63+
process.exit(0)
64+
}
65+
66+
67+
const keypress = async () => {
68+
process.stdin.setRawMode(true)
69+
return new Promise(resolve => process.stdin.once('data', () => {
70+
process.stdin.setRawMode(false)
71+
resolve()
72+
}))
73+
}
74+
75+
async function autoScroll(page) {
76+
await page.evaluate(async () => {
77+
await new Promise((resolve) => {
78+
var totalHeight = 0;
79+
var distance = 100;
80+
var timer = setInterval(() => {
81+
var scrollHeight = document.body.scrollHeight;
82+
window.scrollBy(0, distance);
83+
totalHeight += distance;
84+
85+
if (totalHeight >= scrollHeight - window.innerHeight) {
86+
clearInterval(timer);
87+
resolve();
88+
}
89+
}, 100);
90+
});
91+
});
92+
}
93+
94+
main()

download-urls-as-pdf/package.json

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"name": "download-urls-as-pdf",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "index.js",
6+
"type": "module",
7+
"scripts": {
8+
"test": "echo \"Error: no test specified\" && exit 1"
9+
},
10+
"author": "",
11+
"license": "ISC",
12+
"dependencies": {
13+
"puppeteer": "^19.6.2"
14+
}
15+
}

download-urls-as-pdf/test-input.json

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[
2+
{
3+
"url": "https://www.example.com",
4+
"title": "example",
5+
"folder": ""
6+
}
7+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
results.json
+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Get all Substack URLs for a user
2+
3+
Note: this was originally written in early 2023 depends on a internal Substack API which may be changed at any time.
4+
5+
## Setup
6+
1. Install Go: `brew install go`
7+
8+
## Usage
9+
1. Enter the base URL of the Substack newsletter as the value of the `BASE_URL` variable in `main.go`
10+
1. Run `go run main.go`

0 commit comments

Comments
 (0)