Skip to content

Commit 2922115

Browse files
committed
Add scraping & generation
0 parents  commit 2922115

File tree

14 files changed

+2056
-0
lines changed

14 files changed

+2056
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
node_modules
2+
cache
3+
static/index.html

package-lock.json

Lines changed: 1736 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"dependencies": {
3+
"ejs": "^3.1.6",
4+
"jsdom": "^19.0.0",
5+
"node-fetch": "^2.6.6",
6+
"ts-node": "^10.4.0"
7+
},
8+
"devDependencies": {
9+
"@types/ejs": "^3.1.0",
10+
"@types/jsdom": "^16.2.14",
11+
"@types/node-fetch": "^2.5.12"
12+
}
13+
}

scrape.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import fs from 'fs';
2+
import path from 'path'
3+
import fetch from 'node-fetch'
4+
import { JSDOM } from 'jsdom'
5+
import ejs from 'ejs';
6+
import { Event, Post } from './types';
7+
8+
9+
const cacheOrFetch = (url: string, slug: string): Promise<string> => {
10+
const cachePath = path.join('cache', slug);
11+
if (fs.existsSync(cachePath)) return fs.promises.readFile(cachePath).then(b => b.toString())
12+
return fetch(url)
13+
.then(r => r.text())
14+
.then(t => fs.promises.writeFile(cachePath, t).then(() => t))
15+
}
16+
17+
18+
const parsePost = (raw: any): Post => {
19+
const author = raw._embedded.authors[0];
20+
const image = raw._embedded['wp:featuredmedia'][0]
21+
return {
22+
url: raw.link,
23+
title: raw.title.rendered,
24+
author: {
25+
url: author.link,
26+
name: author.name
27+
},
28+
image: {
29+
url: image.media_details.sizes.full.source_url,
30+
alt: image.title.rendered
31+
},
32+
datetime: new Date(raw.date).getTime(),
33+
excerpt: raw.excerpt.rendered
34+
}
35+
}
36+
37+
const parseEvent = (raw: any): Event => {
38+
return {
39+
url: raw.link,
40+
title: raw.title.rendered,
41+
datetime: new Date(raw.date).getTime(),
42+
location: raw.venue ? [raw.venue.city, raw.venue.country].filter(Boolean).join(', ') : undefined
43+
}
44+
}
45+
46+
const timeAndDate = (date: Date) => {
47+
const time = date.toLocaleTimeString('default', { timeStyle: 'short' })
48+
const timezone = date.toLocaleDateString('default', { timeZoneName: 'short' }).split(', ')[1].trim()
49+
const month = date.toLocaleDateString(undefined, { month: 'long' })
50+
return `${time} ${timezone}${month} ${date.getDate()}, ${date.getFullYear()}`
51+
}
52+
53+
const monthAndDay = (date: Date) => {
54+
return date.toLocaleString('default', { month: 'short' }) + ' ' + date.getDate()
55+
}
56+
57+
cacheOrFetch('https://techcrunch.com/', 'index.html').then(html => {
58+
const dom = new JSDOM(html, { url: 'https://techcrunch.com/' });
59+
const data = JSON.parse(dom.window.document.querySelector('script#tc-app-js-extra')!.textContent!.split(' = ').slice(1).join(' = ').trim().slice(0, -1));
60+
const featured = data.feature_islands.homepage.map(parsePost);
61+
const posts = data.entities.posts.map(parsePost)
62+
const events = data.entities.events.map(parseEvent)
63+
64+
return ejs.renderFile(path.join('templates', 'index.ejs'), {
65+
timeAndDate, monthAndDay,
66+
featured,
67+
posts,
68+
events
69+
}).then(html => fs.promises.writeFile(path.join('static', 'index.html'), html));
70+
71+
}).catch(console.error)

static/assets/facebook.svg

Lines changed: 4 additions & 0 deletions
Loading

static/assets/instagram.svg

Lines changed: 4 additions & 0 deletions
Loading

static/assets/linkedin.svg

Lines changed: 4 additions & 0 deletions
Loading

static/assets/logo.svg

Lines changed: 15 additions & 0 deletions
Loading

static/assets/magnifying-glass.svg

Lines changed: 7 additions & 0 deletions
Loading

static/assets/twitter.svg

Lines changed: 4 additions & 0 deletions
Loading

static/assets/youtube.svg

Lines changed: 4 additions & 0 deletions
Loading

templates/index.ejs

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
4+
<head>
5+
<meta charset="UTF-8">
6+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
7+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
8+
<title>TechCrunch - Startup and Technology news</title>
9+
</head>
10+
11+
<body>
12+
<header>
13+
<nav>
14+
<ul>
15+
<li><a href="https://techcrunch.com/"><object data="assets/logo.svg" title="Tech Crunch logo" type="image/svg+xml"></object></a></li>
16+
<li><a href="https://oidc.techcrunch.com/login/?prompt=login&dest=https%3A%2F%2Ftechcrunch.com%2F">Login</a></li>
17+
<li>
18+
<details>
19+
<summary>Search</summary>
20+
21+
<form action="https://search.techcrunch.com/search">
22+
<input type="search" placeholder="Search" name="p">
23+
<input type="image" src="assets/magnifying-glass.svg" alt="magnifying glass" />
24+
</form>
25+
</details>
26+
</li>
27+
<li><a href="https://techcrunch.com/startups/">Startups</a></li>
28+
<li><a href="https://techcrunch.com/techcruchplus/">TechCrunch+</a></li>
29+
<li><a href="https://techcrunch.com/pages/podcasts/">Audio</a></li>
30+
<li><a href="https://techcrunch.com/newsletters/">Newsletters</a></li>
31+
<li><a href="https://techcrunch.com/video/">Videos</a></li>
32+
<li><a href="https://techcrunch.com/pages/advertisement-events-calendar">Advertise</a></li>
33+
<li><a href="https://techcrunch.com/events/">Events</a></li>
34+
<li>
35+
<details>
36+
<summary>More</summary>
37+
38+
<li><a href="https://techcrunch.com/"><img src="assets/logo.svg" alt="Tech Crunch logo" /></a></li>
39+
<span>More TechCrunch</span>
40+
<ul>
41+
<li><a href="https://techcrunch.com/startup-battlefield/">Startup Battlefield</a></li>
42+
<li><a href="https://techcrunch.com/sponsored/">Sponsored Content</a></li>
43+
<li><a href="https://techcrunch.com/startups/">Include</a></li>
44+
<li><a href="https://crunchbase.com/">Crunchbase</a></li>
45+
<li><a href="https://www.crunchbase.com/">Crunchboard</a></li>
46+
<li><a href="https://techcrunch.com/pages/contact-us/">Contact Us</a></li>
47+
</ul>
48+
</details>
49+
</li>
50+
</ul>
51+
</nav>
52+
</header>
53+
<main>
54+
<section>
55+
<section>
56+
<h1><%- featured[0].title %></h1>
57+
<cite><a href="<%= featured[0].author.url %>"><%- featured[0].author.name %></a></cite>
58+
<img src="<%= featured[0].image.url %>" alt="<%= featured[0].image.alt %>" />
59+
</section>
60+
<section>
61+
<ul>
62+
<% featured.slice(1).forEach(mini => { %>
63+
<li>
64+
<h3><%- mini.title %></h3>
65+
<cite><a href="<%= mini.author.url %>"><%- mini.author.name %></a></cite>
66+
</li>
67+
<% }); %>
68+
</ul>
69+
</section>
70+
</section>
71+
<section>
72+
<h2>The Latest</h2>
73+
<ul>
74+
<% posts.slice(0, 9).forEach(post => { %>
75+
<%- include('post', { post }); %>
76+
<% }); %>
77+
</ul>
78+
<section>
79+
<h2>Sign up for Newsletters</h2>
80+
<a href="https://link.techcrunch.com/join/134/signup-all-newsletters">See all newsletters</a>
81+
<form>
82+
<label><input type="checkbox" value="Readership" />Daily</label>
83+
<label><input type="checkbox" value="Readership-wkend" />Week in Review</label>
84+
<label><input type="checkbox" value="Readership-startups" />Startups Weekly</label>
85+
<label><input type="checkbox" value="Readership-event-updates" />Event Updates</label>
86+
<label><input type="checkbox" value="Readership-sponsorship" />Advertising Updates</label>
87+
<label><input type="checkbox" value="Extra Crunch Announcements" />TechCrunch+ Announcements</label>
88+
<label><input type="checkbox" value="Extra Crunch Events" />TechCrunch+ Events</label>
89+
<label><input type="checkbox" value="Extra Crunch Daily Newsletter" />TechCrunch+ Roundup</label>
90+
<input type="email" placeholder="Email" required />
91+
<button type="submit">Subscribe</button>
92+
</form>
93+
</section>
94+
<ul>
95+
<% posts.slice(9, 11).forEach(post => { %>
96+
<%- include('post', { post }); %>
97+
<% }); %>
98+
</ul>
99+
<section>
100+
<h2>Where we'll be next</h2>
101+
<ul>
102+
<% events.forEach(event => { %>
103+
<li>
104+
<h3><%- event.title %></h3>
105+
<time datetime="<%= new Date(event.datetime).toISOString() %>"><%- monthAndDay(new Date(event.datetime)) %> </time>
106+
<% if (event.location) { %>
107+
<address><%- event.location %></address>
108+
<% } %>
109+
</li>
110+
<% }); %>
111+
</ul>
112+
</section>
113+
<ul>
114+
<% posts.slice(11).forEach(post => { %>
115+
<%- include('post', { post }); %>
116+
<% }); %>
117+
</ul>
118+
</section>
119+
</main>
120+
<footer>
121+
<nav>
122+
<h3>About</h3>
123+
<ul>
124+
<li><a href="https://techcrunch.com/">TechCrunch</a></li>
125+
<li><a href="https://techcrunch.com/pages/about-techcrunch">Staff</a></li>
126+
<li><a href="https://techcrunch.com/pages/contact-us/">Contact Us</a></li>
127+
<li><a href="https://techcrunch.com/pages/advertisement-events-calendar/">Advertise</a></li>
128+
</ul>
129+
130+
<h3>Legal</h3>
131+
<ul>
132+
<li><a href="https://legal.yahoo.com/us/en/yahoo/privacy/index.html">Privacy Policy</a></li>
133+
<li><a href="https://legal.yahoo.com/us/en/yahoo/terms/otos/index.html">Terms of Service</a></li>
134+
<li><a href="https://techcrunch.com/pages/extra-crunch-terms-of-service/">TechCrunch+ Terms</a></li>
135+
<li><a href="https://guce.techcrunch.com/privacy-dashboard?locale=en-US">Privacy Dashboard</a></li>
136+
<li><a href="https://techcrunch.com/pages/code-of-conduct/">Code of Conduct</a></li>
137+
<li><a href="https://legal.yahoo.com/us/en/yahoo/privacy/adinfo/index.html">About Our Ads</a></li>
138+
<li><a href="https://techcrunch.com/">International</a></li>
139+
</ul>
140+
141+
<h3>International</h3>
142+
<ul>
143+
<li><a href="http://jp.techcrunch.com/">Japan</a></li>
144+
</ul>
145+
146+
<ul aria-label="Social Medias">
147+
<li><a href="https://www.facebook.com/techcrunch"><img src="assets/facebook.svg" alt="Facebook logo" />Facebook</a></li>
148+
<li><a href="https://twitter.com/techcrunch"><img src="assets/twitter.svg" alt="Twitter logo" />Twitter</a></li>
149+
<li><a href="http://www.youtube.com/user/techcrunch"><img src="assets/youtube.svg" alt="YouTube logo" />YouTube</a></li>
150+
<li><a href="http://instagram.com/techcrunch"><img src="assets/instagram.svg" alt="Instagram logo" />Instagram</a></li>
151+
<li><a href="http://www.linkedin.com/company/techcrunch"><img src="assets/linkedin.svg" alt="LinkedIn logo" />LinkedIn</a></li>
152+
</ul>
153+
</nav>
154+
155+
<span>© 2022 Yahoo. All rights reserved. Powered by <a href="https://wpvip.com/">WordPress VIP</a>.</span>
156+
</footer>
157+
</body>
158+
159+
</html>

templates/post.ejs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<li>
2+
<h3><a href="<%= post.url %>"><%- post.title %></a></h3>
3+
<cite><a href="<%= post.author.url %>"><%- post.author.name %></a></cite>
4+
<time datetime="<%= new Date(post.datetime).toISOString() %>"><%- timeAndDate(new Date(post.datetime)) %> </time>
5+
<p><a href="<%= post.url %>"><%- post.excerpt %></a></p>
6+
<a href="<%= post.url %>"><img src="<%= post.image.url %>" alt="<%= post.image.alt %>" /></a>
7+
</li>

types.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
export interface Author {
2+
url: string
3+
name: string
4+
}
5+
6+
export interface Image {
7+
url: string
8+
alt?: string
9+
}
10+
11+
export interface Post {
12+
url: string
13+
title: string
14+
author: Author
15+
image?: Image
16+
datetime?: number
17+
excerpt?: string
18+
}
19+
20+
export interface Event {
21+
url: string
22+
title: string
23+
datetime: number
24+
location?: string
25+
}

0 commit comments

Comments
 (0)