Skip to content

Commit d339513

Browse files
authored
feat: add walker (#1)
1 parent b2a2959 commit d339513

File tree

6 files changed

+329
-159
lines changed

6 files changed

+329
-159
lines changed

Diff for: README.md

+27
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,30 @@ for await (const node of parseHTMLStream(reader)) {
4747
```
4848

4949
This code snippet showcases how to iterate through the DOM Nodes in a streaming fashion, offering a practical approach for processing HTML streams in real-time.
50+
51+
## Walker example
52+
53+
If you prefer to have control over moving around the HTML tree of the stream, you can use the following function:
54+
55+
```ts
56+
import htmlStreamWalker from "parse-html-stream/walker";
57+
58+
// ...
59+
60+
const reader = res.body.getReader();
61+
const walker = await htmlStreamWalker(reader);
62+
63+
// Root node
64+
const rootNode = walker.rootNode
65+
66+
// Gives the firstChild taking account the stream chunks
67+
const child = await walker.firstChild(rootNode);
68+
69+
// Gives the nextSibling taking account the stream chunks
70+
const brother = await walker.nextSibling(rootNode);
71+
72+
// You can do it with every HTML node:
73+
const childOfBrother = await walker.firstChild(brother);
74+
```
75+
76+
The stream is processed as you walk through the tree, whenever it does not find a `firstChild` or `nextSibling` and has not yet finished the stream, it asks for another chunk. This way you can walk through the tree during the stream.

Diff for: package.json

+16-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "parse-html-stream",
3-
"version": "0.2.0",
3+
"version": "0.3.0",
44
"module": "./build/index.js",
55
"type": "module",
66
"main": "./build/index.js",
@@ -12,14 +12,27 @@
1212
},
1313
"files": [
1414
"build",
15-
"index.d.ts"
15+
"index.d.ts",
16+
"walker.d.ts"
1617
],
18+
"exports": {
19+
".": {
20+
"import": "./build/index.js",
21+
"require": "./build/index.js",
22+
"types": "./index.d.ts"
23+
},
24+
"./walker": {
25+
"import": "./build/walker/index.js",
26+
"require": "./build/walker/index.js",
27+
"types": "./walker.d.ts"
28+
}
29+
},
1730
"repository": {
1831
"type": "git",
1932
"url": "https://github.com/aralroca/parse-html-stream.git"
2033
},
2134
"scripts": {
22-
"build": "bun build --minify --outdir=build src/index.ts",
35+
"build": "bun build --minify --outdir=build src/index.ts src/walker/index.ts",
2336
"test": "bun test"
2437
},
2538
"devDependencies": {

Diff for: src/index.test.ts

+152-156
Original file line numberDiff line numberDiff line change
@@ -6,162 +6,158 @@ const dom = new JSDOM("<!DOCTYPE html><html><body></body></html>");
66
global.document = dom.window.document;
77
global.window = dom.window;
88

9-
describe("utils", () => {
10-
describe("rpc", () => {
11-
describe("parse-html-stream", () => {
12-
it("should handle an empty HTML stream", async () => {
13-
const stream = new ReadableStream({
14-
start(controller) {
15-
controller.close();
16-
},
17-
});
18-
19-
const reader = stream.getReader();
20-
const nodes = [];
21-
22-
const parseHTMLStream = await import(".").then((m) => m.default);
23-
24-
for await (const node of parseHTMLStream(reader)) {
25-
nodes.push(node);
26-
}
27-
28-
expect(nodes).toEqual([]);
29-
});
30-
31-
it("should transform a stream of HTML into a stream of nodes", async () => {
32-
const encoder = new TextEncoder();
33-
const stream = new ReadableStream({
34-
start(controller) {
35-
controller.enqueue(encoder.encode("<html>"));
36-
controller.enqueue(encoder.encode("<head />"));
37-
controller.enqueue(encoder.encode("<body>"));
38-
controller.enqueue(encoder.encode('<div class="foo">Bar</div>'));
39-
controller.enqueue(encoder.encode("</body>"));
40-
controller.enqueue(encoder.encode("</html>"));
41-
controller.close();
42-
},
43-
});
44-
45-
const reader = stream.getReader();
46-
const nodeNames = [];
47-
48-
const parseHTMLStream = await import(".").then((m) => m.default);
49-
50-
for await (const node of parseHTMLStream(reader)) {
51-
nodeNames.push(node?.nodeName);
52-
}
53-
54-
expect(nodeNames).toEqual(["HTML", "HEAD", "BODY", "DIV", "#text"]);
55-
});
56-
57-
it("should work with comments", async () => {
58-
const encoder = new TextEncoder();
59-
const stream = new ReadableStream({
60-
start(controller) {
61-
controller.enqueue(encoder.encode("<html>"));
62-
controller.enqueue(encoder.encode("<head />"));
63-
controller.enqueue(encoder.encode("<body>"));
64-
controller.enqueue(
65-
encoder.encode('<div class="foo"><!-- comment -->Bar</div>'),
66-
);
67-
controller.enqueue(encoder.encode("</body>"));
68-
controller.enqueue(encoder.encode("</html>"));
69-
controller.close();
70-
},
71-
});
72-
73-
const reader = stream.getReader();
74-
const nodeNames = [];
75-
76-
const parseHTMLStream = await import(".").then((m) => m.default);
77-
78-
for await (const node of parseHTMLStream(reader)) {
79-
nodeNames.push(node?.nodeName);
80-
}
81-
82-
expect(nodeNames).toEqual([
83-
"HTML",
84-
"HEAD",
85-
"BODY",
86-
"DIV",
87-
"#comment",
88-
"#text",
89-
]);
90-
});
91-
92-
it("should be possible to read the attributes of a node HTMLElement", async () => {
93-
const encoder = new TextEncoder();
94-
const stream = new ReadableStream({
95-
start(controller) {
96-
controller.enqueue(encoder.encode('<div class="foo">Bar</div>'));
97-
controller.close();
98-
},
99-
});
100-
101-
const reader = stream.getReader();
102-
const nodes: Node[] = [];
103-
104-
const parseHTMLStream = await import(".").then((m) => m.default);
105-
106-
for await (const node of parseHTMLStream(reader)) {
107-
nodes.push(node);
108-
}
109-
110-
expect(nodes).toHaveLength(5);
111-
expect(nodes[0]?.nodeName).toBe("HTML");
112-
expect(nodes[1]?.nodeName).toBe("HEAD");
113-
expect(nodes[2]?.nodeName).toBe("BODY");
114-
expect(nodes[3]?.nodeName).toBe("DIV");
115-
expect(nodes[4]?.nodeName).toBe("#text");
116-
expect((nodes[3] as HTMLElement).getAttribute("class")).toBe("foo");
117-
});
118-
119-
it("should work with very nested HTML", async () => {
120-
const encoder = new TextEncoder();
121-
const stream = new ReadableStream({
122-
start(controller) {
123-
controller.enqueue(encoder.encode("<html>"));
124-
controller.enqueue(encoder.encode("<head />"));
125-
controller.enqueue(encoder.encode("<body>"));
126-
controller.enqueue(encoder.encode('<div class="foo">'));
127-
controller.enqueue(encoder.encode('<div class="bar">'));
128-
controller.enqueue(encoder.encode('<div class="baz">'));
129-
controller.enqueue(encoder.encode('<div class="qux">'));
130-
controller.enqueue(encoder.encode("Hello"));
131-
controller.enqueue(encoder.encode("</div>"));
132-
controller.enqueue(encoder.encode("</div>"));
133-
controller.enqueue(encoder.encode("</div>"));
134-
controller.enqueue(encoder.encode("</div>"));
135-
controller.enqueue(encoder.encode("</body>"));
136-
controller.enqueue(encoder.encode("</html>"));
137-
controller.close();
138-
},
139-
});
140-
141-
const reader = stream.getReader();
142-
const nodes = [];
143-
144-
const parseHTMLStream = await import(".").then((m) => m.default);
145-
146-
for await (const node of parseHTMLStream(reader)) {
147-
nodes.push(node);
148-
}
149-
150-
expect(nodes).toHaveLength(8);
151-
expect(nodes[0]?.nodeName).toBe("HTML");
152-
expect(nodes[1]?.nodeName).toBe("HEAD");
153-
expect(nodes[2]?.nodeName).toBe("BODY");
154-
expect(nodes[3]?.nodeName).toBe("DIV");
155-
expect((nodes[3] as HTMLElement).classList.contains("foo")).toBeTrue();
156-
expect(nodes[4]?.nodeName).toBe("DIV");
157-
expect((nodes[4] as HTMLElement).classList.contains("bar")).toBeTrue();
158-
expect(nodes[5]?.nodeName).toBe("DIV");
159-
expect((nodes[5] as HTMLElement).classList.contains("baz")).toBeTrue();
160-
expect(nodes[6]?.nodeName).toBe("DIV");
161-
expect((nodes[6] as HTMLElement).classList.contains("qux")).toBeTrue();
162-
expect(nodes[7]?.nodeName).toBe("#text");
163-
expect(nodes[7]?.textContent).toBe("Hello");
164-
});
9+
describe("parse-html-stream", () => {
10+
it("should handle an empty HTML stream", async () => {
11+
const stream = new ReadableStream({
12+
start(controller) {
13+
controller.close();
14+
},
16515
});
16+
17+
const reader = stream.getReader();
18+
const nodes = [];
19+
20+
const parseHTMLStream = await import(".").then((m) => m.default);
21+
22+
for await (const node of parseHTMLStream(reader)) {
23+
nodes.push(node);
24+
}
25+
26+
expect(nodes).toEqual([]);
27+
});
28+
29+
it("should transform a stream of HTML into a stream of nodes", async () => {
30+
const encoder = new TextEncoder();
31+
const stream = new ReadableStream({
32+
start(controller) {
33+
controller.enqueue(encoder.encode("<html>"));
34+
controller.enqueue(encoder.encode("<head />"));
35+
controller.enqueue(encoder.encode("<body>"));
36+
controller.enqueue(encoder.encode('<div class="foo">Bar</div>'));
37+
controller.enqueue(encoder.encode("</body>"));
38+
controller.enqueue(encoder.encode("</html>"));
39+
controller.close();
40+
},
41+
});
42+
43+
const reader = stream.getReader();
44+
const nodeNames = [];
45+
46+
const parseHTMLStream = await import(".").then((m) => m.default);
47+
48+
for await (const node of parseHTMLStream(reader)) {
49+
nodeNames.push(node?.nodeName);
50+
}
51+
52+
expect(nodeNames).toEqual(["HTML", "HEAD", "BODY", "DIV", "#text"]);
53+
});
54+
55+
it("should work with comments", async () => {
56+
const encoder = new TextEncoder();
57+
const stream = new ReadableStream({
58+
start(controller) {
59+
controller.enqueue(encoder.encode("<html>"));
60+
controller.enqueue(encoder.encode("<head />"));
61+
controller.enqueue(encoder.encode("<body>"));
62+
controller.enqueue(
63+
encoder.encode('<div class="foo"><!-- comment -->Bar</div>'),
64+
);
65+
controller.enqueue(encoder.encode("</body>"));
66+
controller.enqueue(encoder.encode("</html>"));
67+
controller.close();
68+
},
69+
});
70+
71+
const reader = stream.getReader();
72+
const nodeNames = [];
73+
74+
const parseHTMLStream = await import(".").then((m) => m.default);
75+
76+
for await (const node of parseHTMLStream(reader)) {
77+
nodeNames.push(node?.nodeName);
78+
}
79+
80+
expect(nodeNames).toEqual([
81+
"HTML",
82+
"HEAD",
83+
"BODY",
84+
"DIV",
85+
"#comment",
86+
"#text",
87+
]);
88+
});
89+
90+
it("should be possible to read the attributes of a node HTMLElement", async () => {
91+
const encoder = new TextEncoder();
92+
const stream = new ReadableStream({
93+
start(controller) {
94+
controller.enqueue(encoder.encode('<div class="foo">Bar</div>'));
95+
controller.close();
96+
},
97+
});
98+
99+
const reader = stream.getReader();
100+
const nodes: Node[] = [];
101+
102+
const parseHTMLStream = await import(".").then((m) => m.default);
103+
104+
for await (const node of parseHTMLStream(reader)) {
105+
nodes.push(node);
106+
}
107+
108+
expect(nodes).toHaveLength(5);
109+
expect(nodes[0]?.nodeName).toBe("HTML");
110+
expect(nodes[1]?.nodeName).toBe("HEAD");
111+
expect(nodes[2]?.nodeName).toBe("BODY");
112+
expect(nodes[3]?.nodeName).toBe("DIV");
113+
expect(nodes[4]?.nodeName).toBe("#text");
114+
expect((nodes[3] as HTMLElement).getAttribute("class")).toBe("foo");
115+
});
116+
117+
it("should work with very nested HTML", async () => {
118+
const encoder = new TextEncoder();
119+
const stream = new ReadableStream({
120+
start(controller) {
121+
controller.enqueue(encoder.encode("<html>"));
122+
controller.enqueue(encoder.encode("<head />"));
123+
controller.enqueue(encoder.encode("<body>"));
124+
controller.enqueue(encoder.encode('<div class="foo">'));
125+
controller.enqueue(encoder.encode('<div class="bar">'));
126+
controller.enqueue(encoder.encode('<div class="baz">'));
127+
controller.enqueue(encoder.encode('<div class="qux">'));
128+
controller.enqueue(encoder.encode("Hello"));
129+
controller.enqueue(encoder.encode("</div>"));
130+
controller.enqueue(encoder.encode("</div>"));
131+
controller.enqueue(encoder.encode("</div>"));
132+
controller.enqueue(encoder.encode("</div>"));
133+
controller.enqueue(encoder.encode("</body>"));
134+
controller.enqueue(encoder.encode("</html>"));
135+
controller.close();
136+
},
137+
});
138+
139+
const reader = stream.getReader();
140+
const nodes = [];
141+
142+
const parseHTMLStream = await import(".").then((m) => m.default);
143+
144+
for await (const node of parseHTMLStream(reader)) {
145+
nodes.push(node);
146+
}
147+
148+
expect(nodes).toHaveLength(8);
149+
expect(nodes[0]?.nodeName).toBe("HTML");
150+
expect(nodes[1]?.nodeName).toBe("HEAD");
151+
expect(nodes[2]?.nodeName).toBe("BODY");
152+
expect(nodes[3]?.nodeName).toBe("DIV");
153+
expect((nodes[3] as HTMLElement).classList.contains("foo")).toBeTrue();
154+
expect(nodes[4]?.nodeName).toBe("DIV");
155+
expect((nodes[4] as HTMLElement).classList.contains("bar")).toBeTrue();
156+
expect(nodes[5]?.nodeName).toBe("DIV");
157+
expect((nodes[5] as HTMLElement).classList.contains("baz")).toBeTrue();
158+
expect(nodes[6]?.nodeName).toBe("DIV");
159+
expect((nodes[6] as HTMLElement).classList.contains("qux")).toBeTrue();
160+
expect(nodes[7]?.nodeName).toBe("#text");
161+
expect(nodes[7]?.textContent).toBe("Hello");
166162
});
167163
});

0 commit comments

Comments
 (0)