Skip to content

Commit a07e62c

Browse files
AlexTategijsk
andauthored
Preserve MathJax tags (#958)
* Prevent MathJax nodes from being identified as 'unlikely candidates', and prevent <mjx-math> tags from being removed due to attribute aria-hidden="true" * Revert changes to _isProbablyVisible() and isNodeVisible() * Update test case to include the MathJax tags which are produced once client-side rendering is complete. The previous test case only used the static HTML received from the server. Unfortunately, after htmltidy2 processes the page it is determined to be "unreaderable" though the appropriate JSDOM tests run and pass. Alternatively, if htmltidy2 is skipped, JSDOMParser produces a slew of errors. Perhaps this will do for now... * Adding support for file:// URLs. This is useful when the test case contains dynamic content as it allows the dev to save a copy of the rendered DOM to disk and use the resulting file as input to generate-testcase. Alternatively one could use JSDOM's {runScripts: "dangerously", resources: "usable"} options, but in my case these fell short and caused MathJax to crash due to missing localStorage implementation in JSDOM. Perhaps my approach will be useful to others... * Use url fileURLToPath to handle file urls --------- Co-authored-by: Gijs Kruitbosch <[email protected]>
1 parent 59689fa commit a07e62c

File tree

7 files changed

+5235
-35
lines changed

7 files changed

+5235
-35
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ You can run it using:
4545
$ node test/generate-testcase.js slug https://example.com/article
4646

4747
Replacing `slug` with the identifier the test should use, and providing a URL
48-
to an actual article on which the test should be based.
48+
to an actual article on which the test should be based. If your test case involves dynamic content, you can save the page to disk and pass the file as a `file://` URL.
4949

5050
On macOS, you may need to make the `tidy` binary executable before that script will succeed. If you see an `EACCES` error when running that script, try:
5151

Readability-readerable.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ var REGEXPS = {
2424
// Readability.js. Please keep both copies in sync.
2525
unlikelyCandidates:
2626
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
27-
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
27+
okMaybeItsACandidate: /and|article|body|column|content|main|mathjax|shadow/i,
2828
};
2929

3030
function isNodeVisible(node) {

Readability.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ Readability.prototype = {
139139
// Readability-readerable.js. Please keep both copies in sync.
140140
unlikelyCandidates:
141141
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
142-
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
142+
okMaybeItsACandidate:
143+
/and|article|body|column|content|main|mathjax|shadow/i,
143144

144145
positive:
145146
/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,

test/generate-testcase.js

Lines changed: 49 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ var fs = require("fs");
77
var JSDOM = require("jsdom").JSDOM;
88
var prettyPrint = require("./utils").prettyPrint;
99
var http = require("http");
10-
var urlparse = require("url").parse;
10+
let { parse: urlparse, fileURLToPath } = require("url");
1111
var htmltidy = require("htmltidy2").tidy;
1212

1313
var { Readability, isProbablyReaderable } = require("../index");
@@ -28,18 +28,9 @@ function generateTestcase(slug) {
2828
var sourceFile = path.join(destRoot, "source.html");
2929
fs.exists(sourceFile, function (exists) {
3030
if (exists) {
31-
fs.readFile(
32-
sourceFile,
33-
{ encoding: "utf-8" },
34-
function (readFileErr, data) {
35-
if (readFileErr) {
36-
console.error("Source existed but couldn't be read?");
37-
process.exit(1);
38-
return;
39-
}
40-
onResponseReceived(null, data, destRoot);
41-
}
42-
);
31+
fetchLocalSource(sourceFile, function (data) {
32+
onResponseReceived(null, data, destRoot);
33+
});
4334
} else {
4435
fetchSource(argURL, function (fetchErr, data) {
4536
onResponseReceived(fetchErr, data, destRoot);
@@ -60,29 +51,54 @@ function fetchSource(url, callbackFn) {
6051
process.exit(1);
6152
return;
6253
}
63-
var client = http;
64-
if (url.indexOf("https") == 0) {
65-
client = require("https");
66-
}
67-
var options = urlparse(url);
68-
options.headers = { "User-Agent": FFX_UA };
69-
70-
client.get(options, function (response) {
71-
if (debug) {
72-
console.log("STATUS:", response.statusCode);
73-
console.log("HEADERS:", JSON.stringify(response.headers));
54+
if (url.indexOf("http") == 0) {
55+
var client = http;
56+
if (url.indexOf("https") == 0) {
57+
client = require("https");
7458
}
75-
response.setEncoding("utf-8");
76-
var rv = "";
77-
response.on("data", function (chunk) {
78-
rv += chunk;
79-
});
80-
response.on("end", function () {
59+
var options = urlparse(url);
60+
options.headers = { "User-Agent": FFX_UA };
61+
62+
client.get(options, function (response) {
8163
if (debug) {
82-
console.log("End received");
64+
console.log("STATUS:", response.statusCode);
65+
console.log("HEADERS:", JSON.stringify(response.headers));
66+
}
67+
response.setEncoding("utf-8");
68+
var rv = "";
69+
response.on("data", function (chunk) {
70+
rv += chunk;
71+
});
72+
response.on("end", function () {
73+
if (debug) {
74+
console.log("End received");
75+
}
76+
sanitizeSource(rv, callbackFn);
77+
});
78+
});
79+
} else if (url.indexOf("file://") == 0) {
80+
sourceFile = fileURLToPath(url);
81+
fs.exists(sourceFile, function (exists) {
82+
if (exists) {
83+
fetchLocalSource(sourceFile, function (data) {
84+
sanitizeSource(data, callbackFn);
85+
});
86+
} else {
87+
console.error("File doesn't exist!");
88+
process.exit(1);
8389
}
84-
sanitizeSource(rv, callbackFn);
8590
});
91+
}
92+
}
93+
94+
function fetchLocalSource(sourceFile, callbackFn) {
95+
fs.readFile(sourceFile, { encoding: "utf-8" }, function (readFileErr, data) {
96+
if (readFileErr) {
97+
console.error("Source existed but couldn't be read?");
98+
process.exit(1);
99+
return;
100+
}
101+
callbackFn(data);
86102
});
87103
}
88104

@@ -94,6 +110,7 @@ function sanitizeSource(html, callbackFn) {
94110
"indent-spaces": 4,
95111
"numeric-entities": true,
96112
"output-xhtml": true,
113+
"custom-tags": "blocklevel",
97114
wrap: 0,
98115
},
99116
callbackFn
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"title": "MathJax v3 with MathML input and HTML output",
3+
"byline": null,
4+
"dir": null,
5+
"lang": "en",
6+
"excerpt": "When",
7+
"siteName": null,
8+
"publishedTime": null,
9+
"readerable": false
10+
}

0 commit comments

Comments
 (0)