Skip to content

Commit 5265662

Browse files
author
Eric Nordlund
committed
Fix linkcheck anchor encoding issues
- Enhanced AnchorCheckParser to handle multiple anchor variations - Added comprehensive test coverage for encoded anchors - Fixed false 'Anchor not found' errors for URLs with encoded characters - Maintains full backward compatibility - All linting checks pass
1 parent 2b7e3ad commit 5265662

File tree

5 files changed

+289
-9
lines changed

5 files changed

+289
-9
lines changed

CHANGES.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ Bugs fixed
3737
Patch by Alicia Garcia-Raboso.
3838
* #13528: Add tilde ``~`` prefix support for :rst:role:`py:deco`.
3939
Patch by Shengyu Zhang and Adam Turner.
40+
* linkcheck: Fix false "Anchor not found" errors for valid URLs with encoded
41+
characters in fragment identifiers.
4042

4143
Testing
4244
-------

sphinx/builders/linkcheck.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from html.parser import HTMLParser
1212
from queue import PriorityQueue, Queue
1313
from threading import Thread
14-
from typing import TYPE_CHECKING, NamedTuple, cast
14+
from typing import TYPE_CHECKING, Any, NamedTuple, cast
1515
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse
1616

1717
from docutils import nodes
@@ -485,6 +485,9 @@ def _retrieval_methods(
485485

486486
def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
487487
req_url, delimiter, anchor = uri.partition('#')
488+
original_encoded_anchor = (
489+
anchor # Store the original encoded anchor before decoding
490+
)
488491
if delimiter and anchor:
489492
for rex in self.anchors_ignore:
490493
if rex.match(anchor):
@@ -536,7 +539,9 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
536539
) as response:
537540
if anchor and self.check_anchors and response.ok:
538541
try:
539-
found = contains_anchor(response, anchor)
542+
found = contains_anchor(
543+
response, anchor, original_encoded_anchor
544+
)
540545
except UnicodeDecodeError:
541546
return (
542547
_Status.IGNORED,
@@ -686,11 +691,13 @@ def _get_request_headers(
686691
return {}
687692

688693

689-
def contains_anchor(response: Response, anchor: str) -> bool:
694+
def contains_anchor(
695+
response: Response, anchor: str, original_encoded_anchor: str = ''
696+
) -> bool:
690697
"""Determine if an anchor is contained within an HTTP response."""
691-
parser = AnchorCheckParser(anchor)
692-
# Read file in chunks. If we find a matching anchor, we break
693-
# the loop early in hopes not to have to download the whole thing.
698+
parser = AnchorCheckParser(anchor, original_encoded_anchor)
699+
# Read file in chunks. If we find a matching anchor, we break the loop early
700+
# to avoid downloading the entire response body.
694701
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
695702
if isinstance(chunk, bytes):
696703
# requests failed to decode, manually try to decode it
@@ -706,15 +713,37 @@ def contains_anchor(response: Response, anchor: str) -> bool:
706713
class AnchorCheckParser(HTMLParser):
707714
"""Specialised HTML parser that looks for a specific anchor."""
708715

709-
def __init__(self, search_anchor: str) -> None:
716+
def __init__(self, search_anchor: str, original_encoded_anchor: str = '') -> None:
717+
"""Initialize the parser with multiple anchor variations.
718+
719+
Args:
720+
search_anchor: The decoded anchor to search for
721+
(e.g., "standard-input/output-stdio")
722+
original_encoded_anchor: The original encoded anchor
723+
(e.g., "standard-input%2Foutput-stdio")
724+
"""
710725
super().__init__()
711726

712-
self.search_anchor = search_anchor
727+
# Create variations of the anchor to check
728+
self.search_variations = {
729+
search_anchor, # decoded (current behavior)
730+
}
731+
732+
# Add the original encoded version if provided
733+
if original_encoded_anchor:
734+
self.search_variations.add(original_encoded_anchor)
735+
736+
# Add a re-encoded version if the decoded anchor contains characters
737+
# that would be encoded
738+
if search_anchor != quote(search_anchor, safe=''):
739+
self.search_variations.add(quote(search_anchor, safe=''))
740+
713741
self.found = False
714742

715743
def handle_starttag(self, tag: Any, attrs: Any) -> None:
716744
for key, value in attrs:
717-
if key in {'id', 'name'} and value == self.search_anchor:
745+
# Check if the attribute value matches any of our variations
746+
if key in {'id', 'name'} and value in self.search_variations:
718747
self.found = True
719748
break
720749

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
root_doc = 'encoded_anchors'
2+
exclude_patterns = ['_build']
3+
linkcheck_anchors = True
4+
linkcheck_timeout = 0.25
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""Test the AnchorCheckParser class."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
from unittest import mock
7+
8+
from sphinx.builders.linkcheck import AnchorCheckParser, contains_anchor
9+
10+
if TYPE_CHECKING:
11+
from typing import Any
12+
13+
14+
def test_anchor_check_parser_basic() -> None:
15+
"""Test basic anchor matching functionality."""
16+
parser = AnchorCheckParser('test-anchor')
17+
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>')
18+
assert parser.found is True
19+
20+
parser = AnchorCheckParser('non-existent')
21+
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>')
22+
assert parser.found is False
23+
24+
25+
def test_anchor_check_parser_with_encoded_anchors() -> None:
26+
"""Test anchor matching with encoded characters."""
27+
# Test with encoded slash
28+
parser = AnchorCheckParser(
29+
'standard-input/output-stdio', 'standard-input%2Foutput-stdio'
30+
)
31+
parser.feed(
32+
'<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>'
33+
)
34+
assert parser.found is True
35+
36+
# Test with plus sign
37+
parser = AnchorCheckParser('encoded+anchor', 'encoded%2Banchor')
38+
parser.feed('<html><body><div id="encoded%2Banchor">Test</div></body></html>')
39+
assert parser.found is True
40+
41+
# Test with space
42+
parser = AnchorCheckParser('encoded space', 'encoded%20space')
43+
parser.feed('<html><body><div id="encoded%20space">Test</div></body></html>')
44+
assert parser.found is True
45+
46+
47+
def test_contains_anchor_with_encoded_characters() -> None:
48+
"""Test the contains_anchor function with encoded characters."""
49+
mock_response = mock.MagicMock()
50+
51+
# Setup a response that returns HTML with encoded anchors
52+
def mock_iter_content(chunk_size: Any = None, decode_unicode: Any = None) -> Any:
53+
content = '<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>'
54+
yield content
55+
56+
mock_response.iter_content = mock_iter_content
57+
58+
# Test with original encoded anchor
59+
assert (
60+
contains_anchor(
61+
mock_response,
62+
'standard-input/output-stdio',
63+
'standard-input%2Foutput-stdio',
64+
)
65+
is True
66+
)
67+
68+
# Test with decoded anchor only
69+
mock_response2 = mock.MagicMock()
70+
mock_response2.iter_content = mock_iter_content
71+
assert contains_anchor(mock_response2, 'standard-input/output-stdio') is True
72+
73+
# Test with non-existent anchor
74+
mock_response3 = mock.MagicMock()
75+
mock_response3.iter_content = mock_iter_content
76+
assert contains_anchor(mock_response3, 'non-existent-anchor') is False
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""Test the linkcheck builder's ability to handle encoded anchors."""
2+
3+
from __future__ import annotations
4+
5+
import json
6+
import re
7+
from http.server import BaseHTTPRequestHandler
8+
from typing import TYPE_CHECKING
9+
10+
import pytest
11+
12+
from tests.utils import serve_application
13+
14+
if TYPE_CHECKING:
15+
from collections.abc import Iterable
16+
from typing import Any
17+
18+
from sphinx.testing.util import SphinxTestApp
19+
20+
21+
class EncodedAnchorsHandler(BaseHTTPRequestHandler):
22+
protocol_version = 'HTTP/1.1'
23+
24+
def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
25+
"""Split content into chunks of a maximum size."""
26+
27+
def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
28+
"""Encode a bytestring into a format suitable for HTTP chunked-transfer."""
29+
yield f'{len(chunk):X}'.encode('ascii')
30+
yield b'\r\n'
31+
yield chunk
32+
yield b'\r\n'
33+
34+
buffer = b''
35+
for char in content:
36+
buffer += char.encode('utf-8')
37+
if len(buffer) >= max_chunk_size:
38+
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
39+
yield from _encode_chunk(chunk)
40+
41+
# Flush remaining bytes, if any
42+
if buffer:
43+
yield from _encode_chunk(buffer)
44+
45+
# Emit a final empty chunk to close the stream
46+
yield from _encode_chunk(b'')
47+
48+
def _send_chunked(self, content: str) -> bool:
49+
"""Send content in chunks."""
50+
for chunk in self._chunk_content(content, max_chunk_size=20):
51+
try:
52+
self.wfile.write(chunk)
53+
except (BrokenPipeError, ConnectionResetError) as e:
54+
self.log_message(str(e))
55+
return False
56+
return True
57+
58+
def do_HEAD(self) -> None:
59+
"""Handle HEAD requests."""
60+
print(f'HEAD request for path: {self.path}')
61+
if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}:
62+
self.send_response(200, 'OK')
63+
else:
64+
self.send_response(404, 'Not Found')
65+
self.end_headers()
66+
67+
def do_GET(self) -> None:
68+
"""Serve test pages with encoded anchors."""
69+
if self.path == '/standard-encoded-anchors':
70+
self.send_response(200, 'OK')
71+
# Note the ID has an encoded forward slash (%2F)
72+
content = """
73+
<!DOCTYPE html>
74+
<html>
75+
<head><title>Encoded Anchors Test</title></head>
76+
<body>
77+
<h1 id="standard-input%2Foutput-stdio">Standard I/O</h1>
78+
<h2 id="encoded%2Banchor">Encoded Plus</h2>
79+
</body>
80+
</html>
81+
"""
82+
elif self.path == '/various-encoded-chars':
83+
self.send_response(200, 'OK')
84+
content = """
85+
<!DOCTYPE html>
86+
<html>
87+
<head><title>Various Encoded Characters</title></head>
88+
<body>
89+
<h1 id="encoded%21exclamation">Encoded Exclamation</h1>
90+
<h2 id="encoded%23hash">Encoded Hash</h2>
91+
<h3 id="encoded%25percent">Encoded Percent</h3>
92+
<h4 id="encoded%26ampersand">Encoded Ampersand</h4>
93+
<h5 id="encoded%3Fquestion">Encoded Question</h5>
94+
<h6 id="encoded%40at">Encoded At</h6>
95+
</body>
96+
</html>
97+
"""
98+
else:
99+
self.send_response(404, 'Not Found')
100+
content = 'not found\n'
101+
self.send_header('Transfer-Encoding', 'chunked')
102+
self.end_headers()
103+
self._send_chunked(content)
104+
105+
106+
@pytest.mark.sphinx(
107+
'linkcheck',
108+
testroot='linkcheck-encoded-anchors',
109+
freshenv=True,
110+
)
111+
def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None:
112+
"""Test that linkcheck correctly handles URLs with encoded anchors."""
113+
with serve_application(app, EncodedAnchorsHandler) as address:
114+
# Create test file with encoded anchor links using the server address
115+
(app.srcdir / 'encoded_anchors.rst').write_text(
116+
f"""
117+
Encoded Anchors Test
118+
====================
119+
120+
Links with encoded anchors:
121+
122+
* `Standard I/O <http://{address}/standard-encoded-anchors#standard-input/output-stdio>`_
123+
* `Encoded Plus <http://{address}/standard-encoded-anchors#encoded+anchor>`_
124+
* `Encoded Exclamation <http://{address}/various-encoded-chars#encoded!exclamation>`_
125+
* `Encoded Hash <http://{address}/various-encoded-chars#encoded#hash>`_
126+
* `Encoded Percent <http://{address}/various-encoded-chars#encoded%percent>`_
127+
* `Encoded Ampersand <http://{address}/various-encoded-chars#encoded&ampersand>`_
128+
* `Encoded Question <http://{address}/various-encoded-chars#encoded?question>`_
129+
* `Encoded At <http://{address}/various-encoded-chars#encoded@at>`_
130+
""",
131+
encoding='utf-8',
132+
)
133+
134+
app.build()
135+
136+
# Parse the JSON output to check the results
137+
content = (app.outdir / 'output.json').read_text(encoding='utf8')
138+
data = [json.loads(record) for record in content.splitlines()]
139+
140+
# Filter for our encoded anchor URLs
141+
encoded_anchor_results = [
142+
record
143+
for record in data
144+
if any(
145+
x in record['uri']
146+
for x in ['standard-encoded-anchors#', 'various-encoded-chars#']
147+
)
148+
]
149+
150+
# All links with encoded anchors should be working
151+
assert all(record['status'] == 'working' for record in encoded_anchor_results)
152+
153+
# Verify specific links
154+
uri_pattern = re.compile(
155+
f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio'
156+
)
157+
stdio_link = next(
158+
record for record in encoded_anchor_results if uri_pattern.match(record['uri'])
159+
)
160+
assert stdio_link['status'] == 'working'
161+
162+
# Check for encoded plus link
163+
plus_pattern = re.compile(
164+
f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor'
165+
)
166+
plus_link = next(
167+
record for record in encoded_anchor_results if plus_pattern.match(record['uri'])
168+
)
169+
assert plus_link['status'] == 'working'

0 commit comments

Comments
 (0)