-
-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Fix linkcheck anchor encoding issue #13621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ | |
from html.parser import HTMLParser | ||
from queue import PriorityQueue, Queue | ||
from threading import Thread | ||
from typing import TYPE_CHECKING, NamedTuple, cast | ||
from typing import TYPE_CHECKING, Any, NamedTuple, cast | ||
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse | ||
|
||
from docutils import nodes | ||
|
@@ -485,6 +485,9 @@ def _retrieval_methods( | |
|
||
def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: | ||
req_url, delimiter, anchor = uri.partition('#') | ||
original_encoded_anchor = ( | ||
anchor # Store the original encoded anchor before decoding | ||
) | ||
if delimiter and anchor: | ||
for rex in self.anchors_ignore: | ||
if rex.match(anchor): | ||
|
@@ -536,7 +539,9 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: | |
) as response: | ||
if anchor and self.check_anchors and response.ok: | ||
try: | ||
found = contains_anchor(response, anchor) | ||
found = contains_anchor( | ||
response, anchor, original_encoded_anchor | ||
) | ||
except UnicodeDecodeError: | ||
return ( | ||
_Status.IGNORED, | ||
|
@@ -686,11 +691,13 @@ def _get_request_headers( | |
return {} | ||
|
||
|
||
def contains_anchor(response: Response, anchor: str) -> bool: | ||
def contains_anchor( | ||
response: Response, anchor: str, original_encoded_anchor: str = '' | ||
) -> bool: | ||
"""Determine if an anchor is contained within an HTTP response.""" | ||
parser = AnchorCheckParser(anchor) | ||
# Read file in chunks. If we find a matching anchor, we break | ||
# the loop early in hopes not to have to download the whole thing. | ||
parser = AnchorCheckParser(anchor, original_encoded_anchor) | ||
# Read file in chunks. If we find a matching anchor, we break the loop early | ||
# to avoid downloading the entire response body. | ||
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True): | ||
if isinstance(chunk, bytes): | ||
# requests failed to decode, manually try to decode it | ||
|
@@ -706,15 +713,37 @@ def contains_anchor(response: Response, anchor: str) -> bool: | |
class AnchorCheckParser(HTMLParser): | ||
"""Specialised HTML parser that looks for a specific anchor.""" | ||
|
||
def __init__(self, search_anchor: str) -> None: | ||
def __init__(self, search_anchor: str, original_encoded_anchor: str = '') -> None: | ||
"""Initialize the parser with multiple anchor variations. | ||
|
||
Args: | ||
search_anchor: The decoded anchor to search for | ||
(e.g., "standard-input/output-stdio") | ||
original_encoded_anchor: The original encoded anchor | ||
(e.g., "standard-input%2Foutput-stdio") | ||
""" | ||
super().__init__() | ||
|
||
self.search_anchor = search_anchor | ||
# Create variations of the anchor to check | ||
self.search_variations = { | ||
search_anchor, # decoded (current behavior) | ||
} | ||
Comment on lines
+728
to
+730
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd recommend that we try to search the anchor name variations in the order listed in the WHATWG HTML spec for To do that, I think we may need to use a different datastructure than Python's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My mistake; we only use the |
||
|
||
# Add the original encoded version if provided | ||
if original_encoded_anchor: | ||
self.search_variations.add(original_encoded_anchor) | ||
|
||
# Add a re-encoded version if the decoded anchor contains characters | ||
# that would be encoded | ||
if search_anchor != quote(search_anchor, safe=''): | ||
self.search_variations.add(quote(search_anchor, safe='')) | ||
Comment on lines
+732
to
+739
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to extract the anchor-to-search-variations code into a helper function/method; doing so would hopefully also be convenient to write unit tests around its behaviour. |
||
|
||
self.found = False | ||
|
||
def handle_starttag(self, tag: Any, attrs: Any) -> None: | ||
for key, value in attrs: | ||
if key in {'id', 'name'} and value == self.search_anchor: | ||
# Check if the attribute value matches any of our variations | ||
if key in {'id', 'name'} and value in self.search_variations: | ||
self.found = True | ||
break | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
root_doc = 'encoded_anchors' | ||
exclude_patterns = ['_build'] | ||
linkcheck_anchors = True | ||
linkcheck_timeout = 0.25 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
"""Test the AnchorCheckParser class.""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
from unittest import mock | ||
|
||
from sphinx.builders.linkcheck import AnchorCheckParser, contains_anchor | ||
|
||
if TYPE_CHECKING: | ||
from typing import Any | ||
|
||
|
||
def test_anchor_check_parser_basic() -> None: | ||
"""Test basic anchor matching functionality.""" | ||
parser = AnchorCheckParser('test-anchor') | ||
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>') | ||
assert parser.found is True | ||
|
||
parser = AnchorCheckParser('non-existent') | ||
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>') | ||
assert parser.found is False | ||
|
||
|
||
def test_anchor_check_parser_with_encoded_anchors() -> None: | ||
"""Test anchor matching with encoded characters.""" | ||
# Test with encoded slash | ||
parser = AnchorCheckParser( | ||
'standard-input/output-stdio', 'standard-input%2Foutput-stdio' | ||
) | ||
parser.feed( | ||
'<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>' | ||
) | ||
assert parser.found is True | ||
|
||
# Test with plus sign | ||
parser = AnchorCheckParser('encoded+anchor', 'encoded%2Banchor') | ||
parser.feed('<html><body><div id="encoded%2Banchor">Test</div></body></html>') | ||
assert parser.found is True | ||
|
||
# Test with space | ||
parser = AnchorCheckParser('encoded space', 'encoded%20space') | ||
parser.feed('<html><body><div id="encoded%20space">Test</div></body></html>') | ||
assert parser.found is True | ||
|
||
|
||
def test_contains_anchor_with_encoded_characters() -> None: | ||
"""Test the contains_anchor function with encoded characters.""" | ||
mock_response = mock.MagicMock() | ||
|
||
# Setup a response that returns HTML with encoded anchors | ||
def mock_iter_content(chunk_size: Any = None, decode_unicode: Any = None) -> Any: | ||
content = '<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>' | ||
yield content | ||
|
||
mock_response.iter_content = mock_iter_content | ||
|
||
# Test with original encoded anchor | ||
assert ( | ||
contains_anchor( | ||
mock_response, | ||
'standard-input/output-stdio', | ||
'standard-input%2Foutput-stdio', | ||
) | ||
is True | ||
) | ||
|
||
# Test with decoded anchor only | ||
mock_response2 = mock.MagicMock() | ||
mock_response2.iter_content = mock_iter_content | ||
assert contains_anchor(mock_response2, 'standard-input/output-stdio') is True | ||
|
||
# Test with non-existent anchor | ||
mock_response3 = mock.MagicMock() | ||
mock_response3.iter_content = mock_iter_content | ||
assert contains_anchor(mock_response3, 'non-existent-anchor') is False |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
"""Test the linkcheck builder's ability to handle encoded anchors.""" | ||
|
||
from __future__ import annotations | ||
|
||
import json | ||
import re | ||
from http.server import BaseHTTPRequestHandler | ||
from typing import TYPE_CHECKING | ||
|
||
import pytest | ||
|
||
from tests.utils import serve_application | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import Iterable | ||
from typing import Any | ||
|
||
from sphinx.testing.util import SphinxTestApp | ||
|
||
|
||
class EncodedAnchorsHandler(BaseHTTPRequestHandler): | ||
protocol_version = 'HTTP/1.1' | ||
|
||
def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]: | ||
"""Split content into chunks of a maximum size.""" | ||
|
||
def _encode_chunk(chunk: bytes) -> Iterable[bytes]: | ||
"""Encode a bytestring into a format suitable for HTTP chunked-transfer.""" | ||
yield f'{len(chunk):X}'.encode('ascii') | ||
yield b'\r\n' | ||
yield chunk | ||
yield b'\r\n' | ||
|
||
buffer = b'' | ||
for char in content: | ||
buffer += char.encode('utf-8') | ||
if len(buffer) >= max_chunk_size: | ||
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:] | ||
yield from _encode_chunk(chunk) | ||
|
||
# Flush remaining bytes, if any | ||
if buffer: | ||
yield from _encode_chunk(buffer) | ||
|
||
# Emit a final empty chunk to close the stream | ||
yield from _encode_chunk(b'') | ||
|
||
def _send_chunked(self, content: str) -> bool: | ||
"""Send content in chunks.""" | ||
for chunk in self._chunk_content(content, max_chunk_size=20): | ||
try: | ||
self.wfile.write(chunk) | ||
except (BrokenPipeError, ConnectionResetError) as e: | ||
self.log_message(str(e)) | ||
return False | ||
return True | ||
|
||
def do_HEAD(self) -> None: | ||
"""Handle HEAD requests.""" | ||
print(f'HEAD request for path: {self.path}') | ||
if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}: | ||
self.send_response(200, 'OK') | ||
else: | ||
self.send_response(404, 'Not Found') | ||
self.end_headers() | ||
|
||
def do_GET(self) -> None: | ||
"""Serve test pages with encoded anchors.""" | ||
if self.path == '/standard-encoded-anchors': | ||
self.send_response(200, 'OK') | ||
# Note the ID has an encoded forward slash (%2F) | ||
content = """ | ||
<!DOCTYPE html> | ||
<html> | ||
<head><title>Encoded Anchors Test</title></head> | ||
<body> | ||
<h1 id="standard-input%2Foutput-stdio">Standard I/O</h1> | ||
<h2 id="encoded%2Banchor">Encoded Plus</h2> | ||
</body> | ||
</html> | ||
""" | ||
elif self.path == '/various-encoded-chars': | ||
self.send_response(200, 'OK') | ||
content = """ | ||
<!DOCTYPE html> | ||
<html> | ||
<head><title>Various Encoded Characters</title></head> | ||
<body> | ||
<h1 id="encoded%21exclamation">Encoded Exclamation</h1> | ||
<h2 id="encoded%23hash">Encoded Hash</h2> | ||
<h3 id="encoded%25percent">Encoded Percent</h3> | ||
<h4 id="encoded%26ampersand">Encoded Ampersand</h4> | ||
<h5 id="encoded%3Fquestion">Encoded Question</h5> | ||
<h6 id="encoded%40at">Encoded At</h6> | ||
</body> | ||
</html> | ||
""" | ||
else: | ||
self.send_response(404, 'Not Found') | ||
content = 'not found\n' | ||
self.send_header('Transfer-Encoding', 'chunked') | ||
self.end_headers() | ||
self._send_chunked(content) | ||
|
||
|
||
@pytest.mark.sphinx( | ||
'linkcheck', | ||
testroot='linkcheck-encoded-anchors', | ||
freshenv=True, | ||
) | ||
def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None: | ||
"""Test that linkcheck correctly handles URLs with encoded anchors.""" | ||
with serve_application(app, EncodedAnchorsHandler) as address: | ||
# Create test file with encoded anchor links using the server address | ||
(app.srcdir / 'encoded_anchors.rst').write_text( | ||
f""" | ||
Encoded Anchors Test | ||
==================== | ||
|
||
Links with encoded anchors: | ||
|
||
* `Standard I/O <http://{address}/standard-encoded-anchors#standard-input/output-stdio>`_ | ||
* `Encoded Plus <http://{address}/standard-encoded-anchors#encoded+anchor>`_ | ||
* `Encoded Exclamation <http://{address}/various-encoded-chars#encoded!exclamation>`_ | ||
* `Encoded Hash <http://{address}/various-encoded-chars#encoded#hash>`_ | ||
* `Encoded Percent <http://{address}/various-encoded-chars#encoded%percent>`_ | ||
* `Encoded Ampersand <http://{address}/various-encoded-chars#encoded&ersand>`_ | ||
* `Encoded Question <http://{address}/various-encoded-chars#encoded?question>`_ | ||
* `Encoded At <http://{address}/various-encoded-chars#encoded@at>`_ | ||
""", | ||
encoding='utf-8', | ||
) | ||
|
||
app.build() | ||
|
||
# Parse the JSON output to check the results | ||
content = (app.outdir / 'output.json').read_text(encoding='utf8') | ||
data = [json.loads(record) for record in content.splitlines()] | ||
|
||
# Filter for our encoded anchor URLs | ||
encoded_anchor_results = [ | ||
record | ||
for record in data | ||
if any( | ||
x in record['uri'] | ||
for x in ['standard-encoded-anchors#', 'various-encoded-chars#'] | ||
) | ||
] | ||
|
||
# All links with encoded anchors should be working | ||
assert all(record['status'] == 'working' for record in encoded_anchor_results) | ||
|
||
# Verify specific links | ||
uri_pattern = re.compile( | ||
f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio' | ||
) | ||
stdio_link = next( | ||
record for record in encoded_anchor_results if uri_pattern.match(record['uri']) | ||
) | ||
assert stdio_link['status'] == 'working' | ||
|
||
# Check for encoded plus link | ||
plus_pattern = re.compile( | ||
f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor' | ||
) | ||
plus_link = next( | ||
record for record in encoded_anchor_results if plus_pattern.match(record['uri']) | ||
) | ||
assert plus_link['status'] == 'working' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A function call that contains the same value encoded in two different ways is, to me, something of an undesirable code smell. It's probably not the first item to focus on, but it would be nice if this initializer could be updated to accept only one form of the anchor (I'd probably recommend the original value received before any encoding-related operations have been attempted).