Skip to content

Fix linkcheck anchor encoding issue #13621

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ Bugs fixed
* #13619: LaTeX: possible duplicated footnotes in PDF from object signatures
(typically if :confval:`latex_show_urls` ``= 'footnote'``).
Patch by Jean-François B.
* #13621: linkcheck: Fix false "Anchor not found" errors for valid URLs with encoded
characters in fragment identifiers.
Patch by Eric Nordlund.

Testing
-------
47 changes: 38 additions & 9 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from html.parser import HTMLParser
from queue import PriorityQueue, Queue
from threading import Thread
from typing import TYPE_CHECKING, NamedTuple, cast
from typing import TYPE_CHECKING, Any, NamedTuple, cast
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse

from docutils import nodes
Expand Down Expand Up @@ -485,6 +485,9 @@ def _retrieval_methods(

def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
req_url, delimiter, anchor = uri.partition('#')
original_encoded_anchor = (
anchor # Store the original encoded anchor before decoding
)
if delimiter and anchor:
for rex in self.anchors_ignore:
if rex.match(anchor):
Expand Down Expand Up @@ -536,7 +539,9 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
) as response:
if anchor and self.check_anchors and response.ok:
try:
found = contains_anchor(response, anchor)
found = contains_anchor(
response, anchor, original_encoded_anchor
)
except UnicodeDecodeError:
return (
_Status.IGNORED,
Expand Down Expand Up @@ -686,11 +691,13 @@ def _get_request_headers(
return {}


def contains_anchor(response: Response, anchor: str) -> bool:
def contains_anchor(
response: Response, anchor: str, original_encoded_anchor: str = ''
) -> bool:
"""Determine if an anchor is contained within an HTTP response."""
parser = AnchorCheckParser(anchor)
# Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing.
parser = AnchorCheckParser(anchor, original_encoded_anchor)
# Read file in chunks. If we find a matching anchor, we break the loop early
# to avoid downloading the entire response body.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
if isinstance(chunk, bytes):
# requests failed to decode, manually try to decode it
Expand All @@ -706,15 +713,37 @@ def contains_anchor(response: Response, anchor: str) -> bool:
class AnchorCheckParser(HTMLParser):
"""Specialised HTML parser that looks for a specific anchor."""

def __init__(self, search_anchor: str) -> None:
def __init__(self, search_anchor: str, original_encoded_anchor: str = '') -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A function call that contains the same value encoded in two different ways is, to me, something of an undesirable code smell. It's probably not the first item to focus on, but it would be nice if this initializer could be updated to accept only one form of the anchor (I'd probably recommend the original value received before any encoding-related operations have been attempted).

"""Initialize the parser with multiple anchor variations.

Args:
search_anchor: The decoded anchor to search for
(e.g., "standard-input/output-stdio")
original_encoded_anchor: The original encoded anchor
(e.g., "standard-input%2Foutput-stdio")
"""
super().__init__()

self.search_anchor = search_anchor
# Create variations of the anchor to check
self.search_variations = {
search_anchor, # decoded (current behavior)
}
Comment on lines +728 to +730
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend that we try to search the anchor name variations in the order listed in the WHATWG HTML spec for Scrolling to a fragment: https://html.spec.whatwg.org/commit-snapshots/4467ddf3235732614a0202729d6ec0e04c33b597/#scrolling-to-a-fragment

To do that, I think we may need to use a different datastructure than Python's set, because it doesn't necessarily iterate over elements in the order they were added/inserted.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My mistake; we only use the search_variations set attribute for presence-checking as we progress our way through each HTML document. Iteration order does not matter. So: I think set is fine, and we do not need to worry about the order that we add items -- provided that the resulting collection is equivalent to the anchor names a browser could check for.


# Add the original encoded version if provided
if original_encoded_anchor:
self.search_variations.add(original_encoded_anchor)

# Add a re-encoded version if the decoded anchor contains characters
# that would be encoded
if search_anchor != quote(search_anchor, safe=''):
self.search_variations.add(quote(search_anchor, safe=''))
Comment on lines +732 to +739
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to extract the anchor-to-search-variations code into a helper function/method; doing so would hopefully also be convenient to write unit tests around its behaviour.


self.found = False

def handle_starttag(self, tag: Any, attrs: Any) -> None:
for key, value in attrs:
if key in {'id', 'name'} and value == self.search_anchor:
# Check if the attribute value matches any of our variations
if key in {'id', 'name'} and value in self.search_variations:
self.found = True
break

Expand Down
4 changes: 4 additions & 0 deletions tests/roots/test-linkcheck-encoded-anchors/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
root_doc = 'encoded_anchors'
exclude_patterns = ['_build']
linkcheck_anchors = True
linkcheck_timeout = 0.25
76 changes: 76 additions & 0 deletions tests/test_builders/test_anchor_check_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Test the AnchorCheckParser class."""

from __future__ import annotations

from typing import TYPE_CHECKING
from unittest import mock

from sphinx.builders.linkcheck import AnchorCheckParser, contains_anchor

if TYPE_CHECKING:
from typing import Any


def test_anchor_check_parser_basic() -> None:
"""Test basic anchor matching functionality."""
parser = AnchorCheckParser('test-anchor')
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>')
assert parser.found is True

parser = AnchorCheckParser('non-existent')
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>')
assert parser.found is False


def test_anchor_check_parser_with_encoded_anchors() -> None:
"""Test anchor matching with encoded characters."""
# Test with encoded slash
parser = AnchorCheckParser(
'standard-input/output-stdio', 'standard-input%2Foutput-stdio'
)
parser.feed(
'<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>'
)
assert parser.found is True

# Test with plus sign
parser = AnchorCheckParser('encoded+anchor', 'encoded%2Banchor')
parser.feed('<html><body><div id="encoded%2Banchor">Test</div></body></html>')
assert parser.found is True

# Test with space
parser = AnchorCheckParser('encoded space', 'encoded%20space')
parser.feed('<html><body><div id="encoded%20space">Test</div></body></html>')
assert parser.found is True


def test_contains_anchor_with_encoded_characters() -> None:
"""Test the contains_anchor function with encoded characters."""
mock_response = mock.MagicMock()

# Setup a response that returns HTML with encoded anchors
def mock_iter_content(chunk_size: Any = None, decode_unicode: Any = None) -> Any:
content = '<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>'
yield content

mock_response.iter_content = mock_iter_content

# Test with original encoded anchor
assert (
contains_anchor(
mock_response,
'standard-input/output-stdio',
'standard-input%2Foutput-stdio',
)
is True
)

# Test with decoded anchor only
mock_response2 = mock.MagicMock()
mock_response2.iter_content = mock_iter_content
assert contains_anchor(mock_response2, 'standard-input/output-stdio') is True

# Test with non-existent anchor
mock_response3 = mock.MagicMock()
mock_response3.iter_content = mock_iter_content
assert contains_anchor(mock_response3, 'non-existent-anchor') is False
169 changes: 169 additions & 0 deletions tests/test_builders/test_build_linkcheck_encoded_anchors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""Test the linkcheck builder's ability to handle encoded anchors."""

from __future__ import annotations

import json
import re
from http.server import BaseHTTPRequestHandler
from typing import TYPE_CHECKING

import pytest

from tests.utils import serve_application

if TYPE_CHECKING:
from collections.abc import Iterable
from typing import Any

from sphinx.testing.util import SphinxTestApp


class EncodedAnchorsHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'

def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
"""Split content into chunks of a maximum size."""

def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
"""Encode a bytestring into a format suitable for HTTP chunked-transfer."""
yield f'{len(chunk):X}'.encode('ascii')
yield b'\r\n'
yield chunk
yield b'\r\n'

buffer = b''
for char in content:
buffer += char.encode('utf-8')
if len(buffer) >= max_chunk_size:
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
yield from _encode_chunk(chunk)

# Flush remaining bytes, if any
if buffer:
yield from _encode_chunk(buffer)

# Emit a final empty chunk to close the stream
yield from _encode_chunk(b'')

def _send_chunked(self, content: str) -> bool:
"""Send content in chunks."""
for chunk in self._chunk_content(content, max_chunk_size=20):
try:
self.wfile.write(chunk)
except (BrokenPipeError, ConnectionResetError) as e:
self.log_message(str(e))
return False
return True

def do_HEAD(self) -> None:
"""Handle HEAD requests."""
print(f'HEAD request for path: {self.path}')
if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}:
self.send_response(200, 'OK')
else:
self.send_response(404, 'Not Found')
self.end_headers()

def do_GET(self) -> None:
"""Serve test pages with encoded anchors."""
if self.path == '/standard-encoded-anchors':
self.send_response(200, 'OK')
# Note the ID has an encoded forward slash (%2F)
content = """
<!DOCTYPE html>
<html>
<head><title>Encoded Anchors Test</title></head>
<body>
<h1 id="standard-input%2Foutput-stdio">Standard I/O</h1>
<h2 id="encoded%2Banchor">Encoded Plus</h2>
</body>
</html>
"""
elif self.path == '/various-encoded-chars':
self.send_response(200, 'OK')
content = """
<!DOCTYPE html>
<html>
<head><title>Various Encoded Characters</title></head>
<body>
<h1 id="encoded%21exclamation">Encoded Exclamation</h1>
<h2 id="encoded%23hash">Encoded Hash</h2>
<h3 id="encoded%25percent">Encoded Percent</h3>
<h4 id="encoded%26ampersand">Encoded Ampersand</h4>
<h5 id="encoded%3Fquestion">Encoded Question</h5>
<h6 id="encoded%40at">Encoded At</h6>
</body>
</html>
"""
else:
self.send_response(404, 'Not Found')
content = 'not found\n'
self.send_header('Transfer-Encoding', 'chunked')
self.end_headers()
self._send_chunked(content)


@pytest.mark.sphinx(
'linkcheck',
testroot='linkcheck-encoded-anchors',
freshenv=True,
)
def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None:
"""Test that linkcheck correctly handles URLs with encoded anchors."""
with serve_application(app, EncodedAnchorsHandler) as address:
# Create test file with encoded anchor links using the server address
(app.srcdir / 'encoded_anchors.rst').write_text(
f"""
Encoded Anchors Test
====================

Links with encoded anchors:

* `Standard I/O <http://{address}/standard-encoded-anchors#standard-input/output-stdio>`_
* `Encoded Plus <http://{address}/standard-encoded-anchors#encoded+anchor>`_
* `Encoded Exclamation <http://{address}/various-encoded-chars#encoded!exclamation>`_
* `Encoded Hash <http://{address}/various-encoded-chars#encoded#hash>`_
* `Encoded Percent <http://{address}/various-encoded-chars#encoded%percent>`_
* `Encoded Ampersand <http://{address}/various-encoded-chars#encoded&ampersand>`_
* `Encoded Question <http://{address}/various-encoded-chars#encoded?question>`_
* `Encoded At <http://{address}/various-encoded-chars#encoded@at>`_
""",
encoding='utf-8',
)

app.build()

# Parse the JSON output to check the results
content = (app.outdir / 'output.json').read_text(encoding='utf8')
data = [json.loads(record) for record in content.splitlines()]

# Filter for our encoded anchor URLs
encoded_anchor_results = [
record
for record in data
if any(
x in record['uri']
for x in ['standard-encoded-anchors#', 'various-encoded-chars#']
)
]

# All links with encoded anchors should be working
assert all(record['status'] == 'working' for record in encoded_anchor_results)

# Verify specific links
uri_pattern = re.compile(
f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio'
)
stdio_link = next(
record for record in encoded_anchor_results if uri_pattern.match(record['uri'])
)
assert stdio_link['status'] == 'working'

# Check for encoded plus link
plus_pattern = re.compile(
f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor'
)
plus_link = next(
record for record in encoded_anchor_results if plus_pattern.match(record['uri'])
)
assert plus_link['status'] == 'working'
Loading