|
| 1 | +"""Test the linkcheck builder's ability to handle encoded anchors.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import json |
| 6 | +import re |
| 7 | +from http.server import BaseHTTPRequestHandler |
| 8 | +from typing import TYPE_CHECKING |
| 9 | + |
| 10 | +import pytest |
| 11 | + |
| 12 | +from tests.utils import serve_application |
| 13 | + |
| 14 | +if TYPE_CHECKING: |
| 15 | + from collections.abc import Iterable |
| 16 | + from typing import Any |
| 17 | + |
| 18 | + from sphinx.testing.util import SphinxTestApp |
| 19 | + |
| 20 | + |
| 21 | +class EncodedAnchorsHandler(BaseHTTPRequestHandler): |
| 22 | + protocol_version = 'HTTP/1.1' |
| 23 | + |
| 24 | + def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]: |
| 25 | + """Split content into chunks of a maximum size.""" |
| 26 | + |
| 27 | + def _encode_chunk(chunk: bytes) -> Iterable[bytes]: |
| 28 | + """Encode a bytestring into a format suitable for HTTP chunked-transfer.""" |
| 29 | + yield f'{len(chunk):X}'.encode('ascii') |
| 30 | + yield b'\r\n' |
| 31 | + yield chunk |
| 32 | + yield b'\r\n' |
| 33 | + |
| 34 | + buffer = b'' |
| 35 | + for char in content: |
| 36 | + buffer += char.encode('utf-8') |
| 37 | + if len(buffer) >= max_chunk_size: |
| 38 | + chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:] |
| 39 | + yield from _encode_chunk(chunk) |
| 40 | + |
| 41 | + # Flush remaining bytes, if any |
| 42 | + if buffer: |
| 43 | + yield from _encode_chunk(buffer) |
| 44 | + |
| 45 | + # Emit a final empty chunk to close the stream |
| 46 | + yield from _encode_chunk(b'') |
| 47 | + |
| 48 | + def _send_chunked(self, content: str) -> bool: |
| 49 | + """Send content in chunks.""" |
| 50 | + for chunk in self._chunk_content(content, max_chunk_size=20): |
| 51 | + try: |
| 52 | + self.wfile.write(chunk) |
| 53 | + except (BrokenPipeError, ConnectionResetError) as e: |
| 54 | + self.log_message(str(e)) |
| 55 | + return False |
| 56 | + return True |
| 57 | + |
| 58 | + def do_HEAD(self) -> None: |
| 59 | + """Handle HEAD requests.""" |
| 60 | + print(f'HEAD request for path: {self.path}') |
| 61 | + if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}: |
| 62 | + self.send_response(200, 'OK') |
| 63 | + else: |
| 64 | + self.send_response(404, 'Not Found') |
| 65 | + self.end_headers() |
| 66 | + |
| 67 | + def do_GET(self) -> None: |
| 68 | + """Serve test pages with encoded anchors.""" |
| 69 | + if self.path == '/standard-encoded-anchors': |
| 70 | + self.send_response(200, 'OK') |
| 71 | + # Note the ID has an encoded forward slash (%2F) |
| 72 | + content = """ |
| 73 | + <!DOCTYPE html> |
| 74 | + <html> |
| 75 | + <head><title>Encoded Anchors Test</title></head> |
| 76 | + <body> |
| 77 | + <h1 id="standard-input%2Foutput-stdio">Standard I/O</h1> |
| 78 | + <h2 id="encoded%2Banchor">Encoded Plus</h2> |
| 79 | + </body> |
| 80 | + </html> |
| 81 | + """ |
| 82 | + elif self.path == '/various-encoded-chars': |
| 83 | + self.send_response(200, 'OK') |
| 84 | + content = """ |
| 85 | + <!DOCTYPE html> |
| 86 | + <html> |
| 87 | + <head><title>Various Encoded Characters</title></head> |
| 88 | + <body> |
| 89 | + <h1 id="encoded%21exclamation">Encoded Exclamation</h1> |
| 90 | + <h2 id="encoded%23hash">Encoded Hash</h2> |
| 91 | + <h3 id="encoded%25percent">Encoded Percent</h3> |
| 92 | + <h4 id="encoded%26ampersand">Encoded Ampersand</h4> |
| 93 | + <h5 id="encoded%3Fquestion">Encoded Question</h5> |
| 94 | + <h6 id="encoded%40at">Encoded At</h6> |
| 95 | + </body> |
| 96 | + </html> |
| 97 | + """ |
| 98 | + else: |
| 99 | + self.send_response(404, 'Not Found') |
| 100 | + content = 'not found\n' |
| 101 | + self.send_header('Transfer-Encoding', 'chunked') |
| 102 | + self.end_headers() |
| 103 | + self._send_chunked(content) |
| 104 | + |
| 105 | + |
| 106 | +@pytest.mark.sphinx( |
| 107 | + 'linkcheck', |
| 108 | + testroot='linkcheck-encoded-anchors', |
| 109 | + freshenv=True, |
| 110 | +) |
| 111 | +def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None: |
| 112 | + """Test that linkcheck correctly handles URLs with encoded anchors.""" |
| 113 | + with serve_application(app, EncodedAnchorsHandler) as address: |
| 114 | + # Create test file with encoded anchor links using the server address |
| 115 | + (app.srcdir / 'encoded_anchors.rst').write_text( |
| 116 | + f""" |
| 117 | +Encoded Anchors Test |
| 118 | +==================== |
| 119 | +
|
| 120 | +Links with encoded anchors: |
| 121 | +
|
| 122 | +* `Standard I/O <http://{address}/standard-encoded-anchors#standard-input/output-stdio>`_ |
| 123 | +* `Encoded Plus <http://{address}/standard-encoded-anchors#encoded+anchor>`_ |
| 124 | +* `Encoded Exclamation <http://{address}/various-encoded-chars#encoded!exclamation>`_ |
| 125 | +* `Encoded Hash <http://{address}/various-encoded-chars#encoded#hash>`_ |
| 126 | +* `Encoded Percent <http://{address}/various-encoded-chars#encoded%percent>`_ |
| 127 | +* `Encoded Ampersand <http://{address}/various-encoded-chars#encoded&ersand>`_ |
| 128 | +* `Encoded Question <http://{address}/various-encoded-chars#encoded?question>`_ |
| 129 | +* `Encoded At <http://{address}/various-encoded-chars#encoded@at>`_ |
| 130 | +""", |
| 131 | + encoding='utf-8', |
| 132 | + ) |
| 133 | + |
| 134 | + app.build() |
| 135 | + |
| 136 | + # Parse the JSON output to check the results |
| 137 | + content = (app.outdir / 'output.json').read_text(encoding='utf8') |
| 138 | + data = [json.loads(record) for record in content.splitlines()] |
| 139 | + |
| 140 | + # Filter for our encoded anchor URLs |
| 141 | + encoded_anchor_results = [ |
| 142 | + record |
| 143 | + for record in data |
| 144 | + if any( |
| 145 | + x in record['uri'] |
| 146 | + for x in ['standard-encoded-anchors#', 'various-encoded-chars#'] |
| 147 | + ) |
| 148 | + ] |
| 149 | + |
| 150 | + # All links with encoded anchors should be working |
| 151 | + assert all(record['status'] == 'working' for record in encoded_anchor_results) |
| 152 | + |
| 153 | + # Verify specific links |
| 154 | + uri_pattern = re.compile( |
| 155 | + f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio' |
| 156 | + ) |
| 157 | + stdio_link = next( |
| 158 | + record for record in encoded_anchor_results if uri_pattern.match(record['uri']) |
| 159 | + ) |
| 160 | + assert stdio_link['status'] == 'working' |
| 161 | + |
| 162 | + # Check for encoded plus link |
| 163 | + plus_pattern = re.compile( |
| 164 | + f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor' |
| 165 | + ) |
| 166 | + plus_link = next( |
| 167 | + record for record in encoded_anchor_results if plus_pattern.match(record['uri']) |
| 168 | + ) |
| 169 | + assert plus_link['status'] == 'working' |
0 commit comments