Skip to content

Removes django-html-sanitizer django-braces and bleach #598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions events/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from django.conf import settings
from django_summernote.widgets import SummernoteInplaceWidget
from django.utils.translation import gettext_lazy as _

from sanitizer.forms import SanitizedCharField

from .html_sanitizer import sanitize_html
from .models import Event, EventParticipation
from .mixins import CrispyFormMixin, ReadOnlyFieldsMixin

Expand All @@ -21,11 +19,11 @@

class EventForm(CrispyFormMixin):

description = SanitizedCharField(
allowed_tags=settings.ALLOWED_HTML_TAGS_INPUT,
allowed_attributes=settings.ALLOWED_HTML_ATTRIBUTES_INPUT,
allowed_styles=settings.ALLOWED_HTML_STYLES_INPUT,
strip=False, widget=SummernoteInplaceWidget())
description = forms.CharField(
widget=SummernoteInplaceWidget(),
required=False,
label=_("Descripción")
)

start_at = forms.SplitDateTimeField(
required=True,
Expand Down Expand Up @@ -72,15 +70,30 @@ class Meta:
'has_sponsors': HAS_SPONSORS_HELP_TEXT,
}

def clean_description(self):
"""
Clean and sanitize the 'description' field using the allowed tags/attrs/styles
from Django settings.
"""
raw_description = self.cleaned_data.get('description', '')
safe_html = sanitize_html(
html=raw_description,
allowed_tags=settings.ALLOWED_HTML_TAGS_INPUT,
allowed_attrs=settings.ALLOWED_HTML_ATTRIBUTES_INPUT,
allowed_styles=settings.ALLOWED_HTML_STYLES_INPUT
)
return safe_html

def clean(self):
cleaned_data = super().clean()
start_at = cleaned_data.get('start_at')
end_at = cleaned_data.get('end_at')
if start_at is not None and end_at is not None:
if start_at > end_at:
msg = 'La fecha de inicio es menor a la fecha de finalizacion'
self._errors['start_at'] = [_(msg)]
self._errors['end_at'] = [_(msg)]

if start_at and end_at and start_at > end_at:
msg = 'La fecha de inicio es menor a la fecha de finalizacion'
self._errors['start_at'] = [_(msg)]
self._errors['end_at'] = [_(msg)]

return cleaned_data

def save(self, *args, **kwargs):
Expand Down
107 changes: 107 additions & 0 deletions events/html_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from bs4 import BeautifulSoup

from urllib.parse import urlparse

from django.conf import settings


def is_safe_url(url: str, attr: str) -> bool:
"""
Check if the URL uses an allowed scheme for the given attribute.

Args:
url (str): The URL to validate.
attr (str): The attribute name (e.g., 'href', 'src').

Returns:
bool: True if the URL is safe, False otherwise.
"""
parsed = urlparse(url)
if parsed.scheme == "":
# URLs without a scheme are considered safe (relative URLs)
return True
allowed_schemes = settings.ALLOWED_URL_SCHEMES.get(attr, [])
return parsed.scheme.lower() in allowed_schemes


def remove_disallowed_tags(soup: BeautifulSoup, allowed_tags: list[str]) -> None:
"""
Remove any tags that are not present in allowed_tags.
"""
for tag in soup.find_all():
if tag.name not in allowed_tags:
tag.decompose()


def filter_style_attribute(style_value: str, allowed_styles: list[str]) -> str:
"""
Take a CSS style string (e.g. "color: red; font-weight: bold;") and
return a sanitized version containing only the allowed properties.
Returns an empty string if no allowed properties remain.
"""
filtered_style_pairs = []
for prop_pair in style_value.split(";"):
prop_pair = prop_pair.strip()
if not prop_pair:
continue
if ":" not in prop_pair:
continue

prop_name, prop_value = prop_pair.split(":", 1)
prop_name = prop_name.strip().lower()
prop_value = prop_value.strip()

if prop_name in allowed_styles:
filtered_style_pairs.append(f"{prop_name}: {prop_value}")

return "; ".join(filtered_style_pairs)


def filter_attributes(
soup: BeautifulSoup, allowed_attrs: list[str], allowed_styles: list[str]
) -> None:
"""
For each remaining (allowed) tag in the soup, remove any attribute
not in allowed_attrs. If the attribute is 'style', filter out
disallowed CSS properties.
"""
for tag in soup.find_all():
for attr_name in list(tag.attrs):
# If attribute name is not allowed, remove it
if attr_name not in allowed_attrs:
del tag.attrs[attr_name]
# If it's a style attribute, apply style filtering
elif attr_name == "style":
style_value = tag.attrs["style"]
safe_style = filter_style_attribute(style_value, allowed_styles)
if safe_style:
tag.attrs["style"] = safe_style
else:
# If no allowed properties remain, remove the style attribute
del tag.attrs["style"]
elif attr_name in settings.ALLOWED_URL_SCHEMES:
# Validate URL schemes for attributes like 'href' and 'src'
url = tag.attrs[attr_name]
if not is_safe_url(url, attr_name):
del tag.attrs[attr_name]


def sanitize_html(
html: str,
allowed_tags: list[str],
allowed_attrs: list[str],
allowed_styles: list[str],
) -> str:
"""
Main function that orchestrates the sanitization process.
"""
soup = BeautifulSoup(html, "html.parser")

# 1. Remove disallowed tags entirely
remove_disallowed_tags(soup, allowed_tags)

# 2. Filter attributes (including style) on the remaining tags
filter_attributes(soup, allowed_attrs, allowed_styles)

# Return the resulting sanitized HTML
return str(soup)
182 changes: 182 additions & 0 deletions events/tests/test_html_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
from django.test import TestCase, override_settings
from events.html_sanitizer import sanitize_html
from django.conf import settings


class SanitizeHTMLTests(TestCase):
def setUp(self):
# Define allowed tags, attributes, and styles from settings
self.allowed_tags = settings.ALLOWED_HTML_TAGS_INPUT
self.allowed_attrs = settings.ALLOWED_HTML_ATTRIBUTES_INPUT
self.allowed_styles = settings.ALLOWED_HTML_STYLES_INPUT

def test_allowed_tags_preserved(self):
"""Ensure that allowed tags are preserved in the output."""
input_html = "<p>This is a <strong>test</strong> paragraph.</p>"
expected_output = "<p>This is a <strong>test</strong> paragraph.</p>"
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_disallowed_tags_removed(self):
"""Ensure that disallowed tags are removed from the output."""
input_html = "<p>This is a <script>alert('XSS');</script> test.</p>"
expected_output = "<p>This is a test.</p>"
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_allowed_attributes_preserved(self):
"""Ensure that allowed attributes are preserved."""
input_html = '<a href="https://example.com" title="Example">Link</a>'
expected_output = '<a href="https://example.com" title="Example">Link</a>'
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_disallowed_attributes_removed(self):
"""Ensure that disallowed attributes are removed."""
input_html = '<a href="https://example.com" onclick="alert(\'XSS\')">Link</a>'
expected_output = '<a href="https://example.com">Link</a>'
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_allowed_styles_preserved(self):
"""Ensure that allowed CSS styles are preserved."""
input_html = '<p style="color: red; font-weight: bold;">Styled text</p>'
expected_output = '<p style="color: red; font-weight: bold">Styled text</p>'
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_malformed_html(self):
"""Ensure that malformed HTML is handled gracefully."""
input_html = "<p>This is <b>bold<i> and italic</p>"
expected_output = "<p>This is <b>bold<i> and italic</i></b></p>"
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_escape_entities(self):
"""Ensure that HTML entities are preserved."""
input_html = "5 &lt; 10 &amp; 10 &gt; 5"
expected_output = "5 &lt; 10 &amp; 10 &gt; 5" # If using convert_charrefs=False
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_href_sanitization(self):
"""Ensure that href attributes do not contain 'javascript:'."""
input_html = "<a href=\"javascript:alert('XSS')\">Bad Link</a>"
expected_output = "<a>Bad Link</a>" # 'href' removed due to unsafe scheme
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_style_with_malicious_content(self):
"""Ensure that style attributes do not contain malicious content."""
input_html = (
"<p style=\"color: red; background-image: url('javascript:alert(1)');\">Test</p>"
)
expected_output = '<p style="color: red">Test</p>'

sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_completely_malicious_input(self):
"""Ensure that completely malicious input is sanitized appropriately."""
input_html = '<img src="x" onerror="alert(1)" /><script>alert("XSS")</script>'
expected_output = '<img src="x"/>'

sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_preserve_allowed_and_remove_disallowed_mixed(self):
"""Ensure that allowed and disallowed elements are correctly handled when mixed."""
input_html = """
<p>This is a <strong>strong</strong> and <em>emphasized</em> text.</p>
<script>alert('XSS');</script>
<a href="https://example.com" onclick="stealCookies()">Example Link</a>
<div style="color: green; font-size: 12px;">Div content</div>
"""
expected_output = """
<p>This is a <strong>strong</strong> and <em>emphasized</em> text.</p>

<a href="https://example.com">Example Link</a>

<div style="color: green; font-size: 12px">Div content</div>
""".strip()

sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)

# Normalize spaces:
sanitized = " ".join(sanitized.split())
expected_output = " ".join(expected_output.split())

self.assertEqual(sanitized, expected_output)

def test_allow_links_with_safe_href(self):
"""Ensure that links with safe href attributes are preserved."""
input_html = '<a href="https://example.com" title="Example">Visit Example</a>'
expected_output = (
'<a href="https://example.com" title="Example">Visit Example</a>'
)
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_allow_empty_style_attribute(self):
"""Ensure that empty style attributes are removed."""
input_html = '<p style="">No styles</p>'
expected_output = "<p>No styles</p>"
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

def test_preserve_text_only(self):
"""Ensure that plain text without HTML remains unchanged."""
input_html = "Just plain text without HTML."
expected_output = "Just plain text without HTML."
sanitized = sanitize_html(
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
)
self.assertEqual(sanitized, expected_output)

@override_settings(
ALLOWED_HTML_TAGS_INPUT=["p", "a"],
ALLOWED_HTML_ATTRIBUTES_INPUT=["href"],
ALLOWED_HTML_STYLES_INPUT=[],
)
def test_dynamic_allowed_tags(self):
"""Ensure that sanitizer uses dynamically overridden settings."""
input_html = (
"<p>Paragraph with "
'<a href="https://example.com" title="Example">link</a> '
"and <b>bold</b>.</p>"
)
# With settings overridden, 'a' allows 'href' only, 'b' is disallowed
expected = '<p>Paragraph with <a href="https://example.com">link</a> and .</p>'
sanitized = sanitize_html(
html=input_html,
allowed_tags=settings.ALLOWED_HTML_TAGS_INPUT,
allowed_attrs=settings.ALLOWED_HTML_ATTRIBUTES_INPUT,
allowed_styles=settings.ALLOWED_HTML_STYLES_INPUT,
)
self.assertEqual(sanitized, expected)
4 changes: 1 addition & 3 deletions events/tests/test_views.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import bleach

from django.test import TestCase, Client
from django.urls import reverse

Expand Down Expand Up @@ -89,7 +87,7 @@ def test_html_sanitizer_in_description_field(self):
self.assertEqual(response.status_code, 302)
self.assertEqual(Event.objects.filter(name='PyDay San Rafael').count(), 1)
event = Event.objects.filter(name='PyDay San Rafael').get()
self.assertEqual(event.description, bleach.clean('an <script>evil()</script> example'))
self.assertEqual(event.description, ('an example'))

def test_events_view_delete(self):
event = EventFactory(owner=self.user)
Expand Down
Loading
Loading