public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed
To: git-commits@fedoraproject.org
Subject: [rpms/python-beautifulsoup4] f44: 4.15.0
Date: Mon, 29 Jun 2026 10:19:57 GMT [thread overview]
Message-ID: <178272839734.1.1243970675572525330.rpms-python-beautifulsoup4-f8c94d3ffbce@fedoraproject.org> (raw)
A new commit has been pushed.
Repo : rpms/python-beautifulsoup4
Branch : f44
Commit : f8c94d3ffbce42a90000c2f098c23ea9949bf636
Author : Terje Røsten <terjeros@gmail.com>
Date : 2026-06-29T12:14:59+02:00
Stats : +33/-251 in 5 file(s)
URL : https://src.fedoraproject.org/rpms/python-beautifulsoup4/c/f8c94d3ffbce42a90000c2f098c23ea9949bf636?branch=f44
Log:
4.15.0
---
diff --git a/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch b/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
deleted file mode 100644
index aeae1ff..0000000
--- a/0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
+++ /dev/null
@@ -1,162 +0,0 @@
-From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
-From: Leonard Richardson <leonardr@segfault.org>
-Date: Mon, 8 Dec 2025 19:34:16 -0500
-Subject: [PATCH] * Change the html.parser tree builder's code for handling
- numeric character references, to avoid a crash when using Python versions
- that include the fix to Python issue https://bugs.python.org/issue13633
- (e.g. Python 3.11.13). [bug=2134393]
-
----
- CHANGELOG | 5 +++
- bs4/builder/_htmlparser.py | 78 +++++++++++++++++++++++++++++-------
- bs4/tests/test_htmlparser.py | 17 ++++++++
- 3 files changed, 86 insertions(+), 14 deletions(-)
-
-diff --git a/CHANGELOG b/CHANGELOG
-index f61b7e9..606e9f5 100644
---- a/CHANGELOG
-+++ b/CHANGELOG
-@@ -1,5 +1,10 @@
- = Unreleased
-
-+* Change the html.parser tree builder's code for handling numeric
-+ character references, to avoid a crash when using Python versions
-+ that include the fix to Python issue https://bugs.python.org/issue13633
-+ (e.g. Python 3.11.13). [bug=2134393]
-+
- * Skip the lxml tree builder's test_surrogate_in_character_reference test
- if the libxml2 version is less than 2.13.0. Prior versions of libxml2
- don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
-diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
-index 165a3d8..ead800f 100644
---- a/bs4/builder/_htmlparser.py
-+++ b/bs4/builder/_htmlparser.py
-@@ -10,6 +10,7 @@ __all__ = [
- ]
-
- from html.parser import HTMLParser
-+import re
-
- from typing import (
- Any,
-@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
- """Handle some textual data that shows up between tags."""
- self.soup.handle_data(data)
-
-+ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
-+ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
-+
-+ @classmethod
-+ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
-+ """Convert a numeric character reference into an actual character.
-+
-+ :param name: The number of the character reference, as
-+ obtained by html.parser
-+
-+ :return: A 3-tuple (dereferenced, replacement_added,
-+ extra_data). `dereferenced` is the dereferenced character
-+ reference, or the empty string if there was no
-+ reference. `replacement_added` is True if the reference
-+ could only be dereferenced by replacing content with U+FFFD
-+ REPLACEMENT CHARACTER. `extra_data` is a portion of data
-+ following the character reference, which was deemed to be
-+ normal data and not part of the reference at all.
-+ """
-+ dereferenced:str = ""
-+ replacement_added:bool = False
-+ extra_data:str = ""
-+
-+ base:int = 10
-+ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
-+ if name.startswith("x") or name.startswith("X"):
-+ # Hex reference
-+ name = name[1:]
-+ base = 16
-+ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
-+
-+ real_name:Optional[int] = None
-+ try:
-+ real_name = int(name, base)
-+ except ValueError:
-+ # This is either bad data that starts with what looks like
-+ # a numeric character reference, or a real numeric
-+ # reference that wasn't terminated by a semicolon.
-+ #
-+ # The fix to https://bugs.python.org/issue13633 made it
-+ # our responsibility to handle the extra data.
-+ #
-+ # To preserve the old behavior, we extract the numeric
-+ # portion of the incoming "reference" and treat that as a
-+ # numeric reference. All subsequent data will be processed
-+ # as string data.
-+ match = reg.search(name)
-+ if match is not None:
-+ real_name = int(match.groups()[0], base)
-+ extra_data = match.groups()[1]
-+
-+ if real_name is None:
-+ dereferenced = ""
-+ extra_data = name
-+ else:
-+ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
-+ return dereferenced, replacement_added, extra_data
-+
- def handle_charref(self, name: str) -> None:
- """Handle a numeric character reference by converting it to the
- corresponding Unicode character and treating it as textual
-@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
-
- :param name: Character number, possibly in hexadecimal.
- """
-- # TODO: This was originally a workaround for a bug in
-- # HTMLParser. (http://bugs.python.org/issue13633) The bug has
-- # been fixed, but removing this code still makes some
-- # Beautiful Soup tests fail. This needs investigation.
-- real_name:int
-- if name.startswith("x"):
-- real_name = int(name.lstrip("x"), 16)
-- elif name.startswith("X"):
-- real_name = int(name.lstrip("X"), 16)
-- else:
-- real_name = int(name)
--
-- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
-+ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
- if replacement_added:
- self.soup.contains_replacement_characters = True
-- self.handle_data(data)
-+ if dereferenced is not None:
-+ self.handle_data(dereferenced)
-+ if extra_data is not None:
-+ self.handle_data(extra_data)
-
- def handle_entityref(self, name: str) -> None:
- """Handle a named entity reference by converting it to the
-diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
-index 0086a9d..cb85b53 100644
---- a/bs4/tests/test_htmlparser.py
-+++ b/bs4/tests/test_htmlparser.py
-@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
- # Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
- # lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
- assert soup.contains_replacement_characters == True
-+
-+class TestBeautifulSoupHTMLParser:
-+ def test_dereference_numeric_character_reference(self):
-+ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
-+ assert m("64") == ("@", False, "")
-+ assert m("x64") == ("d", False, "")
-+ assert m("X64") == ("d", False, "")
-+ assert m("64andsomeextra") == ("@", False, "andsomeextra")
-+ assert m("") == ("", False, "")
-+ assert m("00whee") == ("�", True, "whee")
-+ assert m("xfffdthatsit") == ("�", False, "thatsit")
-+ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
-+ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
-+
-+ # These are almost certainly wrong but at least it doesn't crash.
-+ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
-+ assert m("xffffffffffffffffffffffbeep") == ("�", True, "p")
---
-2.52.0
-
diff --git a/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch b/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
deleted file mode 100644
index 415e736..0000000
--- a/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
+++ /dev/null
@@ -1,83 +0,0 @@
-From ec4a722af07341c4aa3fe604b077a1f773c6fdd2 Mon Sep 17 00:00:00 2001
-From: Leonard Richardson <leonardr@segfault.org>
-Date: Sun, 7 Dec 2025 13:10:42 -0500
-Subject: [PATCH] * Skip the lxml tree builder's
- test_surrogate_in_character_reference test if the libxml2 version is less
- than 2.13.0. Prior versions of libxml2 don't issue the REPLACEMENT
- CHARACTER we're expecting. [bug=2134346]
-
----
- CHANGELOG | 6 ++++++
- bs4/tests/test_lxml.py | 14 +++++++++++++-
- tox.ini | 2 +-
- 3 files changed, 20 insertions(+), 2 deletions(-)
-
-diff --git a/CHANGELOG b/CHANGELOG
-index 544f128..f61b7e9 100644
---- a/CHANGELOG
-+++ b/CHANGELOG
-@@ -1,3 +1,9 @@
-+= Unreleased
-+
-+* Skip the lxml tree builder's test_surrogate_in_character_reference test
-+ if the libxml2 version is less than 2.13.0. Prior versions of libxml2
-+ don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
-+
- = 4.14.3 (20251130)
-
- * When using one of the lxml tree builders, you can pass in
-diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
-index 0b69956..aa82143 100644
---- a/bs4/tests/test_lxml.py
-+++ b/bs4/tests/test_lxml.py
-@@ -7,6 +7,7 @@ from . import LXML_PRESENT, LXML_VERSION
-
- if LXML_PRESENT:
- from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
-+ from lxml import etree
-
- from bs4 import (
- BeautifulStoneSoup,
-@@ -47,7 +48,6 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
-
- # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
- # test if an old version of lxml is installed.
--
- @pytest.mark.skipif(
- not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
- reason="Skipping doctype test for old version of lxml to avoid segfault.",
-@@ -57,6 +57,18 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
- doctype = soup.contents[0]
- assert "" == doctype.strip()
-
-+ # This is a copy of the HTMLTreeBuilderSmokeTest implementation.
-+ # For lxml only, we need to skip the test if the libxml2 version doesn't
-+ # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version.
-+ @pytest.mark.skipif(
-+ "etree.LIBXML_VERSION < (2, 14, 0)",
-+ reason="libxml version doesn't issue REPLACEMENT CHARACTER",
-+ )
-+ def test_surrogate_in_character_reference(self):
-+ # These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
-+ soup = self.soup("<html><body>��</body></html>")
-+ assert soup.body.contents == ['��']
-+
- def test_beautifulstonesoup_is_xml_parser(self):
- # Make sure that the deprecated BSS class uses an xml builder
- # if one is installed.
-diff --git a/tox.ini b/tox.ini
-index c53e4d8..c60c3e7 100644
---- a/tox.ini
-+++ b/tox.ini
-@@ -2,7 +2,7 @@
- # encoding autodetection libraries: cchardet, chardet, and charset-normalizer
- [tox]
- env_list =
-- py{37, 38, 39, 310, 311, 312, 313},bare,docs
-+ py{37, 38, 39, 310, 311, 312, 313, 314},bare,docs
- minversion = 3.28.0
- skip_missing_interpreters = true
-
---
-2.52.0
-
diff --git a/0001-libxml2-seems-to-have-reverted-changes-done-in-2.13..patch b/0001-libxml2-seems-to-have-reverted-changes-done-in-2.13..patch
new file mode 100644
index 0000000..24e03f5
--- /dev/null
+++ b/0001-libxml2-seems-to-have-reverted-changes-done-in-2.13..patch
@@ -0,0 +1,25 @@
+From 09b7a9f419df84f2913b0f087957abd8a47133da Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Terje=20R=C3=B8sten?= <terjeros@gmail.com>
+Date: Mon, 29 Jun 2026 12:05:12 +0200
+Subject: [PATCH] libxml2 seems to have reverted changes done in 2.13.0 version
+
+---
+ bs4/tests/test_lxml.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
+index 128b391..60c6b85 100644
+--- a/bs4/tests/test_lxml.py
++++ b/bs4/tests/test_lxml.py
+@@ -61,7 +61,7 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+ # For lxml only, we need to skip the test if the libxml2 version doesn't
+ # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version.
+ @pytest.mark.skipif(
+- "not LXML_PRESENT or etree.LIBXML_VERSION < (2, 13, 0)",
++ "not LXML_PRESENT or etree.LIBXML_VERSION < (2, 14, 0)",
+ reason="libxml version doesn't issue REPLACEMENT CHARACTER",
+ )
+ def test_surrogate_in_character_reference(self):
+--
+2.54.0
+
diff --git a/python-beautifulsoup4.spec b/python-beautifulsoup4.spec
index 4fa87c5..60181e6 100644
--- a/python-beautifulsoup4.spec
+++ b/python-beautifulsoup4.spec
@@ -8,15 +8,14 @@
%endif
Name: python-beautifulsoup4
-Version: 4.14.3
-Release: 4%{?dist}
+Version: 4.15.0
+Release: 1%{?dist}
Summary: HTML/XML parser for quick-turnaround applications like screen-scraping
License: MIT
URL: http://www.crummy.com/software/BeautifulSoup/
Source0: https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-%{version}.tar.gz
-# Patches from upstream
-Patch0: 0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
-Patch1: 0001-Change-the-html.parser-tree-builder-s-code-for-handl.patch
+# libxml2 changes seems reverted
+Patch0: 0001-libxml2-seems-to-have-reverted-changes-done-in-2.13..patch
# https://git.launchpad.net/beautifulsoup/commit/?id=9786a62726de5a8caba10021c4d4a58c8a3e9e3f
Patch11: beautifulsoup4-4.14-disable-soupsieve.patch
BuildArch: noarch
@@ -93,6 +92,9 @@ Obsoletes: python3-BeautifulSoup < 1:3.2.1-2
%{python3_sitelib}/bs4
%changelog
+* Mon Jun 29 2026 Terje Rosten <terjeros@gmail.com> - 4.15.0-1
+- 4.15.0
+
* Thu Jun 04 2026 Python Maint <python-maint@redhat.com> - 4.14.3-4
- Rebuilt for Python 3.15
diff --git a/sources b/sources
index 6c42086..66f0bef 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-SHA512 (beautifulsoup4-4.14.3.tar.gz) = 5c535534045cac033112b7c060539d6b5a748620df9d15fb4c719708cde263ac506f3734a39156681633942543dc555af51c619dae430311a8889204d8ee325e
+SHA512 (beautifulsoup4-4.15.0.tar.gz) = ccf07cac23ca52f50802b3d6271b42c1367022e2bc409b1be2924bbbd274cebce3b5afde1c1c4f253802b7a58f907cb60a29d347ae2765c33ce85bec9dcc473c
reply other threads:[~2026-06-29 10:19 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=178272839734.1.1243970675572525330.rpms-python-beautifulsoup4-f8c94d3ffbce@fedoraproject.org \
--to=git-commits@fedoraproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox