From 0869a713f21f4b2fe021d802cf18f1b1af53695f Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Mon, 1 Feb 2021 12:52:52 -0800 Subject: bpo-41748: Handles unquoted attributes with commas (GH-24072) * bpo-41748: Adds tests for unquoted attributes with comma * bpo-41748: Handles unquoted attributes with comma * bpo-41748: Addresses review comments * bpo-41748: Addresses review comments * Adds more test cases * Simplifies the regex for handling spaces * bpo-41748: Moves attributes tests under the right class * bpo-41748: Addresses review about duplicate attributes * bpo-41748: Adds NEWS.d entry for this patch (cherry picked from commit 9eb11a139fac5514d8456626806a68b3e3b7eafb) Co-authored-by: Karl Dubost --- Lib/html/parser.py | 2 +- Lib/test/test_htmlparser.py | 92 +++++++++++++--------- .../2021-01-05-21-26-29.bpo-41748.KdC0w3.rst | 2 + 3 files changed, 59 insertions(+), 37 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index de81879a63..d19684ed11 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -47,7 +47,7 @@ |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) - (?:\s*,)* # possibly followed by a comma + \s* # possibly followed by a space )?(?:\s|/(?!>))* )* )? diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 326e34290f..d0dc67d917 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -452,42 +452,6 @@ def test_illegal_declarations(self): self._run_check('', [('comment', 'spacer type="block" height="25"')]) - def test_with_unquoted_attributes(self): - # see #12008 - html = ("" - "" - "
" - "- software-and-i" - "- library
") - expected = [ - ('starttag', 'html', []), - ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), - ('starttag', 'table', - [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), - ('starttag', 'tr', []), - ('starttag', 'td', [('align', 'left')]), - ('starttag', 'font', [('size', '-1')]), - ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), - ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), - ('endtag', 'span'), ('endtag', 'a'), - ('data', '- '), ('starttag', 'a', [('href', '/1/')]), - ('starttag', 'span', [('class', 'en')]), ('data', ' library'), - ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') - ] - self._run_check(html, expected) - - def test_comma_between_attributes(self): - self._run_check('
', [ - ('starttag', 'form', - [('action', '/xxx.php?a=1&b=2&'), - (',', None), ('method', 'post')])]) - - def test_weird_chars_in_unquoted_attribute_values(self): - self._run_check('', [ - ('starttag', 'form', - [('action', 'bogus|&#()value')])]) - def test_invalid_end_tags(self): # A collection of broken end tags.
is used as separator. # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state @@ -773,6 +737,62 @@ def test_end_tag_in_attribute_value(self): [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) + def test_with_unquoted_attributes(self): + # see #12008 + html = ("" + "" + "
" + "- software-and-i" + "- library
") + expected = [ + ('starttag', 'html', []), + ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), + ('starttag', 'table', + [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), + ('starttag', 'tr', []), + ('starttag', 'td', [('align', 'left')]), + ('starttag', 'font', [('size', '-1')]), + ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), + ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), + ('endtag', 'span'), ('endtag', 'a'), + ('data', '- '), ('starttag', 'a', [('href', '/1/')]), + ('starttag', 'span', [('class', 'en')]), ('data', ' library'), + ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') + ] + self._run_check(html, expected) + + def test_comma_between_attributes(self): + # see bpo 41478 + # HTMLParser preserves duplicate attributes, leaving the task of + # removing duplicate attributes to a conformant html tree builder + html = ('
' # between attrs (unquoted) + '
' # between attrs (quoted) + '
' # after values (unquoted) + '
' # after values (quoted) + '
' # one comma values (quoted) + '
' # before values (unquoted) + '
' # before values (quoted) + '
' # before names + '
' # after names + ) + expected = [ + ('starttag', 'div', [('class', 'bar,baz=asd'),]), + ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]), + ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]), + ('starttag', 'div', [('class', 'bar'), (',', None), + ('baz', 'asd'), (',', None)]), + ('starttag', 'div', [('class', 'bar'), (',', None)]), + ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]), + ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]), + ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]), + ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]), + ] + self._run_check(html, expected) + + def test_weird_chars_in_unquoted_attribute_values(self): + self._run_check('', [ + ('starttag', 'form', + [('action', 'bogus|&#()value')])]) if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst new file mode 100644 index 0000000000..52efa3ac3d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst @@ -0,0 +1,2 @@ +Fix HTMLParser parsing rules for element attributes containing +commas with spaces. Patch by Karl Dubost. \ No newline at end of file -- cgit v1.2.3-65-gdbad