From df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2 Mon Sep 17 00:00:00 2001 From: Carl Friedrich Bolz-Tereick Date: Tue, 2 Mar 2021 20:37:27 +0100 Subject: some ascii fast paths of latin-1 encoding/decoding --- pypy/interpreter/test/test_unicodehelper.py | 7 +++++++ pypy/interpreter/unicodehelper.py | 9 +++++---- pypy/objspace/std/test/test_unicodeobject.py | 8 ++++++++ pypy/objspace/std/unicodeobject.py | 3 +++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py index 4d849cd9cf..34e08da5ac 100644 --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -11,6 +11,7 @@ from rpython.rlib import rutf8 from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii +from pypy.interpreter.unicodehelper import utf8_encode_latin_1 from pypy.interpreter import unicodehelper as uh from pypy.module._codecs.interp_codecs import CodecState @@ -91,3 +92,9 @@ def test_encode_decimal(space): result = uh.unicode_encode_decimal( u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) assert result == '12ሴ' + +def test_utf8_encode_latin1_ascii_prefix(): + utf8 = b'abcde\xc3\xa4g' + b = utf8_encode_latin_1(utf8, None, None) + assert b == b'abcde\xe4g' + diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py index 34fde1c874..d17ccb767b 100644 --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -155,14 +155,15 @@ def utf8_encode_latin_1(s, errors, errorhandler): try: rutf8.check_ascii(s) return s - except rutf8.CheckError: - return _utf8_encode_latin_1_slowpath(s, errors, errorhandler) + except rutf8.CheckError, e: + return _utf8_encode_latin_1_slowpath(s, e.pos, errors, errorhandler) -def _utf8_encode_latin_1_slowpath(s, errors, errorhandler): +def _utf8_encode_latin_1_slowpath(s, first_non_ascii_char, errors, errorhandler): size = len(s) result = StringBuilder(size) index = 0 - pos = 0 + result.append_slice(s, 0, first_non_ascii_char) + pos = first_non_ascii_char while pos < size: ch = rutf8.codepoint_at_pos(s, pos) if ch <= 0xFF: diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py index e8763dc496..7f88ed9721 100644 --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -217,6 +217,14 @@ class TestUnicodeObject: uniupper, = unicodedb.toupper_full(ch) assert chr(uniupper) == chr(ch).upper() + def test_latin1_encode_shortcut_ascii(self, monkeypatch): + from rpython.rlib import rutf8 + from pypy.objspace.std.unicodeobject import encode_object + monkeypatch.setattr(rutf8, "check_ascii", None) + w_b = encode_object(self.space, self.space.newutf8("abc", 3), "latin-1", "strict") + assert self.space.bytes_w(w_b) == "abc" + + class AppTestUnicodeStringStdOnly: def test_compares(self): assert u'a' == 'a' diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py index 0be4a9e55c..16edebfb03 100644 --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1216,6 +1216,9 @@ def encode_object(space, w_obj, encoding, errors): if rutf8.has_surrogates(utf8): utf8 = rutf8.reencode_utf8_with_surrogates(utf8) return space.newbytes(utf8) + if (encoding == "latin1" or encoding == "latin-1" and + isinstance(w_obj, W_UnicodeObject) and w_obj.is_ascii()): + return space.newbytes(w_obj._utf8) return encode(space, w_obj, encoding, errors) -- cgit v1.2.3-65-gdbad