Merge pull request #13 from trufont/use-pep393-unicode-api

remove use of legacy Py_UNICODE; use PEP393 unicode strings
harfbuzz · Dec 13, 2018 · 395e256 · 395e256
2 parents 57ac02c + 5f7d109
commit 395e256
Show file tree

Hide file tree

Showing 6 changed files with 236 additions and 31 deletions.
diff --git a/setup.py b/setup.py
@@ -31,6 +31,7 @@
     zip_safe=False,
     setup_requires=["setuptools_scm"],
     cmake_args=cmake_args,
+    python_requires=">=3.5",
 )
 
 

diff --git a/src/uharfbuzz/_harfbuzz.pyx b/src/uharfbuzz/_harfbuzz.pyx
@@ -1,13 +1,24 @@
 #cython: language_level=3
 from charfbuzz cimport *
-from cpython.unicode cimport PyUnicode_AS_UNICODE, PyUnicode_GET_SIZE
-from libc.stdint cimport uint16_t, uint32_t
 from libc.stdlib cimport free, malloc
 from libc.string cimport const_char
 from typing import Callable, Dict, List, Tuple
 
 
-cdef bint PY_NARROW_UNICODE = sizeof(Py_UNICODE) != 4
+cdef extern from "Python.h":
+    # PEP 393
+    bint PyUnicode_IS_READY(object u)
+    Py_ssize_t PyUnicode_GET_LENGTH(object u)
+    int PyUnicode_KIND(object u)
+    void* PyUnicode_DATA(object u)
+    ctypedef uint8_t Py_UCS1
+    ctypedef uint16_t Py_UCS2
+    Py_UCS1 PyUnicode_1BYTE_DATA(object u)
+    Py_UCS2 PyUnicode_2BYTE_DATA(object u)
+    Py_UCS4 PyUnicode_4BYTE_DATA(object u)
+    int PyUnicode_1BYTE_KIND
+    int PyUnicode_2BYTE_KIND
+    int PyUnicode_4BYTE_KIND
 
 
 cdef class GlyphInfo:
@@ -151,13 +162,9 @@ cdef class Buffer:
             self._hb_buffer, hb_script_from_string(cstr, -1))
 
     def add_codepoints(self, codepoints: List[int],
-                       item_offset: int = None, item_length: int = None) -> None:
+                       item_offset: int = 0, item_length: int = -1) -> None:
         cdef unsigned int size = len(codepoints)
         cdef hb_codepoint_t* hb_codepoints
-        if item_offset is None:
-            item_offset = 0
-        if item_length is None:
-            item_length = size
         if not size:
             hb_codepoints = NULL
         else:
@@ -171,40 +178,44 @@ cdef class Buffer:
             free(hb_codepoints)
 
     def add_utf8(self, text: bytes,
-                 item_offset: int = None, item_length: int = None) -> None:
-        cdef unsigned int size = len(text)
-        if item_offset is None:
-            item_offset = 0
-        if item_length is None:
-            item_length = size
-        cdef char* cstr = text
+                 item_offset: int = 0, item_length: int = -1) -> None:
         hb_buffer_add_utf8(
-            self._hb_buffer, cstr, size, item_offset, item_length)
+            self._hb_buffer, text, len(text), item_offset, item_length)
 
     def add_str(self, text: str,
-                item_offset: int = None, item_length: int = None) -> None:
-        cdef Py_UNICODE* array = PyUnicode_AS_UNICODE(text)
-        cdef Py_ssize_t size = PyUnicode_GET_SIZE(text)
-        if item_offset is None:
-            item_offset = 0
-        if item_length is None:
-            item_length = size
-        if PY_NARROW_UNICODE:
+                item_offset: int = 0, item_length: int = -1) -> None:
+        # ensure unicode string is in the "canonical" representation
+        assert PyUnicode_IS_READY(text)
+
+        cdef Py_ssize_t length = PyUnicode_GET_LENGTH(text)
+        cdef int kind = PyUnicode_KIND(text)
+
+        if kind == PyUnicode_1BYTE_KIND:
+            hb_buffer_add_latin1(
+                self._hb_buffer,
+                <uint8_t*>PyUnicode_1BYTE_DATA(text),
+                length,
+                item_offset,
+                item_length,
+            )
+        elif kind == PyUnicode_2BYTE_KIND:
             hb_buffer_add_utf16(
                 self._hb_buffer,
-                <uint16_t*>array,
-                size,
+                <uint16_t*>PyUnicode_2BYTE_DATA(text),
+                length,
                 item_offset,
                 item_length,
             )
-        else:
+        elif kind == PyUnicode_4BYTE_KIND:
             hb_buffer_add_utf32(
                 self._hb_buffer,
-                <uint32_t*>array,
-                size,
+                <uint32_t*>PyUnicode_4BYTE_DATA(text),
+                length,
                 item_offset,
                 item_length,
             )
+        else:
+            raise AssertionError(kind)
 
     def guess_segment_properties(self) -> None:
         hb_buffer_guess_segment_properties(self._hb_buffer)

diff --git a/src/uharfbuzz/charfbuzz.pxd b/src/uharfbuzz/charfbuzz.pxd
@@ -1,4 +1,4 @@
-from libc.stdint cimport uint16_t, uint32_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t
 
 
 cdef extern from "hb.h":
@@ -86,6 +86,10 @@ cdef extern from "hb.h":
         hb_buffer_t* buffer,
         const hb_codepoint_t* text, int text_length,
         unsigned int item_offset, int item_length)
+    void hb_buffer_add_latin1(
+        hb_buffer_t* buffer,
+        const uint8_t* text, int text_length,
+        unsigned int item_offset, int item_length)
     void hb_buffer_add_utf8(
         hb_buffer_t* buffer,
         const char* text, int text_length,

diff --git a/tests/data/AdobeBlank.subset.ttf b/tests/data/AdobeBlank.subset.ttf
diff --git a/tests/data/LICENSE_AdobeBlank.txt b/tests/data/LICENSE_AdobeBlank.txt
@@ -0,0 +1,92 @@
+This Font Software is licensed under the SIL Open Font License,
+Version 1.1.
+
+This license is copied below, and is also available with a FAQ at:
+http://scripts.sil.org/OFL
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font
+creation efforts of academic and linguistic communities, and to
+provide a free and open framework in which fonts may be shared and
+improved in partnership with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply to
+any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software
+components as distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to,
+deleting, or substituting -- in part or in whole -- any of the
+components of the Original Version, by changing formats or by porting
+the Font Software to a new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed,
+modify, redistribute, and sell modified and unmodified copies of the
+Font Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components, in
+Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the
+corresponding Copyright Holder. This restriction only applies to the
+primary font name as presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created using
+the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
diff --git a/tests/test_uharfbuzz.py b/tests/test_uharfbuzz.py
@@ -1,10 +1,107 @@
 import uharfbuzz as hb
+from pathlib import Path
+import pytest
 
 
-class TestBuffer:
+TESTDATA = Path(__file__).parent / "data"
+ADOBE_BLANK_TTF = (TESTDATA / "AdobeBlank.subset.ttf").read_bytes()
+
+
+@pytest.fixture
+def blankfont():
+    """Return a subset of AdobeBlank.ttf containing the following glyphs/characters:
+    [
+        {gid=0, name=".notdef"},
+        {gid=1, name="a", code=0x61},
+        {gid=2, name="b", code=0x62},
+        {gid=3, name="c", code=0x63},
+        {gid=4, name="d", code=0x64},
+        {gid=5, name="e", code=0x65},
+        {gid=6, name="ccedilla", code=0x62},
+        {gid=7, name="uni0431", code=0x0431},  # CYRILLIC SMALL LETTER BE
+        {gid=8, name="u1F4A9", code=0x1F4A9},  # PILE OF POO
+    ]
+    """
+    face = hb.Face(ADOBE_BLANK_TTF)
+    font = hb.Font(face)
+    upem = face.upem
+    font.scale = (upem, upem)
+    hb.ot_font_set_funcs(font)
+    return font
 
+
+class TestBuffer:
     def test_init(self):
         buf = hb.Buffer()
 
     def test_create(self):
         buf = hb.Buffer.create()
+
+    @pytest.mark.parametrize(
+        "string, expected",
+        [
+            ("abcde", [(0x61, 0), (0x62, 1), (0x63, 2), (0x64, 3), (0x65, 4)]),
+            ("abçde", [(0x61, 0), (0x62, 1), (0xE7, 2), (0x64, 3), (0x65, 4)]),
+            ("aбcde", [(0x61, 0), (0x431, 1), (0x63, 2), (0x64, 3), (0x65, 4)]),
+            ("abc💩e", [(0x61, 0), (0x62, 1), (0x63, 2), (0x1F4A9, 3), (0x65, 4)]),
+        ],
+        ids=["ascii", "latin1", "ucs2", "ucs4"],
+    )
+    def test_add_str(self, string, expected):
+        buf = hb.Buffer()
+        buf.add_str(string)
+        infos = [(g.codepoint, g.cluster) for g in buf.glyph_infos]
+        assert infos == expected
+
+    def test_add_utf8(self):
+        buf = hb.Buffer()
+        buf.add_utf8("aбç💩e".encode("utf-8"))
+        infos = [(g.codepoint, g.cluster) for g in buf.glyph_infos]
+        assert infos == [(0x61, 0), (0x431, 1), (0xE7, 3), (0x1F4A9, 5), (0x65, 9)]
+
+    def test_add_codepoints(self):
+        buf = hb.Buffer()
+        buf.add_codepoints([0x61, 0x431, 0xE7, 0x1F4A9, 0x65])
+        infos = [(g.codepoint, g.cluster) for g in buf.glyph_infos]
+        assert infos == [(0x61, 0), (0x431, 1), (0xE7, 2), (0x1F4A9, 3), (0x65, 4)]
+
+    def test_guess_set_segment_properties(self):
+        buf = hb.Buffer()
+        buf.add_str("הארץ")
+
+        buf.guess_segment_properties()
+
+        assert buf.direction == "rtl"
+        assert buf.script == "Hebr"
+        # the guessed language seems to be locale specific
+        # assert buf.language == "en-us"
+        assert buf.language
+
+        buf.direction = "ltr"
+        assert buf.direction == "ltr"
+
+        buf.script = "Latn"
+        assert buf.script == "Latn"
+
+        buf.language = "he-il"
+        assert buf.language == "he-il"
+
+
+class TestShape:
+    @pytest.mark.parametrize(
+        "string, expected",
+        [
+            ("abcde", [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)]),
+            ("abçde", [(1, 0), (2, 1), (6, 2), (4, 3), (5, 4)]),
+            ("aбcde", [(1, 0), (7, 1), (3, 2), (4, 3), (5, 4)]),
+            ("abc💩e", [(1, 0), (2, 1), (3, 2), (8, 3), (5, 4)]),
+        ],
+        ids=["ascii", "latin1", "ucs2", "ucs4"],
+    )
+    def test_gid_and_cluster_no_features(self, blankfont, string, expected):
+        buf = hb.Buffer()
+        buf.add_str(string)
+        buf.guess_segment_properties()
+        hb.shape(blankfont, buf)
+        infos = [(g.codepoint, g.cluster) for g in buf.glyph_infos]
+        assert infos == expected