diff --git a/include/rapidcheck/detail/Unicode.h b/include/rapidcheck/detail/Unicode.h new file mode 100644 index 00000000..8f5795e8 --- /dev/null +++ b/include/rapidcheck/detail/Unicode.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include "rapidcheck/detail/BitStream.h" + +namespace rc { +namespace detail { + +/// By using a bitstream this function will return a single +/// Unicode codepoint, with lower values having a higher chance +/// to appear than the higher ones. Most results will be +/// within the the basic multilingual plane, though +/// any valid Unicode codepoint may be generated. +template +T generateCodePoint(rc::detail::BitStream& stream); + +/// Converts a codepoint into a string containing the utf8 +/// encoding of passed codepoint. +template +T makeCharacterUtf8(Y codepoint); + +} // namespace detail +} // namespace rc + + +#include "Unicode.hpp" diff --git a/include/rapidcheck/detail/Unicode.hpp b/include/rapidcheck/detail/Unicode.hpp new file mode 100644 index 00000000..93707cd0 --- /dev/null +++ b/include/rapidcheck/detail/Unicode.hpp @@ -0,0 +1,96 @@ +#pragma once + +namespace rc { +namespace detail { + +template +T generateCodePoint(rc::detail::BitStream& stream) +{ + static_assert(sizeof T >= 3, + "Code points can only be stored in types at leeast three bytes large."); + + // Note, this algorithm is designed to provide + // good values for UTF8 encoding but can be + // used to generate any Unicode character + int maxBytes = 1; + + T codepoint; + while (maxBytes < 4) + { + bool increase = stream.next(); + if (!increase) + { + break; + } + maxBytes += 1; + } + int noBits; + switch (maxBytes) + { + case 1: + noBits = 7; + break; + case 2: + noBits = 11; + break; + case 3: + noBits = 16; + break; + default: + noBits = 20; + // Actually 21, put the first bit + // needs to be specially handled + // to not exceed the valid + // value range for codepoints + bool highestBit = stream.next(); + if (highestBit) + { + return 0x100000 | stream.next(16); + } + + } + + do + { + codepoint = stream.next(noBits); + } while (codepoint == 0); + return codepoint; +} + +template +T makeCharacterUtf8(Y codepoint) +{ + using ValType = T::value_type; + if (codepoint <= 0x7F) + { + return{ static_cast(codepoint) }; + } + else if (codepoint <= 0x7FF) + { + return{ + static_cast(0b11000000 | ((codepoint >> (6)) & 0b00011111)), + static_cast(0b10000000 | ((codepoint) & 0b00111111)) + }; + } + else if (codepoint <= 0xFFFF) + { + return{ + static_cast(0b11100000 | ((codepoint >> (6 + 6)) & 0b00001111)), + static_cast(0b10000000 | ((codepoint >> (6)) & 0b00111111)), + static_cast(0b10000000 | ((codepoint) & 0b00111111)) + }; + } + else if (codepoint <= 0x10FFFF) + { + return{ + static_cast(0b11110000 | ((codepoint >> (6+6+6)) & 0b00000111)), + static_cast(0b10000000 | ((codepoint >> (6+6)) & 0b00111111)), + static_cast(0b10000000 | ((codepoint >> (6)) & 0b00111111)), + static_cast(0b10000000 | ((codepoint) & 0b00111111)) + }; + } + return T(); +} + +} // namespace detail +} // namespace rc \ No newline at end of file diff --git a/include/rapidcheck/gen/Text.h b/include/rapidcheck/gen/Text.h index 1ab33a66..cb375a8e 100644 --- a/include/rapidcheck/gen/Text.h +++ b/include/rapidcheck/gen/Text.h @@ -10,6 +10,20 @@ namespace gen { template Gen character(); +/// Generator of Unicode Codepoint values. It has a higher chance +/// of generating lower value codepoints. +template +Gen unicodeCodepoint(); + +/// Generator of a container of Unicode Codepoint values. +template +Gen unicodeCodepoints(); + +/// Generator of Unicode text characters, encoded in utf8. +/// Will return them in a string of variable length. +template +Gen characterUtf8(); + /// Generator of strings. Essentially equivalent to /// `gen::container(gen::character())` but /// a lot faster. If you need to use a custom character generator, use @@ -17,6 +31,12 @@ Gen character(); template Gen string(); +/// Generator of strings, as `gen::string()` +/// but will be filled with utf8 encoded Unicode +template +Gen stringUtf8(); + + } // namespace gen } // namespace rc diff --git a/include/rapidcheck/gen/Text.hpp b/include/rapidcheck/gen/Text.hpp index a34bf8d5..ef381fb7 100644 --- a/include/rapidcheck/gen/Text.hpp +++ b/include/rapidcheck/gen/Text.hpp @@ -5,6 +5,8 @@ #include "rapidcheck/detail/BitStream.h" #include "rapidcheck/gen/Container.h" +#include "rapidcheck/detail/Unicode.h" + namespace rc { namespace gen { namespace detail { @@ -12,6 +14,9 @@ namespace detail { template class StringGen; +template +class ContainerCodepointGen; + template class StringGen> { public: @@ -41,6 +46,32 @@ class StringGen> { } }; +template +class ContainerCodepointGen> { +public: + using Container = std::vector; + + Shrinkable operator()(const Random &random, int size) const { + auto stream = rc::detail::bitStreamOf(random); + Container str; + auto length = stream.next() % (size + 1); + str.reserve(length); + + for (std::size_t i = 0; i < length; i++) { + str.push_back(rc::detail::generateCodePoint(stream)); + } + + return shrinkable::shrinkRecur( + std::move(str), + [](const Container &s) { + return seq::concat(shrink::removeChunks(s), + shrink::eachElement(s, &shrink::unicodeCodepoint)); + }); + } +}; + + + template struct DefaultArbitrary> { static Gen> arbitrary() { @@ -64,11 +95,52 @@ Gen character() { }; } + +template +Gen unicodeCodepoint() { + return [](const Random &random, int size) { + auto stream = ::rc::detail::bitStreamOf(random); + + return shrinkable::shrinkRecur(rc::detail::generateCodePoint(stream), + &shrink::unicodeCodepoint); + }; +} + +template +Gen unicodeCodepoints() +{ + return detail::ContainerCodepointGen(); +} + +template +Gen characterUtf8() { + return map(unicodeCodepoint(), [](T codepoint) + { + return rc::detail::makeCharacterUtf8(codepoint); + }); +} + + template Gen string() { return detail::StringGen(); } +template +Gen stringUtf8() { + return map(unicodeCodepoints>(), [](const std::vector& codepoints) + { + String str; + for (const auto& cp : codepoints) + { + str += rc::detail::makeCharacterUtf8(cp); + } + return std::move(str); + }); +} + + + } // namespace gen } // namespace rc diff --git a/include/rapidcheck/shrink/Shrink.h b/include/rapidcheck/shrink/Shrink.h index fd770a18..aae70f11 100644 --- a/include/rapidcheck/shrink/Shrink.h +++ b/include/rapidcheck/shrink/Shrink.h @@ -2,6 +2,8 @@ #include "rapidcheck/Seq.h" +#include + namespace rc { namespace shrink { @@ -50,6 +52,10 @@ inline Seq boolean(bool value); template Seq character(T value); +/// Shrinks a unicode codepoint +template +Seq unicodeCodepoint(T value); + } // namespace shrink } // namespace rc diff --git a/include/rapidcheck/shrink/Shrink.hpp b/include/rapidcheck/shrink/Shrink.hpp index b81eae50..6d84be84 100644 --- a/include/rapidcheck/shrink/Shrink.hpp +++ b/include/rapidcheck/shrink/Shrink.hpp @@ -198,5 +198,17 @@ Seq character(T value) { return seq::takeWhile(std::move(shrinks), [=](T x) { return x != value; }); } +/// Shrinks a unicode codepoint +template +Seq unicodeCodepoint(T value){ + auto shrinks = seq::cast(seq::concat( + seq::fromContainer(std::vector({'a', 'b', 'c'})), + seq::fromContainer(std::vector({'A', 'B', 'C', + '1', '2', '3', ' ', '\n' })), + seq::filter(towards(value, static_cast(1)), [](const T& val) + { return val != static_cast(0); } ))); + + return seq::takeWhile(std::move(shrinks), [=](T x) { return x != value; }); +} } // namespace shrink } // namespace rc diff --git a/src/gen/Text.cpp b/src/gen/Text.cpp index bfb733cc..68448e1d 100644 --- a/src/gen/Text.cpp +++ b/src/gen/Text.cpp @@ -1,5 +1,7 @@ #include "rapidcheck/gen/Text.h" +#include "rapidcheck/detail/Unicode.h" + template rc::Gen rc::gen::string(); template rc::Gen rc::gen::string(); template struct rc::Arbitrary; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f27c275a..254ad4f4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -43,6 +43,7 @@ add_executable(rapidcheck_tests detail/TestMetadataTests.cpp detail/TestParamsTests.cpp detail/TestingTests.cpp + detail/UnicodeTests.cpp detail/VariantTests.cpp fn/CommonTests.cpp gen/BuildTests.cpp diff --git a/test/detail/UnicodeTests.cpp b/test/detail/UnicodeTests.cpp new file mode 100644 index 00000000..d40a019a --- /dev/null +++ b/test/detail/UnicodeTests.cpp @@ -0,0 +1,87 @@ +#include +#include + +#include "rapidcheck/gen/Text.h" + +#include "util/GenUtils.h" +#include "util/ShrinkableUtils.h" + +#include "util/Util.h" +#include "util/Meta.h" + +using namespace rc; +using namespace rc::test; + +TEST_CASE("gen::unicodeCodepoint") { + prop("never generates null characters and always within range of valid unicode", + [](const GenParams ¶ms) { + + const auto gen = gen::unicodeCodepoint(); + onAnyPath( + gen(params.random, params.size), + [](const Shrinkable &value, const Shrinkable &shrink) { + RC_ASSERT(shrink.value() > 0u && shrink.value() < 0x10FFFFu); + }); + }); + + prop("first shrink is always 'a')", + [](const GenParams ¶ms) { + const auto gen = gen::unicodeCodepoint(); + const auto shrinkable = gen(params.random, params.size); + RC_PRE(shrinkable.value() != 'a'); + RC_ASSERT(shrinkable.shrinks().next()->value() == 'a'); + }); +} + +TEST_CASE("detail::makeCharacterUtf8") { + + // Test known values + auto char1 = rc::detail::makeCharacterUtf8(0x00C4u); + REQUIRE(char1.size() == 2u); + REQUIRE(char1[0] == static_cast(0xC3u)); + REQUIRE(char1[1] == static_cast(0x84u)); + auto char2 = rc::detail::makeCharacterUtf8(0x0034u); + REQUIRE(char2[0] == static_cast(0x34u)); + REQUIRE(char2.size() == 1u); + + prop("Only results in valid utf8 characters", + [](const GenParams ¶ms) { + const auto gen = gen::unicodeCodepoint(); + + // Test some known values + + onAnyPath( + gen(params.random, params.size), + [](const Shrinkable &value, const Shrinkable &shrink) { + auto utf8 = rc::detail::makeCharacterUtf8(shrink.value()); + + RC_ASSERT(utf8.size() > 0u); + size_t expectedBytes = 1; + if ((utf8[0] & 0b10000000) == 0) + { + expectedBytes = 1; + } + else if ((utf8[0] & 0b11100000) == 0b11000000) + { + expectedBytes = 2; + } + else if ((utf8[0] & 0b11110000) == 0b11100000) + { + expectedBytes = 3; + } + else if ((utf8[0] & 0b11111000) == 0b11110000) + { + expectedBytes = 4; + } + else + { + RC_ASSERT(false); + } + RC_ASSERT(utf8[0] != 0u); + for (size_t i = 1; i < expectedBytes; ++i) + { + RC_ASSERT((utf8[i] & 0b11000000) == 0b10000000); + } + }); + }); +} \ No newline at end of file diff --git a/test/gen/TextTests.cpp b/test/gen/TextTests.cpp index 72fa4cf9..8452f853 100644 --- a/test/gen/TextTests.cpp +++ b/test/gen/TextTests.cpp @@ -87,6 +87,114 @@ struct StringProperties { } }; +struct StringUtf8Properties { + template + static void exec() { + templatedProp("length is at most four times to size", + [](const GenParams ¶ms) { + const auto gen = gen::stringUtf8(); + const auto shrinkable = gen(params.random, params.size); + RC_ASSERT(shrinkable.value().size() <= + static_cast(params.size) * 4); + }); + + templatedProp("first shrink is empty", + [](const GenParams ¶ms) { + const auto gen = gen::stringUtf8(); + const auto shrinkable = gen(params.random, params.size); + RC_PRE(!shrinkable.value().empty()); + RC_ASSERT(shrinkable.shrinks().next()->value().empty()); + }); + + templatedProp( + "the size of each shrink is the same or smaller than the original", + [](const GenParams ¶ms) { + onAnyPath( + gen::stringUtf8()(params.random, params.size), + [](const Shrinkable &value, const Shrinkable &shrink) { + RC_ASSERT(containerSize(shrink.value()) <= + containerSize(value.value())); + }); + }); + + templatedProp("none of the shrinks equal the original value", + [](const GenParams ¶ms) { + onAnyPath(gen::stringUtf8()(params.random, params.size), + [](const Shrinkable &value, + const Shrinkable &shrink) { + RC_ASSERT(value.value() != shrink.value()); + }); + }); + } +}; + +struct CodepointContainerProperties { + template + static void exec() { + templatedProp("length is at most size", + [](const GenParams ¶ms) { + const auto gen = gen::unicodeCodepoints(); + const auto shrinkable = gen(params.random, params.size); + RC_ASSERT(shrinkable.value().size() <= + static_cast(params.size)); + }); + + templatedProp( + "finds minimum where container must be longer than a certain length", + [](const Random &random) { + const auto n = *gen::inRange(0, 10); + const auto size = *gen::inRange(50, 100); + const auto result = + searchGen(random, + size, + gen::unicodeCodepoints(), + [=](const T &x) { return x.size() >= n; }); + T expected(n, 'a'); + RC_ASSERT(result == expected); + }); + + templatedProp("first shrink is empty", + [](const GenParams ¶ms) { + const auto gen = gen::unicodeCodepoints(); + const auto shrinkable = gen(params.random, params.size); + RC_PRE(!shrinkable.value().empty()); + RC_ASSERT(shrinkable.shrinks().next()->value().empty()); + }); + + templatedProp( + "the size of each shrink is the same or smaller than the original", + [](const GenParams ¶ms) { + onAnyPath( + gen::unicodeCodepoints()(params.random, params.size), + [](const Shrinkable &value, const Shrinkable &shrink) { + RC_ASSERT(containerSize(shrink.value()) <= + containerSize(value.value())); + }); + }); + + templatedProp("none of the shrinks equal the original value", + [](const GenParams ¶ms) { + onAnyPath(gen::unicodeCodepoints()(params.random, params.size), + [](const Shrinkable &value, + const Shrinkable &shrink) { + RC_ASSERT(value.value() != shrink.value()); + }); + }); + } +}; + + TEST_CASE("gen::string") { forEachType(); } + +TEST_CASE("gen::stringUtf8") { + forEachType(); +} + +TEST_CASE("gen::unicodeCodepoints") { + forEachType, std::vector>(); +} + + +