-
Notifications
You must be signed in to change notification settings - Fork 173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added unicode generators #119
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#pragma once | ||
|
||
#include <string> | ||
|
||
#include "rapidcheck/detail/BitStream.h" | ||
|
||
namespace rc { | ||
namespace detail { | ||
|
||
/// By using a bitstream this function will return a single | ||
/// Unicode codepoint, with lower values having a higher chance | ||
/// to appear than the higher ones. Most results will be | ||
/// within the the basic multilingual plane, though | ||
/// any valid Unicode codepoint may be generated. | ||
template<typename T, typename RandomType> | ||
T generateCodePoint(rc::detail::BitStream<RandomType>& stream); | ||
|
||
/// Converts a codepoint into a string containing the utf8 | ||
/// encoding of passed codepoint. | ||
template<typename T, typename Y> | ||
T makeCharacterUtf8(Y codepoint); | ||
|
||
} // namespace detail | ||
} // namespace rc | ||
|
||
|
||
#include "Unicode.hpp" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#pragma once | ||
|
||
namespace rc { | ||
namespace detail { | ||
|
||
template<typename T, typename RandomType> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any particular reason this needs to be templated on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason it is templated on T is that it seemed to be the existing convention. Though I guess it is fairly unnecessary when it just generated an integer type, those tend to be fairly each to assign to each other. So yes, this function probably would be better off as a non-template. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not Unicode expert but according to Wikipedia, "Each UCS character is abstractly represented by a code point, which is an integer between 0 and 1,114,111" meaning a code point should fit into Another option would be to "remap" Unicode space so that alphabetic chars end up in the lower codepoints (i.e. range There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, 21 bits is currently all that is needed for Unicode. The algorithm currently favors codepoints with a low value by a significant margin (roughly 50%), though still about 12% chance to pick one of the very high values. It does not take the size attribute into account for individual characters, just full strings. But perhaps it should? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current extended ASCII string generator does not take size into account for individual characters but I think it should. So this one might just as well be designed to do so from the get go. |
||
T generateCodePoint(rc::detail::BitStream<RandomType>& stream) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. RapidCheck coding guidelines require braces on the same line. There is a |
||
static_assert(sizeof T >= 3, | ||
"Code points can only be stored in types at leeast three bytes large."); | ||
|
||
// Note, this algorithm is designed to provide | ||
// good values for UTF8 encoding but can be | ||
// used to generate any Unicode character | ||
int maxBytes = 1; | ||
|
||
T codepoint; | ||
while (maxBytes < 4) | ||
{ | ||
bool increase = stream.next<bool>(); | ||
if (!increase) | ||
{ | ||
break; | ||
} | ||
maxBytes += 1; | ||
} | ||
int noBits; | ||
switch (maxBytes) | ||
{ | ||
case 1: | ||
noBits = 7; | ||
break; | ||
case 2: | ||
noBits = 11; | ||
break; | ||
case 3: | ||
noBits = 16; | ||
break; | ||
default: | ||
noBits = 20; | ||
// Actually 21, put the first bit | ||
// needs to be specially handled | ||
// to not exceed the valid | ||
// value range for codepoints | ||
bool highestBit = stream.next<bool>(); | ||
if (highestBit) | ||
{ | ||
return 0x100000 | stream.next<T>(16); | ||
} | ||
|
||
} | ||
|
||
do | ||
{ | ||
codepoint = stream.next<T>(noBits); | ||
} while (codepoint == 0); | ||
return codepoint; | ||
} | ||
|
||
template<typename T, typename Y> | ||
T makeCharacterUtf8(Y codepoint) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here I suppose. UTF-8 will probably never be anything other than sequences of 8-bit chars so Just make sure that there's no allocation going on by returning a |
||
{ | ||
using ValType = T::value_type; | ||
if (codepoint <= 0x7F) | ||
{ | ||
return{ static_cast<ValType>(codepoint) }; | ||
} | ||
else if (codepoint <= 0x7FF) | ||
{ | ||
return{ | ||
static_cast<ValType>(0b11000000 | ((codepoint >> (6)) & 0b00011111)), | ||
static_cast<ValType>(0b10000000 | ((codepoint) & 0b00111111)) | ||
}; | ||
} | ||
else if (codepoint <= 0xFFFF) | ||
{ | ||
return{ | ||
static_cast<ValType>(0b11100000 | ((codepoint >> (6 + 6)) & 0b00001111)), | ||
static_cast<ValType>(0b10000000 | ((codepoint >> (6)) & 0b00111111)), | ||
static_cast<ValType>(0b10000000 | ((codepoint) & 0b00111111)) | ||
}; | ||
} | ||
else if (codepoint <= 0x10FFFF) | ||
{ | ||
return{ | ||
static_cast<ValType>(0b11110000 | ((codepoint >> (6+6+6)) & 0b00000111)), | ||
static_cast<ValType>(0b10000000 | ((codepoint >> (6+6)) & 0b00111111)), | ||
static_cast<ValType>(0b10000000 | ((codepoint >> (6)) & 0b00111111)), | ||
static_cast<ValType>(0b10000000 | ((codepoint) & 0b00111111)) | ||
}; | ||
} | ||
return T(); | ||
} | ||
|
||
} // namespace detail | ||
} // namespace rc |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,18 @@ | |
#include "rapidcheck/detail/BitStream.h" | ||
#include "rapidcheck/gen/Container.h" | ||
|
||
#include "rapidcheck/detail/Unicode.h" | ||
|
||
namespace rc { | ||
namespace gen { | ||
namespace detail { | ||
|
||
template <typename String> | ||
class StringGen; | ||
|
||
template <typename Container> | ||
class ContainerCodepointGen; | ||
|
||
template <typename T, typename... Args> | ||
class StringGen<std::basic_string<T, Args...>> { | ||
public: | ||
|
@@ -41,6 +46,32 @@ class StringGen<std::basic_string<T, Args...>> { | |
} | ||
}; | ||
|
||
template <typename T, typename... Args> | ||
class ContainerCodepointGen<std::vector<T, Args...>> { | ||
public: | ||
using Container = std::vector<T, Args...>; | ||
|
||
Shrinkable<Container> operator()(const Random &random, int size) const { | ||
auto stream = rc::detail::bitStreamOf(random); | ||
Container str; | ||
auto length = stream.next<std::size_t>() % (size + 1); | ||
str.reserve(length); | ||
|
||
for (std::size_t i = 0; i < length; i++) { | ||
str.push_back(rc::detail::generateCodePoint<T>(stream)); | ||
} | ||
|
||
return shrinkable::shrinkRecur( | ||
std::move(str), | ||
[](const Container &s) { | ||
return seq::concat(shrink::removeChunks(s), | ||
shrink::eachElement(s, &shrink::unicodeCodepoint<T>)); | ||
}); | ||
} | ||
}; | ||
|
||
|
||
|
||
template <typename T, typename... Args> | ||
struct DefaultArbitrary<std::basic_string<T, Args...>> { | ||
static Gen<std::basic_string<T, Args...>> arbitrary() { | ||
|
@@ -64,11 +95,52 @@ Gen<T> character() { | |
}; | ||
} | ||
|
||
|
||
template <typename T> | ||
Gen<T> unicodeCodepoint() { | ||
return [](const Random &random, int size) { | ||
auto stream = ::rc::detail::bitStreamOf(random); | ||
|
||
return shrinkable::shrinkRecur(rc::detail::generateCodePoint<T>(stream), | ||
&shrink::unicodeCodepoint<T>); | ||
}; | ||
} | ||
|
||
template <typename Container> | ||
Gen<Container> unicodeCodepoints() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the spirit of non-templatization, I think you could go for `std::vectorstd::uint32_t>´ all the way down here. That way, we can spare users the compile time cost of template instantiation. |
||
{ | ||
return detail::ContainerCodepointGen<Container>(); | ||
} | ||
|
||
template <typename String> | ||
Gen<String> characterUtf8() { | ||
return map(unicodeCodepoint(), [](T codepoint) | ||
{ | ||
return rc::detail::makeCharacterUtf8<String>(codepoint); | ||
}); | ||
} | ||
|
||
|
||
template <typename String> | ||
Gen<String> string() { | ||
return detail::StringGen<String>(); | ||
} | ||
|
||
template <typename String> | ||
Gen<String> stringUtf8() { | ||
return map(unicodeCodepoints<std::vector<uint32_t>>(), [](const std::vector<uint32_t>& codepoints) | ||
{ | ||
String str; | ||
for (const auto& cp : codepoints) | ||
{ | ||
str += rc::detail::makeCharacterUtf8<String>(cp); | ||
} | ||
return std::move(str); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
}); | ||
} | ||
|
||
|
||
|
||
} // namespace gen | ||
} // namespace rc | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unnecessary qualification, this is in
rc::detail
already.