Skip to content

Custom character encoding example

Tom Honermann edited this page Jul 3, 2017 · 1 revision

Custom character encoding example

Adding support for your own custom character encoding is fairly straight forward. To do so, you'll need to define an encoding class that performs encoding and decoding operations appropriate for your encoding. You may also need to define a character set class if none of the provided character sets are appropriate for your encoding. For stateful encodings, you'll also need to define your own encoding state and encoding state transition classes. The following code outlines an example.

class my_character_set {
public:
  using code_point_type = unsigned short;  // 'char', 'wchar_t', or an unsigned integral type. 
  static const char* get_name() noexcept {
    return "my_character_set";
  }
  static code_point_type get_substitution_code_point() noexcept {
    return 0xfffd; // The code point value to substitute when errors occur.
  }
};

struct my_encoding_state {             // For stateful encodings, otherwise use trivial_encoding_state.
  ...
};
struct my_encoding_state_transition {  // For stateful encodings, otherwuse use trivial_encoding_state_transition.
  ...
};

class my_encoding {
public:
  using state_type = my_encoding_state;
  using state_transition_type = my_encoding_state_transition;
  using character_type = std::experimental::text::character<my_character_set>;
  using code_unit_type = unsigned char;  // 'char', 'wchar_t', or an unsigned integral type.

  static constexpr int min_code_units = 1;  // 'min_code_units' must match 'max_code_units' for
  static constexpr int max_code_units = 2;  // an encoding to be a random access encoding.

  static const state_type& initial_state() noexcept {
    static const state_type state{};  // Stateful encodings must return an instance of 'state_type'
    return state;                     // appropriate for the beginning of an encoded code unit sequence.
  }

  template<std::experimental::text::CodeUnitOutputIterator<code_unit_type> CUIT>
    static std::experimental::text::encode_status
    encode_state_transition(state_type &state,
                            CUIT &out,
                            const state_transition_type &stt,
                            int &encoded_code_units)
    noexcept( /* noexcept criteria */ )
    {
      encoded_code_units = 0;
      // For stateful encodings, encode the code units for the requested state
      // transition and return 'encode_status::no_error'.  Otherwise, return
      // 'encode_status::invalid_state_transition'.
      return std::experimental::text::encode_status::invalid_state_transition;
    }

  template<std::experimental::text::CodeUnitOutputIterator<code_unit_type> CUIT>
    static std::experimental::text::encode_status
    encode(state_type &state,
           CUIT &out,
           character_type c,
           int &encoded_code_units)
    noexcept( /* noexcept criteria */ )
    {
      encoded_code_units = 0;
      if ( /* c is not a valid character for this encoding */ ) {
        return std::experimental::text::encode_status::invalid_character;
      }
      // Encode the code unit sequence for 'c' by writing to 'out' updating 'state' as necessary.
      // Increment 'encoded_code_units' as each code unit is written respecting exception safety.
      return std::experimental::text::encode_status::no_error;
    }

  template<std::experimental::text::CodeUnitIterator CUIT, typename CUST>
    requires std::experimental::ranges::ForwardIterator<CUIT>()
          && std::experimental::ranges::Convertible<
               std::experimental::ranges::value_type_t<CUIT>, code_unit_type>()
          && std::experimental::ranges::Sentinel<CUST, CUIT>()
    static std::experimental::text::decode_status
    decode(state_type &state,
           CUIT &in_next,
           CUST in_end,
           character_type &c,
           int &decoded_code_units)
    noexcept( /* noexcept criteria */ )
    {
      decoded_code_units = 0;
      // Decode the code unit sequence being careful not to read beyond 'in_end'.
      // Attempt to decode exactly one state transition or character.
      // Increment 'decoded_code_units' as each code unit is read respecting exception safety.
      if ( /* 'in_next' == 'in_end' before a complete code unit sequence is decoded */ ) {
        return std::experimental::text::underflow;
      }
      if ( /* an invalid code unit sequence is read */ ) {
        return std::experimental::text::invalid_code_unit_sequence;
      }
      if ( /* a state transition is decoded */ ) {
        // Update 'state' as necessary.
        return std::experimental::text::no_character;
      }
      // Update 'c' and return 'decode_status::no_error'.
      return std::experimental::text::no_error;
    }

  // rdecode need only be implemented to support bidirectional encodings.
  template<std::experimental::text::CodeUnitIterator CUIT, typename CUST>
    requires std::experimental::ranges::ForwardIterator<CUIT>()
          && std::experimental::ranges::Convertible<
               std::experimental::ranges::value_type_t<CUIT>, code_unit_type>()
          && std::experimental::ranges::Sentinel<CUST, CUIT>()
    static std::experimental::text::decode_status
    rdecode(state_type &state,
            CUIT &in_next,
            CUST in_end,
            character_type &c,
            int &decoded_code_units)
    noexcept( /* noexcept criteria */ )
    {
      decoded_code_units = 0;
      // Decode the code unit sequence being careful not to read beyond 'in_end'.
      // Attempt to decode exactly one state transition or character.
      // Increment 'decoded_code_units' as each code unit is read respecting exception safety.
      if ( /* 'in_next' == 'in_end' before a complete code unit sequence is decoded */ ) {
        return std::experimental::text::underflow;
      }
      if ( /* an invalid code unit sequence is read */ ) {
        return std::experimental::text::invalid_code_unit_sequence;
      }
      if ( /* a state transition is decoded */ ) {
        // Update 'state' as necessary.
        return std::experimental::text::no_character;
      }
      // Update 'c' and return 'decode_status::no_error'.
      return std::experimental::text::no_error;
    }
};

With the above in place, text views and output text iterators can be constructed just as for any provided encoding:

// To construct a text_view:
unsigned char s[] = { ... };
auto tv = std::experimental::text::make_text_view<my_encoding>(s);

// To construct an otext_iterator:
std::vector<unsigned char> v;
auto oti = std::experimental::text::make_otext_iterator<my_encoding>(
             std::back_insert_iterator<decltype(v)>{v});
Clone this wiki locally