Skip to content

Commit

Permalink
Implement legacy escaping of invalid UTF-8 (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsnet authored Dec 21, 2024
1 parent 0580b24 commit 764075c
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 9 deletions.
2 changes: 1 addition & 1 deletion arshal_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ func makeStructArshaler(t reflect.Type) *arshaler {

// Append the token to the output and to the state machine.
n0 := len(b) // offset before calling AppendQuote
if !xe.Flags.Get(jsonflags.EscapeForHTML | jsonflags.EscapeForJS) {
if !xe.Flags.Get(jsonflags.EscapeForHTML | jsonflags.EscapeForJS | jsonflags.EscapeInvalidUTF8) {
b = append(b, f.quotedName...)
} else {
b, _ = jsonwire.AppendQuote(b, f.name, &xe.Flags)
Expand Down
2 changes: 2 additions & 0 deletions internal/jsonflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const (
AllowInvalidUTF8 |
EscapeForHTML |
EscapeForJS |
EscapeInvalidUTF8 |
Deterministic |
FormatNilMapAsNull |
FormatNilSliceAsNull |
Expand Down Expand Up @@ -87,6 +88,7 @@ const (
CanonicalizeNumbers // encode only; for internal use by jsontext.Value.Canonicalize
EscapeForHTML // encode only
EscapeForJS // encode only
EscapeInvalidUTF8 // encode only; only exposed in v1
Multiline // encode only
SpaceAfterColon // encode only
SpaceAfterComma // encode only
Expand Down
8 changes: 6 additions & 2 deletions internal/jsonwire/encode.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,11 @@ func AppendQuote[Bytes ~[]byte | ~string](dst []byte, src Bytes, flags *jsonflag
case r == utf8.RuneError && rn == 1:
hasInvalidUTF8 = true
dst = append(dst, src[i:n]...)
dst = append(dst, "\ufffd"...)
if flags.Get(jsonflags.EscapeInvalidUTF8) {
dst = append(dst, `\ufffd`...)
} else {
dst = append(dst, "\ufffd"...)
}
n += rn
i = n
case (r == '\u2028' || r == '\u2029') && flags.Get(jsonflags.EscapeForJS):
Expand Down Expand Up @@ -150,7 +154,7 @@ func ReformatString(dst, src []byte, flags *jsonflags.Flags) ([]byte, int, error
if err != nil {
return dst, n, err
}
isCanonical := !flags.Get(jsonflags.EscapeForHTML | jsonflags.EscapeForJS)
isCanonical := !flags.Get(jsonflags.EscapeForHTML | jsonflags.EscapeForJS | jsonflags.EscapeInvalidUTF8)
if flags.Get(jsonflags.PreserveRawStrings) || (isCanonical && valFlags.IsCanonical()) {
dst = append(dst, src[:n]...) // copy the string verbatim
return dst, n, nil
Expand Down
1 change: 1 addition & 0 deletions jsontext/value.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ func (v *Value) reformat(canonical, multiline bool, prefix, indent string) error
eo.Flags.Set(jsonflags.PreserveRawStrings | 0) // per RFC 8785, section 3.2.2.2
eo.Flags.Set(jsonflags.EscapeForHTML | 0) // per RFC 8785, section 3.2.2.2
eo.Flags.Set(jsonflags.EscapeForJS | 0) // per RFC 8785, section 3.2.2.2
eo.Flags.Set(jsonflags.EscapeInvalidUTF8 | 0) // per RFC 8785, section 3.2.2.2
eo.Flags.Set(jsonflags.Multiline | 0) // per RFC 8785, section 3.2.1
} else {
if s := strings.TrimLeft(prefix, " \t"); len(s) > 0 {
Expand Down
6 changes: 0 additions & 6 deletions v1/failing.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
TestMarshalInvalidUTF8
TestMarshalInvalidUTF8/#00
TestMarshalInvalidUTF8/#02
TestMarshalInvalidUTF8/#03
TestMarshalInvalidUTF8/#04
TestMarshalInvalidUTF8/#05
TestUnmarshal
TestUnmarshal/#106
TestUnmarshal/#107
Expand Down
18 changes: 18 additions & 0 deletions v1/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type Options = jsonopts.Options
// DefaultOptionsV1 is the full set of all options that define v1 semantics.
// It is equivalent to the following boolean options being set to true:
//
// - [EscapeInvalidUTF8]
// - [FormatByteArrayAsArray]
// - [FormatTimeDurationAsNanosecond]
// - [IgnoreStructErrors]
Expand Down Expand Up @@ -64,6 +65,22 @@ func DefaultOptionsV1() Options {
return &jsonopts.DefaultOptionsV1
}

// EscapeInvalidUTF8 specifies that bytes of invalid UTF-8 within JSON strings
// should be escaped as a hexadecimal Unicode codepoint (i.e., \ufffd)
// of the Unicode replacement character as opposed to being encoded
// as the Unicode replacement character verbatim (without escaping).
// This option has no effect if [jsontext.AllowInvalidUTF8] is false.
//
// This only affects encoding and is ignored when decoding.
// The v1 default is true.
func EscapeInvalidUTF8(v bool) Options {
if v {
return jsonflags.EscapeInvalidUTF8 | 1
} else {
return jsonflags.EscapeInvalidUTF8 | 0
}
}

// FormatByteArrayAsArray specifies that a [N]byte array is formatted
// by default as a JSON array of byte values in contrast to v2 default
// of using a JSON string with the base64 encoding of the value.
Expand Down Expand Up @@ -172,6 +189,7 @@ func RejectFloatOverflow(v bool) Options {
// [InvalidUnmarshalError], or [UnmarshalTypeError] instead of the
// [jsonv2.SemanticError] or [jsontext.SyntacticError].
//
// This affects either marshaling or unmarshaling.
// The v1 default is true.
func ReportLegacyErrorValues(v bool) Options {
if v {
Expand Down

0 comments on commit 764075c

Please sign in to comment.