Skip to content

Commit 60d06e3

Browse files
committed
jsontext: proof-of-concept ReadStringAsStream
This is an idea for making it possible to have explicit control over what happens when reading strings. For example, an implementation could choose to decode on the fly or discard large strings but still keep going without using a large amount of memory. We include a somewhat involved example demonstrating some possibilities. Signed-off-by: Roger Peppe <[email protected]>
1 parent d3c622f commit 60d06e3

File tree

8 files changed

+385
-25
lines changed

8 files changed

+385
-25
lines changed

bench_test.go

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,13 @@ import (
2929
var benchVersion = cmp.Or(os.Getenv("BENCHMARK_VERSION"), "v2")
3030

3131
var jsonFuncs = func() (funcs struct {
32-
marshal func(any) ([]byte, error)
33-
unmarshal func([]byte, any) error
34-
encodeValue func(w io.Writer, b []byte) error
35-
encodeTokens func(w io.Writer, toks []jsontext.Token) error
36-
decodeValue func(r io.Reader) error
37-
decodeTokens func(r io.Reader) error
32+
marshal func(any) ([]byte, error)
33+
unmarshal func([]byte, any) error
34+
encodeValue func(w io.Writer, b []byte) error
35+
encodeTokens func(w io.Writer, toks []jsontext.Token) error
36+
decodeValue func(r io.Reader) error
37+
decodeTokens func(r io.Reader) error
38+
decodeTokensStream func(r io.Reader) error
3839
}) {
3940
ignoreEOF := func(err error) error {
4041
if err == io.EOF {
@@ -62,6 +63,8 @@ var jsonFuncs = func() (funcs struct {
6263
}
6364
}
6465
}
66+
funcs.decodeTokensStream = funcs.decodeTokens
67+
6568
case "v1in2":
6669
funcs.marshal = jsonv1in2.Marshal
6770
funcs.unmarshal = jsonv1in2.Unmarshal
@@ -80,6 +83,7 @@ var jsonFuncs = func() (funcs struct {
8083
}
8184
}
8285
}
86+
funcs.decodeTokensStream = funcs.decodeTokens
8387
case "v2":
8488
funcs.marshal = func(v any) ([]byte, error) { return jsonv2.Marshal(v) }
8589
funcs.unmarshal = func(b []byte, v any) error { return jsonv2.Unmarshal(b, v) }
@@ -107,6 +111,22 @@ var jsonFuncs = func() (funcs struct {
107111
}
108112
}
109113
}
114+
funcs.decodeTokensStream = func(r io.Reader) error {
115+
d := jsontext.NewDecoder(r, jsontext.AllowDuplicateNames(true))
116+
for {
117+
if d.PeekKind() == '"' {
118+
for _, err := range d.ReadStringAsSeq() {
119+
if err != nil {
120+
return err
121+
}
122+
}
123+
return nil
124+
}
125+
if _, err := d.ReadToken(); err != nil {
126+
return ignoreEOF(err)
127+
}
128+
}
129+
}
110130
default:
111131
panic("unknown version: " + benchVersion)
112132
}
@@ -413,11 +433,14 @@ func runAllTestdata(tb testing.TB) {
413433
})
414434
}
415435
}
416-
417436
tokens := mustDecodeTokens(tb, td.Data())
418437
buffer := make([]byte, 0, 2*len(td.Data()))
419438
for _, codeName := range []string{"Encode", "Decode"} {
420-
for _, typeName := range []string{"Token", "Value"} {
439+
for _, typeName := range []string{"Token", "TokenStream", "Value"} {
440+
if codeName == "Encode" && typeName == "TokenStream" {
441+
// no token stream for encoding.
442+
continue
443+
}
421444
for _, modeName := range []string{"Streaming", "Buffered"} {
422445
name := path.Join(td.Name, codeName, typeName, modeName)
423446
runTestOrBench(tb, name, int64(td.Size), func(tb testing.TB) {
@@ -531,6 +554,10 @@ func runDecode(t testing.TB, typeName, modeName string, buffer, data []byte, tok
531554
if err := jsonFuncs.decodeTokens(r); err != nil {
532555
t.Fatalf("Decoder.ReadToken error: %v", err)
533556
}
557+
case "TokenStream":
558+
if err := jsonFuncs.decodeTokensStream(r); err != nil {
559+
t.Fatalf("Decoder.ReadTokenStreamv error: %v", err)
560+
}
534561
case "Value":
535562
if err := jsonFuncs.decodeValue(r); err != nil {
536563
t.Fatalf("Decoder.ReadValue error: %v", err)
@@ -555,7 +582,7 @@ func BenchmarkSlowStreamingDecode(b *testing.B) { runAllSlowStreamingDecode(
555582

556583
func runAllSlowStreamingDecode(tb testing.TB) {
557584
for _, td := range slowStreamingDecodeTestdata {
558-
for _, typeName := range []string{"Token", "Value"} {
585+
for _, typeName := range []string{"Token", "TokenStream", "Value"} {
559586
name := path.Join(td.name, typeName)
560587
runTestOrBench(tb, name, len64(td.data), func(tb testing.TB) {
561588
runSlowStreamingDecode(tb, typeName, td.data)
@@ -570,6 +597,10 @@ func runAllSlowStreamingDecode(tb testing.TB) {
570597
func runSlowStreamingDecode(t testing.TB, typeName string, data []byte) {
571598
r := iotest.OneByteReader(bytes.NewReader(data))
572599
switch typeName {
600+
case "TokenStream":
601+
if err := jsonFuncs.decodeTokensStream(r); err != nil {
602+
t.Fatalf("Decoder.ReadToken error: %v", err)
603+
}
573604
case "Token":
574605
if err := jsonFuncs.decodeTokens(r); err != nil {
575606
t.Fatalf("Decoder.ReadToken error: %v", err)

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@
44
module github.com/go-json-experiment/json
55

66
go 1.24
7+
8+
require github.com/rogpeppe/ioseq v0.0.0-20250405133649-1cc137273d25

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
github.com/rogpeppe/ioseq v0.0.0-20250405133649-1cc137273d25 h1:HikF06tRfzHScFVzW7N1v+BDqcv4e34b5SbEMZnKP10=
2+
github.com/rogpeppe/ioseq v0.0.0-20250405133649-1cc137273d25/go.mod h1:Rs8CeC6rysLMoBy6MSLofmizxZm7lg52K8Wu2AYHVpA=

internal/jsonwire/decode.go

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -93,16 +93,28 @@ func ConsumeLiteral(b []byte, lit string) (n int, err error) {
9393
// non-zero then we know that the string would be encoded the same way
9494
// under both v1 or v2 escape semantics.
9595
func ConsumeSimpleString(b []byte) (n int) {
96+
return ConsumeSimpleStringPartial(b, true)
97+
}
98+
99+
// ConsumeSimpleStringPartial is like ConsumeSimpleString but can
100+
// also work when not at the start of a string, indicated by atStart.
101+
func ConsumeSimpleStringPartial(b []byte, atStart bool) (n int) {
96102
// NOTE: The arguments and logic are kept simple to keep this inlinable.
97-
if len(b) > 0 && b[0] == '"' {
98-
n++
99-
for len(b) > n && b[n] < utf8.RuneSelf && escapeASCII[b[n]] == 0 {
100-
n++
101-
}
102-
if uint(len(b)) > uint(n) && b[n] == '"' {
103-
n++
104-
return n
103+
if len(b) == 0 {
104+
return 0
105+
}
106+
if atStart {
107+
if b[0] != '"' {
108+
return 0
105109
}
110+
n++
111+
}
112+
for len(b) > n && b[n] < utf8.RuneSelf && escapeASCII[b[n]] == 0 {
113+
n++
114+
}
115+
if uint(len(b)) > uint(n) && b[n] == '"' {
116+
n++
117+
return n
106118
}
107119
return 0
108120
}
@@ -116,12 +128,18 @@ func ConsumeString(flags *ValueFlags, b []byte, validateUTF8 bool) (n int, err e
116128
return ConsumeStringResumable(flags, b, 0, validateUTF8)
117129
}
118130

119-
// ConsumeStringResumable is identical to consumeString but supports resuming
131+
// ConsumeStringResumable is identical to ConsumeString but supports resuming
120132
// from a previous call that returned io.ErrUnexpectedEOF.
121133
func ConsumeStringResumable(flags *ValueFlags, b []byte, resumeOffset int, validateUTF8 bool) (n int, err error) {
134+
return ConsumeStringResumablePartial(flags, b, resumeOffset, true, validateUTF8)
135+
}
136+
137+
// ConsumeStringResumable is identical to ConsumeStringResumable but also supports
138+
// starting midway through the string, indicated by atStart.
139+
func ConsumeStringResumablePartial(flags *ValueFlags, b []byte, resumeOffset int, atStart bool, validateUTF8 bool) (n int, err error) {
122140
// Consume the leading double quote.
123141
switch {
124-
case resumeOffset > 0:
142+
case !atStart:
125143
n = resumeOffset // already handled the leading quote
126144
case uint(len(b)) == 0:
127145
return n, io.ErrUnexpectedEOF
@@ -248,18 +266,28 @@ func ConsumeStringResumable(flags *ValueFlags, b []byte, resumeOffset int, valid
248266
return n, io.ErrUnexpectedEOF
249267
}
250268

251-
// AppendUnquote appends the unescaped form of a JSON string in src to dst.
269+
// AppendUnquotePartial appends the unescaped form of a JSON string in src to dst.
252270
// Any invalid UTF-8 within the string will be replaced with utf8.RuneError,
253271
// but the error will be specified as having encountered such an error.
254272
// The input must be an entire JSON string with no surrounding whitespace.
273+
//
274+
// It returns [io.ErrUnexpectedEOF] if the source terminates before
275+
// the final quote.
255276
func AppendUnquote[Bytes ~[]byte | ~string](dst []byte, src Bytes) (v []byte, err error) {
277+
return AppendUnquotePartial(dst, src, true)
278+
}
279+
280+
// AppendUnquotePartial is like AppendUnquote except that the string
281+
// need not start at the beginning (atStart indicates whether it does).
282+
func AppendUnquotePartial[Bytes ~[]byte | ~string](dst []byte, src Bytes, atStart bool) (v []byte, err error) {
256283
dst = slices.Grow(dst, len(src))
257284

258285
// Consume the leading double quote.
259286
var i, n int
260287
switch {
261288
case uint(len(src)) == 0:
262289
return dst, io.ErrUnexpectedEOF
290+
case !atStart:
263291
case src[0] == '"':
264292
i, n = 1, 1
265293
default:
@@ -409,11 +437,26 @@ func hasEscapedUTF16Prefix[Bytes ~[]byte | ~string](b Bytes, lowerSurrogateHalf
409437
// Otherwise, a new buffer is allocated for the output.
410438
// It assumes the input is valid.
411439
func UnquoteMayCopy(b []byte, isVerbatim bool) []byte {
440+
return UnquoteMayCopyPartial(b, true, isVerbatim)
441+
}
442+
443+
// UnquoteMayCopyPartial is like UnquoteMayCopy except that
444+
// it can also be used to unquote a string that's not at the start.
445+
// The atStart argument indicates whether that'sthe case.
446+
func UnquoteMayCopyPartial(b []byte, atStart bool, isVerbatim bool) []byte {
412447
// NOTE: The arguments and logic are kept simple to keep this inlinable.
413448
if isVerbatim {
414-
return b[len(`"`) : len(b)-len(`"`)]
449+
if atStart {
450+
b = b[len(`"`):]
451+
}
452+
// When unquoting a streaming string, the final quote may be
453+
// omitted.
454+
if len(b) > 0 && b[len(b)-1] == '"' {
455+
b = b[:len(b)-len(`"`)]
456+
}
457+
return b
415458
}
416-
b, _ = AppendUnquote(nil, b)
459+
b, _ = AppendUnquotePartial(nil, b, atStart)
417460
return b
418461
}
419462

internal/jsonwire/encode.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,19 @@ func NeedEscape[Bytes ~[]byte | ~string](src Bytes) bool {
6161
// If no escape flags are set, then the shortest representable form is used,
6262
// which is also the canonical form for strings (RFC 8785, section 3.2.2.2).
6363
func AppendQuote[Bytes ~[]byte | ~string](dst []byte, src Bytes, flags *jsonflags.Flags) ([]byte, error) {
64-
var i, n int
65-
var hasInvalidUTF8 bool
6664
dst = slices.Grow(dst, len(`"`)+len(src)+len(`"`))
6765
dst = append(dst, '"')
66+
dst, err := AppendQuotePartial(dst, src, flags)
67+
dst = append(dst, '"')
68+
return dst, err
69+
}
70+
71+
// AppendQuotePartial is like AppendQuote except that it can also be used to encode content
72+
// within a string. If entire is false, then neither the starting or ending quotes will be appended.
73+
func AppendQuotePartial[Bytes ~[]byte | ~string](dst []byte, src Bytes, flags *jsonflags.Flags) ([]byte, error) {
74+
dst = slices.Grow(dst, len(src))
75+
var i, n int
76+
var hasInvalidUTF8 bool
6877
for uint(len(src)) > uint(n) {
6978
if c := src[n]; c < utf8.RuneSelf {
7079
// Handle single-byte ASCII.
@@ -104,7 +113,6 @@ func AppendQuote[Bytes ~[]byte | ~string](dst []byte, src Bytes, flags *jsonflag
104113
}
105114
}
106115
dst = append(dst, src[i:n]...)
107-
dst = append(dst, '"')
108116
if hasInvalidUTF8 && !flags.Get(jsonflags.AllowInvalidUTF8) {
109117
return dst, ErrInvalidUTF8
110118
}

jsontext/decode.go

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ package jsontext
77
import (
88
"bytes"
99
"errors"
10+
"fmt"
1011
"io"
12+
"iter"
1113

1214
"github.com/go-json-experiment/json/internal/jsonflags"
1315
"github.com/go-json-experiment/json/internal/jsonopts"
@@ -459,6 +461,101 @@ func (d *decoderState) SkipUntil(depth int, length int64) error {
459461
func (d *Decoder) ReadToken() (Token, error) {
460462
return d.s.ReadToken()
461463
}
464+
465+
// ReadStringAsSeq reads a string token as a sequence of
466+
// byte slices that are available as the contents of the string
467+
// are accumulated. If the next token is not a string, the sequence
468+
// immediately terminates with an error.
469+
//
470+
// The byte slices must not escape the iterator.
471+
//
472+
// It also returns an error if the AllowDuplicateNames option
473+
// is not set and the string is an object key.
474+
//
475+
// If the iterator terminates before completion, no
476+
// more tokens may be read from the decoder.
477+
// Any attempt to read tokens with the decoder while the iteration
478+
// is in progress will result in an error.
479+
func (dec *Decoder) ReadStringAsSeq() iter.Seq2[[]byte, error] {
480+
return dec.s.ReadStringAsSeq()
481+
}
482+
483+
var errNotString = fmt.Errorf("cannot stream non-string")
484+
var errCannotStreamObjectKeyWithoutAllowDuplicateNames = fmt.Errorf("cannot stream object key without enabling duplicate names")
485+
var errStringStreamAborted = fmt.Errorf("stream aborted")
486+
487+
func (d *decoderState) ReadStringAsSeq() iter.Seq2[[]byte, error] {
488+
return func(yield func([]byte, error) bool) {
489+
if err := d.readStringAsSeq(yield); err != nil {
490+
yield(nil, err)
491+
}
492+
}
493+
}
494+
495+
func (d *decoderState) readStringAsSeq(yield func([]byte, error) bool) error {
496+
if d.PeekKind() != '"' {
497+
err := d.peekErr
498+
if err == nil {
499+
err = errNotString
500+
}
501+
return err
502+
}
503+
if d.Tokens.Last.NeedObjectName() && !d.Flags.Get(jsonflags.AllowDuplicateNames) {
504+
return errCannotStreamObjectKeyWithoutAllowDuplicateNames
505+
}
506+
pos := d.peekPos
507+
d.peekPos = 0
508+
if err := d.Tokens.appendString(); err != nil {
509+
return wrapSyntacticError(d, err, pos, +1) // report position at start of string
510+
}
511+
var flags jsonwire.ValueFlags
512+
if n := jsonwire.ConsumeSimpleString(d.buf[pos:]); n > 0 {
513+
if data := jsonwire.UnquoteMayCopy(d.buf[pos:pos+n], true); len(data) > 0 {
514+
yield(data, nil)
515+
}
516+
return nil
517+
}
518+
atStart := true
519+
for {
520+
n, err := jsonwire.ConsumeStringResumablePartial(&flags, d.buf[pos:], 0, atStart, true)
521+
if err == io.ErrUnexpectedEOF {
522+
pos += n
523+
if n > 0 {
524+
if data := jsonwire.UnquoteMayCopyPartial(d.buf[pos-n:pos], atStart, flags.IsVerbatim()); len(data) > 0 {
525+
if !yield(data, nil) {
526+
d.peekPos = -1
527+
d.peekErr = errStringStreamAborted
528+
return nil
529+
}
530+
} else {
531+
}
532+
} else {
533+
}
534+
// Discard previous string content.
535+
d.prevStart, d.prevEnd = pos, pos
536+
absPos := d.baseOffset + int64(pos)
537+
err := d.fetch()
538+
pos = int(absPos - d.baseOffset)
539+
if err != nil {
540+
return err
541+
}
542+
atStart = false
543+
continue
544+
}
545+
pos += n
546+
// No need to set errStringStreamAborted because we know that
547+
// we're in a good state to proceed and we're not going to call yield
548+
// again.
549+
if data := jsonwire.UnquoteMayCopyPartial(d.buf[pos-n:pos], atStart, flags.IsVerbatim()); len(data) > 0 {
550+
yield(data, err)
551+
}
552+
553+
// Discard previous string content.
554+
d.prevStart, d.prevEnd = pos, pos
555+
return err
556+
}
557+
}
558+
462559
func (d *decoderState) ReadToken() (Token, error) {
463560
// Determine the next kind.
464561
var err error

0 commit comments

Comments
 (0)