Skip to content

Multi-lined envelope reading corruption caused by direct reference into bufio.Reader's internal buffer rollover. #214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 33 additions & 4 deletions extensions/omniv21/fileformat/flatfile/fixedlength/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ import (
)

type line struct {
lineNum int // 1-based
b []byte
lineNum int // 1-based
b []byte // either a copy of a line content or a direct ref into bufio.Reader.
copied bool // see notes in reader.readLine()
}

type reader struct {
Expand Down Expand Up @@ -148,9 +149,37 @@ func (r *reader) readAndMatchHeaderFooterBasedEnvelope(
}

func (r *reader) readLine() error {
// https://github.com/jf-tech/omniparser/issues/213
//
// If we're dealing with multi-lined envelope (either by rows or by header/footer), this
// readLine() will be called several times, thus whatever ios.ByteReadLine, which uses
// bufio.Reader underneath, returns in a previous call may be potentially be invalidated due to
// bufio.Reader's internal buf rollover. If we read the previous line directly, it would cause
// corruption.
//
// To fix the problem the easiest solution would be simply copying the return []byte from
// ios.ByteReadLine every single time. But for files with single-line envelope, which are the
// vast majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to
// has a flag on reader.linesBuf's last element to tell if it contains a reference into the
// bufio.Reader's internal buffer, or it's a copy. Every time before we call bufio.Reader read,
// we check reader.liensBuf's last element flag, if it is not a copy, then we will turn it into
// a copy.
//
// This way, we optimize for the vast majority cases without
// needing allocations, and avoid any potential corruptions in the multi-lined envelope cases.
linesBufLen := len(r.linesBuf)
if linesBufLen > 0 && !r.linesBuf[linesBufLen-1].copied {
cp := make([]byte, len(r.linesBuf[linesBufLen-1].b))
copy(cp, r.linesBuf[linesBufLen-1].b)
r.linesBuf[linesBufLen-1].b = cp
r.linesBuf[linesBufLen-1].copied = true
}
for {
// note1: ios.ByteReadLine returns a ln with trailing '\n' (and/or '\r') dropped.
// note2: ios.ByteReadLine won't return io.EOF if ln returned isn't empty.
// note1: ios.ByteReadLine returns a line with trailing '\n' (and/or '\r') dropped.
// note2: ios.ByteReadLine won't return io.EOF if line returned isn't empty.
// note3: ios.ByteReadLine's returned []byte is merely pointing into the bufio.Reader's
// internal buffer, thus the content will be invalided if ios.ByteReadLine is called
// again. Caution!
b, err := ios.ByteReadLine(r.r)
switch {
case err == io.EOF:
Expand Down
31 changes: 31 additions & 0 deletions extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,37 @@ func TestReadLine(t *testing.T) {
assert.Equal(t, line{lineNum: 45, b: []byte("a new line")}, r.linesBuf[2])
}

func TestReadLineMultiCalls(t *testing.T) {
// See more details about the bug: https://github.com/jf-tech/omniparser/issues/213
r := &reader{inputName: "test-input"}
// construct a two-line input, and total length of the two line input is longer than
// the bufio.Reader's internal buffer. Note bufio.Reader has a minReadBufferSize at 16.
r.r = bufio.NewReaderSize(strings.NewReader("0123456789\nabcdefghijklmnopq\n!@#"), 16)
// First read should be completely normal and fine.
err := r.readLine()
assert.NoError(t, err)
assert.Equal(t, 1, r.linesRead)
assert.Equal(t, 1, len(r.linesBuf))
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: false}, r.linesBuf[0])
// Now the second read, if we didn't do any copying of the first line result, the first line
// would have become corrupted since the bufio.Reader's tiny internal buffer gets overwritten
// by the new line data. With the fix, both lines in the linesBuf should be without corruption.
err = r.readLine()
assert.NoError(t, err)
assert.Equal(t, 2, r.linesRead)
assert.Equal(t, 2, len(r.linesBuf))
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0])
assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: false}, r.linesBuf[1])
// 3rd line reading
err = r.readLine()
assert.NoError(t, err)
assert.Equal(t, 3, r.linesRead)
assert.Equal(t, 3, len(r.linesBuf))
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0])
assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: true}, r.linesBuf[1])
assert.Equal(t, line{lineNum: 3, b: []byte("!@#"), copied: false}, r.linesBuf[2])
}

func TestLinesToNode(t *testing.T) {
for _, test := range []struct {
name string
Expand Down
8 changes: 4 additions & 4 deletions extensions/omniv21/samples/fixedlength2/fixedlength_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,22 +108,22 @@ func Test4_Nested(t *testing.T) {
tests[test4_Nested].doTest(t)
}

// Benchmark1_Single_Row-8 25898 45728 ns/op 28169 B/op 647 allocs/op
// Benchmark1_Single_Row-8 25951 45776 ns/op 28213 B/op 645 allocs/op
func Benchmark1_Single_Row(b *testing.B) {
tests[test1_Single_Row].doBenchmark(b)
}

// Benchmark2_Multi_Rows-8 15338 78273 ns/op 30179 B/op 618 allocs/op
// Benchmark2_Multi_Rows-8 18583 64240 ns/op 34056 B/op 636 allocs/op
func Benchmark2_Multi_Rows(b *testing.B) {
tests[test2_Multi_Rows].doBenchmark(b)
}

// Benchmark3_Header_Footer-8 6390 178301 ns/op 76571 B/op 1501 allocs/op
// Benchmark3_Header_Footer-8 7306 158797 ns/op 78666 B/op 1587 allocs/op
func Benchmark3_Header_Footer(b *testing.B) {
tests[test3_Header_Footer].doBenchmark(b)
}

// Benchmark4_Nested-8 10000 107948 ns/op 78986 B/op 1535 allocs/op
// Benchmark4_Nested-8 10000 107955 ns/op 80929 B/op 1515 allocs/op
func Benchmark4_Nested(b *testing.B) {
tests[test4_Nested].doBenchmark(b)
}