diff --git a/extensions/omniv21/fileformat/flatfile/fixedlength/reader.go b/extensions/omniv21/fileformat/flatfile/fixedlength/reader.go index 40bdd56..5279a43 100644 --- a/extensions/omniv21/fileformat/flatfile/fixedlength/reader.go +++ b/extensions/omniv21/fileformat/flatfile/fixedlength/reader.go @@ -14,8 +14,9 @@ import ( ) type line struct { - lineNum int // 1-based - b []byte + lineNum int // 1-based + b []byte // either a copy of a line content or a direct ref into bufio.Reader. + copied bool // see notes in reader.readLine() } type reader struct { @@ -148,9 +149,37 @@ func (r *reader) readAndMatchHeaderFooterBasedEnvelope( } func (r *reader) readLine() error { + // https://github.com/jf-tech/omniparser/issues/213 + // + // If we're dealing with multi-lined envelope (either by rows or by header/footer), this + // readLine() will be called several times, thus whatever ios.ByteReadLine, which uses + // bufio.Reader underneath, returns in a previous call may be potentially be invalidated due to + // bufio.Reader's internal buf rollover. If we read the previous line directly, it would cause + // corruption. + // + // To fix the problem the easiest solution would be simply copying the return []byte from + // ios.ByteReadLine every single time. But for files with single-line envelope, which are the + // vast majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to + // has a flag on reader.linesBuf's last element to tell if it contains a reference into the + // bufio.Reader's internal buffer, or it's a copy. Every time before we call bufio.Reader read, + // we check reader.liensBuf's last element flag, if it is not a copy, then we will turn it into + // a copy. + // + // This way, we optimize for the vast majority cases without + // needing allocations, and avoid any potential corruptions in the multi-lined envelope cases. + linesBufLen := len(r.linesBuf) + if linesBufLen > 0 && !r.linesBuf[linesBufLen-1].copied { + cp := make([]byte, len(r.linesBuf[linesBufLen-1].b)) + copy(cp, r.linesBuf[linesBufLen-1].b) + r.linesBuf[linesBufLen-1].b = cp + r.linesBuf[linesBufLen-1].copied = true + } for { - // note1: ios.ByteReadLine returns a ln with trailing '\n' (and/or '\r') dropped. - // note2: ios.ByteReadLine won't return io.EOF if ln returned isn't empty. + // note1: ios.ByteReadLine returns a line with trailing '\n' (and/or '\r') dropped. + // note2: ios.ByteReadLine won't return io.EOF if line returned isn't empty. + // note3: ios.ByteReadLine's returned []byte is merely pointing into the bufio.Reader's + // internal buffer, thus the content will be invalided if ios.ByteReadLine is called + // again. Caution! b, err := ios.ByteReadLine(r.r) switch { case err == io.EOF: diff --git a/extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go b/extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go index 062edf1..4639c6c 100644 --- a/extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go +++ b/extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go @@ -412,6 +412,37 @@ func TestReadLine(t *testing.T) { assert.Equal(t, line{lineNum: 45, b: []byte("a new line")}, r.linesBuf[2]) } +func TestReadLineMultiCalls(t *testing.T) { + // See more details about the bug: https://github.com/jf-tech/omniparser/issues/213 + r := &reader{inputName: "test-input"} + // construct a two-line input, and total length of the two line input is longer than + // the bufio.Reader's internal buffer. Note bufio.Reader has a minReadBufferSize at 16. + r.r = bufio.NewReaderSize(strings.NewReader("0123456789\nabcdefghijklmnopq\n!@#"), 16) + // First read should be completely normal and fine. + err := r.readLine() + assert.NoError(t, err) + assert.Equal(t, 1, r.linesRead) + assert.Equal(t, 1, len(r.linesBuf)) + assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: false}, r.linesBuf[0]) + // Now the second read, if we didn't do any copying of the first line result, the first line + // would have become corrupted since the bufio.Reader's tiny internal buffer gets overwritten + // by the new line data. With the fix, both lines in the linesBuf should be without corruption. + err = r.readLine() + assert.NoError(t, err) + assert.Equal(t, 2, r.linesRead) + assert.Equal(t, 2, len(r.linesBuf)) + assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0]) + assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: false}, r.linesBuf[1]) + // 3rd line reading + err = r.readLine() + assert.NoError(t, err) + assert.Equal(t, 3, r.linesRead) + assert.Equal(t, 3, len(r.linesBuf)) + assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0]) + assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: true}, r.linesBuf[1]) + assert.Equal(t, line{lineNum: 3, b: []byte("!@#"), copied: false}, r.linesBuf[2]) +} + func TestLinesToNode(t *testing.T) { for _, test := range []struct { name string diff --git a/extensions/omniv21/samples/fixedlength2/fixedlength_test.go b/extensions/omniv21/samples/fixedlength2/fixedlength_test.go index f2f16fd..81da1e0 100644 --- a/extensions/omniv21/samples/fixedlength2/fixedlength_test.go +++ b/extensions/omniv21/samples/fixedlength2/fixedlength_test.go @@ -108,22 +108,22 @@ func Test4_Nested(t *testing.T) { tests[test4_Nested].doTest(t) } -// Benchmark1_Single_Row-8 25898 45728 ns/op 28169 B/op 647 allocs/op +// Benchmark1_Single_Row-8 25951 45776 ns/op 28213 B/op 645 allocs/op func Benchmark1_Single_Row(b *testing.B) { tests[test1_Single_Row].doBenchmark(b) } -// Benchmark2_Multi_Rows-8 15338 78273 ns/op 30179 B/op 618 allocs/op +// Benchmark2_Multi_Rows-8 18583 64240 ns/op 34056 B/op 636 allocs/op func Benchmark2_Multi_Rows(b *testing.B) { tests[test2_Multi_Rows].doBenchmark(b) } -// Benchmark3_Header_Footer-8 6390 178301 ns/op 76571 B/op 1501 allocs/op +// Benchmark3_Header_Footer-8 7306 158797 ns/op 78666 B/op 1587 allocs/op func Benchmark3_Header_Footer(b *testing.B) { tests[test3_Header_Footer].doBenchmark(b) } -// Benchmark4_Nested-8 10000 107948 ns/op 78986 B/op 1535 allocs/op +// Benchmark4_Nested-8 10000 107955 ns/op 80929 B/op 1515 allocs/op func Benchmark4_Nested(b *testing.B) { tests[test4_Nested].doBenchmark(b) }