dispatch/vendor/github.com/blevesearch/segment/segment_test.go
2016-03-01 01:51:26 +01:00

242 lines
6.5 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package segment
import (
"bufio"
"bytes"
"errors"
"io"
"strings"
"testing"
)
// Tests borrowed from Scanner to test Segmenter
// slowReader is a reader that returns only a few bytes at a time, to test the incremental
// reads in Scanner.Scan.
type slowReader struct {
max int
buf io.Reader
}
func (sr *slowReader) Read(p []byte) (n int, err error) {
if len(p) > sr.max {
p = p[0:sr.max]
}
return sr.buf.Read(p)
}
// genLine writes to buf a predictable but non-trivial line of text of length
// n, including the terminal newline and an occasional carriage return.
// If addNewline is false, the \r and \n are not emitted.
func genLine(buf *bytes.Buffer, lineNum, n int, addNewline bool) {
buf.Reset()
doCR := lineNum%5 == 0
if doCR {
n--
}
for i := 0; i < n-1; i++ { // Stop early for \n.
c := 'a' + byte(lineNum+i)
if c == '\n' || c == '\r' { // Don't confuse us.
c = 'N'
}
buf.WriteByte(c)
}
if addNewline {
if doCR {
buf.WriteByte('\r')
}
buf.WriteByte('\n')
}
return
}
func wrapSplitFuncAsSegmentFuncForTesting(splitFunc bufio.SplitFunc) SegmentFunc {
return func(data []byte, atEOF bool) (advance int, token []byte, typ int, err error) {
typ = 0
advance, token, err = splitFunc(data, atEOF)
return
}
}
// Test that the line segmenter errors out on a long line.
func TestSegmentTooLong(t *testing.T) {
const smallMaxTokenSize = 256 // Much smaller for more efficient testing.
// Build a buffer of lots of line lengths up to but not exceeding smallMaxTokenSize.
tmp := new(bytes.Buffer)
buf := new(bytes.Buffer)
lineNum := 0
j := 0
for i := 0; i < 2*smallMaxTokenSize; i++ {
genLine(tmp, lineNum, j, true)
j++
buf.Write(tmp.Bytes())
lineNum++
}
s := NewSegmenter(&slowReader{3, buf})
// change to line segmenter for testing
s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(bufio.ScanLines))
s.MaxTokenSize(smallMaxTokenSize)
j = 0
for lineNum := 0; s.Segment(); lineNum++ {
genLine(tmp, lineNum, j, false)
if j < smallMaxTokenSize {
j++
} else {
j--
}
line := tmp.Bytes()
if !bytes.Equal(s.Bytes(), line) {
t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Bytes(), line)
}
}
err := s.Err()
if err != ErrTooLong {
t.Fatalf("expected ErrTooLong; got %s", err)
}
}
var testError = errors.New("testError")
// Test the correct error is returned when the split function errors out.
func TestSegmentError(t *testing.T) {
// Create a split function that delivers a little data, then a predictable error.
numSplits := 0
const okCount = 7
errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF {
panic("didn't get enough data")
}
if numSplits >= okCount {
return 0, nil, testError
}
numSplits++
return 1, data[0:1], nil
}
// Read the data.
const text = "abcdefghijklmnopqrstuvwxyz"
buf := strings.NewReader(text)
s := NewSegmenter(&slowReader{1, buf})
// change to line segmenter for testing
s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(errorSplit))
var i int
for i = 0; s.Segment(); i++ {
if len(s.Bytes()) != 1 || text[i] != s.Bytes()[0] {
t.Errorf("#%d: expected %q got %q", i, text[i], s.Bytes()[0])
}
}
// Check correct termination location and error.
if i != okCount {
t.Errorf("unexpected termination; expected %d tokens got %d", okCount, i)
}
err := s.Err()
if err != testError {
t.Fatalf("expected %q got %v", testError, err)
}
}
// Test that Scan finishes if we have endless empty reads.
type endlessZeros struct{}
func (endlessZeros) Read(p []byte) (int, error) {
return 0, nil
}
func TestBadReader(t *testing.T) {
scanner := NewSegmenter(endlessZeros{})
for scanner.Segment() {
t.Fatal("read should fail")
}
err := scanner.Err()
if err != io.ErrNoProgress {
t.Errorf("unexpected error: %v", err)
}
}
func TestSegmentAdvanceNegativeError(t *testing.T) {
errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF {
panic("didn't get enough data")
}
return -1, data[0:1], nil
}
// Read the data.
const text = "abcdefghijklmnopqrstuvwxyz"
buf := strings.NewReader(text)
s := NewSegmenter(&slowReader{1, buf})
// change to line segmenter for testing
s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(errorSplit))
s.Segment()
err := s.Err()
if err != ErrNegativeAdvance {
t.Fatalf("expected %q got %v", testError, err)
}
}
func TestSegmentAdvanceTooFarError(t *testing.T) {
errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF {
panic("didn't get enough data")
}
return len(data) + 10, data[0:1], nil
}
// Read the data.
const text = "abcdefghijklmnopqrstuvwxyz"
buf := strings.NewReader(text)
s := NewSegmenter(&slowReader{1, buf})
// change to line segmenter for testing
s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(errorSplit))
s.Segment()
err := s.Err()
if err != ErrAdvanceTooFar {
t.Fatalf("expected %q got %v", testError, err)
}
}
func TestSegmentLongTokens(t *testing.T) {
// Read the data.
text := bytes.Repeat([]byte("abcdefghijklmnop"), 257)
buf := strings.NewReader(string(text))
s := NewSegmenter(&slowReader{1, buf})
// change to line segmenter for testing
s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(bufio.ScanLines))
for s.Segment() {
line := s.Bytes()
if !bytes.Equal(text, line) {
t.Errorf("expected %s, got %s", text, line)
}
}
err := s.Err()
if err != nil {
t.Fatalf("unexpected error; got %s", err)
}
}
func TestSegmentLongTokensDontDouble(t *testing.T) {
// Read the data.
text := bytes.Repeat([]byte("abcdefghijklmnop"), 257)
buf := strings.NewReader(string(text))
s := NewSegmenter(&slowReader{1, buf})
// change to line segmenter for testing
s.SetSegmenter(wrapSplitFuncAsSegmentFuncForTesting(bufio.ScanLines))
s.MaxTokenSize(6144)
for s.Segment() {
line := s.Bytes()
if !bytes.Equal(text, line) {
t.Errorf("expected %s, got %s", text, line)
}
}
err := s.Err()
if err != nil {
t.Fatalf("unexpected error; got %s", err)
}
}