1142 lines
27 KiB
Go
1142 lines
27 KiB
Go
// Copyright (c) 2014 Couchbase, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package upsidedown
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"reflect"
|
|
|
|
"github.com/blevesearch/bleve/size"
|
|
"github.com/golang/protobuf/proto"
|
|
)
|
|
|
|
var reflectStaticSizeTermFrequencyRow int
|
|
var reflectStaticSizeTermVector int
|
|
|
|
func init() {
|
|
var tfr TermFrequencyRow
|
|
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
|
|
var tv TermVector
|
|
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
|
|
}
|
|
|
|
const ByteSeparator byte = 0xff
|
|
|
|
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
|
|
|
type UpsideDownCouchRow interface {
|
|
KeySize() int
|
|
KeyTo([]byte) (int, error)
|
|
Key() []byte
|
|
Value() []byte
|
|
ValueSize() int
|
|
ValueTo([]byte) (int, error)
|
|
}
|
|
|
|
func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) {
|
|
if len(key) > 0 {
|
|
switch key[0] {
|
|
case 'v':
|
|
return NewVersionRowKV(key, value)
|
|
case 'f':
|
|
return NewFieldRowKV(key, value)
|
|
case 'd':
|
|
return NewDictionaryRowKV(key, value)
|
|
case 't':
|
|
return NewTermFrequencyRowKV(key, value)
|
|
case 'b':
|
|
return NewBackIndexRowKV(key, value)
|
|
case 's':
|
|
return NewStoredRowKV(key, value)
|
|
case 'i':
|
|
return NewInternalRowKV(key, value)
|
|
}
|
|
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
|
|
}
|
|
return nil, fmt.Errorf("Invalid empty key")
|
|
}
|
|
|
|
// VERSION
|
|
|
|
type VersionRow struct {
|
|
version uint8
|
|
}
|
|
|
|
func (v *VersionRow) Key() []byte {
|
|
return []byte{'v'}
|
|
}
|
|
|
|
func (v *VersionRow) KeySize() int {
|
|
return 1
|
|
}
|
|
|
|
func (v *VersionRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'v'
|
|
return 1, nil
|
|
}
|
|
|
|
func (v *VersionRow) Value() []byte {
|
|
return []byte{byte(v.version)}
|
|
}
|
|
|
|
func (v *VersionRow) ValueSize() int {
|
|
return 1
|
|
}
|
|
|
|
func (v *VersionRow) ValueTo(buf []byte) (int, error) {
|
|
buf[0] = v.version
|
|
return 1, nil
|
|
}
|
|
|
|
func (v *VersionRow) String() string {
|
|
return fmt.Sprintf("Version: %d", v.version)
|
|
}
|
|
|
|
func NewVersionRow(version uint8) *VersionRow {
|
|
return &VersionRow{
|
|
version: version,
|
|
}
|
|
}
|
|
|
|
func NewVersionRowKV(key, value []byte) (*VersionRow, error) {
|
|
rv := VersionRow{}
|
|
buf := bytes.NewBuffer(value)
|
|
err := binary.Read(buf, binary.LittleEndian, &rv.version)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &rv, nil
|
|
}
|
|
|
|
// INTERNAL STORAGE
|
|
|
|
type InternalRow struct {
|
|
key []byte
|
|
val []byte
|
|
}
|
|
|
|
func (i *InternalRow) Key() []byte {
|
|
buf := make([]byte, i.KeySize())
|
|
size, _ := i.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (i *InternalRow) KeySize() int {
|
|
return len(i.key) + 1
|
|
}
|
|
|
|
func (i *InternalRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'i'
|
|
actual := copy(buf[1:], i.key)
|
|
return 1 + actual, nil
|
|
}
|
|
|
|
func (i *InternalRow) Value() []byte {
|
|
return i.val
|
|
}
|
|
|
|
func (i *InternalRow) ValueSize() int {
|
|
return len(i.val)
|
|
}
|
|
|
|
func (i *InternalRow) ValueTo(buf []byte) (int, error) {
|
|
actual := copy(buf, i.val)
|
|
return actual, nil
|
|
}
|
|
|
|
func (i *InternalRow) String() string {
|
|
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val)
|
|
}
|
|
|
|
func NewInternalRow(key, val []byte) *InternalRow {
|
|
return &InternalRow{
|
|
key: key,
|
|
val: val,
|
|
}
|
|
}
|
|
|
|
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
|
|
rv := InternalRow{}
|
|
rv.key = key[1:]
|
|
rv.val = value
|
|
return &rv, nil
|
|
}
|
|
|
|
// FIELD definition
|
|
|
|
type FieldRow struct {
|
|
index uint16
|
|
name string
|
|
}
|
|
|
|
func (f *FieldRow) Key() []byte {
|
|
buf := make([]byte, f.KeySize())
|
|
size, _ := f.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (f *FieldRow) KeySize() int {
|
|
return 3
|
|
}
|
|
|
|
func (f *FieldRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'f'
|
|
binary.LittleEndian.PutUint16(buf[1:3], f.index)
|
|
return 3, nil
|
|
}
|
|
|
|
func (f *FieldRow) Value() []byte {
|
|
return append([]byte(f.name), ByteSeparator)
|
|
}
|
|
|
|
func (f *FieldRow) ValueSize() int {
|
|
return len(f.name) + 1
|
|
}
|
|
|
|
func (f *FieldRow) ValueTo(buf []byte) (int, error) {
|
|
size := copy(buf, f.name)
|
|
buf[size] = ByteSeparator
|
|
return size + 1, nil
|
|
}
|
|
|
|
func (f *FieldRow) String() string {
|
|
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
|
|
}
|
|
|
|
func NewFieldRow(index uint16, name string) *FieldRow {
|
|
return &FieldRow{
|
|
index: index,
|
|
name: name,
|
|
}
|
|
}
|
|
|
|
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
|
|
rv := FieldRow{}
|
|
|
|
buf := bytes.NewBuffer(key)
|
|
_, err := buf.ReadByte() // type
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = binary.Read(buf, binary.LittleEndian, &rv.index)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
buf = bytes.NewBuffer(value)
|
|
rv.name, err = buf.ReadString(ByteSeparator)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
|
|
|
|
return &rv, nil
|
|
}
|
|
|
|
// DICTIONARY
|
|
|
|
const DictionaryRowMaxValueSize = binary.MaxVarintLen64
|
|
|
|
type DictionaryRow struct {
|
|
term []byte
|
|
count uint64
|
|
field uint16
|
|
}
|
|
|
|
func (dr *DictionaryRow) Key() []byte {
|
|
buf := make([]byte, dr.KeySize())
|
|
size, _ := dr.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (dr *DictionaryRow) KeySize() int {
|
|
return dictionaryRowKeySize(dr.term)
|
|
}
|
|
|
|
func dictionaryRowKeySize(term []byte) int {
|
|
return len(term) + 3
|
|
}
|
|
|
|
func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) {
|
|
return dictionaryRowKeyTo(buf, dr.field, dr.term), nil
|
|
}
|
|
|
|
func dictionaryRowKeyTo(buf []byte, field uint16, term []byte) int {
|
|
buf[0] = 'd'
|
|
binary.LittleEndian.PutUint16(buf[1:3], field)
|
|
size := copy(buf[3:], term)
|
|
return size + 3
|
|
}
|
|
|
|
func (dr *DictionaryRow) Value() []byte {
|
|
buf := make([]byte, dr.ValueSize())
|
|
size, _ := dr.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (dr *DictionaryRow) ValueSize() int {
|
|
return DictionaryRowMaxValueSize
|
|
}
|
|
|
|
func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) {
|
|
used := binary.PutUvarint(buf, dr.count)
|
|
return used, nil
|
|
}
|
|
|
|
func (dr *DictionaryRow) String() string {
|
|
return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count)
|
|
}
|
|
|
|
func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow {
|
|
return &DictionaryRow{
|
|
term: term,
|
|
field: field,
|
|
count: count,
|
|
}
|
|
}
|
|
|
|
func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
|
|
rv, err := NewDictionaryRowK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = rv.parseDictionaryV(value)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
|
|
rv := &DictionaryRow{}
|
|
err := rv.parseDictionaryK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
func (dr *DictionaryRow) parseDictionaryK(key []byte) error {
|
|
dr.field = binary.LittleEndian.Uint16(key[1:3])
|
|
if dr.term != nil {
|
|
dr.term = dr.term[:0]
|
|
}
|
|
dr.term = append(dr.term, key[3:]...)
|
|
return nil
|
|
}
|
|
|
|
func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
|
|
count, err := dictionaryRowParseV(value)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dr.count = count
|
|
return nil
|
|
}
|
|
|
|
func dictionaryRowParseV(value []byte) (uint64, error) {
|
|
count, nread := binary.Uvarint(value)
|
|
if nread <= 0 {
|
|
return 0, fmt.Errorf("DictionaryRow parse Uvarint error, nread: %d", nread)
|
|
}
|
|
return count, nil
|
|
}
|
|
|
|
// TERM FIELD FREQUENCY
|
|
|
|
type TermVector struct {
|
|
field uint16
|
|
arrayPositions []uint64
|
|
pos uint64
|
|
start uint64
|
|
end uint64
|
|
}
|
|
|
|
func (tv *TermVector) Size() int {
|
|
return reflectStaticSizeTermVector + size.SizeOfPtr +
|
|
len(tv.arrayPositions)*size.SizeOfUint64
|
|
}
|
|
|
|
func (tv *TermVector) String() string {
|
|
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
|
}
|
|
|
|
type TermFrequencyRow struct {
|
|
term []byte
|
|
doc []byte
|
|
freq uint64
|
|
vectors []*TermVector
|
|
norm float32
|
|
field uint16
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Size() int {
|
|
sizeInBytes := reflectStaticSizeTermFrequencyRow +
|
|
len(tfr.term) +
|
|
len(tfr.doc)
|
|
|
|
for _, entry := range tfr.vectors {
|
|
sizeInBytes += entry.Size()
|
|
}
|
|
|
|
return sizeInBytes
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Term() []byte {
|
|
return tfr.term
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Freq() uint64 {
|
|
return tfr.freq
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
|
|
buf := make([]byte, 3)
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
|
return buf
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
|
|
buf := make([]byte, 3+len(tfr.term))
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
|
copy(buf[3:], tfr.term)
|
|
return buf
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
|
|
buf := make([]byte, 3+len(tfr.term)+1)
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
|
termLen := copy(buf[3:], tfr.term)
|
|
buf[3+termLen] = ByteSeparator
|
|
return buf
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Key() []byte {
|
|
buf := make([]byte, tfr.KeySize())
|
|
size, _ := tfr.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) KeySize() int {
|
|
return termFrequencyRowKeySize(tfr.term, tfr.doc)
|
|
}
|
|
|
|
func termFrequencyRowKeySize(term, doc []byte) int {
|
|
return 3 + len(term) + 1 + len(doc)
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) KeyTo(buf []byte) (int, error) {
|
|
return termFrequencyRowKeyTo(buf, tfr.field, tfr.term, tfr.doc), nil
|
|
}
|
|
|
|
func termFrequencyRowKeyTo(buf []byte, field uint16, term, doc []byte) int {
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], field)
|
|
termLen := copy(buf[3:], term)
|
|
buf[3+termLen] = ByteSeparator
|
|
docLen := copy(buf[3+termLen+1:], doc)
|
|
return 3 + termLen + 1 + docLen
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) KeyAppendTo(buf []byte) ([]byte, error) {
|
|
keySize := tfr.KeySize()
|
|
if cap(buf) < keySize {
|
|
buf = make([]byte, keySize)
|
|
}
|
|
actualSize, err := tfr.KeyTo(buf[0:keySize])
|
|
return buf[0:actualSize], err
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
|
|
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
|
|
return dr.Key()
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) DictionaryRowKeySize() int {
|
|
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
|
|
return dr.KeySize()
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) DictionaryRowKeyTo(buf []byte) (int, error) {
|
|
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
|
|
return dr.KeyTo(buf)
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Value() []byte {
|
|
buf := make([]byte, tfr.ValueSize())
|
|
size, _ := tfr.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ValueSize() int {
|
|
bufLen := binary.MaxVarintLen64 + binary.MaxVarintLen64
|
|
for _, vector := range tfr.vectors {
|
|
bufLen += (binary.MaxVarintLen64 * 4) + (1+len(vector.arrayPositions))*binary.MaxVarintLen64
|
|
}
|
|
return bufLen
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ValueTo(buf []byte) (int, error) {
|
|
used := binary.PutUvarint(buf[:binary.MaxVarintLen64], tfr.freq)
|
|
|
|
normuint32 := math.Float32bits(tfr.norm)
|
|
newbuf := buf[used : used+binary.MaxVarintLen64]
|
|
used += binary.PutUvarint(newbuf, uint64(normuint32))
|
|
|
|
for _, vector := range tfr.vectors {
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(vector.field))
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.pos)
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.start)
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.end)
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(len(vector.arrayPositions)))
|
|
for _, arrayPosition := range vector.arrayPositions {
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], arrayPosition)
|
|
}
|
|
}
|
|
return used, nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) String() string {
|
|
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
|
|
}
|
|
|
|
func InitTermFrequencyRow(tfr *TermFrequencyRow, term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
|
|
tfr.term = term
|
|
tfr.field = field
|
|
tfr.doc = docID
|
|
tfr.freq = freq
|
|
tfr.norm = norm
|
|
return tfr
|
|
}
|
|
|
|
func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
|
|
return &TermFrequencyRow{
|
|
term: term,
|
|
field: field,
|
|
doc: docID,
|
|
freq: freq,
|
|
norm: norm,
|
|
}
|
|
}
|
|
|
|
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
|
|
return &TermFrequencyRow{
|
|
term: term,
|
|
field: field,
|
|
doc: docID,
|
|
freq: freq,
|
|
norm: norm,
|
|
vectors: vectors,
|
|
}
|
|
}
|
|
|
|
func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
|
|
rv := &TermFrequencyRow{}
|
|
err := rv.parseK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) parseK(key []byte) error {
|
|
keyLen := len(key)
|
|
if keyLen < 3 {
|
|
return fmt.Errorf("invalid term frequency key, no valid field")
|
|
}
|
|
tfr.field = binary.LittleEndian.Uint16(key[1:3])
|
|
|
|
termEndPos := bytes.IndexByte(key[3:], ByteSeparator)
|
|
if termEndPos < 0 {
|
|
return fmt.Errorf("invalid term frequency key, no byte separator terminating term")
|
|
}
|
|
tfr.term = key[3 : 3+termEndPos]
|
|
|
|
docLen := keyLen - (3 + termEndPos + 1)
|
|
if docLen < 1 {
|
|
return fmt.Errorf("invalid term frequency key, empty docid")
|
|
}
|
|
tfr.doc = key[3+termEndPos+1:]
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
|
|
tfr.doc = key[3+len(term)+1:]
|
|
if len(tfr.doc) <= 0 {
|
|
return fmt.Errorf("invalid term frequency key, empty docid")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) parseV(value []byte, includeTermVectors bool) error {
|
|
var bytesRead int
|
|
tfr.freq, bytesRead = binary.Uvarint(value)
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, invalid frequency")
|
|
}
|
|
currOffset := bytesRead
|
|
|
|
var norm uint64
|
|
norm, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, no norm")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
tfr.norm = math.Float32frombits(uint32(norm))
|
|
|
|
tfr.vectors = nil
|
|
if !includeTermVectors {
|
|
return nil
|
|
}
|
|
|
|
var field uint64
|
|
field, bytesRead = binary.Uvarint(value[currOffset:])
|
|
for bytesRead > 0 {
|
|
currOffset += bytesRead
|
|
tv := TermVector{}
|
|
tv.field = uint16(field)
|
|
// at this point we expect at least one term vector
|
|
if tfr.vectors == nil {
|
|
tfr.vectors = make([]*TermVector, 0)
|
|
}
|
|
|
|
tv.pos, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no position")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
tv.start, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no start")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
tv.end, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no end")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
var arrayPositionsLen uint64 = 0
|
|
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
if arrayPositionsLen > 0 {
|
|
tv.arrayPositions = make([]uint64, arrayPositionsLen)
|
|
for i := 0; uint64(i) < arrayPositionsLen; i++ {
|
|
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
|
|
}
|
|
currOffset += bytesRead
|
|
}
|
|
}
|
|
|
|
tfr.vectors = append(tfr.vectors, &tv)
|
|
// try to read next record (may not exist)
|
|
field, bytesRead = binary.Uvarint(value[currOffset:])
|
|
}
|
|
if len(value[currOffset:]) > 0 && bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector field invalid")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
|
|
rv, err := NewTermFrequencyRowK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = rv.parseV(value, true)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
type BackIndexRow struct {
|
|
doc []byte
|
|
termsEntries []*BackIndexTermsEntry
|
|
storedEntries []*BackIndexStoreEntry
|
|
}
|
|
|
|
func (br *BackIndexRow) AllTermKeys() [][]byte {
|
|
if br == nil {
|
|
return nil
|
|
}
|
|
rv := make([][]byte, 0, len(br.termsEntries)) // FIXME this underestimates severely
|
|
for _, termsEntry := range br.termsEntries {
|
|
for i := range termsEntry.Terms {
|
|
termRow := NewTermFrequencyRow([]byte(termsEntry.Terms[i]), uint16(termsEntry.GetField()), br.doc, 0, 0)
|
|
rv = append(rv, termRow.Key())
|
|
}
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func (br *BackIndexRow) AllStoredKeys() [][]byte {
|
|
if br == nil {
|
|
return nil
|
|
}
|
|
rv := make([][]byte, len(br.storedEntries))
|
|
for i, storedEntry := range br.storedEntries {
|
|
storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
|
|
rv[i] = storedRow.Key()
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func (br *BackIndexRow) Key() []byte {
|
|
buf := make([]byte, br.KeySize())
|
|
size, _ := br.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (br *BackIndexRow) KeySize() int {
|
|
return len(br.doc) + 1
|
|
}
|
|
|
|
func (br *BackIndexRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'b'
|
|
used := copy(buf[1:], br.doc)
|
|
return used + 1, nil
|
|
}
|
|
|
|
func (br *BackIndexRow) Value() []byte {
|
|
buf := make([]byte, br.ValueSize())
|
|
size, _ := br.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (br *BackIndexRow) ValueSize() int {
|
|
birv := &BackIndexRowValue{
|
|
TermsEntries: br.termsEntries,
|
|
StoredEntries: br.storedEntries,
|
|
}
|
|
return birv.Size()
|
|
}
|
|
|
|
func (br *BackIndexRow) ValueTo(buf []byte) (int, error) {
|
|
birv := &BackIndexRowValue{
|
|
TermsEntries: br.termsEntries,
|
|
StoredEntries: br.storedEntries,
|
|
}
|
|
return birv.MarshalTo(buf)
|
|
}
|
|
|
|
func (br *BackIndexRow) String() string {
|
|
return fmt.Sprintf("Backindex DocId: `%s` Terms Entries: %v, Stored Entries: %v", string(br.doc), br.termsEntries, br.storedEntries)
|
|
}
|
|
|
|
func NewBackIndexRow(docID []byte, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
|
|
return &BackIndexRow{
|
|
doc: docID,
|
|
termsEntries: entries,
|
|
storedEntries: storedFields,
|
|
}
|
|
}
|
|
|
|
func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
|
|
rv := BackIndexRow{}
|
|
|
|
buf := bytes.NewBuffer(key)
|
|
_, err := buf.ReadByte() // type
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rv.doc, err = buf.ReadBytes(ByteSeparator)
|
|
if err == io.EOF && len(rv.doc) < 1 {
|
|
err = fmt.Errorf("invalid doc length 0 - % x", key)
|
|
}
|
|
if err != nil && err != io.EOF {
|
|
return nil, err
|
|
} else if err == nil {
|
|
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
|
|
}
|
|
|
|
var birv BackIndexRowValue
|
|
err = proto.Unmarshal(value, &birv)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv.termsEntries = birv.TermsEntries
|
|
rv.storedEntries = birv.StoredEntries
|
|
|
|
return &rv, nil
|
|
}
|
|
|
|
// STORED
|
|
|
|
type StoredRow struct {
|
|
doc []byte
|
|
field uint16
|
|
arrayPositions []uint64
|
|
typ byte
|
|
value []byte
|
|
}
|
|
|
|
func (s *StoredRow) Key() []byte {
|
|
buf := make([]byte, s.KeySize())
|
|
size, _ := s.KeyTo(buf)
|
|
return buf[0:size]
|
|
}
|
|
|
|
func (s *StoredRow) KeySize() int {
|
|
return 1 + len(s.doc) + 1 + 2 + (binary.MaxVarintLen64 * len(s.arrayPositions))
|
|
}
|
|
|
|
func (s *StoredRow) KeyTo(buf []byte) (int, error) {
|
|
docLen := len(s.doc)
|
|
buf[0] = 's'
|
|
copy(buf[1:], s.doc)
|
|
buf[1+docLen] = ByteSeparator
|
|
binary.LittleEndian.PutUint16(buf[1+docLen+1:], s.field)
|
|
bytesUsed := 1 + docLen + 1 + 2
|
|
for _, arrayPosition := range s.arrayPositions {
|
|
varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition)
|
|
bytesUsed += varbytes
|
|
}
|
|
return bytesUsed, nil
|
|
}
|
|
|
|
func (s *StoredRow) Value() []byte {
|
|
buf := make([]byte, s.ValueSize())
|
|
size, _ := s.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (s *StoredRow) ValueSize() int {
|
|
return len(s.value) + 1
|
|
}
|
|
|
|
func (s *StoredRow) ValueTo(buf []byte) (int, error) {
|
|
buf[0] = s.typ
|
|
used := copy(buf[1:], s.value)
|
|
return used + 1, nil
|
|
}
|
|
|
|
func (s *StoredRow) String() string {
|
|
return fmt.Sprintf("Document: %s Field %d, Array Positions: %v, Type: %s Value: %s", s.doc, s.field, s.arrayPositions, string(s.typ), s.value)
|
|
}
|
|
|
|
func (s *StoredRow) ScanPrefixForDoc() []byte {
|
|
docLen := len(s.doc)
|
|
buf := make([]byte, 1+docLen+1)
|
|
buf[0] = 's'
|
|
copy(buf[1:], s.doc)
|
|
buf[1+docLen] = ByteSeparator
|
|
return buf
|
|
}
|
|
|
|
func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
|
|
return &StoredRow{
|
|
doc: docID,
|
|
field: field,
|
|
arrayPositions: arrayPositions,
|
|
typ: typ,
|
|
value: value,
|
|
}
|
|
}
|
|
|
|
func NewStoredRowK(key []byte) (*StoredRow, error) {
|
|
rv := StoredRow{}
|
|
|
|
buf := bytes.NewBuffer(key)
|
|
_, err := buf.ReadByte() // type
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rv.doc, err = buf.ReadBytes(ByteSeparator)
|
|
if len(rv.doc) < 2 { // 1 for min doc id length, 1 for separator
|
|
err = fmt.Errorf("invalid doc length 0")
|
|
return nil, err
|
|
}
|
|
|
|
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
|
|
|
|
err = binary.Read(buf, binary.LittleEndian, &rv.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rv.arrayPositions = make([]uint64, 0)
|
|
nextArrayPos, err := binary.ReadUvarint(buf)
|
|
for err == nil {
|
|
rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
|
|
nextArrayPos, err = binary.ReadUvarint(buf)
|
|
}
|
|
return &rv, nil
|
|
}
|
|
|
|
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
|
|
rv, err := NewStoredRowK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv.typ = value[0]
|
|
rv.value = value[1:]
|
|
return rv, nil
|
|
}
|
|
|
|
type backIndexFieldTermVisitor func(field uint32, term []byte)
|
|
|
|
// visitBackIndexRow is designed to process a protobuf encoded
|
|
// value, without creating unnecessary garbage. Instead values are passed
|
|
// to a callback, inspected first, and only copied if necessary.
|
|
// Due to the fact that this borrows from generated code, it must be marnually
|
|
// updated if the protobuf definition changes.
|
|
//
|
|
// This code originates from:
|
|
// func (m *BackIndexRowValue) Unmarshal(data []byte) error
|
|
// the sections which create garbage or parse unintersting sections
|
|
// have been commented out. This was done by design to allow for easier
|
|
// merging in the future if that original function is regenerated
|
|
func visitBackIndexRow(data []byte, callback backIndexFieldTermVisitor) error {
|
|
l := len(data)
|
|
iNdEx := 0
|
|
for iNdEx < l {
|
|
var wire uint64
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
wire |= (uint64(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
fieldNum := int32(wire >> 3)
|
|
wireType := int(wire & 0x7)
|
|
switch fieldNum {
|
|
case 1:
|
|
if wireType != 2 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field TermsEntries", wireType)
|
|
}
|
|
var msglen int
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
msglen |= (int(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
postIndex := iNdEx + msglen
|
|
if msglen < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if postIndex > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
// dont parse term entries
|
|
// m.TermsEntries = append(m.TermsEntries, &BackIndexTermsEntry{})
|
|
// if err := m.TermsEntries[len(m.TermsEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
|
|
// return err
|
|
// }
|
|
// instead, inspect them
|
|
if err := visitBackIndexRowFieldTerms(data[iNdEx:postIndex], callback); err != nil {
|
|
return err
|
|
}
|
|
iNdEx = postIndex
|
|
case 2:
|
|
if wireType != 2 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field StoredEntries", wireType)
|
|
}
|
|
var msglen int
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
msglen |= (int(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
postIndex := iNdEx + msglen
|
|
if msglen < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if postIndex > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
// don't parse stored entries
|
|
// m.StoredEntries = append(m.StoredEntries, &BackIndexStoreEntry{})
|
|
// if err := m.StoredEntries[len(m.StoredEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
|
|
// return err
|
|
// }
|
|
iNdEx = postIndex
|
|
default:
|
|
var sizeOfWire int
|
|
for {
|
|
sizeOfWire++
|
|
wire >>= 7
|
|
if wire == 0 {
|
|
break
|
|
}
|
|
}
|
|
iNdEx -= sizeOfWire
|
|
skippy, err := skipUpsidedown(data[iNdEx:])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if skippy < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if (iNdEx + skippy) > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
// don't track unrecognized data
|
|
//m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
|
|
iNdEx += skippy
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// visitBackIndexRowFieldTerms is designed to process a protobuf encoded
|
|
// sub-value within the BackIndexRowValue, without creating unnecessary garbage.
|
|
// Instead values are passed to a callback, inspected first, and only copied if
|
|
// necessary. Due to the fact that this borrows from generated code, it must
|
|
// be marnually updated if the protobuf definition changes.
|
|
//
|
|
// This code originates from:
|
|
// func (m *BackIndexTermsEntry) Unmarshal(data []byte) error {
|
|
// the sections which create garbage or parse uninteresting sections
|
|
// have been commented out. This was done by design to allow for easier
|
|
// merging in the future if that original function is regenerated
|
|
func visitBackIndexRowFieldTerms(data []byte, callback backIndexFieldTermVisitor) error {
|
|
var theField uint32
|
|
|
|
var hasFields [1]uint64
|
|
l := len(data)
|
|
iNdEx := 0
|
|
for iNdEx < l {
|
|
var wire uint64
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
wire |= (uint64(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
fieldNum := int32(wire >> 3)
|
|
wireType := int(wire & 0x7)
|
|
switch fieldNum {
|
|
case 1:
|
|
if wireType != 0 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType)
|
|
}
|
|
var v uint32
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
v |= (uint32(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
// m.Field = &v
|
|
theField = v
|
|
hasFields[0] |= uint64(0x00000001)
|
|
case 2:
|
|
if wireType != 2 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field Terms", wireType)
|
|
}
|
|
var stringLen uint64
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
stringLen |= (uint64(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
postIndex := iNdEx + int(stringLen)
|
|
if postIndex > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
//m.Terms = append(m.Terms, string(data[iNdEx:postIndex]))
|
|
callback(theField, data[iNdEx:postIndex])
|
|
iNdEx = postIndex
|
|
default:
|
|
var sizeOfWire int
|
|
for {
|
|
sizeOfWire++
|
|
wire >>= 7
|
|
if wire == 0 {
|
|
break
|
|
}
|
|
}
|
|
iNdEx -= sizeOfWire
|
|
skippy, err := skipUpsidedown(data[iNdEx:])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if skippy < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if (iNdEx + skippy) > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
//m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
|
|
iNdEx += skippy
|
|
}
|
|
}
|
|
// if hasFields[0]&uint64(0x00000001) == 0 {
|
|
// return new(github_com_golang_protobuf_proto.RequiredNotSetError)
|
|
// }
|
|
|
|
return nil
|
|
}
|