Update server dependencies

This commit is contained in:
Ken-Håvard Lieng 2018-05-04 23:39:27 +02:00
parent fb8fec38ff
commit de36fe682a
883 changed files with 147940 additions and 68404 deletions

View file

@ -1,6 +1,6 @@
# ![bleve](docs/bleve.png) bleve
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve)
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve)
[![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve)
[![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve)

View file

@ -14,6 +14,22 @@
package analysis
import (
"reflect"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTokenLocation int
var reflectStaticSizeTokenFreq int
func init() {
var tl TokenLocation
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
var tf TokenFreq
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
}
// TokenLocation represents one occurrence of a term at a particular location in
// a field. Start, End and Position have the same meaning as in analysis.Token.
// Field and ArrayPositions identify the field value in the source document.
@ -26,6 +42,12 @@ type TokenLocation struct {
Position int
}
func (tl *TokenLocation) Size() int {
rv := reflectStaticSizeTokenLocation
rv += len(tl.ArrayPositions) * size.SizeOfUint64
return rv
}
// TokenFreq represents all the occurrences of a term in all fields of a
// document.
type TokenFreq struct {
@ -34,6 +56,15 @@ type TokenFreq struct {
frequency int
}
func (tf *TokenFreq) Size() int {
rv := reflectStaticSizeTokenFreq
rv += len(tf.Term)
for _, loc := range tf.Locations {
rv += loc.Size()
}
return rv
}
func (tf *TokenFreq) Frequency() int {
return tf.frequency
}
@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int {
// fields.
type TokenFrequencies map[string]*TokenFreq
func (tfs TokenFrequencies) Size() int {
rv := size.SizeOfMap
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
for k, v := range tfs {
rv += len(k)
rv += v.Size()
}
return rv
}
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
// walk the new token frequencies
for tfk, tf := range other {

View file

@ -25,6 +25,9 @@ import (
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search/highlight/highlighter/html"
// force import of scorch so its accessible by default
_ "github.com/blevesearch/bleve/index/scorch"
)
var bleveExpVar = expvar.NewMap("bleve")

View file

@ -14,13 +14,24 @@
package document
import "fmt"
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDocument int
func init() {
var d Document
reflectStaticSizeDocument = int(reflect.TypeOf(d).Size())
}
type Document struct {
ID string `json:"id"`
Fields []Field `json:"fields"`
CompositeFields []*CompositeField
Number uint64 `json:"-"`
}
func NewDocument(id string) *Document {
@ -31,6 +42,21 @@ func NewDocument(id string) *Document {
}
}
func (d *Document) Size() int {
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
len(d.ID)
for _, entry := range d.Fields {
sizeInBytes += entry.Size()
}
for _, entry := range d.CompositeFields {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
func (d *Document) AddField(f Field) *Document {
switch f := f.(type) {
case *CompositeField:

View file

@ -36,4 +36,6 @@ type Field interface {
// that this field represents - this is a common metric for tracking
// the rate of indexing
NumPlainTextBytes() uint64
Size() int
}

View file

@ -16,11 +16,20 @@ package document
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/size"
)
const DefaultBooleanIndexingOptions = StoreField | IndexField
var reflectStaticSizeBooleanField int
func init() {
var f BooleanField
reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size())
}
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
type BooleanField struct {
name string
@ -30,6 +39,13 @@ type BooleanField struct {
numPlainTextBytes uint64
}
func (b *BooleanField) Size() int {
return reflectStaticSizeBooleanField + size.SizeOfPtr +
len(b.name) +
len(b.arrayPositions)*size.SizeOfUint64 +
len(b.value)
}
func (b *BooleanField) Name() string {
return b.name
}

View file

@ -15,9 +15,19 @@
package document
import (
"reflect"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeCompositeField int
func init() {
var cf CompositeField
reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size())
}
const DefaultCompositeIndexingOptions = IndexField
type CompositeField struct {
@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl
return rv
}
func (c *CompositeField) Size() int {
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
len(c.name)
for k, _ := range c.includedFields {
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
}
for k, _ := range c.excludedFields {
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
}
return sizeInBytes
}
func (c *CompositeField) Name() string {
return c.name
}

View file

@ -17,13 +17,22 @@ package document
import (
"fmt"
"math"
"reflect"
"time"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/size"
)
const DefaultDateTimeIndexingOptions = StoreField | IndexField
var reflectStaticSizeDateTimeField int
func init() {
var f DateTimeField
reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size())
}
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
const DefaultDateTimePrecisionStep uint = 4
var MinTimeRepresentable = time.Unix(0, math.MinInt64)
@ -37,6 +46,12 @@ type DateTimeField struct {
numPlainTextBytes uint64
}
func (n *DateTimeField) Size() int {
return reflectStaticSizeDateTimeField + size.SizeOfPtr +
len(n.name) +
len(n.arrayPositions)*size.SizeOfUint64
}
func (n *DateTimeField) Name() string {
return n.name
}

View file

@ -16,12 +16,21 @@ package document
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/geo"
"github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeGeoPointField int
func init() {
var f GeoPointField
reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size())
}
var GeoPrecisionStep uint = 9
type GeoPointField struct {
@ -32,6 +41,12 @@ type GeoPointField struct {
numPlainTextBytes uint64
}
func (n *GeoPointField) Size() int {
return reflectStaticSizeGeoPointField + size.SizeOfPtr +
len(n.name) +
len(n.arrayPositions)*size.SizeOfUint64
}
func (n *GeoPointField) Name() string {
return n.name
}

View file

@ -16,12 +16,21 @@ package document
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/size"
)
const DefaultNumericIndexingOptions = StoreField | IndexField
var reflectStaticSizeNumericField int
func init() {
var f NumericField
reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size())
}
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
const DefaultPrecisionStep uint = 4
@ -33,6 +42,12 @@ type NumericField struct {
numPlainTextBytes uint64
}
func (n *NumericField) Size() int {
return reflectStaticSizeNumericField + size.SizeOfPtr +
len(n.name) +
len(n.arrayPositions)*size.SizeOfPtr
}
func (n *NumericField) Name() string {
return n.name
}

View file

@ -16,11 +16,20 @@ package document
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/size"
)
const DefaultTextIndexingOptions = IndexField
var reflectStaticSizeTextField int
func init() {
var f TextField
reflectStaticSizeTextField = int(reflect.TypeOf(f).Size())
}
const DefaultTextIndexingOptions = IndexField | DocValues
type TextField struct {
name string
@ -31,6 +40,13 @@ type TextField struct {
numPlainTextBytes uint64
}
func (t *TextField) Size() int {
return reflectStaticSizeTextField + size.SizeOfPtr +
len(t.name) +
len(t.arrayPositions)*size.SizeOfUint64 +
len(t.value)
}
func (t *TextField) Name() string {
return t.name
}

View file

@ -20,6 +20,7 @@ const (
IndexField IndexingOptions = 1 << iota
StoreField
IncludeTermVectors
DocValues
)
func (o IndexingOptions) IsIndexed() bool {
@ -34,6 +35,10 @@ func (o IndexingOptions) IncludeTermVectors() bool {
return o&IncludeTermVectors != 0
}
func (o IndexingOptions) IncludeDocValues() bool {
return o&DocValues != 0
}
func (o IndexingOptions) String() string {
rv := ""
if o.IsIndexed() {
@ -51,5 +56,11 @@ func (o IndexingOptions) String() string {
}
rv += "TV"
}
if o.IncludeDocValues() {
if rv != "" {
rv += ", "
}
rv += "DV"
}
return rv
}

View file

@ -15,6 +15,7 @@
package geo
import (
"fmt"
"math"
"github.com/blevesearch/bleve/numeric"
@ -26,6 +27,12 @@ var GeoBits uint = 32
var minLon = -180.0
var minLat = -90.0
var maxLon = 180.0
var maxLat = 90.0
var minLonRad = minLon * degreesToRadian
var minLatRad = minLat * degreesToRadian
var maxLonRad = maxLon * degreesToRadian
var maxLatRad = maxLat * degreesToRadian
var geoTolerance = 1E-6
var lonScale = float64((uint64(0x1)<<GeoBits)-1) / 360.0
var latScale = float64((uint64(0x1)<<GeoBits)-1) / 180.0
@ -91,26 +98,8 @@ func BoundingBoxContains(lon, lat, minLon, minLat, maxLon, maxLat float64) bool
compareGeo(lat, minLat) >= 0 && compareGeo(lat, maxLat) <= 0
}
// ComputeBoundingBox will compute a bounding box around the provided point
// which surrounds a circle of the provided radius (in meters).
func ComputeBoundingBox(centerLon, centerLat,
radius float64) (upperLeftLon float64, upperLeftLat float64,
lowerRightLon float64, lowerRightLat float64) {
_, tlat := pointFromLonLatBearing(centerLon, centerLat, 0, radius)
rlon, _ := pointFromLonLatBearing(centerLon, centerLat, 90, radius)
_, blat := pointFromLonLatBearing(centerLon, centerLat, 180, radius)
llon, _ := pointFromLonLatBearing(centerLon, centerLat, 270, radius)
return normalizeLon(llon), normalizeLat(tlat),
normalizeLon(rlon), normalizeLat(blat)
}
const degreesToRadian = math.Pi / 180
const radiansToDegrees = 180 / math.Pi
const flattening = 1.0 / 298.257223563
const semiMajorAxis = 6378137
const semiMinorAxis = semiMajorAxis * (1.0 - flattening)
const semiMajorAxis2 = semiMajorAxis * semiMajorAxis
const semiMinorAxis2 = semiMinorAxis * semiMinorAxis
// DegreesToRadians converts an angle in degrees to radians
func DegreesToRadians(d float64) float64 {
@ -122,84 +111,60 @@ func RadiansToDegrees(r float64) float64 {
return r * radiansToDegrees
}
// pointFromLonLatBearing starts that the provide lon,lat
// then moves in the bearing direction (in degrees)
// this move continues for the provided distance (in meters)
// The lon, lat of this destination location is returned.
func pointFromLonLatBearing(lon, lat, bearing,
dist float64) (float64, float64) {
var earthMeanRadiusMeters = 6371008.7714
alpha1 := DegreesToRadians(bearing)
cosA1 := math.Cos(alpha1)
sinA1 := math.Sin(alpha1)
tanU1 := (1 - flattening) * math.Tan(DegreesToRadians(lat))
cosU1 := 1 / math.Sqrt(1+tanU1*tanU1)
sinU1 := tanU1 * cosU1
sig1 := math.Atan2(tanU1, cosA1)
sinAlpha := cosU1 * sinA1
cosSqAlpha := 1 - sinAlpha*sinAlpha
uSq := cosSqAlpha * (semiMajorAxis2 - semiMinorAxis2) / semiMinorAxis2
A := 1 + uSq/16384*(4096+uSq*(-768+uSq*(320-175*uSq)))
B := uSq / 1024 * (256 + uSq*(-128+uSq*(74-47*uSq)))
func RectFromPointDistance(lon, lat, dist float64) (float64, float64, float64, float64, error) {
err := checkLongitude(lon)
if err != nil {
return 0, 0, 0, 0, err
}
err = checkLatitude(lat)
if err != nil {
return 0, 0, 0, 0, err
}
radLon := DegreesToRadians(lon)
radLat := DegreesToRadians(lat)
radDistance := (dist + 7e-2) / earthMeanRadiusMeters
sigma := dist / (semiMinorAxis * A)
minLatL := radLat - radDistance
maxLatL := radLat + radDistance
cos25SigmaM := math.Cos(2*sig1 + sigma)
sinSigma := math.Sin(sigma)
cosSigma := math.Cos(sigma)
deltaSigma := B * sinSigma * (cos25SigmaM + (B/4)*
(cosSigma*(-1+2*cos25SigmaM*cos25SigmaM)-(B/6)*cos25SigmaM*
(-1+4*sinSigma*sinSigma)*(-3+4*cos25SigmaM*cos25SigmaM)))
sigmaP := sigma
sigma = dist/(semiMinorAxis*A) + deltaSigma
for math.Abs(sigma-sigmaP) > 1E-12 {
cos25SigmaM = math.Cos(2*sig1 + sigma)
sinSigma = math.Sin(sigma)
cosSigma = math.Cos(sigma)
deltaSigma = B * sinSigma * (cos25SigmaM + (B/4)*
(cosSigma*(-1+2*cos25SigmaM*cos25SigmaM)-(B/6)*cos25SigmaM*
(-1+4*sinSigma*sinSigma)*(-3+4*cos25SigmaM*cos25SigmaM)))
sigmaP = sigma
sigma = dist/(semiMinorAxis*A) + deltaSigma
var minLonL, maxLonL float64
if minLatL > minLatRad && maxLatL < maxLatRad {
deltaLon := asin(sin(radDistance) / cos(radLat))
minLonL = radLon - deltaLon
if minLonL < minLonRad {
minLonL += 2 * math.Pi
}
maxLonL = radLon + deltaLon
if maxLonL > maxLonRad {
maxLonL -= 2 * math.Pi
}
} else {
// pole is inside distance
minLatL = math.Max(minLatL, minLatRad)
maxLatL = math.Min(maxLatL, maxLatRad)
minLonL = minLonRad
maxLonL = maxLonRad
}
tmp := sinU1*sinSigma - cosU1*cosSigma*cosA1
lat2 := math.Atan2(sinU1*cosSigma+cosU1*sinSigma*cosA1,
(1-flattening)*math.Sqrt(sinAlpha*sinAlpha+tmp*tmp))
lamda := math.Atan2(sinSigma*sinA1, cosU1*cosSigma-sinU1*sinSigma*cosA1)
c := flattening / 16 * cosSqAlpha * (4 + flattening*(4-3*cosSqAlpha))
lam := lamda - (1-c)*flattening*sinAlpha*
(sigma+c*sinSigma*(cos25SigmaM+c*cosSigma*(-1+2*cos25SigmaM*cos25SigmaM)))
rvlon := lon + RadiansToDegrees(lam)
rvlat := RadiansToDegrees(lat2)
return rvlon, rvlat
return RadiansToDegrees(minLonL),
RadiansToDegrees(maxLatL),
RadiansToDegrees(maxLonL),
RadiansToDegrees(minLatL),
nil
}
// normalizeLon normalizes a longitude value within the -180 to 180 range
func normalizeLon(lonDeg float64) float64 {
if lonDeg >= -180 && lonDeg <= 180 {
return lonDeg
func checkLatitude(latitude float64) error {
if math.IsNaN(latitude) || latitude < minLat || latitude > maxLat {
return fmt.Errorf("invalid latitude %f; must be between %f and %f", latitude, minLat, maxLat)
}
off := math.Mod(lonDeg+180, 360)
if off < 0 {
return 180 + off
} else if off == 0 && lonDeg > 0 {
return 180
}
return -180 + off
return nil
}
// normalizeLat normalizes a latitude value within the -90 to 90 range
func normalizeLat(latDeg float64) float64 {
if latDeg >= -90 && latDeg <= 90 {
return latDeg
func checkLongitude(longitude float64) error {
if math.IsNaN(longitude) || longitude < minLon || longitude > maxLon {
return fmt.Errorf("invalid longitude %f; must be between %f and %f", longitude, minLon, maxLon)
}
off := math.Abs(math.Mod(latDeg+90, 360))
if off <= 180 {
return off - 90
}
return (360 - off) - 90
return nil
}

View file

@ -146,6 +146,12 @@ func earthDiameter(lat float64) float64 {
return earthDiameterPerLatitude[int(index)]
}
var pio2 = math.Pi / 2
func sin(a float64) float64 {
return cos(a - pio2)
}
// cos is a sloppy math (faster) implementation of math.Cos
func cos(a float64) float64 {
if a < 0.0 {

View file

@ -15,11 +15,13 @@
package bleve
import (
"context"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/mapping"
"golang.org/x/net/context"
"github.com/blevesearch/bleve/size"
)
// A Batch groups together multiple Index and Delete
@ -31,6 +33,9 @@ import (
type Batch struct {
index Index
internal *index.Batch
lastDocSize uint64
totalSize uint64
}
// Index adds the specified index operation to the
@ -46,6 +51,30 @@ func (b *Batch) Index(id string, data interface{}) error {
return err
}
b.internal.Update(doc)
b.lastDocSize = uint64(doc.Size() +
len(id) + size.SizeOfString) // overhead from internal
b.totalSize += b.lastDocSize
return nil
}
func (b *Batch) LastDocSize() uint64 {
return b.lastDocSize
}
func (b *Batch) TotalDocsSize() uint64 {
return b.totalSize
}
// IndexAdvanced adds the specified index operation to the
// batch which skips the mapping. NOTE: the bleve Index is not updated
// until the batch is executed.
func (b *Batch) IndexAdvanced(doc *document.Document) (err error) {
if doc.ID == "" {
return ErrorEmptyID
}
b.internal.Update(doc)
return nil
}
@ -65,7 +94,7 @@ func (b *Batch) SetInternal(key, val []byte) {
b.internal.SetInternal(key, val)
}
// SetInternal adds the specified delete internal
// DeleteInternal adds the specified delete internal
// operation to the batch. NOTE: the bleve Index is
// not updated until the batch is executed.
func (b *Batch) DeleteInternal(key []byte) {
@ -99,12 +128,15 @@ func (b *Batch) Reset() {
// them.
//
// The DocumentMapping used to index a value is deduced by the following rules:
// 1) If value implements Classifier interface, resolve the mapping from Type().
// 2) If value has a string field or value at IndexMapping.TypeField.
// 1) If value implements mapping.bleveClassifier interface, resolve the mapping
// from BleveType().
// 2) If value implements mapping.Classifier interface, resolve the mapping
// from Type().
// 3) If value has a string field or value at IndexMapping.TypeField.
// (defaulting to "_type"), use it to resolve the mapping. Fields addressing
// is described below.
// 3) If IndexMapping.DefaultType is registered, return it.
// 4) Return IndexMapping.DefaultMapping.
// 4) If IndexMapping.DefaultType is registered, return it.
// 5) Return IndexMapping.DefaultMapping.
//
// Each field or nested field of the value is identified by a string path, then
// mapped to one or several FieldMappings which extract the result for analysis.

View file

@ -14,7 +14,20 @@
package index
import "github.com/blevesearch/bleve/document"
import (
"reflect"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeAnalysisResult int
func init() {
var ar AnalysisResult
reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size())
}
type IndexRow interface {
KeySize() int
@ -29,6 +42,20 @@ type IndexRow interface {
type AnalysisResult struct {
DocID string
Rows []IndexRow
// scorch
Document *document.Document
Analyzed []analysis.TokenFrequencies
Length []int
}
func (a *AnalysisResult) Size() int {
rv := reflectStaticSizeAnalysisResult
for _, analyzedI := range a.Analyzed {
rv += analyzedI.Size()
}
rv += len(a.Length) * size.SizeOfInt
return rv
}
type AnalysisWork struct {

View file

@ -18,11 +18,23 @@ import (
"bytes"
"encoding/json"
"fmt"
"reflect"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTermFieldDoc int
var reflectStaticSizeTermFieldVector int
func init() {
var tfd TermFieldDoc
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
var tfv TermFieldVector
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
}
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
type Index interface {
@ -68,6 +80,8 @@ type IndexReader interface {
Document(id string) (*document.Document, error)
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
DocValueReader(fields []string) (DocValueReader, error)
Fields() ([]string, error)
GetInternal(key []byte) ([]byte, error)
@ -84,6 +98,18 @@ type IndexReader interface {
Close() error
}
type IndexReaderRegexp interface {
FieldDictRegexp(field string, regex []byte) (FieldDict, error)
}
type IndexReaderFuzzy interface {
FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
}
type IndexReaderOnly interface {
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
}
// FieldTerms contains the terms used by a document, keyed by field
type FieldTerms map[string][]string
@ -115,6 +141,11 @@ type TermFieldVector struct {
End uint64
}
func (tfv *TermFieldVector) Size() int {
return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
}
// IndexInternalID is an opaque document identifier interal to the index impl
type IndexInternalID []byte
@ -134,14 +165,27 @@ type TermFieldDoc struct {
Vectors []*TermFieldVector
}
func (tfd *TermFieldDoc) Size() int {
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
len(tfd.Term) + len(tfd.ID)
for _, entry := range tfd.Vectors {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
// Reset allows an already allocated TermFieldDoc to be reused
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
// remember the []byte used for the ID
id := tfd.ID
vectors := tfd.Vectors
// idiom to copy over from empty TermFieldDoc (0 allocations)
*tfd = TermFieldDoc{}
// reuse the []byte already allocated (and reset len to 0)
tfd.ID = id[:0]
tfd.Vectors = vectors[:0]
return tfd
}
@ -161,6 +205,8 @@ type TermFieldReader interface {
// Count returns the number of documents contains the term in this field.
Count() uint64
Close() error
Size() int
}
type DictEntry struct {
@ -185,6 +231,9 @@ type DocIDReader interface {
// will start there instead. If ID is greater than or equal to the end of
// the range, Next() call will return io.EOF.
Advance(ID IndexInternalID) (IndexInternalID, error)
Size() int
Close() error
}
@ -239,3 +288,23 @@ func (b *Batch) Reset() {
b.IndexOps = make(map[string]*document.Document)
b.InternalOps = make(map[string][]byte)
}
// Optimizable represents an optional interface that implementable by
// optimizable resources (e.g., TermFieldReaders, Searchers). These
// optimizable resources are provided the same OptimizableContext
// instance, so that they can coordinate via dynamic interface
// casting.
type Optimizable interface {
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
}
type OptimizableContext interface {
// Once all the optimzable resources have been provided the same
// OptimizableContext instance, the optimization preparations are
// finished or completed via the Finish() method.
Finish() error
}
type DocValueReader interface {
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
}

View file

@ -0,0 +1,367 @@
# scorch
## Definitions
Batch
- A collection of Documents to mutate in the index.
Document
- Has a unique identifier (arbitrary bytes).
- Is comprised of a list of fields.
Field
- Has a name (string).
- Has a type (text, number, date, geopoint).
- Has a value (depending on type).
- Can be indexed, stored, or both.
- If indexed, can be analyzed.
-m If indexed, can optionally store term vectors.
## Scope
Scorch *MUST* implement the bleve.index API without requiring any changes to this API.
Scorch *MAY* introduce new interfaces, which can be discovered to allow use of new capabilities not in the current API.
## Implementation
The scorch implementation starts with the concept of a segmented index.
A segment is simply a slice, subset, or portion of the entire index. A segmented index is one which is composed of one or more segments. Although segments are created in a particular order, knowing this ordering is not required to achieve correct semantics when querying. Because there is no ordering, this means that when searching an index, you can (and should) search all the segments concurrently.
### Internal Wrapper
In order to accommodate the existing APIs while also improving the implementation, the scorch implementation includes some wrapper functionality that must be described.
#### \_id field
In scorch, field 0 is prearranged to be named \_id. All documents have a value for this field, which is the documents external identifier. In this version the field *MUST* be both indexed AND stored. The scorch wrapper adds this field, as it will not be present in the Document from the calling bleve code.
NOTE: If a document already contains a field \_id, it will be replaced. If this is problematic, the caller must ensure such a scenario does not happen.
### Proposed Structures
```
type Segment interface {
Dictionary(field string) TermDictionary
}
type TermDictionary interface {
PostingsList(term string, excluding PostingsList) PostingsList
}
type PostingsList interface {
Next() Posting
And(other PostingsList) PostingsList
Or(other PostingsList) PostingsList
}
type Posting interface {
Number() uint64
Frequency() uint64
Norm() float64
Locations() Locations
}
type Locations interface {
Start() uint64
End() uint64
Pos() uint64
ArrayPositions() ...
}
type DeletedDocs {
}
type SegmentSnapshot struct {
segment Segment
deleted PostingsList
}
type IndexSnapshot struct {
segment []SegmentSnapshot
}
```
**What about errors?**
**What about memory mgmnt or context?**
**Postings List separate iterator to separate stateful from stateless**
### Mutating the Index
The bleve.index API has methods for directly making individual mutations (Update/Delete/SetInternal/DeleteInternal), however for this first implementation, we assume that all of these calls can simply be turned into a Batch of size 1. This may be highly inefficient, but it will be correct. This decision is made based on the fact that Couchbase FTS always uses Batches.
NOTE: As a side-effect of this decision, it should be clear that performance tuning may depend on the batch size, which may in-turn require changes in FTS.
From this point forward, only Batch mutations will be discussed.
Sequence of Operations:
1. For each document in the batch, search through all existing segments. The goal is to build up a per-segment bitset which tells us which documents in that segment are obsoleted by the addition of the new segment we're currently building. NOTE: we're not ready for this change to take effect yet, so rather than this operation mutating anything, they simply return bitsets, which we can apply later. Logically, this is something like:
```
foreach segment {
dict := segment.Dictionary("\_id")
postings := empty postings list
foreach docID {
postings = postings.Or(dict.PostingsList(docID, nil))
}
}
```
NOTE: it is illustrated above as nested for loops, but some or all of these could be concurrently. The end result is that for each segment, we have (possibly empty) bitset.
2. Also concurrent with 1, the documents in the batch are analyzed. This analysis proceeds using the existing analyzer pool.
3. (after 2 completes) Analyzed documents are fed into a function which builds a new Segment representing this information.
4. We now have everything we need to update the state of the system to include this new snapshot.
- Acquire a lock
- Create a new IndexSnapshot
- For each SegmentSnapshot in the IndexSnapshot, take the deleted PostingsList and OR it with the new postings list for this Segment. Construct a new SegmentSnapshot for the segment using this new deleted PostingsList. Append this SegmentSnapshot to the IndexSnapshot.
- Create a new SegmentSnapshot wrapping our new segment with nil deleted docs.
- Append the new SegmentSnapshot to the IndexSnapshot
- Release the lock
An ASCII art example:
```
0 - Empty Index
No segments
IndexSnapshot
segments []
deleted []
1 - Index Batch [ A B C ]
segment 0
numbers [ 1 2 3 ]
\_id [ A B C ]
IndexSnapshot
segments [ 0 ]
deleted [ nil ]
2 - Index Batch [ B' ]
segment 0 1
numbers [ 1 2 3 ] [ 1 ]
\_id [ A B C ] [ B ]
Compute bitset segment-0-deleted-by-1:
[ 0 1 0 ]
OR it with previous (nil) (call it 0-1)
[ 0 1 0 ]
IndexSnapshot
segments [ 0 1 ]
deleted [ 0-1 nil ]
3 - Index Batch [ C' ]
segment 0 1 2
numbers [ 1 2 3 ] [ 1 ] [ 1 ]
\_id [ A B C ] [ B ] [ C ]
Compute bitset segment-0-deleted-by-2:
[ 0 0 1 ]
OR it with previous ([ 0 1 0 ]) (call it 0-12)
[ 0 1 1 ]
Compute bitset segment-1-deleted-by-2:
[ 0 ]
OR it with previous (nil)
still just nil
IndexSnapshot
segments [ 0 1 2 ]
deleted [ 0-12 nil nil ]
```
**is there opportunity to stop early when doc is found in one segment**
**also, more efficient way to find bits for long lists of ids?**
### Searching
In the bleve.index API all searching starts by getting an IndexReader, which represents a snapshot of the index at a point in time.
As described in the section above, our index implementation maintains a pointer to the current IndexSnapshot. When a caller gets an IndexReader, they get a copy of this pointer, and can use it as long as they like. The IndexSnapshot contains SegmentSnapshots, which only contain pointers to immutable segments. The deleted posting lists associated with a segment change over time, but the particular deleted posting list in YOUR snapshot is immutable. This gives a stable view of the data.
#### Term Search
Term search is the only searching primitive exposed in today's bleve.index API. This ultimately could limit our ability to take advantage of the indexing improvements, but it also means it will be easier to get a first version of this working.
A term search for term T in field F will look something like this:
```
searchResultPostings = empty
foreach segment {
dict := segment.Dictionary(F)
segmentResultPostings = dict.PostingsList(T, segmentSnapshotDeleted)
// make segmentLocal numbers into global numbers, and flip bits in searchResultPostings
}
```
The searchResultPostings will be a new implementation of the TermFieldReader inteface.
As a reminder this interface is:
```
// TermFieldReader is the interface exposing the enumeration of documents
// containing a given term in a given field. Documents are returned in byte
// lexicographic order over their identifiers.
type TermFieldReader interface {
// Next returns the next document containing the term in this field, or nil
// when it reaches the end of the enumeration. The preAlloced TermFieldDoc
// is optional, and when non-nil, will be used instead of allocating memory.
Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error)
// Advance resets the enumeration at specified document or its immediate
// follower.
Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error)
// Count returns the number of documents contains the term in this field.
Count() uint64
Close() error
}
```
At first glance this appears problematic, we have no way to return documents in order of their identifiers. But it turns out the wording of this perhaps too strong, or a bit ambiguous. Originally, this referred to the external identifiers, but with the introduction of a distinction between internal/external identifiers, returning them in order of their internal identifiers is also acceptable. **ASIDE**: the reason for this is that most callers just use Next() and literally don't care what the order is, they could be in any order and it would be fine. There is only one search that cares and that is the ConjunctionSearcher, which relies on Next/Advance having very specific semantics. Later in this document we will have a proposal to split into multiple interfaces:
- The weakest interface, only supports Next() no ordering at all.
- Ordered, supporting Advance()
- And/Or'able capable of internally efficiently doing these ops with like interfaces (if not capable then can always fall back to external walking)
But, the good news is that we don't even have to do that for our first implementation. As long as the global numbers we use for internal identifiers are consistent within this IndexSnapshot, then Next() will be ordered by ascending document number, and Advance() will still work correctly.
NOTE: there is another place where we rely on the ordering of these hits, and that is in the "\_id" sort order. Previously this was the natural order, and a NOOP for the collector, now it must be implemented by actually sorting on the "\_id" field. We probably should introduce at least a marker interface to detect this.
An ASCII art example:
```
Let's start with the IndexSnapshot we ended with earlier:
3 - Index Batch [ C' ]
segment 0 1 2
numbers [ 1 2 3 ] [ 1 ] [ 1 ]
\_id [ A B C ] [ B ] [ C ]
Compute bitset segment-0-deleted-by-2:
[ 0 0 1 ]
OR it with previous ([ 0 1 0 ]) (call it 0-12)
[ 0 1 1 ]
Compute bitset segment-1-deleted-by-2:
[ 0 0 0 ]
OR it with previous (nil)
still just nil
IndexSnapshot
segments [ 0 1 2 ]
deleted [ 0-12 nil nil ]
Now let's search for the term 'cat' in the field 'desc' and let's assume that Document C (both versions) would match it.
Concurrently:
- Segment 0
- Get Term Dictionary For Field 'desc'
- From it get Postings List for term 'cat' EXCLUDING 0-12
- raw segment matches [ 0 0 1 ] but excluding [ 0 1 1 ] gives [ 0 0 0 ]
- Segment 1
- Get Term Dictionary For Field 'desc'
- From it get Postings List for term 'cat' excluding nil
- [ 0 ]
- Segment 2
- Get Term Dictionary For Field 'desc'
- From it get Postings List for term 'cat' excluding nil
- [ 1 ]
Map local bitsets into global number space (global meaning cross-segment but still unique to this snapshot)
IndexSnapshot already should have mapping something like:
0 - Offset 0
1 - Offset 3 (because segment 0 had 3 docs)
2 - Offset 4 (becuase segment 1 had 1 doc)
This maps to search result bitset:
[ 0 0 0 0 1]
Caller would call Next() and get doc number 5 (assuming 1 based indexing for now)
Caller could then ask to get term locations, stored fields, external doc ID for document number 5. Internally in the IndexSnapshot, we can now convert that back, and realize doc number 5 comes from segment 2, 5-4=1 so we're looking for doc number 1 in segment 2. That happens to be C...
```
#### Future improvements
In the future, interfaces to detect these non-serially operating TermFieldReaders could expose their own And() and Or() up to the higher level Conjunction/Disjunction searchers. Doing this alone offers some win, but also means there would be greater burden on the Searcher code rewriting logical expressions for maximum performance.
Another related topic is that of peak memory usage. With serially operating TermFieldReaders it was necessary to start them all at the same time and operate in unison. However, with these non-serially operating TermFieldReaders we have the option of doing a few at a time, consolidating them, dispoting the intermediaries, and then doing a few more. For very complex queries with many clauses this could reduce peak memory usage.
### Memory Tracking
All segments must be able to produce two statistics, an estimate of their explicit memory usage, and their actual size on disk (if any). For in-memory segments, disk usage could be zero, and the memory usage represents the entire information content. For mmap-based disk segments, the memory could be as low as the size of tracking structure itself (say just a few pointers).
This would allow the implementation to throttle or block incoming mutations when a threshold memory usage has (or would be) exceeded.
### Persistence
Obviously, we want to support (but maybe not require) asynchronous persistence of segments. My expectation is that segments are initially built in memory. At some point they are persisted to disk. This poses some interesting challenges.
At runtime, the state of an index (it's IndexSnapshot) is not only the contents of the segments, but also the bitmasks of deleted documents. These bitmasks indirectly encode an ordering in which the segments were added. The reason is that the bitmasks encode which items have been obsoleted by other (subsequent or more future) segments. In the runtime implementation we compute bitmask deltas and then merge them at the same time we bring the new segment in. One idea is that we could take a similar approach on disk. When we persist a segment, we persist the bitmask deltas of segments known to exist at that time, and eventually these can get merged up into a base segment deleted bitmask.
This also relates to the topic rollback, addressed next...
### Rollback
One desirable property in the Couchbase ecosystem is the ability to rollback to some previous (though typically not long ago) state. One idea for keeping this property in this design is to protect some of the most recent segments from merging. Then, if necessary, they could be "undone" to reveal previous states of the system. In these scenarios "undone" has to properly undo the deleted bitmasks on the other segments. Again, the current thinking is that rather than "undo" anything, it could be work that was deferred in the first place, thus making it easier to logically undo.
Another possibly related approach would be to tie this into our existing snapshot mechanism. Perhaps simulating a slow reader (holding onto index snapshots) for some period of time, can be the mechanism to achieve the desired end goal.
### Internal Storage
The bleve.index API has support for "internal storage". The ability to store information under a separate name space.
This is not used for high volume storage, so it is tempting to think we could just put a small k/v store alongside the rest of the index. But, the reality is that this storage is used to maintain key information related to the rollback scenario. Because of this, its crucial that ordering and overwriting of key/value pairs correspond with actual segment persistence in the index. Based on this, I believe its important to put the internal key/value pairs inside the segments themselves. But, this also means that they must follow a similar "deleted" bitmask approach to obsolete values in older segments. But, this also seems to substantially increase the complexity of the solution because of the separate name space, it would appear to require its own bitmask. Further keys aren't numeric, which then implies yet another mapping from internal key to number, etc.
More thought is required here.
### Merging
The segmented index approach requires merging to prevent the number of segments from growing too large.
Recent experience with LSMs has taught us that having the correct merge strategy can make a huge difference in the overall performance of the system. In particular, a simple merge strategy which merges segments too aggressively can lead to high write amplification and unnecessarily rendering cached data useless.
A few simple principles have been identified.
- Roughly we merge multiple smaller segments into a single larger one.
- The larger a segment gets the less likely we should be to ever merge it.
- Segments with large numbers of deleted/obsoleted items are good candidates as the merge will result in a space savings.
- Segments with all items deleted/obsoleted can be dropped.
Merging of a segment should be able to proceed even if that segment is held by an ongoing snapshot, it should only delay the removal of it.

View file

@ -0,0 +1,56 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import "time"
// RegistryAsyncErrorCallbacks should be treated as read-only after
// process init()'ialization.
var RegistryAsyncErrorCallbacks = map[string]func(error){}
// RegistryEventCallbacks should be treated as read-only after
// process init()'ialization.
var RegistryEventCallbacks = map[string]func(Event){}
// Event represents the information provided in an OnEvent() callback.
type Event struct {
Kind EventKind
Scorch *Scorch
Duration time.Duration
}
// EventKind represents an event code for OnEvent() callbacks.
type EventKind int
// EventKindCloseStart is fired when a Scorch.Close() has begun.
var EventKindCloseStart = EventKind(1)
// EventKindClose is fired when a scorch index has been fully closed.
var EventKindClose = EventKind(2)
// EventKindMergerProgress is fired when the merger has completed a
// round of merge processing.
var EventKindMergerProgress = EventKind(3)
// EventKindPersisterProgress is fired when the persister has completed
// a round of persistence processing.
var EventKindPersisterProgress = EventKind(4)
// EventKindBatchIntroductionStart is fired when Batch() is invoked which
// introduces a new segment.
var EventKindBatchIntroductionStart = EventKind(5)
// EventKindBatchIntroduction is fired when Batch() completes.
var EventKindBatchIntroduction = EventKind(6)

View file

@ -0,0 +1,443 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"fmt"
"sync/atomic"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
)
type segmentIntroduction struct {
id uint64
data segment.Segment
obsoletes map[uint64]*roaring.Bitmap
ids []string
internal map[string][]byte
applied chan error
persisted chan error
}
type persistIntroduction struct {
persisted map[uint64]segment.Segment
applied notificationChan
}
type epochWatcher struct {
epoch uint64
notifyCh notificationChan
}
type snapshotReversion struct {
snapshot *IndexSnapshot
applied chan error
persisted chan error
}
func (s *Scorch) mainLoop() {
var epochWatchers []*epochWatcher
OUTER:
for {
atomic.AddUint64(&s.stats.TotIntroduceLoop, 1)
select {
case <-s.closeCh:
break OUTER
case epochWatcher := <-s.introducerNotifier:
epochWatchers = append(epochWatchers, epochWatcher)
case nextMerge := <-s.merges:
s.introduceMerge(nextMerge)
case next := <-s.introductions:
err := s.introduceSegment(next)
if err != nil {
continue OUTER
}
case persist := <-s.persists:
s.introducePersist(persist)
case revertTo := <-s.revertToSnapshots:
err := s.revertToSnapshot(revertTo)
if err != nil {
continue OUTER
}
}
var epochCurr uint64
s.rootLock.RLock()
if s.root != nil {
epochCurr = s.root.epoch
}
s.rootLock.RUnlock()
var epochWatchersNext []*epochWatcher
for _, w := range epochWatchers {
if w.epoch < epochCurr {
close(w.notifyCh)
} else {
epochWatchersNext = append(epochWatchersNext, w)
}
}
epochWatchers = epochWatchersNext
}
s.asyncTasks.Done()
}
func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1)
s.rootLock.RLock()
root := s.root
s.rootLock.RUnlock()
nsegs := len(root.segment)
// prepare new index snapshot
newSnapshot := &IndexSnapshot{
parent: s,
segment: make([]*SegmentSnapshot, 0, nsegs+1),
offsets: make([]uint64, 0, nsegs+1),
internal: make(map[string][]byte, len(root.internal)),
refs: 1,
creator: "introduceSegment",
}
// iterate through current segments
var running uint64
for i := range root.segment {
// see if optimistic work included this segment
delta, ok := next.obsoletes[root.segment[i].id]
if !ok {
var err error
delta, err = root.segment[i].segment.DocNumbers(next.ids)
if err != nil {
next.applied <- fmt.Errorf("error computing doc numbers: %v", err)
close(next.applied)
_ = newSnapshot.DecRef()
return err
}
}
newss := &SegmentSnapshot{
id: root.segment[i].id,
segment: root.segment[i].segment,
cachedDocs: root.segment[i].cachedDocs,
creator: root.segment[i].creator,
}
// apply new obsoletions
if root.segment[i].deleted == nil {
newss.deleted = delta
} else {
newss.deleted = roaring.Or(root.segment[i].deleted, delta)
}
if newss.deleted.IsEmpty() {
newss.deleted = nil
}
// check for live size before copying
if newss.LiveSize() > 0 {
newSnapshot.segment = append(newSnapshot.segment, newss)
root.segment[i].segment.AddRef()
newSnapshot.offsets = append(newSnapshot.offsets, running)
running += newss.segment.Count()
}
}
// append new segment, if any, to end of the new index snapshot
if next.data != nil {
newSegmentSnapshot := &SegmentSnapshot{
id: next.id,
segment: next.data, // take ownership of next.data's ref-count
cachedDocs: &cachedDocs{cache: nil},
creator: "introduceSegment",
}
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
newSnapshot.offsets = append(newSnapshot.offsets, running)
// increment numItemsIntroduced which tracks the number of items
// queued for persistence.
atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count())
atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1)
}
// copy old values
for key, oldVal := range root.internal {
newSnapshot.internal[key] = oldVal
}
// set new values and apply deletes
for key, newVal := range next.internal {
if newVal != nil {
newSnapshot.internal[key] = newVal
} else {
delete(newSnapshot.internal, key)
}
}
newSnapshot.updateSize()
s.rootLock.Lock()
if next.persisted != nil {
s.rootPersisted = append(s.rootPersisted, next.persisted)
}
// swap in new index snapshot
newSnapshot.epoch = s.nextSnapshotEpoch
s.nextSnapshotEpoch++
rootPrev := s.root
s.root = newSnapshot
// release lock
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
close(next.applied)
return nil
}
func (s *Scorch) introducePersist(persist *persistIntroduction) {
atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1)
s.rootLock.Lock()
root := s.root
nextSnapshotEpoch := s.nextSnapshotEpoch
s.nextSnapshotEpoch++
s.rootLock.Unlock()
newIndexSnapshot := &IndexSnapshot{
parent: s,
epoch: nextSnapshotEpoch,
segment: make([]*SegmentSnapshot, len(root.segment)),
offsets: make([]uint64, len(root.offsets)),
internal: make(map[string][]byte, len(root.internal)),
refs: 1,
creator: "introducePersist",
}
for i, segmentSnapshot := range root.segment {
// see if this segment has been replaced
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok {
newSegmentSnapshot := &SegmentSnapshot{
id: segmentSnapshot.id,
segment: replacement,
deleted: segmentSnapshot.deleted,
cachedDocs: segmentSnapshot.cachedDocs,
creator: "introducePersist",
}
newIndexSnapshot.segment[i] = newSegmentSnapshot
delete(persist.persisted, segmentSnapshot.id)
// update items persisted incase of a new segment snapshot
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count())
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
} else {
newIndexSnapshot.segment[i] = root.segment[i]
newIndexSnapshot.segment[i].segment.AddRef()
}
newIndexSnapshot.offsets[i] = root.offsets[i]
}
for k, v := range root.internal {
newIndexSnapshot.internal[k] = v
}
newIndexSnapshot.updateSize()
s.rootLock.Lock()
rootPrev := s.root
s.root = newIndexSnapshot
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
close(persist.applied)
}
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1)
s.rootLock.RLock()
root := s.root
s.rootLock.RUnlock()
newSnapshot := &IndexSnapshot{
parent: s,
internal: root.internal,
refs: 1,
creator: "introduceMerge",
}
// iterate through current segments
newSegmentDeleted := roaring.NewBitmap()
var running uint64
for i := range root.segment {
segmentID := root.segment[i].id
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
// this segment is going away, see if anything else was deleted since we started the merge
if segSnapAtMerge != nil && root.segment[i].deleted != nil {
// assume all these deletes are new
deletedSince := root.segment[i].deleted
// if we already knew about some of them, remove
if segSnapAtMerge.deleted != nil {
deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted)
}
deletedSinceItr := deletedSince.Iterator()
for deletedSinceItr.HasNext() {
oldDocNum := deletedSinceItr.Next()
newDocNum := nextMerge.oldNewDocNums[segmentID][oldDocNum]
newSegmentDeleted.Add(uint32(newDocNum))
}
}
// clean up the old segment map to figure out the
// obsolete segments wrt root in meantime, whatever
// segments left behind in old map after processing
// the root segments would be the obsolete segment set
delete(nextMerge.old, segmentID)
} else if root.segment[i].LiveSize() > 0 {
// this segment is staying
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
id: root.segment[i].id,
segment: root.segment[i].segment,
deleted: root.segment[i].deleted,
cachedDocs: root.segment[i].cachedDocs,
creator: root.segment[i].creator,
})
root.segment[i].segment.AddRef()
newSnapshot.offsets = append(newSnapshot.offsets, running)
running += root.segment[i].segment.Count()
}
}
// before the newMerge introduction, need to clean the newly
// merged segment wrt the current root segments, hence
// applying the obsolete segment contents to newly merged segment
for segID, ss := range nextMerge.old {
obsoleted := ss.DocNumbersLive()
if obsoleted != nil {
obsoletedIter := obsoleted.Iterator()
for obsoletedIter.HasNext() {
oldDocNum := obsoletedIter.Next()
newDocNum := nextMerge.oldNewDocNums[segID][oldDocNum]
newSegmentDeleted.Add(uint32(newDocNum))
}
}
}
// In case where all the docs in the newly merged segment getting
// deleted by the time we reach here, can skip the introduction.
if nextMerge.new != nil &&
nextMerge.new.Count() > newSegmentDeleted.GetCardinality() {
// put new segment at end
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
id: nextMerge.id,
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
deleted: newSegmentDeleted,
cachedDocs: &cachedDocs{cache: nil},
creator: "introduceMerge",
})
newSnapshot.offsets = append(newSnapshot.offsets, running)
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
}
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
newSnapshot.updateSize()
s.rootLock.Lock()
// swap in new index snapshot
newSnapshot.epoch = s.nextSnapshotEpoch
s.nextSnapshotEpoch++
rootPrev := s.root
s.root = newSnapshot
// release lock
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
// notify requester that we incorporated this
nextMerge.notify <- newSnapshot
close(nextMerge.notify)
}
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1)
if revertTo.snapshot == nil {
err := fmt.Errorf("Cannot revert to a nil snapshot")
revertTo.applied <- err
return err
}
// acquire lock
s.rootLock.Lock()
// prepare a new index snapshot, based on next snapshot
newSnapshot := &IndexSnapshot{
parent: s,
segment: make([]*SegmentSnapshot, len(revertTo.snapshot.segment)),
offsets: revertTo.snapshot.offsets,
internal: revertTo.snapshot.internal,
epoch: s.nextSnapshotEpoch,
refs: 1,
creator: "revertToSnapshot",
}
s.nextSnapshotEpoch++
// iterate through segments
for i, segmentSnapshot := range revertTo.snapshot.segment {
newSnapshot.segment[i] = &SegmentSnapshot{
id: segmentSnapshot.id,
segment: segmentSnapshot.segment,
deleted: segmentSnapshot.deleted,
cachedDocs: segmentSnapshot.cachedDocs,
creator: segmentSnapshot.creator,
}
newSnapshot.segment[i].segment.AddRef()
// remove segment from ineligibleForRemoval map
filename := zapFileName(segmentSnapshot.id)
delete(s.ineligibleForRemoval, filename)
}
if revertTo.persisted != nil {
s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
}
newSnapshot.updateSize()
// swap in new snapshot
rootPrev := s.root
s.root = newSnapshot
// release lock
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
close(revertTo.applied)
return nil
}

View file

@ -0,0 +1,342 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"encoding/json"
"fmt"
"os"
"sync/atomic"
"time"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/mergeplan"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
)
func (s *Scorch) mergerLoop() {
var lastEpochMergePlanned uint64
mergePlannerOptions, err := s.parseMergePlannerOptions()
if err != nil {
s.fireAsyncError(fmt.Errorf("mergePlannerOption json parsing err: %v", err))
s.asyncTasks.Done()
return
}
OUTER:
for {
atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1)
select {
case <-s.closeCh:
break OUTER
default:
// check to see if there is a new snapshot to persist
s.rootLock.RLock()
ourSnapshot := s.root
ourSnapshot.AddRef()
atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size()))
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch)
s.rootLock.RUnlock()
if ourSnapshot.epoch != lastEpochMergePlanned {
startTime := time.Now()
// lets get started
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
if err != nil {
atomic.StoreUint64(&s.iStats.mergeEpoch, 0)
if err == ErrClosed {
// index has been closed
_ = ourSnapshot.DecRef()
break OUTER
}
s.fireAsyncError(fmt.Errorf("merging err: %v", err))
_ = ourSnapshot.DecRef()
atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1)
continue OUTER
}
lastEpochMergePlanned = ourSnapshot.epoch
s.fireEvent(EventKindMergerProgress, time.Since(startTime))
}
_ = ourSnapshot.DecRef()
// tell the persister we're waiting for changes
// first make a epochWatcher chan
ew := &epochWatcher{
epoch: lastEpochMergePlanned,
notifyCh: make(notificationChan, 1),
}
// give it to the persister
select {
case <-s.closeCh:
break OUTER
case s.persisterNotifier <- ew:
}
// now wait for persister (but also detect close)
select {
case <-s.closeCh:
break OUTER
case <-ew.notifyCh:
}
}
atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1)
}
s.asyncTasks.Done()
}
func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions,
error) {
mergePlannerOptions := mergeplan.DefaultMergePlanOptions
if v, ok := s.config["scorchMergePlanOptions"]; ok {
b, err := json.Marshal(v)
if err != nil {
return &mergePlannerOptions, err
}
err = json.Unmarshal(b, &mergePlannerOptions)
if err != nil {
return &mergePlannerOptions, err
}
err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions)
if err != nil {
return nil, err
}
}
return &mergePlannerOptions, nil
}
func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
options *mergeplan.MergePlanOptions) error {
// build list of zap segments in this snapshot
var onlyZapSnapshots []mergeplan.Segment
for _, segmentSnapshot := range ourSnapshot.segment {
if _, ok := segmentSnapshot.segment.(*zap.Segment); ok {
onlyZapSnapshots = append(onlyZapSnapshots, segmentSnapshot)
}
}
atomic.AddUint64(&s.stats.TotFileMergePlan, 1)
// give this list to the planner
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
if err != nil {
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1)
return fmt.Errorf("merge planning err: %v", err)
}
if resultMergePlan == nil {
// nothing to do
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
return nil
}
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
// process tasks in serial for now
var notifications []chan *IndexSnapshot
for _, task := range resultMergePlan.Tasks {
if len(task.Segments) == 0 {
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
continue
}
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments)))
oldMap := make(map[uint64]*SegmentSnapshot)
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
for _, planSegment := range task.Segments {
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
oldMap[segSnapshot.id] = segSnapshot
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
if segSnapshot.LiveSize() == 0 {
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1)
oldMap[segSnapshot.id] = nil
} else {
segmentsToMerge = append(segmentsToMerge, zapSeg)
docsToDrop = append(docsToDrop, segSnapshot.deleted)
}
}
}
}
var oldNewDocNums map[uint64][]uint64
var segment segment.Segment
if len(segmentsToMerge) > 0 {
filename := zapFileName(newSegmentID)
s.markIneligibleForRemoval(filename)
path := s.path + string(os.PathSeparator) + filename
fileMergeZapStartTime := time.Now()
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor)
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes)
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime)
if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime {
atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime)
}
if err != nil {
s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
return fmt.Errorf("merging failed: %v", err)
}
segment, err = zap.Open(path)
if err != nil {
s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
return err
}
oldNewDocNums = make(map[uint64][]uint64)
for i, segNewDocNums := range newDocNums {
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
}
atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge)))
}
sm := &segmentMerge{
id: newSegmentID,
old: oldMap,
oldNewDocNums: oldNewDocNums,
new: segment,
notify: make(chan *IndexSnapshot, 1),
}
notifications = append(notifications, sm.notify)
// give it to the introducer
select {
case <-s.closeCh:
_ = segment.Close()
return ErrClosed
case s.merges <- sm:
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1)
}
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1)
}
for _, notification := range notifications {
select {
case <-s.closeCh:
return ErrClosed
case newSnapshot := <-notification:
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
if newSnapshot != nil {
_ = newSnapshot.DecRef()
}
}
}
return nil
}
type segmentMerge struct {
id uint64
old map[uint64]*SegmentSnapshot
oldNewDocNums map[uint64][]uint64
new segment.Segment
notify chan *IndexSnapshot
}
// perform a merging of the given SegmentBase instances into a new,
// persisted segment, and synchronously introduce that new segment
// into the root
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
chunkFactor uint32) (*IndexSnapshot, uint64, error) {
atomic.AddUint64(&s.stats.TotMemMergeBeg, 1)
memMergeZapStartTime := time.Now()
atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1)
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
filename := zapFileName(newSegmentID)
path := s.path + string(os.PathSeparator) + filename
newDocNums, _, err :=
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor)
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)
memMergeZapTime := uint64(time.Since(memMergeZapStartTime))
atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime)
if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime {
atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime)
}
if err != nil {
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return nil, 0, err
}
segment, err := zap.Open(path)
if err != nil {
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return nil, 0, err
}
// update persisted stats
atomic.AddUint64(&s.stats.TotPersistedItems, segment.Count())
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
sm := &segmentMerge{
id: newSegmentID,
old: make(map[uint64]*SegmentSnapshot),
oldNewDocNums: make(map[uint64][]uint64),
new: segment,
notify: make(chan *IndexSnapshot, 1),
}
for i, idx := range sbsIndexes {
ss := snapshot.segment[idx]
sm.old[ss.id] = ss
sm.oldNewDocNums[ss.id] = newDocNums[i]
}
select { // send to introducer
case <-s.closeCh:
_ = segment.DecRef()
return nil, 0, ErrClosed
case s.merges <- sm:
}
select { // wait for introduction to complete
case <-s.closeCh:
return nil, 0, ErrClosed
case newSnapshot := <-sm.notify:
atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs)))
atomic.AddUint64(&s.stats.TotMemMergeDone, 1)
return newSnapshot, newSegmentID, nil
}
}

View file

@ -0,0 +1,386 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package mergeplan provides a segment merge planning approach that's
// inspired by Lucene's TieredMergePolicy.java and descriptions like
// http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html
package mergeplan
import (
"errors"
"fmt"
"math"
"sort"
"strings"
)
// A Segment represents the information that the planner needs to
// calculate segment merging.
type Segment interface {
// Unique id of the segment -- used for sorting.
Id() uint64
// Full segment size (the size before any logical deletions).
FullSize() int64
// Size of the live data of the segment; i.e., FullSize() minus
// any logical deletions.
LiveSize() int64
}
// Plan() will functionally compute a merge plan. A segment will be
// assigned to at most a single MergeTask in the output MergePlan. A
// segment not assigned to any MergeTask means the segment should
// remain unmerged.
func Plan(segments []Segment, o *MergePlanOptions) (*MergePlan, error) {
return plan(segments, o)
}
// A MergePlan is the result of the Plan() API.
//
// The planner doesnt know how or whether these tasks are executed --
// thats up to a separate merge execution system, which might execute
// these tasks concurrently or not, and which might execute all the
// tasks or not.
type MergePlan struct {
Tasks []*MergeTask
}
// A MergeTask represents several segments that should be merged
// together into a single segment.
type MergeTask struct {
Segments []Segment
}
// The MergePlanOptions is designed to be reusable between planning calls.
type MergePlanOptions struct {
// Max # segments per logarithmic tier, or max width of any
// logarithmic “step”. Smaller values mean more merging but fewer
// segments. Should be >= SegmentsPerMergeTask, else you'll have
// too much merging.
MaxSegmentsPerTier int
// Max size of any segment produced after merging. Actual
// merging, however, may produce segment sizes different than the
// planners predicted sizes.
MaxSegmentSize int64
// The growth factor for each tier in a staircase of idealized
// segments computed by CalcBudget().
TierGrowth float64
// The number of segments in any resulting MergeTask. e.g.,
// len(result.Tasks[ * ].Segments) == SegmentsPerMergeTask.
SegmentsPerMergeTask int
// Small segments are rounded up to this size, i.e., treated as
// equal (floor) size for consideration. This is to prevent lots
// of tiny segments from resulting in a long tail in the index.
FloorSegmentSize int64
// Controls how aggressively merges that reclaim more deletions
// are favored. Higher values will more aggressively target
// merges that reclaim deletions, but be careful not to go so high
// that way too much merging takes place; a value of 3.0 is
// probably nearly too high. A value of 0.0 means deletions don't
// impact merge selection.
ReclaimDeletesWeight float64
// Optional, defaults to mergeplan.CalcBudget().
CalcBudget func(totalSize int64, firstTierSize int64,
o *MergePlanOptions) (budgetNumSegments int)
// Optional, defaults to mergeplan.ScoreSegments().
ScoreSegments func(segments []Segment, o *MergePlanOptions) float64
// Optional.
Logger func(string)
}
// Returns the higher of the input or FloorSegmentSize.
func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 {
if s > o.FloorSegmentSize {
return s
}
return o.FloorSegmentSize
}
// MaxSegmentSizeLimit represents the maximum size of a segment,
// this limit comes with hit-1 optimisation/max encoding limit uint31.
const MaxSegmentSizeLimit = 1<<31 - 1
// ErrMaxSegmentSizeTooLarge is returned when the size of the segment
// exceeds the MaxSegmentSizeLimit
var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit")
// DefaultMergePlanOptions suggests the default options.
var DefaultMergePlanOptions = MergePlanOptions{
MaxSegmentsPerTier: 10,
MaxSegmentSize: 5000000,
TierGrowth: 10.0,
SegmentsPerMergeTask: 10,
FloorSegmentSize: 2000,
ReclaimDeletesWeight: 2.0,
}
// -------------------------------------------
func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
if len(segmentsIn) <= 1 {
return nil, nil
}
if o == nil {
o = &DefaultMergePlanOptions
}
segments := append([]Segment(nil), segmentsIn...) // Copy.
sort.Sort(byLiveSizeDescending(segments))
var minLiveSize int64 = math.MaxInt64
var eligibles []Segment
var eligiblesLiveSize int64
for _, segment := range segments {
if minLiveSize > segment.LiveSize() {
minLiveSize = segment.LiveSize()
}
// Only small-enough segments are eligible.
if segment.LiveSize() < o.MaxSegmentSize/2 {
eligibles = append(eligibles, segment)
eligiblesLiveSize += segment.LiveSize()
}
}
minLiveSize = o.RaiseToFloorSegmentSize(minLiveSize)
calcBudget := o.CalcBudget
if calcBudget == nil {
calcBudget = CalcBudget
}
budgetNumSegments := CalcBudget(eligiblesLiveSize, minLiveSize, o)
scoreSegments := o.ScoreSegments
if scoreSegments == nil {
scoreSegments = ScoreSegments
}
rv := &MergePlan{}
var empties []Segment
for _, eligible := range eligibles {
if eligible.LiveSize() <= 0 {
empties = append(empties, eligible)
}
}
if len(empties) > 0 {
rv.Tasks = append(rv.Tasks, &MergeTask{Segments: empties})
eligibles = removeSegments(eligibles, empties)
}
// While were over budget, keep looping, which might produce
// another MergeTask.
for len(eligibles) > 0 && (len(eligibles)+len(rv.Tasks)) > budgetNumSegments {
// Track a current best roster as we examine and score
// potential rosters of merges.
var bestRoster []Segment
var bestRosterScore float64 // Lower score is better.
for startIdx := 0; startIdx < len(eligibles); startIdx++ {
var roster []Segment
var rosterLiveSize int64
for idx := startIdx; idx < len(eligibles) && len(roster) < o.SegmentsPerMergeTask; idx++ {
eligible := eligibles[idx]
if rosterLiveSize+eligible.LiveSize() < o.MaxSegmentSize {
roster = append(roster, eligible)
rosterLiveSize += eligible.LiveSize()
}
}
if len(roster) > 0 {
rosterScore := scoreSegments(roster, o)
if len(bestRoster) <= 0 || rosterScore < bestRosterScore {
bestRoster = roster
bestRosterScore = rosterScore
}
}
}
if len(bestRoster) <= 0 {
return rv, nil
}
rv.Tasks = append(rv.Tasks, &MergeTask{Segments: bestRoster})
eligibles = removeSegments(eligibles, bestRoster)
}
return rv, nil
}
// Compute the number of segments that would be needed to cover the
// totalSize, by climbing up a logarithmically growing staircase of
// segment tiers.
func CalcBudget(totalSize int64, firstTierSize int64, o *MergePlanOptions) (
budgetNumSegments int) {
tierSize := firstTierSize
if tierSize < 1 {
tierSize = 1
}
maxSegmentsPerTier := o.MaxSegmentsPerTier
if maxSegmentsPerTier < 1 {
maxSegmentsPerTier = 1
}
tierGrowth := o.TierGrowth
if tierGrowth < 1.0 {
tierGrowth = 1.0
}
for totalSize > 0 {
segmentsInTier := float64(totalSize) / float64(tierSize)
if segmentsInTier < float64(maxSegmentsPerTier) {
budgetNumSegments += int(math.Ceil(segmentsInTier))
break
}
budgetNumSegments += maxSegmentsPerTier
totalSize -= int64(maxSegmentsPerTier) * tierSize
tierSize = int64(float64(tierSize) * tierGrowth)
}
return budgetNumSegments
}
// Of note, removeSegments() keeps the ordering of the results stable.
func removeSegments(segments []Segment, toRemove []Segment) []Segment {
rv := make([]Segment, 0, len(segments)-len(toRemove))
OUTER:
for _, segment := range segments {
for _, r := range toRemove {
if segment == r {
continue OUTER
}
}
rv = append(rv, segment)
}
return rv
}
// Smaller result score is better.
func ScoreSegments(segments []Segment, o *MergePlanOptions) float64 {
var totBeforeSize int64
var totAfterSize int64
var totAfterSizeFloored int64
for _, segment := range segments {
totBeforeSize += segment.FullSize()
totAfterSize += segment.LiveSize()
totAfterSizeFloored += o.RaiseToFloorSegmentSize(segment.LiveSize())
}
if totBeforeSize <= 0 || totAfterSize <= 0 || totAfterSizeFloored <= 0 {
return 0
}
// Roughly guess the "balance" of the segments -- whether the
// segments are about the same size.
balance :=
float64(o.RaiseToFloorSegmentSize(segments[0].LiveSize())) /
float64(totAfterSizeFloored)
// Gently favor smaller merges over bigger ones. We don't want to
// make the exponent too large else we end up with poor merges of
// small segments in order to avoid the large merges.
score := balance * math.Pow(float64(totAfterSize), 0.05)
// Strongly favor merges that reclaim deletes.
nonDelRatio := float64(totAfterSize) / float64(totBeforeSize)
score *= math.Pow(nonDelRatio, o.ReclaimDeletesWeight)
return score
}
// ------------------------------------------
// ToBarChart returns an ASCII rendering of the segments and the plan.
// The barMax is the max width of the bars in the bar chart.
func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan) string {
rv := make([]string, 0, len(segments))
var maxFullSize int64
for _, segment := range segments {
if maxFullSize < segment.FullSize() {
maxFullSize = segment.FullSize()
}
}
if maxFullSize < 0 {
maxFullSize = 1
}
for _, segment := range segments {
barFull := int(segment.FullSize())
barLive := int(segment.LiveSize())
if maxFullSize > int64(barMax) {
barFull = int(float64(barMax) * float64(barFull) / float64(maxFullSize))
barLive = int(float64(barMax) * float64(barLive) / float64(maxFullSize))
}
barKind := " "
barChar := "."
if plan != nil {
TASK_LOOP:
for taski, task := range plan.Tasks {
for _, taskSegment := range task.Segments {
if taskSegment == segment {
barKind = "*"
barChar = fmt.Sprintf("%d", taski)
break TASK_LOOP
}
}
}
}
bar :=
strings.Repeat(barChar, barLive)[0:barLive] +
strings.Repeat("x", barFull-barLive)[0:barFull-barLive]
rv = append(rv, fmt.Sprintf("%s %5d: %5d /%5d - %s %s", prefix,
segment.Id(),
segment.LiveSize(),
segment.FullSize(),
barKind, bar))
}
return strings.Join(rv, "\n")
}
// ValidateMergePlannerOptions validates the merge planner options
func ValidateMergePlannerOptions(options *MergePlanOptions) error {
if options.MaxSegmentSize > MaxSegmentSizeLimit {
return ErrMaxSegmentSizeTooLarge
}
return nil
}

View file

@ -0,0 +1,28 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mergeplan
type byLiveSizeDescending []Segment
func (a byLiveSizeDescending) Len() int { return len(a) }
func (a byLiveSizeDescending) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byLiveSizeDescending) Less(i, j int) bool {
if a[i].LiveSize() != a[j].LiveSize() {
return a[i].LiveSize() > a[j].LiveSize()
}
return a[i].Id() < a[j].Id()
}

View file

@ -0,0 +1,93 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
)
func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
if kind != "conjunction" {
return octx, nil
}
if octx == nil {
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
}
o, ok := octx.(*OptimizeTFRConjunction)
if !ok {
return octx, nil
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize across different snapshots")
}
o.tfrs = append(o.tfrs, s)
return o, nil
}
type OptimizeTFRConjunction struct {
snapshot *IndexSnapshot
tfrs []*IndexSnapshotTermFieldReader
}
func (o *OptimizeTFRConjunction) Finish() error {
if len(o.tfrs) <= 1 {
return nil
}
for i := range o.snapshot.segment {
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
if !ok || itr0.ActualBM == nil {
continue
}
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
if !ok || itr1.ActualBM == nil {
continue
}
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
for _, tfr := range o.tfrs[2:] {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if !ok || itr.ActualBM == nil {
continue
}
bm.And(itr.ActualBM)
}
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if ok && itr.ActualBM != nil {
itr.ActualBM = bm
itr.Actual = bm.Iterator()
}
}
}
return nil
}

View file

@ -0,0 +1,838 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"bytes"
"encoding/binary"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/boltdb/bolt"
)
var DefaultChunkFactor uint32 = 1024
// Arbitrary number, need to make it configurable.
// Lower values like 10/making persister really slow
// doesn't work well as it is creating more files to
// persist for in next persist iteration and spikes the # FDs.
// Ideal value should let persister also proceed at
// an optimum pace so that the merger can skip
// many intermediate snapshots.
// This needs to be based on empirical data.
// TODO - may need to revisit this approach/value.
var epochDistance = uint64(5)
type notificationChan chan struct{}
func (s *Scorch) persisterLoop() {
defer s.asyncTasks.Done()
var persistWatchers []*epochWatcher
var lastPersistedEpoch, lastMergedEpoch uint64
var ew *epochWatcher
OUTER:
for {
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
select {
case <-s.closeCh:
break OUTER
case ew = <-s.persisterNotifier:
persistWatchers = append(persistWatchers, ew)
default:
}
if ew != nil && ew.epoch > lastMergedEpoch {
lastMergedEpoch = ew.epoch
}
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
lastMergedEpoch, persistWatchers)
var ourSnapshot *IndexSnapshot
var ourPersisted []chan error
// check to see if there is a new snapshot to persist
s.rootLock.Lock()
if s.root != nil && s.root.epoch > lastPersistedEpoch {
ourSnapshot = s.root
ourSnapshot.AddRef()
ourPersisted = s.rootPersisted
s.rootPersisted = nil
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
}
s.rootLock.Unlock()
if ourSnapshot != nil {
startTime := time.Now()
err := s.persistSnapshot(ourSnapshot)
for _, ch := range ourPersisted {
if err != nil {
ch <- err
}
close(ch)
}
if err != nil {
atomic.StoreUint64(&s.iStats.persistEpoch, 0)
if err == ErrClosed {
// index has been closed
_ = ourSnapshot.DecRef()
break OUTER
}
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
_ = ourSnapshot.DecRef()
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
continue OUTER
}
lastPersistedEpoch = ourSnapshot.epoch
for _, ew := range persistWatchers {
close(ew.notifyCh)
}
persistWatchers = nil
_ = ourSnapshot.DecRef()
changed := false
s.rootLock.RLock()
if s.root != nil && s.root.epoch != lastPersistedEpoch {
changed = true
}
s.rootLock.RUnlock()
s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
if changed {
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
continue OUTER
}
}
// tell the introducer we're waiting for changes
w := &epochWatcher{
epoch: lastPersistedEpoch,
notifyCh: make(notificationChan, 1),
}
select {
case <-s.closeCh:
break OUTER
case s.introducerNotifier <- w:
}
s.removeOldData() // might as well cleanup while waiting
atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)
select {
case <-s.closeCh:
break OUTER
case <-w.notifyCh:
// woken up, next loop should pick up work
atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
case ew = <-s.persisterNotifier:
// if the watchers are already caught up then let them wait,
// else let them continue to do the catch up
persistWatchers = append(persistWatchers, ew)
}
atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
}
}
func notifyMergeWatchers(lastPersistedEpoch uint64,
persistWatchers []*epochWatcher) []*epochWatcher {
var watchersNext []*epochWatcher
for _, w := range persistWatchers {
if w.epoch < lastPersistedEpoch {
close(w.notifyCh)
} else {
watchersNext = append(watchersNext, w)
}
}
return watchersNext
}
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
persistWatchers []*epochWatcher) (uint64, []*epochWatcher) {
// first, let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
OUTER:
// check for slow merger and await until the merger catch up
for lastPersistedEpoch > lastMergedEpoch+epochDistance {
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
select {
case <-s.closeCh:
break OUTER
case ew := <-s.persisterNotifier:
persistWatchers = append(persistWatchers, ew)
lastMergedEpoch = ew.epoch
}
atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)
// let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
}
return lastMergedEpoch, persistWatchers
}
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
if err != nil {
return err
}
if persisted {
return nil
}
return s.persistSnapshotDirect(snapshot)
}
// DefaultMinSegmentsForInMemoryMerge represents the default number of
// in-memory zap segments that persistSnapshotMaybeMerge() needs to
// see in an IndexSnapshot before it decides to merge and persist
// those segments
var DefaultMinSegmentsForInMemoryMerge = 2
// persistSnapshotMaybeMerge examines the snapshot and might merge and
// persist the in-memory zap segments if there are enough of them
func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
bool, error) {
// collect the in-memory zap segments (SegmentBase instances)
var sbs []*zap.SegmentBase
var sbsDrops []*roaring.Bitmap
var sbsIndexes []int
for i, segmentSnapshot := range snapshot.segment {
if sb, ok := segmentSnapshot.segment.(*zap.SegmentBase); ok {
sbs = append(sbs, sb)
sbsDrops = append(sbsDrops, segmentSnapshot.deleted)
sbsIndexes = append(sbsIndexes, i)
}
}
if len(sbs) < DefaultMinSegmentsForInMemoryMerge {
return false, nil
}
newSnapshot, newSegmentID, err := s.mergeSegmentBases(
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
if err != nil {
return false, err
}
if newSnapshot == nil {
return false, nil
}
defer func() {
_ = newSnapshot.DecRef()
}()
mergedSegmentIDs := map[uint64]struct{}{}
for _, idx := range sbsIndexes {
mergedSegmentIDs[snapshot.segment[idx].id] = struct{}{}
}
// construct a snapshot that's logically equivalent to the input
// snapshot, but with merged segments replaced by the new segment
equiv := &IndexSnapshot{
parent: snapshot.parent,
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
internal: snapshot.internal,
epoch: snapshot.epoch,
creator: "persistSnapshotMaybeMerge",
}
// copy to the equiv the segments that weren't replaced
for _, segment := range snapshot.segment {
if _, wasMerged := mergedSegmentIDs[segment.id]; !wasMerged {
equiv.segment = append(equiv.segment, segment)
}
}
// append to the equiv the new segment
for _, segment := range newSnapshot.segment {
if segment.id == newSegmentID {
equiv.segment = append(equiv.segment, &SegmentSnapshot{
id: newSegmentID,
segment: segment.segment,
deleted: nil, // nil since merging handled deletions
})
break
}
}
err = s.persistSnapshotDirect(equiv)
if err != nil {
return false, err
}
return true, nil
}
func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
// start a write transaction
tx, err := s.rootBolt.Begin(true)
if err != nil {
return err
}
// defer rollback on error
defer func() {
if err != nil {
_ = tx.Rollback()
}
}()
snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket)
if err != nil {
return err
}
newSnapshotKey := segment.EncodeUvarintAscending(nil, snapshot.epoch)
snapshotBucket, err := snapshotsBucket.CreateBucketIfNotExists(newSnapshotKey)
if err != nil {
return err
}
// persist meta values
metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
if err != nil {
return err
}
err = metaBucket.Put([]byte("type"), []byte(zap.Type))
if err != nil {
return err
}
buf := make([]byte, binary.MaxVarintLen32)
binary.BigEndian.PutUint32(buf, zap.Version)
err = metaBucket.Put([]byte("version"), buf)
if err != nil {
return err
}
// persist internal values
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
if err != nil {
return err
}
// TODO optimize writing these in order?
for k, v := range snapshot.internal {
err = internalBucket.Put([]byte(k), v)
if err != nil {
return err
}
}
var filenames []string
newSegmentPaths := make(map[uint64]string)
// first ensure that each segment in this snapshot has been persisted
for _, segmentSnapshot := range snapshot.segment {
snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id)
snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey)
if err != nil {
return err
}
switch seg := segmentSnapshot.segment.(type) {
case *zap.SegmentBase:
// need to persist this to disk
filename := zapFileName(segmentSnapshot.id)
path := s.path + string(os.PathSeparator) + filename
err = zap.PersistSegmentBase(seg, path)
if err != nil {
return fmt.Errorf("error persisting segment: %v", err)
}
newSegmentPaths[segmentSnapshot.id] = path
err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
if err != nil {
return err
}
filenames = append(filenames, filename)
case *zap.Segment:
path := seg.Path()
filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator))
err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
if err != nil {
return err
}
filenames = append(filenames, filename)
default:
return fmt.Errorf("unknown segment type: %T", seg)
}
// store current deleted bits
var roaringBuf bytes.Buffer
if segmentSnapshot.deleted != nil {
_, err = segmentSnapshot.deleted.WriteTo(&roaringBuf)
if err != nil {
return fmt.Errorf("error persisting roaring bytes: %v", err)
}
err = snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes())
if err != nil {
return err
}
}
}
// we need to swap in a new root only when we've persisted 1 or
// more segments -- whereby the new root would have 1-for-1
// replacements of in-memory segments with file-based segments
//
// other cases like updates to internal values only, and/or when
// there are only deletions, are already covered and persisted by
// the newly populated boltdb snapshotBucket above
if len(newSegmentPaths) > 0 {
// now try to open all the new snapshots
newSegments := make(map[uint64]segment.Segment)
defer func() {
for _, s := range newSegments {
if s != nil {
// cleanup segments that were opened but not
// swapped into the new root
_ = s.Close()
}
}
}()
for segmentID, path := range newSegmentPaths {
newSegments[segmentID], err = zap.Open(path)
if err != nil {
return fmt.Errorf("error opening new segment at %s, %v", path, err)
}
}
persist := &persistIntroduction{
persisted: newSegments,
applied: make(notificationChan),
}
select {
case <-s.closeCh:
err = ErrClosed
return err
case s.persists <- persist:
}
select {
case <-s.closeCh:
err = ErrClosed
return err
case <-persist.applied:
}
}
err = tx.Commit()
if err != nil {
return err
}
err = s.rootBolt.Sync()
if err != nil {
return err
}
// allow files to become eligible for removal after commit, such
// as file segments from snapshots that came from the merger
s.rootLock.Lock()
for _, filename := range filenames {
delete(s.ineligibleForRemoval, filename)
}
s.rootLock.Unlock()
return nil
}
func zapFileName(epoch uint64) string {
return fmt.Sprintf("%012x.zap", epoch)
}
// bolt snapshot code
var boltSnapshotsBucket = []byte{'s'}
var boltPathKey = []byte{'p'}
var boltDeletedKey = []byte{'d'}
var boltInternalKey = []byte{'i'}
var boltMetaDataKey = []byte{'m'}
func (s *Scorch) loadFromBolt() error {
return s.rootBolt.View(func(tx *bolt.Tx) error {
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return nil
}
foundRoot := false
c := snapshots.Cursor()
for k, _ := c.Last(); k != nil; k, _ = c.Prev() {
_, snapshotEpoch, err := segment.DecodeUvarintAscending(k)
if err != nil {
log.Printf("unable to parse segment epoch %x, continuing", k)
continue
}
if foundRoot {
s.AddEligibleForRemoval(snapshotEpoch)
continue
}
snapshot := snapshots.Bucket(k)
if snapshot == nil {
log.Printf("snapshot key, but bucket missing %x, continuing", k)
s.AddEligibleForRemoval(snapshotEpoch)
continue
}
indexSnapshot, err := s.loadSnapshot(snapshot)
if err != nil {
log.Printf("unable to load snapshot, %v, continuing", err)
s.AddEligibleForRemoval(snapshotEpoch)
continue
}
indexSnapshot.epoch = snapshotEpoch
// set the nextSegmentID
s.nextSegmentID, err = s.maxSegmentIDOnDisk()
if err != nil {
return err
}
s.nextSegmentID++
s.rootLock.Lock()
s.nextSnapshotEpoch = snapshotEpoch + 1
if s.root != nil {
_ = s.root.DecRef()
}
s.root = indexSnapshot
s.rootLock.Unlock()
foundRoot = true
}
return nil
})
}
// LoadSnapshot loads the segment with the specified epoch
// NOTE: this is currently ONLY intended to be used by the command-line tool
func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
err = s.rootBolt.View(func(tx *bolt.Tx) error {
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return nil
}
snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
snapshot := snapshots.Bucket(snapshotKey)
if snapshot == nil {
return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
}
rv, err = s.loadSnapshot(snapshot)
return err
})
if err != nil {
return nil, err
}
return rv, nil
}
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
rv := &IndexSnapshot{
parent: s,
internal: make(map[string][]byte),
refs: 1,
creator: "loadSnapshot",
}
var running uint64
c := snapshot.Cursor()
for k, _ := c.First(); k != nil; k, _ = c.Next() {
if k[0] == boltInternalKey[0] {
internalBucket := snapshot.Bucket(k)
err := internalBucket.ForEach(func(key []byte, val []byte) error {
copiedVal := append([]byte(nil), val...)
rv.internal[string(key)] = copiedVal
return nil
})
if err != nil {
_ = rv.DecRef()
return nil, err
}
} else if k[0] != boltMetaDataKey[0] {
segmentBucket := snapshot.Bucket(k)
if segmentBucket == nil {
_ = rv.DecRef()
return nil, fmt.Errorf("segment key, but bucket missing % x", k)
}
segmentSnapshot, err := s.loadSegment(segmentBucket)
if err != nil {
_ = rv.DecRef()
return nil, fmt.Errorf("failed to load segment: %v", err)
}
_, segmentSnapshot.id, err = segment.DecodeUvarintAscending(k)
if err != nil {
_ = rv.DecRef()
return nil, fmt.Errorf("failed to decode segment id: %v", err)
}
rv.segment = append(rv.segment, segmentSnapshot)
rv.offsets = append(rv.offsets, running)
running += segmentSnapshot.segment.Count()
}
}
return rv, nil
}
func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, error) {
pathBytes := segmentBucket.Get(boltPathKey)
if pathBytes == nil {
return nil, fmt.Errorf("segment path missing")
}
segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
segment, err := zap.Open(segmentPath)
if err != nil {
return nil, fmt.Errorf("error opening bolt segment: %v", err)
}
rv := &SegmentSnapshot{
segment: segment,
cachedDocs: &cachedDocs{cache: nil},
}
deletedBytes := segmentBucket.Get(boltDeletedKey)
if deletedBytes != nil {
deletedBitmap := roaring.NewBitmap()
r := bytes.NewReader(deletedBytes)
_, err := deletedBitmap.ReadFrom(r)
if err != nil {
_ = segment.Close()
return nil, fmt.Errorf("error reading deleted bytes: %v", err)
}
if !deletedBitmap.IsEmpty() {
rv.deleted = deletedBitmap
}
}
return rv, nil
}
type uint64Descending []uint64
func (p uint64Descending) Len() int { return len(p) }
func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] }
func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func (s *Scorch) removeOldData() {
removed, err := s.removeOldBoltSnapshots()
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err))
}
if removed > 0 {
err = s.removeOldZapFiles()
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
}
}
}
// NumSnapshotsToKeep represents how many recent, old snapshots to
// keep around per Scorch instance. Useful for apps that require
// rollback'ability.
var NumSnapshotsToKeep = 1
// Removes enough snapshots from the rootBolt so that the
// s.eligibleForRemoval stays under the NumSnapshotsToKeep policy.
func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
persistedEpochs, err := s.RootBoltSnapshotEpochs()
if err != nil {
return 0, err
}
if len(persistedEpochs) <= s.numSnapshotsToKeep {
// we need to keep everything
return 0, nil
}
// make a map of epochs to protect from deletion
protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep)
for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] {
protectedEpochs[epoch] = struct{}{}
}
var epochsToRemove []uint64
var newEligible []uint64
s.rootLock.Lock()
for _, epoch := range s.eligibleForRemoval {
if _, ok := protectedEpochs[epoch]; ok {
// protected
newEligible = append(newEligible, epoch)
} else {
epochsToRemove = append(epochsToRemove, epoch)
}
}
s.eligibleForRemoval = newEligible
s.rootLock.Unlock()
if len(epochsToRemove) <= 0 {
return 0, nil
}
tx, err := s.rootBolt.Begin(true)
if err != nil {
return 0, err
}
defer func() {
if err == nil {
err = tx.Commit()
} else {
_ = tx.Rollback()
}
if err == nil {
err = s.rootBolt.Sync()
}
}()
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return 0, nil
}
for _, epochToRemove := range epochsToRemove {
k := segment.EncodeUvarintAscending(nil, epochToRemove)
err = snapshots.DeleteBucket(k)
if err == bolt.ErrBucketNotFound {
err = nil
}
if err == nil {
numRemoved++
}
}
return numRemoved, err
}
func (s *Scorch) maxSegmentIDOnDisk() (uint64, error) {
currFileInfos, err := ioutil.ReadDir(s.path)
if err != nil {
return 0, err
}
var rv uint64
for _, finfo := range currFileInfos {
fname := finfo.Name()
if filepath.Ext(fname) == ".zap" {
prefix := strings.TrimSuffix(fname, ".zap")
id, err2 := strconv.ParseUint(prefix, 16, 64)
if err2 != nil {
return 0, err2
}
if id > rv {
rv = id
}
}
}
return rv, err
}
// Removes any *.zap files which aren't listed in the rootBolt.
func (s *Scorch) removeOldZapFiles() error {
liveFileNames, err := s.loadZapFileNames()
if err != nil {
return err
}
currFileInfos, err := ioutil.ReadDir(s.path)
if err != nil {
return err
}
s.rootLock.RLock()
for _, finfo := range currFileInfos {
fname := finfo.Name()
if filepath.Ext(fname) == ".zap" {
if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] {
err := os.Remove(s.path + string(os.PathSeparator) + fname)
if err != nil {
log.Printf("got err removing file: %s, err: %v", fname, err)
}
}
}
}
s.rootLock.RUnlock()
return nil
}
func (s *Scorch) RootBoltSnapshotEpochs() ([]uint64, error) {
var rv []uint64
err := s.rootBolt.View(func(tx *bolt.Tx) error {
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return nil
}
sc := snapshots.Cursor()
for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() {
_, snapshotEpoch, err := segment.DecodeUvarintAscending(sk)
if err != nil {
continue
}
rv = append(rv, snapshotEpoch)
}
return nil
})
return rv, err
}
// Returns the *.zap file names that are listed in the rootBolt.
func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) {
rv := map[string]struct{}{}
err := s.rootBolt.View(func(tx *bolt.Tx) error {
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return nil
}
sc := snapshots.Cursor()
for sk, _ := sc.First(); sk != nil; sk, _ = sc.Next() {
snapshot := snapshots.Bucket(sk)
if snapshot == nil {
continue
}
segc := snapshot.Cursor()
for segk, _ := segc.First(); segk != nil; segk, _ = segc.Next() {
if segk[0] == boltInternalKey[0] {
continue
}
segmentBucket := snapshot.Bucket(segk)
if segmentBucket == nil {
continue
}
pathBytes := segmentBucket.Get(boltPathKey)
if pathBytes == nil {
continue
}
pathString := string(pathBytes)
rv[string(pathString)] = struct{}{}
}
}
return nil
})
return rv, err
}

View file

@ -0,0 +1,573 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"sync"
"sync/atomic"
"time"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry"
"github.com/boltdb/bolt"
)
const Name = "scorch"
const Version uint8 = 2
var ErrClosed = fmt.Errorf("scorch closed")
type Scorch struct {
readOnly bool
version uint8
config map[string]interface{}
analysisQueue *index.AnalysisQueue
stats Stats
nextSegmentID uint64
path string
unsafeBatch bool
rootLock sync.RWMutex
root *IndexSnapshot // holds 1 ref-count on the root
rootPersisted []chan error // closed when root is persisted
nextSnapshotEpoch uint64
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
numSnapshotsToKeep int
closeCh chan struct{}
introductions chan *segmentIntroduction
persists chan *persistIntroduction
merges chan *segmentMerge
introducerNotifier chan *epochWatcher
revertToSnapshots chan *snapshotReversion
persisterNotifier chan *epochWatcher
rootBolt *bolt.DB
asyncTasks sync.WaitGroup
onEvent func(event Event)
onAsyncError func(err error)
iStats internalStats
}
type internalStats struct {
persistEpoch uint64
persistSnapshotSize uint64
mergeEpoch uint64
mergeSnapshotSize uint64
newSegBufBytesAdded uint64
newSegBufBytesRemoved uint64
analysisBytesAdded uint64
analysisBytesRemoved uint64
}
func NewScorch(storeName string,
config map[string]interface{},
analysisQueue *index.AnalysisQueue) (index.Index, error) {
rv := &Scorch{
version: Version,
config: config,
analysisQueue: analysisQueue,
nextSnapshotEpoch: 1,
closeCh: make(chan struct{}),
ineligibleForRemoval: map[string]bool{},
}
rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"}
ro, ok := config["read_only"].(bool)
if ok {
rv.readOnly = ro
}
ub, ok := config["unsafe_batch"].(bool)
if ok {
rv.unsafeBatch = ub
}
ecbName, ok := config["eventCallbackName"].(string)
if ok {
rv.onEvent = RegistryEventCallbacks[ecbName]
}
aecbName, ok := config["asyncErrorCallbackName"].(string)
if ok {
rv.onAsyncError = RegistryAsyncErrorCallbacks[aecbName]
}
return rv, nil
}
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
if s.onEvent != nil {
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
}
}
func (s *Scorch) fireAsyncError(err error) {
if s.onAsyncError != nil {
s.onAsyncError(err)
}
atomic.AddUint64(&s.stats.TotOnErrors, 1)
}
func (s *Scorch) Open() error {
err := s.openBolt()
if err != nil {
return err
}
s.asyncTasks.Add(1)
go s.mainLoop()
if !s.readOnly && s.path != "" {
s.asyncTasks.Add(1)
go s.persisterLoop()
s.asyncTasks.Add(1)
go s.mergerLoop()
}
return nil
}
func (s *Scorch) openBolt() error {
var ok bool
s.path, ok = s.config["path"].(string)
if !ok {
return fmt.Errorf("must specify path")
}
if s.path == "" {
s.unsafeBatch = true
}
var rootBoltOpt *bolt.Options
if s.readOnly {
rootBoltOpt = &bolt.Options{
ReadOnly: true,
}
} else {
if s.path != "" {
err := os.MkdirAll(s.path, 0700)
if err != nil {
return err
}
}
}
rootBoltPath := s.path + string(os.PathSeparator) + "root.bolt"
var err error
if s.path != "" {
s.rootBolt, err = bolt.Open(rootBoltPath, 0600, rootBoltOpt)
if err != nil {
return err
}
// now see if there is any existing state to load
err = s.loadFromBolt()
if err != nil {
_ = s.Close()
return err
}
}
s.introductions = make(chan *segmentIntroduction)
s.persists = make(chan *persistIntroduction)
s.merges = make(chan *segmentMerge)
s.introducerNotifier = make(chan *epochWatcher, 1)
s.revertToSnapshots = make(chan *snapshotReversion)
s.persisterNotifier = make(chan *epochWatcher, 1)
if !s.readOnly && s.path != "" {
err := s.removeOldZapFiles() // Before persister or merger create any new files.
if err != nil {
_ = s.Close()
return err
}
}
s.numSnapshotsToKeep = NumSnapshotsToKeep
if v, ok := s.config["numSnapshotsToKeep"]; ok {
var t int
if t, err = parseToInteger(v); err != nil {
return fmt.Errorf("numSnapshotsToKeep parse err: %v", err)
}
if t > 0 {
s.numSnapshotsToKeep = t
}
}
return nil
}
func (s *Scorch) Close() (err error) {
startTime := time.Now()
defer func() {
s.fireEvent(EventKindClose, time.Since(startTime))
}()
s.fireEvent(EventKindCloseStart, 0)
// signal to async tasks we want to close
close(s.closeCh)
// wait for them to close
s.asyncTasks.Wait()
// now close the root bolt
if s.rootBolt != nil {
err = s.rootBolt.Close()
s.rootLock.Lock()
if s.root != nil {
_ = s.root.DecRef()
}
s.root = nil
s.rootLock.Unlock()
}
return
}
func (s *Scorch) Update(doc *document.Document) error {
b := index.NewBatch()
b.Update(doc)
return s.Batch(b)
}
func (s *Scorch) Delete(id string) error {
b := index.NewBatch()
b.Delete(id)
return s.Batch(b)
}
// Batch applices a batch of changes to the index atomically
func (s *Scorch) Batch(batch *index.Batch) (err error) {
start := time.Now()
defer func() {
s.fireEvent(EventKindBatchIntroduction, time.Since(start))
}()
resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps))
var numUpdates uint64
var numDeletes uint64
var numPlainTextBytes uint64
var ids []string
for docID, doc := range batch.IndexOps {
if doc != nil {
// insert _id field
doc.AddField(document.NewTextFieldCustom("_id", nil, []byte(doc.ID), document.IndexField|document.StoreField, nil))
numUpdates++
numPlainTextBytes += doc.NumPlainTextBytes()
} else {
numDeletes++
}
ids = append(ids, docID)
}
// FIXME could sort ids list concurrent with analysis?
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
aw := index.NewAnalysisWork(s, doc, resultChan)
// put the work on the queue
s.analysisQueue.Queue(aw)
}
}
}()
// wait for analysis result
analysisResults := make([]*index.AnalysisResult, int(numUpdates))
var itemsDeQueued uint64
var totalAnalysisSize int
for itemsDeQueued < numUpdates {
result := <-resultChan
resultSize := result.Size()
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize))
totalAnalysisSize += resultSize
analysisResults[itemsDeQueued] = result
itemsDeQueued++
}
close(resultChan)
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize))
atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start)))
indexStart := time.Now()
// notify handlers that we're about to introduce a segment
s.fireEvent(EventKindBatchIntroductionStart, 0)
var newSegment segment.Segment
var bufBytes uint64
if len(analysisResults) > 0 {
newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor)
if err != nil {
return err
}
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes)
} else {
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
}
err = s.prepareSegment(newSegment, ids, batch.InternalOps)
if err != nil {
if newSegment != nil {
_ = newSegment.Close()
}
atomic.AddUint64(&s.stats.TotOnErrors, 1)
} else {
atomic.AddUint64(&s.stats.TotUpdates, numUpdates)
atomic.AddUint64(&s.stats.TotDeletes, numDeletes)
atomic.AddUint64(&s.stats.TotBatches, 1)
atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes)
}
atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes)
atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart)))
return err
}
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
internalOps map[string][]byte) error {
// new introduction
introduction := &segmentIntroduction{
id: atomic.AddUint64(&s.nextSegmentID, 1),
data: newSegment,
ids: ids,
obsoletes: make(map[uint64]*roaring.Bitmap),
internal: internalOps,
applied: make(chan error),
}
if !s.unsafeBatch {
introduction.persisted = make(chan error, 1)
}
// optimistically prepare obsoletes outside of rootLock
s.rootLock.RLock()
root := s.root
root.AddRef()
s.rootLock.RUnlock()
for _, seg := range root.segment {
delta, err := seg.segment.DocNumbers(ids)
if err != nil {
return err
}
introduction.obsoletes[seg.id] = delta
}
_ = root.DecRef()
introStartTime := time.Now()
s.introductions <- introduction
// block until this segment is applied
err := <-introduction.applied
if err != nil {
return err
}
if introduction.persisted != nil {
err = <-introduction.persisted
}
introTime := uint64(time.Since(introStartTime))
atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime)
if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime {
atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime)
}
return err
}
func (s *Scorch) SetInternal(key, val []byte) error {
b := index.NewBatch()
b.SetInternal(key, val)
return s.Batch(b)
}
func (s *Scorch) DeleteInternal(key []byte) error {
b := index.NewBatch()
b.DeleteInternal(key)
return s.Batch(b)
}
// Reader returns a low-level accessor on the index data. Close it to
// release associated resources.
func (s *Scorch) Reader() (index.IndexReader, error) {
return s.currentSnapshot(), nil
}
func (s *Scorch) currentSnapshot() *IndexSnapshot {
s.rootLock.RLock()
rv := s.root
rv.AddRef()
s.rootLock.RUnlock()
return rv
}
func (s *Scorch) Stats() json.Marshaler {
return &s.stats
}
func (s *Scorch) StatsMap() map[string]interface{} {
m := s.stats.ToMap()
if s.path != "" {
finfos, err := ioutil.ReadDir(s.path)
if err == nil {
var numFilesOnDisk, numBytesUsedDisk uint64
for _, finfo := range finfos {
if !finfo.IsDir() {
numBytesUsedDisk += uint64(finfo.Size())
numFilesOnDisk++
}
}
m["CurOnDiskBytes"] = numBytesUsedDisk
m["CurOnDiskFiles"] = numFilesOnDisk
}
}
// TODO: consider one day removing these backwards compatible
// names for apps using the old names
m["updates"] = m["TotUpdates"]
m["deletes"] = m["TotDeletes"]
m["batches"] = m["TotBatches"]
m["errors"] = m["TotOnErrors"]
m["analysis_time"] = m["TotAnalysisTime"]
m["index_time"] = m["TotIndexTime"]
m["term_searchers_started"] = m["TotTermSearchersStarted"]
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
m["num_items_introduced"] = m["TotIntroducedItems"]
m["num_items_persisted"] = m["TotPersistedItems"]
m["num_bytes_used_disk"] = m["CurOnDiskBytes"]
m["num_files_on_disk"] = m["CurOnDiskFiles"]
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
return m
}
func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult {
rv := &index.AnalysisResult{
Document: d,
Analyzed: make([]analysis.TokenFrequencies, len(d.Fields)+len(d.CompositeFields)),
Length: make([]int, len(d.Fields)+len(d.CompositeFields)),
}
for i, field := range d.Fields {
if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze()
rv.Analyzed[i] = tokenFreqs
rv.Length[i] = fieldLength
if len(d.CompositeFields) > 0 {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
}
}
}
}
return rv
}
func (s *Scorch) Advanced() (store.KVStore, error) {
return nil, nil
}
func (s *Scorch) AddEligibleForRemoval(epoch uint64) {
s.rootLock.Lock()
if s.root == nil || s.root.epoch != epoch {
s.eligibleForRemoval = append(s.eligibleForRemoval, epoch)
}
s.rootLock.Unlock()
}
func (s *Scorch) MemoryUsed() uint64 {
indexSnapshot := s.currentSnapshot()
defer func() {
_ = indexSnapshot.Close()
}()
// Account for current root snapshot overhead
memUsed := uint64(indexSnapshot.Size())
// Account for snapshot that the persister may be working on
persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch)
persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize)
if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch {
// the snapshot that the persister is working on isn't the same as
// the current snapshot
memUsed += persistSnapshotSize
}
// Account for snapshot that the merger may be working on
mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch)
mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize)
if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch {
// the snapshot that the merger is working on isn't the same as
// the current snapshot
memUsed += mergeSnapshotSize
}
memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) -
atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved))
memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) -
atomic.LoadUint64(&s.iStats.analysisBytesRemoved))
return memUsed
}
func (s *Scorch) markIneligibleForRemoval(filename string) {
s.rootLock.Lock()
s.ineligibleForRemoval[filename] = true
s.rootLock.Unlock()
}
func (s *Scorch) unmarkIneligibleForRemoval(filename string) {
s.rootLock.Lock()
delete(s.ineligibleForRemoval, filename)
s.rootLock.Unlock()
}
func init() {
registry.RegisterIndexType(Name, NewScorch)
}
func parseToInteger(i interface{}) (int, error) {
switch v := i.(type) {
case float64:
return int(v), nil
case int:
return v, nil
default:
return 0, fmt.Errorf("expects int or float64 value")
}
}

View file

@ -0,0 +1,130 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segment
import (
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
)
type EmptySegment struct{}
func (e *EmptySegment) Dictionary(field string) (TermDictionary, error) {
return &EmptyDictionary{}, nil
}
func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error {
return nil
}
func (e *EmptySegment) DocID(num uint64) ([]byte, error) {
return nil, nil
}
func (e *EmptySegment) Count() uint64 {
return 0
}
func (e *EmptySegment) DocNumbers([]string) (*roaring.Bitmap, error) {
r := roaring.NewBitmap()
return r, nil
}
func (e *EmptySegment) Fields() []string {
return []string{}
}
func (e *EmptySegment) Close() error {
return nil
}
func (e *EmptySegment) Size() uint64 {
return 0
}
func (e *EmptySegment) AddRef() {
}
func (e *EmptySegment) DecRef() error {
return nil
}
type EmptyDictionary struct{}
func (e *EmptyDictionary) PostingsList(term []byte,
except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) {
return &EmptyPostingsList{}, nil
}
func (e *EmptyDictionary) Iterator() DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) PrefixIterator(prefix string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) RegexpIterator(start string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) FuzzyIterator(term string,
fuzziness int) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
includeCount bool) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
type EmptyDictionaryIterator struct{}
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
return nil, nil
}
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
return nil, nil
}
type EmptyPostingsList struct{}
func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool,
prealloc PostingsIterator) PostingsIterator {
return &EmptyPostingsIterator{}
}
func (e *EmptyPostingsList) Size() int {
return 0
}
func (e *EmptyPostingsList) Count() uint64 {
return 0
}
type EmptyPostingsIterator struct{}
func (e *EmptyPostingsIterator) Next() (Posting, error) {
return nil, nil
}
func (e *EmptyPostingsIterator) Size() int {
return 0
}

View file

@ -0,0 +1,94 @@
// Copyright 2014 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
// This code originated from:
// https://github.com/cockroachdb/cockroach/blob/2dd65dde5d90c157f4b93f92502ca1063b904e1d/pkg/util/encoding/encoding.go
// Modified to not use pkg/errors
package segment
import "fmt"
const (
MaxVarintSize = 9
// IntMin is chosen such that the range of int tags does not overlap the
// ascii character set that is frequently used in testing.
IntMin = 0x80 // 128
intMaxWidth = 8
intZero = IntMin + intMaxWidth // 136
intSmall = IntMax - intZero - intMaxWidth // 109
// IntMax is the maximum int tag value.
IntMax = 0xfd // 253
)
// EncodeUvarintAscending encodes the uint64 value using a variable length
// (length-prefixed) representation. The length is encoded as a single
// byte indicating the number of encoded bytes (-8) to follow. See
// EncodeVarintAscending for rationale. The encoded bytes are appended to the
// supplied buffer and the final buffer is returned.
func EncodeUvarintAscending(b []byte, v uint64) []byte {
switch {
case v <= intSmall:
return append(b, intZero+byte(v))
case v <= 0xff:
return append(b, IntMax-7, byte(v))
case v <= 0xffff:
return append(b, IntMax-6, byte(v>>8), byte(v))
case v <= 0xffffff:
return append(b, IntMax-5, byte(v>>16), byte(v>>8), byte(v))
case v <= 0xffffffff:
return append(b, IntMax-4, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
case v <= 0xffffffffff:
return append(b, IntMax-3, byte(v>>32), byte(v>>24), byte(v>>16), byte(v>>8),
byte(v))
case v <= 0xffffffffffff:
return append(b, IntMax-2, byte(v>>40), byte(v>>32), byte(v>>24), byte(v>>16),
byte(v>>8), byte(v))
case v <= 0xffffffffffffff:
return append(b, IntMax-1, byte(v>>48), byte(v>>40), byte(v>>32), byte(v>>24),
byte(v>>16), byte(v>>8), byte(v))
default:
return append(b, IntMax, byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32),
byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
}
}
// DecodeUvarintAscending decodes a varint encoded uint64 from the input
// buffer. The remainder of the input buffer and the decoded uint64
// are returned.
func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) {
if len(b) == 0 {
return nil, 0, fmt.Errorf("insufficient bytes to decode uvarint value")
}
length := int(b[0]) - intZero
b = b[1:] // skip length byte
if length <= intSmall {
return b, uint64(length), nil
}
length -= intSmall
if length < 0 || length > 8 {
return nil, 0, fmt.Errorf("invalid uvarint length of %d", length)
} else if len(b) < length {
return nil, 0, fmt.Errorf("insufficient bytes to decode uvarint value: %q", b)
}
var v uint64
// It is faster to range over the elements in a slice than to index
// into the slice on each loop iteration.
for _, t := range b[:length] {
v = (v << 8) | uint64(t)
}
return b[length:], v, nil
}

View file

@ -0,0 +1,126 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segment
import (
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
)
// DocumentFieldValueVisitor defines a callback to be visited for each
// stored field value. The return value determines if the visitor
// should keep going. Returning true continues visiting, false stops.
type DocumentFieldValueVisitor func(field string, typ byte, value []byte, pos []uint64) bool
type Segment interface {
Dictionary(field string) (TermDictionary, error)
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error
DocID(num uint64) ([]byte, error)
Count() uint64
DocNumbers([]string) (*roaring.Bitmap, error)
Fields() []string
Close() error
Size() int
AddRef()
DecRef() error
}
type TermDictionary interface {
PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error)
Iterator() DictionaryIterator
PrefixIterator(prefix string) DictionaryIterator
RangeIterator(start, end string) DictionaryIterator
RegexpIterator(regex string) DictionaryIterator
FuzzyIterator(term string, fuzziness int) DictionaryIterator
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
}
type DictionaryIterator interface {
Next() (*index.DictEntry, error)
}
type PostingsList interface {
Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator
Size() int
Count() uint64
// NOTE deferred for future work
// And(other PostingsList) PostingsList
// Or(other PostingsList) PostingsList
}
type PostingsIterator interface {
// The caller is responsible for copying whatever it needs from
// the returned Posting instance before calling Next(), as some
// implementations may return a shared instance to reduce memory
// allocations.
Next() (Posting, error)
// Advance will return the posting with the specified doc number
// or if there is no such posting, the next posting.
// Callers MUST NOT attempt to pass a docNum that is less than or
// equal to the currently visited posting doc Num.
Advance(docNum uint64) (Posting, error)
Size() int
}
type Posting interface {
Number() uint64
Frequency() uint64
Norm() float64
Locations() []Location
Size() int
}
type Location interface {
Field() string
Start() uint64
End() uint64
Pos() uint64
ArrayPositions() []uint64
Size() int
}
// DocumentFieldTermVisitable is implemented by various scorch segment
// implementations with persistence for the un inverting of the
// postings or other indexed values.
type DocumentFieldTermVisitable interface {
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error)
// VisitableDocValueFields implementation should return
// the list of fields which are document value persisted and
// therefore visitable by the above VisitDocumentFieldTerms method.
VisitableDocValueFields() ([]string, error)
}
type DocVisitState interface {
}

View file

@ -0,0 +1,167 @@
# zap file format
The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written.
Current usage:
- mmap the entire file
- crc-32 bytes and version are in fixed position at end of the file
- reading remainder of footer could be version specific
- remainder of footer gives us:
- 3 important offsets (docValue , fields index and stored data index)
- 2 important values (number of docs and chunk factor)
- field data is processed once and memoized onto the heap so that we never have to go back to disk for it
- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends.
- access to all other indexed data follows the following pattern:
- first know the field name -> convert to id
- next navigate to term dictionary for that field
- some operations stop here and do dictionary ops
- next use dictionary to navigate to posting list for a specific term
- walk posting list
- if necessary, walk posting details as we go
- if location info is desired, consult location bitmap to see if it is there
## stored fields section
- for each document
- preparation phase:
- produce a slice of metadata bytes and data bytes
- produce these slices in field id order
- field value is appended to the data slice
- metadata slice is varint encoded with the following values for each field value
- field id (uint16)
- field type (byte)
- field value start offset in uncompressed data slice (uint64)
- field value length (uint64)
- field number of array positions (uint64)
- one additional value for each array position (uint64)
- compress the data slice using snappy
- file writing phase:
- remember the start offset for this document
- write out meta data length (varint uint64)
- write out compressed data length (varint uint64)
- write out the metadata bytes
- write out the compressed data bytes
## stored fields idx
- for each document
- write start offset (remembered from previous section) of stored data (big endian uint64)
With this index and a known document number, we have direct access to all the stored field data.
## posting details (freq/norm) section
- for each posting list
- produce a slice containing multiple consecutive chunks (each chunk is varint stream)
- produce a slice remembering offsets of where each chunk starts
- preparation phase:
- for each hit in the posting list
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
- encode term frequency (uint64)
- encode norm factor (float32)
- file writing phase:
- remember start position for this posting list details
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## posting details (location) section
- for each posting list
- produce a slice containing multiple consecutive chunks (each chunk is varint stream)
- produce a slice remembering offsets of where each chunk starts
- preparation phase:
- for each hit in the posting list
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
- encode field (uint16)
- encode field pos (uint64)
- encode field start (uint64)
- encode field end (uint64)
- encode number of array positions to follow (uint64)
- encode each array position (each uint64)
- file writing phase:
- remember start position for this posting list details
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## bitmaps of hits with location info
- for each posting list
- preparation phase:
- encode roaring bitmap (inidicating which hits have location details indexed) posting list to bytes (so we know the length)
- file writing phase:
- remember the start position for this bitmap
- write length of encoded roaring bitmap
- write the serialized roaring bitmap data
## postings list section
- for each posting list
- preparation phase:
- encode roaring bitmap posting list to bytes (so we know the length)
- file writing phase:
- remember the start position for this posting list
- write freq/norm details offset (remembered from previous, as varint uint64)
- write location details offset (remembered from previous, as varint uint64)
- write location bitmap offset (remembered from pervious, as varint uint64)
- write length of encoded roaring bitmap
- write the serialized roaring bitmap data
## dictionary
- for each field
- preparation phase:
- encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous)
- file writing phase:
- remember the start position of this persistDictionary
- write length of vellum data (varint uint64)
- write out vellum data
## fields section
- for each field
- file writing phase:
- remember start offset for each field
- write dictionary address (remembered from previous) (varint uint64)
- write length of field name (varint uint64)
- write field name bytes
## fields idx
- for each field
- file writing phase:
- write big endian uint64 of start offset for each field
NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size.
## fields DocValue
- for each field
- preparation phase:
- produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data
- produce a slice remembering the length of each chunk
- file writing phase:
- remember the start position of this first field DocValue offset in the footer
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any
read operation leverage that meta information to extract the document specific data from the file.
## footer
- file writing phase
- write number of docs (big endian uint64)
- write stored field index location (big endian uint64)
- write field index location (big endian uint64)
- write field docValue location (big endian uint64)
- write out chunk factor (big endian uint32)
- write out version (big endian uint32)
- write out file CRC of everything preceding this (big endian uint32)

View file

@ -0,0 +1,149 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bufio"
"math"
"os"
)
const Version uint32 = 11
const Type string = "zap"
const fieldNotUninverted = math.MaxUint64
// PersistSegmentBase persists SegmentBase in the zap file format.
func PersistSegmentBase(sb *SegmentBase, path string) error {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return err
}
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
br := bufio.NewWriter(f)
_, err = br.Write(sb.mem)
if err != nil {
cleanup()
return err
}
err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset,
sb.chunkFactor, sb.memCRC, br)
if err != nil {
cleanup()
return err
}
err = br.Flush()
if err != nil {
cleanup()
return err
}
err = f.Sync()
if err != nil {
cleanup()
return err
}
err = f.Close()
if err != nil {
cleanup()
return err
}
return nil
}
func persistStoredFieldValues(fieldID int,
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
curr int, metaEncode varintEncoder, data []byte) (
int, []byte, error) {
for i := 0; i < len(storedFieldValues); i++ {
// encode field
_, err := metaEncode(uint64(fieldID))
if err != nil {
return 0, nil, err
}
// encode type
_, err = metaEncode(uint64(stf[i]))
if err != nil {
return 0, nil, err
}
// encode start offset
_, err = metaEncode(uint64(curr))
if err != nil {
return 0, nil, err
}
// end len
_, err = metaEncode(uint64(len(storedFieldValues[i])))
if err != nil {
return 0, nil, err
}
// encode number of array pos
_, err = metaEncode(uint64(len(spf[i])))
if err != nil {
return 0, nil, err
}
// encode all array positions
for _, pos := range spf[i] {
_, err = metaEncode(pos)
if err != nil {
return 0, nil, err
}
}
data = append(data, storedFieldValues[i]...)
curr += len(storedFieldValues[i])
}
return curr, data, nil
}
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
dictLocs []uint64) (*SegmentBase, error) {
sb := &SegmentBase{
mem: mem,
memCRC: memCRC,
chunkFactor: chunkFactor,
fieldsMap: fieldsMap,
fieldsInv: fieldsInv,
numDocs: numDocs,
storedIndexOffset: storedIndexOffset,
fieldsIndexOffset: fieldsIndexOffset,
docValueOffset: docValueOffset,
dictLocs: dictLocs,
fieldDvReaders: make(map[uint16]*docValueReader),
}
sb.updateSize()
err := sb.loadDvReaders()
if err != nil {
return nil, err
}
return sb, nil
}

View file

@ -0,0 +1,230 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"io"
"reflect"
"github.com/golang/snappy"
)
var reflectStaticSizeMetaData int
func init() {
var md MetaData
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size())
}
var termSeparator byte = 0xff
var termSeparatorSplitSlice = []byte{termSeparator}
type chunkedContentCoder struct {
final []byte
chunkSize uint64
currChunk uint64
chunkLens []uint64
w io.Writer
progressiveWrite bool
chunkMetaBuf bytes.Buffer
chunkBuf bytes.Buffer
chunkMeta []MetaData
compressed []byte // temp buf for snappy compression
}
// MetaData represents the data information inside a
// chunk.
type MetaData struct {
DocNum uint64 // docNum of the data inside the chunk
DocDvOffset uint64 // offset of data inside the chunk for the given docid
}
// newChunkedContentCoder returns a new chunk content coder which
// packs data into chunks based on the provided chunkSize
func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64,
w io.Writer, progressiveWrite bool) *chunkedContentCoder {
total := maxDocNum/chunkSize + 1
rv := &chunkedContentCoder{
chunkSize: chunkSize,
chunkLens: make([]uint64, total),
chunkMeta: make([]MetaData, 0, total),
w: w,
progressiveWrite: progressiveWrite,
}
return rv
}
// Reset lets you reuse this chunked content coder. Buffers are reset
// and re used. You cannot change the chunk size.
func (c *chunkedContentCoder) Reset() {
c.currChunk = 0
c.final = c.final[:0]
c.chunkBuf.Reset()
c.chunkMetaBuf.Reset()
for i := range c.chunkLens {
c.chunkLens[i] = 0
}
c.chunkMeta = c.chunkMeta[:0]
}
// Close indicates you are done calling Add() this allows
// the final chunk to be encoded.
func (c *chunkedContentCoder) Close() error {
return c.flushContents()
}
func (c *chunkedContentCoder) flushContents() error {
// flush the contents, with meta information at first
buf := make([]byte, binary.MaxVarintLen64)
n := binary.PutUvarint(buf, uint64(len(c.chunkMeta)))
_, err := c.chunkMetaBuf.Write(buf[:n])
if err != nil {
return err
}
// write out the metaData slice
for _, meta := range c.chunkMeta {
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
if err != nil {
return err
}
}
// write the metadata to final data
metaData := c.chunkMetaBuf.Bytes()
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
// write the compressed data to the final data
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
c.final = append(c.final, c.compressed...)
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
if c.progressiveWrite {
_, err := c.w.Write(c.final)
if err != nil {
return err
}
c.final = c.final[:0]
}
return nil
}
// Add encodes the provided byte slice into the correct chunk for the provided
// doc num. You MUST call Add() with increasing docNums.
func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// flush out the previous chunk details
err := c.flushContents()
if err != nil {
return err
}
// clearing the chunk specific meta for next chunk
c.chunkBuf.Reset()
c.chunkMetaBuf.Reset()
c.chunkMeta = c.chunkMeta[:0]
c.currChunk = chunk
}
// get the starting offset for this doc
dvOffset := c.chunkBuf.Len()
dvSize, err := c.chunkBuf.Write(vals)
if err != nil {
return err
}
c.chunkMeta = append(c.chunkMeta, MetaData{
DocNum: docNum,
DocDvOffset: uint64(dvOffset + dvSize),
})
return nil
}
// Write commits all the encoded chunked contents to the provided writer.
//
// | ..... data ..... | chunk offsets (varints)
// | position of chunk offsets (uint64) | number of offsets (uint64) |
//
func (c *chunkedContentCoder) Write() (int, error) {
var tw int
if c.final != nil {
// write out the data section first
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
}
chunkOffsetsStart := uint64(tw)
if cap(c.final) < binary.MaxVarintLen64 {
c.final = make([]byte, binary.MaxVarintLen64)
} else {
c.final = c.final[0:binary.MaxVarintLen64]
}
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
// write out the chunk offsets
for _, chunkOffset := range chunkOffsets {
n := binary.PutUvarint(c.final, chunkOffset)
nw, err := c.w.Write(c.final[:n])
tw += nw
if err != nil {
return tw, err
}
}
chunkOffsetsLen := uint64(tw) - chunkOffsetsStart
c.final = c.final[0:8]
// write out the length of chunk offsets
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
// write out the number of chunks
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
nw, err = c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
c.final = c.final[:0]
return tw, nil
}
// ReadDocValueBoundary elicits the start, end offsets from a
// metaData header slice
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = metaHeaders[chunk-1].DocDvOffset
}
return start, metaHeaders[chunk].DocDvOffset
}

View file

@ -0,0 +1,51 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"hash/crc32"
"io"
)
// CountHashWriter is a wrapper around a Writer which counts the number of
// bytes which have been written and computes a crc32 hash
type CountHashWriter struct {
w io.Writer
crc uint32
n int
}
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
func NewCountHashWriter(w io.Writer) *CountHashWriter {
return &CountHashWriter{w: w}
}
// Write writes the provided bytes to the wrapped writer and counts the bytes
func (c *CountHashWriter) Write(b []byte) (int, error) {
n, err := c.w.Write(b)
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
c.n += n
return n, err
}
// Count returns the number of bytes written
func (c *CountHashWriter) Count() int {
return c.n
}
// Sum32 returns the CRC-32 hash of the content written to this writer
func (c *CountHashWriter) Sum32() uint32 {
return c.crc
}

View file

@ -0,0 +1,289 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
"github.com/couchbase/vellum/levenshtein"
"github.com/couchbase/vellum/regexp"
)
// Dictionary is the zap representation of the term dictionary
type Dictionary struct {
sb *SegmentBase
field string
fieldID uint16
fst *vellum.FST
fstReader *vellum.Reader
}
// PostingsList returns the postings list for the specified term
func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap,
prealloc segment.PostingsList) (segment.PostingsList, error) {
var preallocPL *PostingsList
pl, ok := prealloc.(*PostingsList)
if ok && pl != nil {
preallocPL = pl
}
return d.postingsList(term, except, preallocPL)
}
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
if d.fstReader == nil {
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil
}
postingsOffset, exists, err := d.fstReader.Get(term)
if err != nil {
return nil, fmt.Errorf("vellum err: %v", err)
}
if !exists {
if rv == nil || rv == emptyPostingsList {
return emptyPostingsList, nil
}
return d.postingsListInit(rv, except), nil
}
return d.postingsListFromOffset(postingsOffset, except, rv)
}
func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
rv = d.postingsListInit(rv, except)
err := rv.read(postingsOffset, d)
if err != nil {
return nil, err
}
return rv, nil
}
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
if rv == nil || rv == emptyPostingsList {
rv = &PostingsList{}
} else {
postings := rv.postings
if postings != nil {
postings.Clear()
}
*rv = PostingsList{} // clear the struct
rv.postings = postings
}
rv.sb = d.sb
rv.except = except
return rv
}
// Iterator returns an iterator for this dictionary
func (d *Dictionary) Iterator() segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
itr, err := d.fst.Iterator(nil, nil)
if err == nil {
rv.itr = itr
} else if err != nil && err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
// PrefixIterator returns an iterator which only visits terms having the
// the specified prefix
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
r, err := regexp.New(prefix + ".*")
if err == nil {
itr, err := d.fst.Search(r, nil, nil)
if err == nil {
rv.itr = itr
} else if err != nil && err != vellum.ErrIteratorDone {
rv.err = err
}
} else {
rv.err = err
}
}
return rv
}
// RangeIterator returns an iterator which only visits terms between the
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
// need to increment the end position to be inclusive
endBytes := []byte(end)
if endBytes[len(endBytes)-1] < 0xff {
endBytes[len(endBytes)-1]++
} else {
endBytes = append(endBytes, 0xff)
}
if d.fst != nil {
itr, err := d.fst.Iterator([]byte(start), endBytes)
if err == nil {
rv.itr = itr
} else if err != nil && err != vellum.ErrIteratorDone {
rv.err = err
}
}
return rv
}
// RegexpIterator returns an iterator which only visits terms having the
// the specified regex
func (d *Dictionary) RegexpIterator(regex string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
r, err := regexp.New(regex)
if err == nil {
itr, err2 := d.fst.Search(r, nil, nil)
if err2 == nil {
rv.itr = itr
} else if err2 != nil && err2 != vellum.ErrIteratorDone {
rv.err = err2
}
} else {
rv.err = err
}
}
return rv
}
// FuzzyIterator returns an iterator which only visits terms having the
// the specified edit/levenshtein distance
func (d *Dictionary) FuzzyIterator(term string,
fuzziness int) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
la, err := levenshtein.New(term, fuzziness)
if err == nil {
itr, err2 := d.fst.Search(la, nil, nil)
if err2 == nil {
rv.itr = itr
} else if err2 != nil && err2 != vellum.ErrIteratorDone {
rv.err = err2
}
} else {
rv.err = err
}
}
return rv
}
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte,
includeCount bool) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
omitCount: !includeCount,
}
var buf bytes.Buffer
builder, err := vellum.New(&buf, nil)
if err != nil {
rv.err = err
return rv
}
for _, term := range onlyTerms {
err = builder.Insert(term, 0)
if err != nil {
rv.err = err
return rv
}
}
err = builder.Close()
if err != nil {
rv.err = err
return rv
}
onlyFST, err := vellum.Load(buf.Bytes())
if err != nil {
rv.err = err
return rv
}
itr, err := d.fst.Search(onlyFST, nil, nil)
if err == nil {
rv.itr = itr
} else if err != nil && err != vellum.ErrIteratorDone {
rv.err = err
}
return rv
}
// DictionaryIterator is an iterator for term dictionary
type DictionaryIterator struct {
d *Dictionary
itr vellum.Iterator
err error
tmp PostingsList
entry index.DictEntry
omitCount bool
}
// Next returns the next entry in the dictionary
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
if i.err != nil && i.err != vellum.ErrIteratorDone {
return nil, i.err
} else if i.itr == nil || i.err == vellum.ErrIteratorDone {
return nil, nil
}
term, postingsOffset := i.itr.Current()
i.entry.Term = string(term)
if !i.omitCount {
i.err = i.tmp.read(postingsOffset, i.d)
if i.err != nil {
return nil, i.err
}
i.entry.Count = i.tmp.Count()
}
i.err = i.itr.Next()
return &i.entry, nil
}

View file

@ -0,0 +1,312 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"math"
"reflect"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
"github.com/golang/snappy"
)
var reflectStaticSizedocValueReader int
func init() {
var dvi docValueReader
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
}
type docNumTermsVisitor func(docNum uint64, terms []byte) error
type docVisitState struct {
dvrs map[uint16]*docValueReader
segment *Segment
}
type docValueReader struct {
field string
curChunkNum uint64
chunkOffsets []uint64
dvDataLoc uint64
curChunkHeader []MetaData
curChunkData []byte // compressed data cache
uncompressed []byte // temp buf for snappy decompression
}
func (di *docValueReader) size() int {
return reflectStaticSizedocValueReader + size.SizeOfPtr +
len(di.field) +
len(di.chunkOffsets)*size.SizeOfUint64 +
len(di.curChunkHeader)*reflectStaticSizeMetaData +
len(di.curChunkData)
}
func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
if rv == nil {
rv = &docValueReader{}
}
rv.field = di.field
rv.curChunkNum = math.MaxUint64
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
rv.dvDataLoc = di.dvDataLoc
rv.curChunkHeader = nil
rv.curChunkData = nil
rv.uncompressed = nil
return rv
}
func (di *docValueReader) fieldName() string {
return di.field
}
func (di *docValueReader) curChunkNumber() uint64 {
return di.curChunkNum
}
func (s *SegmentBase) loadFieldDocValueReader(field string,
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
// get the docValue offset for the given fields
if fieldDvLocStart == fieldNotUninverted {
return nil, fmt.Errorf("loadFieldDocValueReader: "+
"no docValues found for field: %s", field)
}
// read the number of chunks, and chunk offsets position
var numChunks, chunkOffsetsPosition uint64
if fieldDvLocEnd-fieldDvLocStart > 16 {
numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
// read the length of chunk offsets
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
// acquire position of chunk offsets
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
}
fdvIter := &docValueReader{
curChunkNum: math.MaxUint64,
field: field,
chunkOffsets: make([]uint64, int(numChunks)),
}
// read the chunk offsets
var offset uint64
for i := 0; i < int(numChunks); i++ {
loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
if read <= 0 {
return nil, fmt.Errorf("corrupted chunk offset during segment load")
}
fdvIter.chunkOffsets[i] = loc
offset += uint64(read)
}
// set the data offset
fdvIter.dvDataLoc = fieldDvLocStart
return fdvIter, nil
}
func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
// advance to the chunk where the docValues
// reside for the given docNum
destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
if start >= end {
di.curChunkHeader = di.curChunkHeader[:0]
di.curChunkData = nil
di.curChunkNum = chunkNumber
di.uncompressed = di.uncompressed[:0]
return nil
}
destChunkDataLoc += start
curChunkEnd += end
// read the number of docs reside in the chunk
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
if read <= 0 {
return fmt.Errorf("failed to read the chunk")
}
chunkMetaLoc := destChunkDataLoc + uint64(read)
offset := uint64(0)
di.curChunkHeader = make([]MetaData, int(numDocs))
for i := 0; i < int(numDocs); i++ {
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
}
compressedDataLoc := chunkMetaLoc + offset
dataLength := curChunkEnd - compressedDataLoc
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
di.curChunkNum = chunkNumber
di.uncompressed = di.uncompressed[:0]
return nil
}
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
for i := 0; i < len(di.chunkOffsets); i++ {
err := di.loadDvChunk(uint64(i), s)
if err != nil {
return err
}
if di.curChunkData == nil || len(di.curChunkHeader) <= 0 {
continue
}
// uncompress the already loaded data
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
if err != nil {
return err
}
di.uncompressed = uncompressed
start := uint64(0)
for _, entry := range di.curChunkHeader {
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
if err != nil {
return err
}
start = entry.DocDvOffset
}
}
return nil
}
func (di *docValueReader) visitDocValues(docNum uint64,
visitor index.DocumentFieldTermVisitor) error {
// binary search the term locations for the docNum
start, end := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
return nil
}
var uncompressed []byte
var err error
// use the uncompressed copy if available
if len(di.uncompressed) > 0 {
uncompressed = di.uncompressed
} else {
// uncompress the already loaded data
uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
if err != nil {
return err
}
di.uncompressed = uncompressed
}
// pick the terms for the given docNum
uncompressed = uncompressed[start:end]
for {
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
if i < 0 {
break
}
visitor(di.field, uncompressed[0:i])
uncompressed = uncompressed[i+1:]
}
return nil
}
func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
i := sort.Search(len(di.curChunkHeader), func(i int) bool {
return di.curChunkHeader[i].DocNum >= docNum
})
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
return ReadDocValueBoundary(i, di.curChunkHeader)
}
return math.MaxUint64, math.MaxUint64
}
// VisitDocumentFieldTerms is an implementation of the
// DocumentFieldTermVisitable interface
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
segment.DocVisitState, error) {
dvs, ok := dvsIn.(*docVisitState)
if !ok || dvs == nil {
dvs = &docVisitState{}
} else {
if dvs.segment != s {
dvs.segment = s
dvs.dvrs = nil
}
}
var fieldIDPlus1 uint16
if dvs.dvrs == nil {
dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
for _, field := range fields {
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue
}
fieldID := fieldIDPlus1 - 1
if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
dvIter != nil {
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
}
}
}
// find the chunkNumber where the docValues are stored
docInChunk := localDocNum / uint64(s.chunkFactor)
var dvr *docValueReader
for _, field := range fields {
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue
}
fieldID := fieldIDPlus1 - 1
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
// check if the chunk is already loaded
if docInChunk != dvr.curChunkNumber() {
err := dvr.loadDvChunk(docInChunk, &s.SegmentBase)
if err != nil {
return dvs, err
}
}
_ = dvr.visitDocValues(localDocNum, visitor)
}
}
return dvs, nil
}
// VisitableDocValueFields returns the list of fields with
// persisted doc value terms ready to be visitable using the
// VisitDocumentFieldTerms method.
func (s *Segment) VisitableDocValueFields() ([]string, error) {
rv := make([]string, 0, len(s.fieldDvReaders))
for fieldID, field := range s.fieldsInv {
if dvIter, ok := s.fieldDvReaders[uint16(fieldID)]; ok &&
dvIter != nil {
rv = append(rv, field)
}
}
return rv, nil
}

View file

@ -0,0 +1,124 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"github.com/couchbase/vellum"
)
// enumerator provides an ordered traversal of multiple vellum
// iterators. Like JOIN of iterators, the enumerator produces a
// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC,
// then iteratorIndex ASC, where the same key might be seen or
// repeated across multiple child iterators.
type enumerator struct {
itrs []vellum.Iterator
currKs [][]byte
currVs []uint64
lowK []byte
lowIdxs []int
lowCurr int
}
// newEnumerator returns a new enumerator over the vellum Iterators
func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) {
rv := &enumerator{
itrs: itrs,
currKs: make([][]byte, len(itrs)),
currVs: make([]uint64, len(itrs)),
lowIdxs: make([]int, 0, len(itrs)),
}
for i, itr := range rv.itrs {
rv.currKs[i], rv.currVs[i] = itr.Current()
}
rv.updateMatches()
if rv.lowK == nil {
return rv, vellum.ErrIteratorDone
}
return rv, nil
}
// updateMatches maintains the low key matches based on the currKs
func (m *enumerator) updateMatches() {
m.lowK = nil
m.lowIdxs = m.lowIdxs[:0]
m.lowCurr = 0
for i, key := range m.currKs {
if key == nil {
continue
}
cmp := bytes.Compare(key, m.lowK)
if cmp < 0 || m.lowK == nil {
// reached a new low
m.lowK = key
m.lowIdxs = m.lowIdxs[:0]
m.lowIdxs = append(m.lowIdxs, i)
} else if cmp == 0 {
m.lowIdxs = append(m.lowIdxs, i)
}
}
}
// Current returns the enumerator's current key, iterator-index, and
// value. If the enumerator is not pointing at a valid value (because
// Next returned an error previously), Current will return nil,0,0.
func (m *enumerator) Current() ([]byte, int, uint64) {
var i int
var v uint64
if m.lowCurr < len(m.lowIdxs) {
i = m.lowIdxs[m.lowCurr]
v = m.currVs[i]
}
return m.lowK, i, v
}
// Next advances the enumerator to the next key/iterator/value result,
// else vellum.ErrIteratorDone is returned.
func (m *enumerator) Next() error {
m.lowCurr += 1
if m.lowCurr >= len(m.lowIdxs) {
// move all the current low iterators forwards
for _, vi := range m.lowIdxs {
err := m.itrs[vi].Next()
if err != nil && err != vellum.ErrIteratorDone {
return err
}
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
}
m.updateMatches()
}
if m.lowK == nil {
return vellum.ErrIteratorDone
}
return nil
}
// Close all the underlying Iterators. The first error, if any, will
// be returned.
func (m *enumerator) Close() error {
var rv error
for _, itr := range m.itrs {
err := itr.Close()
if rv == nil {
rv = err
}
}
return rv
}

View file

@ -0,0 +1,172 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"io"
)
type chunkedIntCoder struct {
final []byte
chunkSize uint64
chunkBuf bytes.Buffer
chunkLens []uint64
currChunk uint64
buf []byte
}
// newChunkedIntCoder returns a new chunk int coder which packs data into
// chunks based on the provided chunkSize and supports up to the specified
// maxDocNum
func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
total := maxDocNum/chunkSize + 1
rv := &chunkedIntCoder{
chunkSize: chunkSize,
chunkLens: make([]uint64, total),
final: make([]byte, 0, 64),
}
return rv
}
// Reset lets you reuse this chunked int coder. buffers are reset and reused
// from previous use. you cannot change the chunk size or max doc num.
func (c *chunkedIntCoder) Reset() {
c.final = c.final[:0]
c.chunkBuf.Reset()
c.currChunk = 0
for i := range c.chunkLens {
c.chunkLens[i] = 0
}
}
// Add encodes the provided integers into the correct chunk for the provided
// doc num. You MUST call Add() with increasing docNums.
func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// starting a new chunk
c.Close()
c.chunkBuf.Reset()
c.currChunk = chunk
}
if len(c.buf) < binary.MaxVarintLen64 {
c.buf = make([]byte, binary.MaxVarintLen64)
}
for _, val := range vals {
wb := binary.PutUvarint(c.buf, val)
_, err := c.chunkBuf.Write(c.buf[:wb])
if err != nil {
return err
}
}
return nil
}
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// starting a new chunk
c.Close()
c.chunkBuf.Reset()
c.currChunk = chunk
}
_, err := c.chunkBuf.Write(buf)
return err
}
// Close indicates you are done calling Add() this allows the final chunk
// to be encoded.
func (c *chunkedIntCoder) Close() {
encodingBytes := c.chunkBuf.Bytes()
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
c.final = append(c.final, encodingBytes...)
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close
}
// Write commits all the encoded chunked integers to the provided writer.
func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens))
if len(c.buf) < bufNeeded {
c.buf = make([]byte, bufNeeded)
}
buf := c.buf
// convert the chunk lengths into chunk offsets
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
// write out the number of chunks & each chunk offsets
n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
for _, chunkOffset := range chunkOffsets {
n += binary.PutUvarint(buf[n:], chunkOffset)
}
tw, err := w.Write(buf[:n])
if err != nil {
return tw, err
}
// write out the data
nw, err := w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
return tw, nil
}
func (c *chunkedIntCoder) FinalSize() int {
return len(c.final)
}
// modifyLengthsToEndOffsets converts the chunk length array
// to a chunk offset array. The readChunkBoundary
// will figure out the start and end of every chunk from
// these offsets. Starting offset of i'th index is stored
// in i-1'th position except for 0'th index and ending offset
// is stored at i'th index position.
// For 0'th element, starting position is always zero.
// eg:
// Lens -> 5 5 5 5 => 5 10 15 20
// Lens -> 0 5 0 5 => 0 5 5 10
// Lens -> 0 0 0 5 => 0 0 0 5
// Lens -> 5 0 0 0 => 5 5 5 5
// Lens -> 0 5 0 0 => 0 5 5 5
// Lens -> 0 0 5 0 => 0 0 5 5
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
var runningOffset uint64
var index, i int
for i = 1; i <= len(lengths); i++ {
runningOffset += lengths[i-1]
lengths[index] = runningOffset
index++
}
return lengths
}
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = offsets[chunk-1]
}
return start, offsets[chunk]
}

View file

@ -0,0 +1,801 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bufio"
"bytes"
"encoding/binary"
"fmt"
"math"
"os"
"sort"
"github.com/RoaringBitmap/roaring"
"github.com/couchbase/vellum"
"github.com/golang/snappy"
)
var DefaultFileMergerBufferSize = 1024 * 1024
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
// Merge takes a slice of zap segments and bit masks describing which
// documents may be dropped, and creates a new segment containing the
// remaining data. This new segment is built at the specified path,
// with the provided chunkFactor.
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
chunkFactor uint32) ([][]uint64, uint64, error) {
segmentBases := make([]*SegmentBase, len(segments))
for segmenti, segment := range segments {
segmentBases[segmenti] = &segment.SegmentBase
}
return MergeSegmentBases(segmentBases, drops, path, chunkFactor)
}
func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string,
chunkFactor uint32) ([][]uint64, uint64, error) {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return nil, 0, err
}
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
// buffer the output
br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize)
// wrap it for counting (tracking offsets)
cr := NewCountHashWriter(br)
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
MergeToWriter(segmentBases, drops, chunkFactor, cr)
if err != nil {
cleanup()
return nil, 0, err
}
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
docValueOffset, chunkFactor, cr.Sum32(), cr)
if err != nil {
cleanup()
return nil, 0, err
}
err = br.Flush()
if err != nil {
cleanup()
return nil, 0, err
}
err = f.Sync()
if err != nil {
cleanup()
return nil, 0, err
}
err = f.Close()
if err != nil {
cleanup()
return nil, 0, err
}
return newDocNums, uint64(cr.Count()), nil
}
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
chunkFactor uint32, cr *CountHashWriter) (
newDocNums [][]uint64,
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
err error) {
docValueOffset = uint64(fieldNotUninverted)
var fieldsSame bool
fieldsSame, fieldsInv = mergeFields(segments)
fieldsMap = mapFields(fieldsInv)
numDocs = computeNewDocCount(segments, drops)
if numDocs > 0 {
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
fieldsMap, fieldsInv, fieldsSame, numDocs, cr)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
fieldsInv, fieldsMap, fieldsSame,
newDocNums, numDocs, chunkFactor, cr)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
} else {
dictLocs = make([]uint64, len(fieldsInv))
}
fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, nil
}
// mapFields takes the fieldsInv list and returns a map of fieldName
// to fieldID+1
func mapFields(fields []string) map[string]uint16 {
rv := make(map[string]uint16, len(fields))
for i, fieldName := range fields {
rv[fieldName] = uint16(i) + 1
}
return rv
}
// computeNewDocCount determines how many documents will be in the newly
// merged segment when obsoleted docs are dropped
func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 {
var newDocCount uint64
for segI, segment := range segments {
newDocCount += segment.numDocs
if drops[segI] != nil {
newDocCount -= drops[segI].GetCardinality()
}
}
return newDocCount
}
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
w *CountHashWriter) ([]uint64, uint64, error) {
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var bufLoc []uint64
var postings *PostingsList
var postItr *PostingsIterator
rv := make([]uint64, len(fieldsInv))
fieldDvLocsStart := make([]uint64, len(fieldsInv))
fieldDvLocsEnd := make([]uint64, len(fieldsInv))
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
var vellumBuf bytes.Buffer
newVellum, err := vellum.New(&vellumBuf, nil)
if err != nil {
return nil, 0, err
}
newRoaring := roaring.NewBitmap()
// for each field
for fieldID, fieldName := range fieldsInv {
// collect FST iterators from all active segments for this field
var newDocNums [][]uint64
var drops []*roaring.Bitmap
var dicts []*Dictionary
var itrs []vellum.Iterator
var segmentsInFocus []*SegmentBase
for segmentI, segment := range segments {
dict, err2 := segment.dictionary(fieldName)
if err2 != nil {
return nil, 0, err2
}
if dict != nil && dict.fst != nil {
itr, err2 := dict.fst.Iterator(nil, nil)
if err2 != nil && err2 != vellum.ErrIteratorDone {
return nil, 0, err2
}
if itr != nil {
newDocNums = append(newDocNums, newDocNumsIn[segmentI])
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() {
drops = append(drops, dropsIn[segmentI])
} else {
drops = append(drops, nil)
}
dicts = append(dicts, dict)
itrs = append(itrs, itr)
segmentsInFocus = append(segmentsInFocus, segment)
}
}
}
var prevTerm []byte
newRoaring.Clear()
var lastDocNum, lastFreq, lastNorm uint64
// determines whether to use "1-hit" encoding optimization
// when a term appears in only 1 doc, with no loc info,
// has freq of 1, and the docNum fits into 31-bits
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
docNum := uint64(newRoaring.Minimum())
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
return true, docNum, lastNorm
}
}
return false, 0, 0
}
finishTerm := func(term []byte) error {
if term == nil {
return nil
}
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err := writePostings(newRoaring,
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
if err != nil {
return err
}
if postingsOffset > 0 {
err = newVellum.Insert(term, postingsOffset)
if err != nil {
return err
}
}
newRoaring.Clear()
tfEncoder.Reset()
locEncoder.Reset()
lastDocNum = 0
lastFreq = 0
lastNorm = 0
return nil
}
enumerator, err := newEnumerator(itrs)
for err == nil {
term, itrI, postingsOffset := enumerator.Current()
if !bytes.Equal(prevTerm, term) {
// if the term changed, write out the info collected
// for the previous term
err2 := finishTerm(prevTerm)
if err2 != nil {
return nil, 0, err2
}
}
var err2 error
postings, err2 = dicts[itrI].postingsListFromOffset(
postingsOffset, drops[itrI], postings)
if err2 != nil {
return nil, 0, err2
}
postItr = postings.iterator(true, true, true, postItr)
if fieldsSame {
// can optimize by copying freq/norm/loc bytes directly
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
term, postItr, newDocNums[itrI], newRoaring,
tfEncoder, locEncoder)
} else {
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
fieldsMap, term, postItr, newDocNums[itrI], newRoaring,
tfEncoder, locEncoder, bufLoc)
}
if err != nil {
return nil, 0, err
}
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
prevTerm = append(prevTerm, term...)
err = enumerator.Next()
}
if err != nil && err != vellum.ErrIteratorDone {
return nil, 0, err
}
err = finishTerm(prevTerm)
if err != nil {
return nil, 0, err
}
dictOffset := uint64(w.Count())
err = newVellum.Close()
if err != nil {
return nil, 0, err
}
vellumData := vellumBuf.Bytes()
// write out the length of the vellum data
n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData)))
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return nil, 0, err
}
// write this vellum to disk
_, err = w.Write(vellumData)
if err != nil {
return nil, 0, err
}
rv[fieldID] = dictOffset
// get the field doc value offset (start)
fieldDvLocsStart[fieldID] = uint64(w.Count())
// update the field doc values
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true)
fdvReadersAvailable := false
var dvIterClone *docValueReader
for segmentI, segment := range segmentsInFocus {
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists &&
dvIter != nil {
fdvReadersAvailable = true
dvIterClone = dvIter.cloneInto(dvIterClone)
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error {
if newDocNums[segmentI][docNum] == docDropped {
return nil
}
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms)
if err != nil {
return err
}
return nil
})
if err != nil {
return nil, 0, err
}
}
}
if fdvReadersAvailable {
err = fdvEncoder.Close()
if err != nil {
return nil, 0, err
}
// persist the doc value details for this field
_, err = fdvEncoder.Write()
if err != nil {
return nil, 0, err
}
// get the field doc value offset (end)
fieldDvLocsEnd[fieldID] = uint64(w.Count())
} else {
fieldDvLocsStart[fieldID] = fieldNotUninverted
fieldDvLocsEnd[fieldID] = fieldNotUninverted
}
// reset vellum buffer and vellum builder
vellumBuf.Reset()
err = newVellum.Reset(&vellumBuf)
if err != nil {
return nil, 0, err
}
}
fieldDvLocsOffset := uint64(w.Count())
buf := bufMaxVarintLen64
for i := 0; i < len(fieldDvLocsStart); i++ {
n := binary.PutUvarint(buf, fieldDvLocsStart[i])
_, err := w.Write(buf[:n])
if err != nil {
return nil, 0, err
}
n = binary.PutUvarint(buf, fieldDvLocsEnd[i])
_, err = w.Write(buf[:n])
if err != nil {
return nil, 0, err
}
}
return rv, fieldDvLocsOffset, nil
}
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) (
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
next, err := postItr.Next()
for next != nil && err == nil {
hitNewDocNum := newDocNums[next.Number()]
if hitNewDocNum == docDropped {
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
}
newRoaring.Add(uint32(hitNewDocNum))
nextFreq := next.Frequency()
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
locs := next.Locations()
err = tfEncoder.Add(hitNewDocNum,
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
if err != nil {
return 0, 0, 0, nil, err
}
if len(locs) > 0 {
numBytesLocs := 0
for _, loc := range locs {
ap := loc.ArrayPositions()
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap)
}
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
if err != nil {
return 0, 0, 0, nil, err
}
for _, loc := range locs {
ap := loc.ArrayPositions()
if cap(bufLoc) < 5+len(ap) {
bufLoc = make([]uint64, 0, 5+len(ap))
}
args := bufLoc[0:5]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = loc.Pos()
args[2] = loc.Start()
args[3] = loc.End()
args[4] = uint64(len(ap))
args = append(args, ap...)
err = locEncoder.Add(hitNewDocNum, args...)
if err != nil {
return 0, 0, 0, nil, err
}
}
}
lastDocNum = hitNewDocNum
lastFreq = nextFreq
lastNorm = nextNorm
next, err = postItr.Next()
}
return lastDocNum, lastFreq, lastNorm, bufLoc, err
}
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) (
lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) {
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err :=
postItr.nextBytes()
for err == nil && len(nextFreqNormBytes) > 0 {
hitNewDocNum := newDocNums[nextDocNum]
if hitNewDocNum == docDropped {
return 0, 0, 0, fmt.Errorf("see hit with dropped doc num")
}
newRoaring.Add(uint32(hitNewDocNum))
err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes)
if err != nil {
return 0, 0, 0, err
}
if len(nextLocBytes) > 0 {
err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes)
if err != nil {
return 0, 0, 0, err
}
}
lastDocNum = hitNewDocNum
lastFreq = nextFreq
lastNorm = nextNorm
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err =
postItr.nextBytes()
}
return lastDocNum, lastFreq, lastNorm, err
}
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
use1HitEncoding func(uint64) (bool, uint64, uint64),
w *CountHashWriter, bufMaxVarintLen64 []byte) (
offset uint64, err error) {
termCardinality := postings.GetCardinality()
if termCardinality <= 0 {
return 0, nil
}
if use1HitEncoding != nil {
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
if encodeAs1Hit {
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil
}
}
tfOffset := uint64(w.Count())
_, err = tfEncoder.Write(w)
if err != nil {
return 0, err
}
locOffset := uint64(w.Count())
_, err = locEncoder.Write(w)
if err != nil {
return 0, err
}
postingsOffset := uint64(w.Count())
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
if err != nil {
return 0, err
}
return postingsOffset, nil
}
type varintEncoder func(uint64) (int, error)
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
w *CountHashWriter) (uint64, [][]uint64, error) {
var rv [][]uint64 // The remapped or newDocNums for each segment.
var newDocNum uint64
var curr int
var data, compressed []byte
var metaBuf bytes.Buffer
varBuf := make([]byte, binary.MaxVarintLen64)
metaEncode := func(val uint64) (int, error) {
wb := binary.PutUvarint(varBuf, val)
return metaBuf.Write(varBuf[:wb])
}
vals := make([][][]byte, len(fieldsInv))
typs := make([][]byte, len(fieldsInv))
poss := make([][][]uint64, len(fieldsInv))
docNumOffsets := make([]uint64, newSegDocCount)
// for each segment
for segI, segment := range segments {
segNewDocNums := make([]uint64, segment.numDocs)
dropsI := drops[segI]
// optimize when the field mapping is the same across all
// segments and there are no deletions, via byte-copying
// of stored docs bytes directly to the writer
if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) {
err := segment.copyStoredDocs(newDocNum, docNumOffsets, w)
if err != nil {
return 0, nil, err
}
for i := uint64(0); i < segment.numDocs; i++ {
segNewDocNums[i] = newDocNum
newDocNum++
}
rv = append(rv, segNewDocNums)
continue
}
// for each doc num
for docNum := uint64(0); docNum < segment.numDocs; docNum++ {
// TODO: roaring's API limits docNums to 32-bits?
if dropsI != nil && dropsI.Contains(uint32(docNum)) {
segNewDocNums[docNum] = docDropped
continue
}
segNewDocNums[docNum] = newDocNum
curr = 0
metaBuf.Reset()
data = data[:0]
// collect all the data
for i := 0; i < len(fieldsInv); i++ {
vals[i] = vals[i][:0]
typs[i] = typs[i][:0]
poss[i] = poss[i][:0]
}
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
fieldID := int(fieldsMap[field]) - 1
vals[fieldID] = append(vals[fieldID], value)
typs[fieldID] = append(typs[fieldID], typ)
poss[fieldID] = append(poss[fieldID], pos)
return true
})
if err != nil {
return 0, nil, err
}
// _id field special case optimizes ExternalID() lookups
idFieldVal := vals[uint16(0)][0]
_, err = metaEncode(uint64(len(idFieldVal)))
if err != nil {
return 0, nil, err
}
// now walk the non-"_id" fields in order
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
storedFieldValues := vals[fieldID]
stf := typs[fieldID]
spf := poss[fieldID]
var err2 error
curr, data, err2 = persistStoredFieldValues(fieldID,
storedFieldValues, stf, spf, curr, metaEncode, data)
if err2 != nil {
return 0, nil, err2
}
}
metaBytes := metaBuf.Bytes()
compressed = snappy.Encode(compressed[:cap(compressed)], data)
// record where we're about to start writing
docNumOffsets[newDocNum] = uint64(w.Count())
// write out the meta len and compressed data len
_, err = writeUvarints(w,
uint64(len(metaBytes)),
uint64(len(idFieldVal)+len(compressed)))
if err != nil {
return 0, nil, err
}
// now write the meta
_, err = w.Write(metaBytes)
if err != nil {
return 0, nil, err
}
// now write the _id field val (counted as part of the 'compressed' data)
_, err = w.Write(idFieldVal)
if err != nil {
return 0, nil, err
}
// now write the compressed data
_, err = w.Write(compressed)
if err != nil {
return 0, nil, err
}
newDocNum++
}
rv = append(rv, segNewDocNums)
}
// return value is the start of the stored index
storedIndexOffset := uint64(w.Count())
// now write out the stored doc index
for _, docNumOffset := range docNumOffsets {
err := binary.Write(w, binary.BigEndian, docNumOffset)
if err != nil {
return 0, nil, err
}
}
return storedIndexOffset, rv, nil
}
// copyStoredDocs writes out a segment's stored doc info, optimized by
// using a single Write() call for the entire set of bytes. The
// newDocNumOffsets is filled with the new offsets for each doc.
func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64,
w *CountHashWriter) error {
if s.numDocs <= 0 {
return nil
}
indexOffset0, storedOffset0, _, _, _ :=
s.getDocStoredOffsets(0) // the segment's first doc
indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN :=
s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc
storedOffset0New := uint64(w.Count())
storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN]
_, err := w.Write(storedBytes)
if err != nil {
return err
}
// remap the storedOffset's for the docs into new offsets relative
// to storedOffset0New, filling the given docNumOffsetsOut array
for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 {
storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8])
storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New
newDocNumOffsets[newDocNum] = storedOffsetNew
newDocNum += 1
}
return nil
}
// mergeFields builds a unified list of fields used across all the
// input segments, and computes whether the fields are the same across
// segments (which depends on fields to be sorted in the same way
// across segments)
func mergeFields(segments []*SegmentBase) (bool, []string) {
fieldsSame := true
var segment0Fields []string
if len(segments) > 0 {
segment0Fields = segments[0].Fields()
}
fieldsExist := map[string]struct{}{}
for _, segment := range segments {
fields := segment.Fields()
for fieldi, field := range fields {
fieldsExist[field] = struct{}{}
if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field {
fieldsSame = false
}
}
}
rv := make([]string, 0, len(fieldsExist))
// ensure _id stays first
rv = append(rv, "_id")
for k := range fieldsExist {
if k != "_id" {
rv = append(rv, k)
}
}
sort.Strings(rv[1:]) // leave _id as first
return fieldsSame, rv
}

View file

@ -0,0 +1,826 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"math"
"sort"
"sync"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/couchbase/vellum"
"github.com/golang/snappy"
)
var NewSegmentBufferNumResultsBump int = 100
var NewSegmentBufferNumResultsFactor float64 = 1.0
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
// SegmentBase from analysis results
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
chunkFactor uint32) (*SegmentBase, uint64, error) {
s := interimPool.Get().(*interim)
var br bytes.Buffer
if s.lastNumDocs > 0 {
// use previous results to initialize the buf with an estimate
// size, but note that the interim instance comes from a
// global interimPool, so multiple scorch instances indexing
// different docs can lead to low quality estimates
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
NewSegmentBufferNumResultsFactor)
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
NewSegmentBufferAvgBytesPerDocFactor)
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
}
s.results = results
s.chunkFactor = chunkFactor
s.w = NewCountHashWriter(&br)
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
err := s.convert()
if err != nil {
return nil, uint64(0), err
}
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
s.FieldsMap, s.FieldsInv, uint64(len(results)),
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
if err == nil && s.reset() == nil {
s.lastNumDocs = len(results)
s.lastOutSize = len(br.Bytes())
interimPool.Put(s)
}
return sb, uint64(len(br.Bytes())), err
}
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
// interim holds temporary working data used while converting from
// analysis results to a zap-encoded segment
type interim struct {
results []*index.AnalysisResult
chunkFactor uint32
w *CountHashWriter
// FieldsMap adds 1 to field id to avoid zero value issues
// name -> field id + 1
FieldsMap map[string]uint16
// FieldsInv is the inverse of FieldsMap
// field id -> name
FieldsInv []string
// Term dictionaries for each field
// field id -> term -> postings list id + 1
Dicts []map[string]uint64
// Terms for each field, where terms are sorted ascending
// field id -> []term
DictKeys [][]string
// Fields whose IncludeDocValues is true
// field id -> bool
IncludeDocValues []bool
// postings id -> bitmap of docNums
Postings []*roaring.Bitmap
// postings id -> freq/norm's, one for each docNum in postings
FreqNorms [][]interimFreqNorm
freqNormsBacking []interimFreqNorm
// postings id -> locs, one for each freq
Locs [][]interimLoc
locsBacking []interimLoc
numTermsPerPostingsList []int // key is postings list id
numLocsPerPostingsList []int // key is postings list id
builder *vellum.Builder
builderBuf bytes.Buffer
metaBuf bytes.Buffer
tmp0 []byte
tmp1 []byte
lastNumDocs int
lastOutSize int
}
func (s *interim) reset() (err error) {
s.results = nil
s.chunkFactor = 0
s.w = nil
s.FieldsMap = nil
s.FieldsInv = nil
for i := range s.Dicts {
s.Dicts[i] = nil
}
s.Dicts = s.Dicts[:0]
for i := range s.DictKeys {
s.DictKeys[i] = s.DictKeys[i][:0]
}
s.DictKeys = s.DictKeys[:0]
for i := range s.IncludeDocValues {
s.IncludeDocValues[i] = false
}
s.IncludeDocValues = s.IncludeDocValues[:0]
for _, idn := range s.Postings {
idn.Clear()
}
s.Postings = s.Postings[:0]
s.FreqNorms = s.FreqNorms[:0]
for i := range s.freqNormsBacking {
s.freqNormsBacking[i] = interimFreqNorm{}
}
s.freqNormsBacking = s.freqNormsBacking[:0]
s.Locs = s.Locs[:0]
for i := range s.locsBacking {
s.locsBacking[i] = interimLoc{}
}
s.locsBacking = s.locsBacking[:0]
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
s.builderBuf.Reset()
if s.builder != nil {
err = s.builder.Reset(&s.builderBuf)
}
s.metaBuf.Reset()
s.tmp0 = s.tmp0[:0]
s.tmp1 = s.tmp1[:0]
s.lastNumDocs = 0
s.lastOutSize = 0
return err
}
func (s *interim) grabBuf(size int) []byte {
buf := s.tmp0
if cap(buf) < size {
buf = make([]byte, size)
s.tmp0 = buf
}
return buf[0:size]
}
type interimStoredField struct {
vals [][]byte
typs []byte
arrayposs [][]uint64 // array positions
}
type interimFreqNorm struct {
freq uint64
norm float32
numLocs int
}
type interimLoc struct {
fieldID uint16
pos uint64
start uint64
end uint64
arrayposs []uint64
}
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
s.FieldsMap = map[string]uint16{}
s.getOrDefineField("_id") // _id field is fieldID 0
for _, result := range s.results {
for _, field := range result.Document.CompositeFields {
s.getOrDefineField(field.Name())
}
for _, field := range result.Document.Fields {
s.getOrDefineField(field.Name())
}
}
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
for fieldID, fieldName := range s.FieldsInv {
s.FieldsMap[fieldName] = uint16(fieldID + 1)
}
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
} else {
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
}
s.prepareDicts()
for _, dict := range s.DictKeys {
sort.Strings(dict)
}
s.processDocuments()
storedIndexOffset, err := s.writeStoredFields()
if err != nil {
return 0, 0, 0, nil, err
}
var fdvIndexOffset uint64
var dictOffsets []uint64
if len(s.results) > 0 {
fdvIndexOffset, dictOffsets, err = s.writeDicts()
if err != nil {
return 0, 0, 0, nil, err
}
} else {
dictOffsets = make([]uint64, len(s.FieldsInv))
}
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
if err != nil {
return 0, 0, 0, nil, err
}
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
}
func (s *interim) getOrDefineField(fieldName string) int {
fieldIDPlus1, exists := s.FieldsMap[fieldName]
if !exists {
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[fieldName] = fieldIDPlus1
s.FieldsInv = append(s.FieldsInv, fieldName)
s.Dicts = append(s.Dicts, make(map[string]uint64))
n := len(s.DictKeys)
if n < cap(s.DictKeys) {
s.DictKeys = s.DictKeys[:n+1]
s.DictKeys[n] = s.DictKeys[n][:0]
} else {
s.DictKeys = append(s.DictKeys, []string(nil))
}
}
return int(fieldIDPlus1 - 1)
}
// fill Dicts and DictKeys from analysis results
func (s *interim) prepareDicts() {
var pidNext int
var totTFs int
var totLocs int
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
dict := s.Dicts[fieldID]
dictKeys := s.DictKeys[fieldID]
for term, tf := range tfs {
pidPlus1, exists := dict[term]
if !exists {
pidNext++
pidPlus1 = uint64(pidNext)
dict[term] = pidPlus1
dictKeys = append(dictKeys, term)
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
}
pid := pidPlus1 - 1
s.numTermsPerPostingsList[pid] += 1
s.numLocsPerPostingsList[pid] += len(tf.Locations)
totLocs += len(tf.Locations)
}
totTFs += len(tfs)
s.DictKeys[fieldID] = dictKeys
}
for _, result := range s.results {
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
_, tf := field.Analyze()
visitField(fieldID, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
tf := result.Analyzed[i]
visitField(fieldID, tf)
}
}
numPostingsLists := pidNext
if cap(s.Postings) >= numPostingsLists {
s.Postings = s.Postings[:numPostingsLists]
} else {
postings := make([]*roaring.Bitmap, numPostingsLists)
copy(postings, s.Postings[:cap(s.Postings)])
for i := 0; i < numPostingsLists; i++ {
if postings[i] == nil {
postings[i] = roaring.New()
}
}
s.Postings = postings
}
if cap(s.FreqNorms) >= numPostingsLists {
s.FreqNorms = s.FreqNorms[:numPostingsLists]
} else {
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
}
if cap(s.freqNormsBacking) >= totTFs {
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
} else {
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
}
freqNormsBacking := s.freqNormsBacking
for pid, numTerms := range s.numTermsPerPostingsList {
s.FreqNorms[pid] = freqNormsBacking[0:0]
freqNormsBacking = freqNormsBacking[numTerms:]
}
if cap(s.Locs) >= numPostingsLists {
s.Locs = s.Locs[:numPostingsLists]
} else {
s.Locs = make([][]interimLoc, numPostingsLists)
}
if cap(s.locsBacking) >= totLocs {
s.locsBacking = s.locsBacking[:totLocs]
} else {
s.locsBacking = make([]interimLoc, totLocs)
}
locsBacking := s.locsBacking
for pid, numLocs := range s.numLocsPerPostingsList {
s.Locs[pid] = locsBacking[0:0]
locsBacking = locsBacking[numLocs:]
}
}
func (s *interim) processDocuments() {
numFields := len(s.FieldsInv)
reuseFieldLens := make([]int, numFields)
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
for docNum, result := range s.results {
for i := 0; i < numFields; i++ { // clear these for reuse
reuseFieldLens[i] = 0
reuseFieldTFs[i] = nil
}
s.processDocument(uint64(docNum), result,
reuseFieldLens, reuseFieldTFs)
}
}
func (s *interim) processDocument(docNum uint64,
result *index.AnalysisResult,
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
visitField := func(fieldID uint16, fieldName string,
ln int, tf analysis.TokenFrequencies) {
fieldLens[fieldID] += ln
existingFreqs := fieldTFs[fieldID]
if existingFreqs != nil {
existingFreqs.MergeAll(fieldName, tf)
} else {
fieldTFs[fieldID] = tf
}
}
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name()))
ln, tf := field.Analyze()
visitField(fieldID, field.Name(), ln, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
ln := result.Length[i]
tf := result.Analyzed[i]
visitField(fieldID, field.Name(), ln, tf)
}
// now that it's been rolled up into fieldTFs, walk that
for fieldID, tfs := range fieldTFs {
dict := s.Dicts[fieldID]
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
for term, tf := range tfs {
pid := dict[term] - 1
bs := s.Postings[pid]
bs.Add(uint32(docNum))
s.FreqNorms[pid] = append(s.FreqNorms[pid],
interimFreqNorm{
freq: uint64(tf.Frequency()),
norm: norm,
numLocs: len(tf.Locations),
})
if len(tf.Locations) > 0 {
locs := s.Locs[pid]
for _, loc := range tf.Locations {
var locf = uint16(fieldID)
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field))
}
var arrayposs []uint64
if len(loc.ArrayPositions) > 0 {
arrayposs = loc.ArrayPositions
}
locs = append(locs, interimLoc{
fieldID: locf,
pos: uint64(loc.Position),
start: uint64(loc.Start),
end: uint64(loc.End),
arrayposs: arrayposs,
})
}
s.Locs[pid] = locs
}
}
}
}
func (s *interim) writeStoredFields() (
storedIndexOffset uint64, err error) {
varBuf := make([]byte, binary.MaxVarintLen64)
metaEncode := func(val uint64) (int, error) {
wb := binary.PutUvarint(varBuf, val)
return s.metaBuf.Write(varBuf[:wb])
}
data, compressed := s.tmp0[:0], s.tmp1[:0]
defer func() { s.tmp0, s.tmp1 = data, compressed }()
// keyed by docNum
docStoredOffsets := make([]uint64, len(s.results))
// keyed by fieldID, for the current doc in the loop
docStoredFields := map[uint16]interimStoredField{}
for docNum, result := range s.results {
for fieldID := range docStoredFields { // reset for next doc
delete(docStoredFields, fieldID)
}
for _, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name()))
opts := field.Options()
if opts.IsStored() {
isf := docStoredFields[fieldID]
isf.vals = append(isf.vals, field.Value())
isf.typs = append(isf.typs, encodeFieldType(field))
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
docStoredFields[fieldID] = isf
}
if opts.IncludeDocValues() {
s.IncludeDocValues[fieldID] = true
}
}
var curr int
s.metaBuf.Reset()
data = data[:0]
// _id field special case optimizes ExternalID() lookups
idFieldVal := docStoredFields[uint16(0)].vals[0]
_, err = metaEncode(uint64(len(idFieldVal)))
if err != nil {
return 0, err
}
// handle non-"_id" fields
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
isf, exists := docStoredFields[uint16(fieldID)]
if exists {
curr, data, err = persistStoredFieldValues(
fieldID, isf.vals, isf.typs, isf.arrayposs,
curr, metaEncode, data)
if err != nil {
return 0, err
}
}
}
metaBytes := s.metaBuf.Bytes()
compressed = snappy.Encode(compressed[:cap(compressed)], data)
docStoredOffsets[docNum] = uint64(s.w.Count())
_, err := writeUvarints(s.w,
uint64(len(metaBytes)),
uint64(len(idFieldVal)+len(compressed)))
if err != nil {
return 0, err
}
_, err = s.w.Write(metaBytes)
if err != nil {
return 0, err
}
_, err = s.w.Write(idFieldVal)
if err != nil {
return 0, err
}
_, err = s.w.Write(compressed)
if err != nil {
return 0, err
}
}
storedIndexOffset = uint64(s.w.Count())
for _, docStoredOffset := range docStoredOffsets {
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
if err != nil {
return 0, err
}
}
return storedIndexOffset, nil
}
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
dictOffsets = make([]uint64, len(s.FieldsInv))
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
buf := s.grabBuf(binary.MaxVarintLen64)
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
var docTermMap [][]byte
if s.builder == nil {
s.builder, err = vellum.New(&s.builderBuf, nil)
if err != nil {
return 0, nil, err
}
}
for fieldID, terms := range s.DictKeys {
if cap(docTermMap) < len(s.results) {
docTermMap = make([][]byte, len(s.results))
} else {
docTermMap = docTermMap[0:len(s.results)]
for docNum := range docTermMap { // reset the docTermMap
docTermMap[docNum] = docTermMap[docNum][:0]
}
}
dict := s.Dicts[fieldID]
for _, term := range terms { // terms are already sorted
pid := dict[term] - 1
postingsBS := s.Postings[pid]
freqNorms := s.FreqNorms[pid]
freqNormOffset := 0
locs := s.Locs[pid]
locOffset := 0
postingsItr := postingsBS.Iterator()
for postingsItr.HasNext() {
docNum := uint64(postingsItr.Next())
freqNorm := freqNorms[freqNormOffset]
err = tfEncoder.Add(docNum,
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
uint64(math.Float32bits(freqNorm.norm)))
if err != nil {
return 0, nil, err
}
if freqNorm.numLocs > 0 {
numBytesLocs := 0
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
numBytesLocs += totalUvarintBytes(
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
uint64(len(loc.arrayposs)), loc.arrayposs)
}
err = locEncoder.Add(docNum, uint64(numBytesLocs))
if err != nil {
return 0, nil, err
}
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
err = locEncoder.Add(docNum,
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
uint64(len(loc.arrayposs)))
if err != nil {
return 0, nil, err
}
err = locEncoder.Add(docNum, loc.arrayposs...)
if err != nil {
return 0, nil, err
}
}
locOffset += freqNorm.numLocs
}
freqNormOffset++
docTermMap[docNum] = append(
append(docTermMap[docNum], term...),
termSeparator)
}
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err :=
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
if err != nil {
return 0, nil, err
}
if postingsOffset > uint64(0) {
err = s.builder.Insert([]byte(term), postingsOffset)
if err != nil {
return 0, nil, err
}
}
tfEncoder.Reset()
locEncoder.Reset()
}
err = s.builder.Close()
if err != nil {
return 0, nil, err
}
// record where this dictionary starts
dictOffsets[fieldID] = uint64(s.w.Count())
vellumData := s.builderBuf.Bytes()
// write out the length of the vellum data
n := binary.PutUvarint(buf, uint64(len(vellumData)))
_, err = s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
// write this vellum to disk
_, err = s.w.Write(vellumData)
if err != nil {
return 0, nil, err
}
// reset vellum for reuse
s.builderBuf.Reset()
err = s.builder.Reset(&s.builderBuf)
if err != nil {
return 0, nil, err
}
// write the field doc values
if s.IncludeDocValues[fieldID] {
for docNum, docTerms := range docTermMap {
if len(docTerms) > 0 {
err = fdvEncoder.Add(uint64(docNum), docTerms)
if err != nil {
return 0, nil, err
}
}
}
err = fdvEncoder.Close()
if err != nil {
return 0, nil, err
}
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
_, err = fdvEncoder.Write()
if err != nil {
return 0, nil, err
}
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
fdvEncoder.Reset()
} else {
fdvOffsetsStart[fieldID] = fieldNotUninverted
fdvOffsetsEnd[fieldID] = fieldNotUninverted
}
}
fdvIndexOffset = uint64(s.w.Count())
for i := 0; i < len(fdvOffsetsStart); i++ {
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
_, err := s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
_, err = s.w.Write(buf[:n])
if err != nil {
return 0, nil, err
}
}
return fdvIndexOffset, dictOffsets, nil
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.GeoPointField:
fieldType = 'g'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}
// returns the total # of bytes needed to encode the given uint64's
// into binary.PutUVarint() encoding
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
n = numUvarintBytes(a)
n += numUvarintBytes(b)
n += numUvarintBytes(c)
n += numUvarintBytes(d)
n += numUvarintBytes(e)
for _, v := range more {
n += numUvarintBytes(v)
}
return n
}
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
func numUvarintBytes(x uint64) (n int) {
for x >= 0x80 {
x >>= 7
n++
}
return n + 1
}

View file

@ -0,0 +1,790 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
"reflect"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizePostingsList int
var reflectStaticSizePostingsIterator int
var reflectStaticSizePosting int
var reflectStaticSizeLocation int
func init() {
var pl PostingsList
reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size())
var pi PostingsIterator
reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size())
var p Posting
reflectStaticSizePosting = int(reflect.TypeOf(p).Size())
var l Location
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
}
// FST or vellum value (uint64) encoding is determined by the top two
// highest-order or most significant bits...
//
// encoding : MSB
// name : 63 62 61...to...bit #0 (LSB)
// ----------+---+---+---------------------------------------------------
// general : 0 | 0 | 62-bits of postingsOffset.
// ~ : 0 | 1 | reserved for future.
// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum.
// ~ : 1 | 1 | reserved for future.
//
// Encoding "general" is able to handle all cases, where the
// postingsOffset points to more information about the postings for
// the term.
//
// Encoding "1-hit" is used to optimize a commonly seen case when a
// term has only a single hit. For example, a term in the _id field
// will have only 1 hit. The "1-hit" encoding is used for a term
// in a field when...
//
// - term vector info is disabled for that field;
// - and, the term appears in only a single doc for that field;
// - and, the term's freq is exactly 1 in that single doc for that field;
// - and, the docNum must fit into 31-bits;
//
// Otherwise, the "general" encoding is used instead.
//
// In the "1-hit" encoding, the field in that single doc may have
// other terms, which is supported in the "1-hit" encoding by the
// positive float31 norm.
const FSTValEncodingMask = uint64(0xc000000000000000)
const FSTValEncodingGeneral = uint64(0x0000000000000000)
const FSTValEncoding1Hit = uint64(0x8000000000000000)
func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 {
return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum)
}
func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) {
return (mask31Bits & v), (mask31Bits & (v >> 31))
}
const mask31Bits = uint64(0x000000007fffffff)
func under32Bits(x uint64) bool {
return x <= mask31Bits
}
const docNum1HitFinished = math.MaxUint64
// PostingsList is an in-memory represenation of a postings list
type PostingsList struct {
sb *SegmentBase
postingsOffset uint64
freqOffset uint64
locOffset uint64
postings *roaring.Bitmap
except *roaring.Bitmap
// when normBits1Hit != 0, then this postings list came from a
// 1-hit encoding, and only the docNum1Hit & normBits1Hit apply
docNum1Hit uint64
normBits1Hit uint64
}
// represents an immutable, empty postings list
var emptyPostingsList = &PostingsList{}
func (p *PostingsList) Size() int {
sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr
if p.except != nil {
sizeInBytes += int(p.except.GetSizeInBytes())
}
return sizeInBytes
}
func (p *PostingsList) OrInto(receiver *roaring.Bitmap) {
if p.normBits1Hit != 0 {
receiver.Add(uint32(p.docNum1Hit))
return
}
if p.postings != nil {
receiver.Or(p.postings)
}
}
// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool,
prealloc segment.PostingsIterator) segment.PostingsIterator {
if p.normBits1Hit == 0 && p.postings == nil {
return emptyPostingsIterator
}
var preallocPI *PostingsIterator
pi, ok := prealloc.(*PostingsIterator)
if ok && pi != nil {
preallocPI = pi
}
if preallocPI == emptyPostingsIterator {
preallocPI = nil
}
return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI)
}
func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
rv *PostingsIterator) *PostingsIterator {
if rv == nil {
rv = &PostingsIterator{}
} else {
freqNormReader := rv.freqNormReader
if freqNormReader != nil {
freqNormReader.Reset([]byte(nil))
}
locReader := rv.locReader
if locReader != nil {
locReader.Reset([]byte(nil))
}
freqChunkOffsets := rv.freqChunkOffsets[:0]
locChunkOffsets := rv.locChunkOffsets[:0]
nextLocs := rv.nextLocs[:0]
nextSegmentLocs := rv.nextSegmentLocs[:0]
buf := rv.buf
*rv = PostingsIterator{} // clear the struct
rv.freqNormReader = freqNormReader
rv.locReader = locReader
rv.freqChunkOffsets = freqChunkOffsets
rv.locChunkOffsets = locChunkOffsets
rv.nextLocs = nextLocs
rv.nextSegmentLocs = nextSegmentLocs
rv.buf = buf
}
rv.postings = p
if p.normBits1Hit != 0 {
// "1-hit" encoding
rv.docNum1Hit = p.docNum1Hit
rv.normBits1Hit = p.normBits1Hit
if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) {
rv.docNum1Hit = docNum1HitFinished
}
return rv
}
// "general" encoding, check if empty
if p.postings == nil {
return rv
}
var n uint64
var read int
// prepare the freq chunk details
rv.includeFreqNorm = includeFreq || includeNorm
if rv.includeFreqNorm {
var numFreqChunks uint64
numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
if cap(rv.freqChunkOffsets) >= int(numFreqChunks) {
rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)]
} else {
rv.freqChunkOffsets = make([]uint64, int(numFreqChunks))
}
for i := 0; i < int(numFreqChunks); i++ {
rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.freqChunkStart = p.freqOffset + n
}
// prepare the loc chunk details
rv.includeLocs = includeLocs
if rv.includeLocs {
n = 0
var numLocChunks uint64
numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
if cap(rv.locChunkOffsets) >= int(numLocChunks) {
rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)]
} else {
rv.locChunkOffsets = make([]uint64, int(numLocChunks))
}
for i := 0; i < int(numLocChunks); i++ {
rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.locChunkStart = p.locOffset + n
}
rv.all = p.postings.Iterator()
if p.except != nil {
rv.ActualBM = roaring.AndNot(p.postings, p.except)
rv.Actual = rv.ActualBM.Iterator()
} else {
rv.ActualBM = p.postings
rv.Actual = p.postings.Iterator()
}
return rv
}
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
var n uint64
if p.normBits1Hit != 0 {
n = 1
} else if p.postings != nil {
n = p.postings.GetCardinality()
}
var e uint64
if p.except != nil {
e = p.except.GetCardinality()
}
if n <= e {
return 0
}
return n - e
}
func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
rv.postingsOffset = postingsOffset
// handle "1-hit" encoding special case
if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit {
return rv.init1Hit(postingsOffset)
}
// read the location of the freq/norm details
var n uint64
var read int
rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
n += uint64(read)
rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var postingsLen uint64
postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
if rv.postings == nil {
rv.postings = roaring.NewBitmap()
}
_, err := rv.postings.FromBuffer(roaringBytes)
if err != nil {
return fmt.Errorf("error loading roaring bitmap: %v", err)
}
return nil
}
func (rv *PostingsList) init1Hit(fstVal uint64) error {
docNum, normBits := FSTValDecode1Hit(fstVal)
rv.docNum1Hit = docNum
rv.normBits1Hit = normBits
return nil
}
// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
postings *PostingsList
all roaring.IntIterable
Actual roaring.IntIterable
ActualBM *roaring.Bitmap
currChunk uint32
currChunkFreqNorm []byte
currChunkLoc []byte
freqNormReader *bytes.Reader
locReader *bytes.Reader
freqChunkOffsets []uint64
freqChunkStart uint64
locChunkOffsets []uint64
locChunkStart uint64
next Posting // reused across Next() calls
nextLocs []Location // reused across Next() calls
nextSegmentLocs []segment.Location // reused across Next() calls
docNum1Hit uint64
normBits1Hit uint64
buf []byte
includeFreqNorm bool
includeLocs bool
}
var emptyPostingsIterator = &PostingsIterator{}
func (i *PostingsIterator) Size() int {
sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr +
len(i.currChunkFreqNorm) +
len(i.currChunkLoc) +
len(i.freqChunkOffsets)*size.SizeOfUint64 +
len(i.locChunkOffsets)*size.SizeOfUint64 +
i.next.Size()
for _, entry := range i.nextLocs {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
func (i *PostingsIterator) loadChunk(chunk int) error {
if i.includeFreqNorm {
if chunk >= len(i.freqChunkOffsets) {
return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)",
chunk, len(i.freqChunkOffsets))
}
end, start := i.freqChunkStart, i.freqChunkStart
s, e := readChunkBoundary(chunk, i.freqChunkOffsets)
start += s
end += e
i.currChunkFreqNorm = i.postings.sb.mem[start:end]
if i.freqNormReader == nil {
i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm)
} else {
i.freqNormReader.Reset(i.currChunkFreqNorm)
}
}
if i.includeLocs {
if chunk >= len(i.locChunkOffsets) {
return fmt.Errorf("tried to load loc chunk that doesn't exist %d/(%d)",
chunk, len(i.locChunkOffsets))
}
end, start := i.locChunkStart, i.locChunkStart
s, e := readChunkBoundary(chunk, i.locChunkOffsets)
start += s
end += e
i.currChunkLoc = i.postings.sb.mem[start:end]
if i.locReader == nil {
i.locReader = bytes.NewReader(i.currChunkLoc)
} else {
i.locReader.Reset(i.currChunkLoc)
}
}
i.currChunk = uint32(chunk)
return nil
}
func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
if i.normBits1Hit != 0 {
return 1, i.normBits1Hit, false, nil
}
freqHasLocs, err := binary.ReadUvarint(i.freqNormReader)
if err != nil {
return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
}
freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
normBits, err := binary.ReadUvarint(i.freqNormReader)
if err != nil {
return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, hasLocs, err
}
func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
rv := freq << 1
if hasLocs {
rv = rv | 0x01 // 0'th LSB encodes whether there are locations
}
return rv
}
func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
freq := freqHasLocs >> 1
hasLocs := freqHasLocs&0x01 != 0
return freq, hasLocs
}
// readLocation processes all the integers on the stream representing a single
// location. if you care about it, pass in a non-nil location struct, and we
// will fill it. if you don't care about it, pass in nil and we safely consume
// the contents.
func (i *PostingsIterator) readLocation(l *Location) error {
// read off field
fieldID, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading location field: %v", err)
}
// read off pos
pos, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading location pos: %v", err)
}
// read off start
start, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading location start: %v", err)
}
// read off end
end, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading location end: %v", err)
}
// read off num array pos
numArrayPos, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading location num array pos: %v", err)
}
// group these together for less branching
if l != nil {
l.field = i.postings.sb.fieldsInv[fieldID]
l.pos = pos
l.start = start
l.end = end
if cap(l.ap) < int(numArrayPos) {
l.ap = make([]uint64, int(numArrayPos))
} else {
l.ap = l.ap[:int(numArrayPos)]
}
}
// read off array positions
for k := 0; k < int(numArrayPos); k++ {
ap, err := binary.ReadUvarint(i.locReader)
if err != nil {
return fmt.Errorf("error reading array position: %v", err)
}
if l != nil {
l.ap[k] = ap
}
}
return nil
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) {
return i.nextAtOrAfter(0)
}
// Advance returns the posting at the specified docNum or it is not present
// the next posting, or if the end is reached, nil
func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) {
return i.nextAtOrAfter(docNum)
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) {
docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter)
if err != nil || !exists {
return nil, err
}
i.next = Posting{} // clear the struct
rv := &i.next
rv.docNum = docNum
if !i.includeFreqNorm {
return rv, nil
}
var normBits uint64
var hasLocs bool
rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
if err != nil {
return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.includeLocs && hasLocs {
// prepare locations into reused slices, where we assume
// rv.freq >= "number of locs", since in a composite field,
// some component fields might have their IncludeTermVector
// flags disabled while other component fields are enabled
if cap(i.nextLocs) >= int(rv.freq) {
i.nextLocs = i.nextLocs[0:rv.freq]
} else {
i.nextLocs = make([]Location, rv.freq, rv.freq*2)
}
if cap(i.nextSegmentLocs) < int(rv.freq) {
i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2)
}
rv.locs = i.nextSegmentLocs[:0]
numLocsBytes, err := binary.ReadUvarint(i.locReader)
if err != nil {
return nil, fmt.Errorf("error reading location numLocsBytes: %v", err)
}
j := 0
startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader
for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) {
err := i.readLocation(&i.nextLocs[j])
if err != nil {
return nil, err
}
rv.locs = append(rv.locs, &i.nextLocs[j])
j++
}
}
return rv, nil
}
var freqHasLocs1Hit = encodeFreqHasLocs(1, false)
// nextBytes returns the docNum and the encoded freq & loc bytes for
// the next posting
func (i *PostingsIterator) nextBytes() (
docNumOut uint64, freq uint64, normBits uint64,
bytesFreqNorm []byte, bytesLoc []byte, err error) {
docNum, exists, err := i.nextDocNumAtOrAfter(0)
if err != nil || !exists {
return 0, 0, 0, nil, nil, err
}
if i.normBits1Hit != 0 {
if i.buf == nil {
i.buf = make([]byte, binary.MaxVarintLen64*2)
}
n := binary.PutUvarint(i.buf, freqHasLocs1Hit)
n += binary.PutUvarint(i.buf[n:], i.normBits1Hit)
return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil
}
startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
var hasLocs bool
freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
if err != nil {
return 0, 0, 0, nil, nil, err
}
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
if hasLocs {
startLoc := len(i.currChunkLoc) - i.locReader.Len()
numLocsBytes, err := binary.ReadUvarint(i.locReader)
if err != nil {
return 0, 0, 0, nil, nil,
fmt.Errorf("error reading location nextBytes numLocs: %v", err)
}
// skip over all the location bytes
_, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent)
if err != nil {
return 0, 0, 0, nil, nil, err
}
endLoc := len(i.currChunkLoc) - i.locReader.Len()
bytesLoc = i.currChunkLoc[startLoc:endLoc]
}
return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil
}
// nextDocNum returns the next docNum on the postings list, and also
// sets up the currChunk / loc related fields of the iterator.
func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) {
if i.normBits1Hit != 0 {
if i.docNum1Hit == docNum1HitFinished {
return 0, false, nil
}
if i.docNum1Hit < atOrAfter {
// advanced past our 1-hit
i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum
return 0, false, nil
}
docNum := i.docNum1Hit
i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum
return docNum, true, nil
}
if i.Actual == nil || !i.Actual.HasNext() {
return 0, false, nil
}
n := i.Actual.Next()
for uint64(n) < atOrAfter && i.Actual.HasNext() {
n = i.Actual.Next()
}
if uint64(n) < atOrAfter {
// couldn't find anything
return 0, false, nil
}
allN := i.all.Next()
nChunk := n / i.postings.sb.chunkFactor
allNChunk := allN / i.postings.sb.chunkFactor
// n is the next actual hit (excluding some postings), and
// allN is the next hit in the full postings, and
// if they don't match, move 'all' forwards until they do
for allN != n {
// in the same chunk, so move the freq/norm/loc decoders forward
if i.includeFreqNorm && allNChunk == nChunk {
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
err := i.loadChunk(int(nChunk))
if err != nil {
return 0, false, fmt.Errorf("error loading chunk: %v", err)
}
}
// read off freq/offsets even though we don't care about them
_, _, hasLocs, err := i.readFreqNormHasLocs()
if err != nil {
return 0, false, err
}
if i.includeLocs && hasLocs {
numLocsBytes, err := binary.ReadUvarint(i.locReader)
if err != nil {
return 0, false, fmt.Errorf("error reading location numLocsBytes: %v", err)
}
// skip over all the location bytes
_, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent)
if err != nil {
return 0, false, err
}
}
}
allN = i.all.Next()
allNChunk = allN / i.postings.sb.chunkFactor
}
if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) {
err := i.loadChunk(int(nChunk))
if err != nil {
return 0, false, fmt.Errorf("error loading chunk: %v", err)
}
}
return uint64(n), true, nil
}
// Posting is a single entry in a postings list
type Posting struct {
docNum uint64
freq uint64
norm float32
locs []segment.Location
}
func (p *Posting) Size() int {
sizeInBytes := reflectStaticSizePosting
for _, entry := range p.locs {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
// Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 {
return p.docNum
}
// Frequency returns the frequence of occurance of this term in this doc/field
func (p *Posting) Frequency() uint64 {
return p.freq
}
// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
return float64(p.norm)
}
// Locations returns the location information for each occurance
func (p *Posting) Locations() []segment.Location {
return p.locs
}
// Location represents the location of a single occurance
type Location struct {
field string
pos uint64
start uint64
end uint64
ap []uint64
}
func (l *Location) Size() int {
return reflectStaticSizeLocation +
len(l.field) +
len(l.ap)*size.SizeOfUint64
}
// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
return l.field
}
// Start returns the start byte offset of this occurance
func (l *Location) Start() uint64 {
return l.start
}
// End returns the end byte offset of this occurance
func (l *Location) End() uint64 {
return l.end
}
// Pos returns the 1-based phrase position of this occurance
func (l *Location) Pos() uint64 {
return l.pos
}
// ArrayPositions returns the array position vector associated with this occurance
func (l *Location) ArrayPositions() []uint64 {
return l.ap
}

View file

@ -0,0 +1,43 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import "encoding/binary"
func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) {
_, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum)
meta := s.mem[storedOffset+n : storedOffset+n+metaLen]
data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen]
return meta, data
}
func (s *SegmentBase) getDocStoredOffsets(docNum uint64) (
uint64, uint64, uint64, uint64, uint64) {
indexOffset := s.storedIndexOffset + (8 * docNum)
storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8])
var n uint64
metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64])
n += uint64(read)
dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64])
n += uint64(read)
return indexOffset, storedOffset, n, metaLen, dataLen
}

View file

@ -0,0 +1,534 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"os"
"reflect"
"sync"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
"github.com/couchbase/vellum"
mmap "github.com/edsrzf/mmap-go"
"github.com/golang/snappy"
)
var reflectStaticSizeSegmentBase int
func init() {
var sb SegmentBase
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size())
}
// Open returns a zap impl of a segment
func Open(path string) (segment.Segment, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
mm, err := mmap.Map(f, mmap.RDONLY, 0)
if err != nil {
// mmap failed, try to close the file
_ = f.Close()
return nil, err
}
rv := &Segment{
SegmentBase: SegmentBase{
mem: mm[0 : len(mm)-FooterSize],
fieldsMap: make(map[string]uint16),
fieldDvReaders: make(map[uint16]*docValueReader),
},
f: f,
mm: mm,
path: path,
refs: 1,
}
rv.SegmentBase.updateSize()
err = rv.loadConfig()
if err != nil {
_ = rv.Close()
return nil, err
}
err = rv.loadFields()
if err != nil {
_ = rv.Close()
return nil, err
}
err = rv.loadDvReaders()
if err != nil {
_ = rv.Close()
return nil, err
}
return rv, nil
}
// SegmentBase is a memory only, read-only implementation of the
// segment.Segment interface, using zap's data representation.
type SegmentBase struct {
mem []byte
memCRC uint32
chunkFactor uint32
fieldsMap map[string]uint16 // fieldName -> fieldID+1
fieldsInv []string // fieldID -> fieldName
numDocs uint64
storedIndexOffset uint64
fieldsIndexOffset uint64
docValueOffset uint64
dictLocs []uint64
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
size uint64
}
func (sb *SegmentBase) Size() int {
return int(sb.size)
}
func (sb *SegmentBase) updateSize() {
sizeInBytes := reflectStaticSizeSegmentBase +
cap(sb.mem)
// fieldsMap
for k, _ := range sb.fieldsMap {
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16
}
// fieldsInv, dictLocs
for _, entry := range sb.fieldsInv {
sizeInBytes += len(entry) + size.SizeOfString
}
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64
// fieldDvReaders
for _, v := range sb.fieldDvReaders {
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr
if v != nil {
sizeInBytes += v.size()
}
}
sb.size = uint64(sizeInBytes)
}
func (sb *SegmentBase) AddRef() {}
func (sb *SegmentBase) DecRef() (err error) { return nil }
func (sb *SegmentBase) Close() (err error) { return nil }
// Segment implements a persisted segment.Segment interface, by
// embedding an mmap()'ed SegmentBase.
type Segment struct {
SegmentBase
f *os.File
mm mmap.MMap
path string
version uint32
crc uint32
m sync.Mutex // Protects the fields that follow.
refs int64
}
func (s *Segment) Size() int {
// 8 /* size of file pointer */
// 4 /* size of version -> uint32 */
// 4 /* size of crc -> uint32 */
sizeOfUints := 16
sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints
// mutex, refs -> int64
sizeInBytes += 16
// do not include the mmap'ed part
return sizeInBytes + s.SegmentBase.Size() - cap(s.mem)
}
func (s *Segment) AddRef() {
s.m.Lock()
s.refs++
s.m.Unlock()
}
func (s *Segment) DecRef() (err error) {
s.m.Lock()
s.refs--
if s.refs == 0 {
err = s.closeActual()
}
s.m.Unlock()
return err
}
func (s *Segment) loadConfig() error {
crcOffset := len(s.mm) - 4
s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4])
verOffset := crcOffset - 4
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
if s.version != Version {
return fmt.Errorf("unsupported version %d", s.version)
}
chunkOffset := verOffset - 4
s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4])
docValueOffset := chunkOffset - 8
s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8])
fieldsIndexOffset := docValueOffset - 8
s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8])
storedIndexOffset := fieldsIndexOffset - 8
s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8])
numDocsOffset := storedIndexOffset - 8
s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8])
return nil
}
func (s *SegmentBase) loadFields() error {
// NOTE for now we assume the fields index immediately preceeds
// the footer, and if this changes, need to adjust accordingly (or
// store explicit length), where s.mem was sliced from s.mm in Open().
fieldsIndexEnd := uint64(len(s.mem))
// iterate through fields index
var fieldID uint64
for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd])
n := uint64(read)
s.dictLocs = append(s.dictLocs, dictLoc)
var nameLen uint64
nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd])
n += uint64(read)
name := string(s.mem[addr+n : addr+n+nameLen])
s.fieldsInv = append(s.fieldsInv, name)
s.fieldsMap[name] = uint16(fieldID + 1)
fieldID++
}
return nil
}
// Dictionary returns the term dictionary for the specified field
func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) {
dict, err := s.dictionary(field)
if err == nil && dict == nil {
return &segment.EmptyDictionary{}, nil
}
return dict, err
}
func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
fieldIDPlus1 := sb.fieldsMap[field]
if fieldIDPlus1 > 0 {
rv = &Dictionary{
sb: sb,
field: field,
fieldID: fieldIDPlus1 - 1,
}
dictStart := sb.dictLocs[rv.fieldID]
if dictStart > 0 {
// read the length of the vellum data
vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64])
fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
if fstBytes != nil {
rv.fst, err = vellum.Load(fstBytes)
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
}
rv.fstReader, err = rv.fst.Reader()
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum Reader err: %v", field, err)
}
}
}
}
return rv, nil
}
// visitDocumentCtx holds data structures that are reusable across
// multiple VisitDocument() calls to avoid memory allocations
type visitDocumentCtx struct {
buf []byte
reader bytes.Reader
arrayPos []uint64
}
var visitDocumentCtxPool = sync.Pool{
New: func() interface{} {
reuse := &visitDocumentCtx{}
return reuse
},
}
// VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
// first make sure this is a valid number in this segment
if num < s.numDocs {
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
meta, compressed := s.getDocStoredMetaAndCompressed(num)
vdc.reader.Reset(meta)
// handle _id field special case
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
idFieldVal := compressed[:idFieldValLen]
keepGoing := visitor("_id", byte('t'), idFieldVal, nil)
if !keepGoing {
visitDocumentCtxPool.Put(vdc)
return nil
}
// handle non-"_id" fields
compressed = compressed[idFieldValLen:]
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed)
if err != nil {
return err
}
for keepGoing {
field, err := binary.ReadUvarint(&vdc.reader)
if err == io.EOF {
break
}
if err != nil {
return err
}
typ, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
offset, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
l, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
numap, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
var arrayPos []uint64
if numap > 0 {
if cap(vdc.arrayPos) < int(numap) {
vdc.arrayPos = make([]uint64, numap)
}
arrayPos = vdc.arrayPos[:numap]
for i := 0; i < int(numap); i++ {
ap, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return err
}
arrayPos[i] = ap
}
}
value := uncompressed[offset : offset+l]
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
}
vdc.buf = uncompressed
visitDocumentCtxPool.Put(vdc)
}
return nil
}
// DocID returns the value of the _id field for the given docNum
func (s *SegmentBase) DocID(num uint64) ([]byte, error) {
if num >= s.numDocs {
return nil, nil
}
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
meta, compressed := s.getDocStoredMetaAndCompressed(num)
vdc.reader.Reset(meta)
// handle _id field special case
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
if err != nil {
return nil, err
}
idFieldVal := compressed[:idFieldValLen]
visitDocumentCtxPool.Put(vdc)
return idFieldVal, nil
}
// Count returns the number of documents in this segment.
func (s *SegmentBase) Count() uint64 {
return s.numDocs
}
// DocNumbers returns a bitset corresponding to the doc numbers of all the
// provided _id strings
func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
rv := roaring.New()
if len(s.fieldsMap) > 0 {
idDict, err := s.dictionary("_id")
if err != nil {
return nil, err
}
postingsList := emptyPostingsList
for _, id := range ids {
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
if err != nil {
return nil, err
}
postingsList.OrInto(rv)
}
}
return rv, nil
}
// Fields returns the field names used in this segment
func (s *SegmentBase) Fields() []string {
return s.fieldsInv
}
// Path returns the path of this segment on disk
func (s *Segment) Path() string {
return s.path
}
// Close releases all resources associated with this segment
func (s *Segment) Close() (err error) {
return s.DecRef()
}
func (s *Segment) closeActual() (err error) {
if s.mm != nil {
err = s.mm.Unmap()
}
// try to close file even if unmap failed
if s.f != nil {
err2 := s.f.Close()
if err == nil {
// try to return first error
err = err2
}
}
return
}
// some helpers i started adding for the command-line utility
// Data returns the underlying mmaped data slice
func (s *Segment) Data() []byte {
return s.mm
}
// CRC returns the CRC value stored in the file footer
func (s *Segment) CRC() uint32 {
return s.crc
}
// Version returns the file version in the file footer
func (s *Segment) Version() uint32 {
return s.version
}
// ChunkFactor returns the chunk factor in the file footer
func (s *Segment) ChunkFactor() uint32 {
return s.chunkFactor
}
// FieldsIndexOffset returns the fields index offset in the file footer
func (s *Segment) FieldsIndexOffset() uint64 {
return s.fieldsIndexOffset
}
// StoredIndexOffset returns the stored value index offset in the file footer
func (s *Segment) StoredIndexOffset() uint64 {
return s.storedIndexOffset
}
// DocValueOffset returns the docValue offset in the file footer
func (s *Segment) DocValueOffset() uint64 {
return s.docValueOffset
}
// NumDocs returns the number of documents in the file footer
func (s *Segment) NumDocs() uint64 {
return s.numDocs
}
// DictAddr is a helper function to compute the file offset where the
// dictionary is stored for the specified field.
func (s *Segment) DictAddr(field string) (uint64, error) {
fieldIDPlus1, ok := s.fieldsMap[field]
if !ok {
return 0, fmt.Errorf("no such field '%s'", field)
}
return s.dictLocs[fieldIDPlus1-1], nil
}
func (s *SegmentBase) loadDvReaders() error {
if s.docValueOffset == fieldNotUninverted {
return nil
}
var read uint64
for fieldID, field := range s.fieldsInv {
var fieldLocStart, fieldLocEnd uint64
var n int
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 {
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID)
}
read += uint64(n)
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 {
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID)
}
read += uint64(n)
s.fieldDvReaders[uint16(fieldID)], _ = s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
}
return nil
}

View file

@ -0,0 +1,145 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"encoding/binary"
"io"
"github.com/RoaringBitmap/roaring"
)
// writes out the length of the roaring bitmap in bytes as varint
// then writes out the roaring bitmap itself
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
reuseBufVarint []byte) (int, error) {
buf, err := r.ToBytes()
if err != nil {
return 0, err
}
var tw int
// write out the length
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
nw, err := w.Write(reuseBufVarint[:n])
tw += nw
if err != nil {
return tw, err
}
// write out the roaring bytes
nw, err = w.Write(buf)
tw += nw
if err != nil {
return tw, err
}
return tw, nil
}
func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) {
var rv uint64
var fieldsOffsets []uint64
for fieldID, fieldName := range fieldsInv {
// record start of this field
fieldsOffsets = append(fieldsOffsets, uint64(w.Count()))
// write out the dict location and field name length
_, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName)))
if err != nil {
return 0, err
}
// write out the field name
_, err = w.Write([]byte(fieldName))
if err != nil {
return 0, err
}
}
// now write out the fields index
rv = uint64(w.Count())
for fieldID := range fieldsInv {
err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID])
if err != nil {
return 0, err
}
}
return rv, nil
}
// FooterSize is the size of the footer record in bytes
// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset
const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8
func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
chunkFactor uint32, crcBeforeFooter uint32, writerIn io.Writer) error {
w := NewCountHashWriter(writerIn)
w.crc = crcBeforeFooter
// write out the number of docs
err := binary.Write(w, binary.BigEndian, numDocs)
if err != nil {
return err
}
// write out the stored field index location:
err = binary.Write(w, binary.BigEndian, storedIndexOffset)
if err != nil {
return err
}
// write out the field index location
err = binary.Write(w, binary.BigEndian, fieldsIndexOffset)
if err != nil {
return err
}
// write out the fieldDocValue location
err = binary.Write(w, binary.BigEndian, docValueOffset)
if err != nil {
return err
}
// write out 32-bit chunk factor
err = binary.Write(w, binary.BigEndian, chunkFactor)
if err != nil {
return err
}
// write out 32-bit version
err = binary.Write(w, binary.BigEndian, Version)
if err != nil {
return err
}
// write out CRC-32 of everything upto but not including this CRC
err = binary.Write(w, binary.BigEndian, w.crc)
if err != nil {
return err
}
return nil
}
func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) {
buf := make([]byte, binary.MaxVarintLen64)
for _, val := range vals {
n := binary.PutUvarint(buf, val)
var nw int
nw, err = w.Write(buf[:n])
tw += nw
if err != nil {
return tw, err
}
}
return tw, err
}

View file

@ -0,0 +1,638 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"bytes"
"container/heap"
"encoding/binary"
"fmt"
"reflect"
"sort"
"sync"
"sync/atomic"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
)
type asynchSegmentResult struct {
dictItr segment.DictionaryIterator
index int
docs *roaring.Bitmap
postings segment.PostingsList
err error
}
var reflectStaticSizeIndexSnapshot int
func init() {
var is interface{} = IndexSnapshot{}
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
}
type IndexSnapshot struct {
parent *Scorch
segment []*SegmentSnapshot
offsets []uint64
internal map[string][]byte
epoch uint64
size uint64
creator string
m sync.Mutex // Protects the fields that follow.
refs int64
m2 sync.Mutex // Protects the fields that follow.
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
fieldDicts map[string][]segment.TermDictionary // keyed by field, recycled dicts
}
func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
return i.segment
}
func (i *IndexSnapshot) Internal() map[string][]byte {
return i.internal
}
func (i *IndexSnapshot) AddRef() {
i.m.Lock()
i.refs++
i.m.Unlock()
}
func (i *IndexSnapshot) DecRef() (err error) {
i.m.Lock()
i.refs--
if i.refs == 0 {
for _, s := range i.segment {
if s != nil {
err2 := s.segment.DecRef()
if err == nil {
err = err2
}
}
}
if i.parent != nil {
go i.parent.AddEligibleForRemoval(i.epoch)
}
}
i.m.Unlock()
return err
}
func (i *IndexSnapshot) Close() error {
return i.DecRef()
}
func (i *IndexSnapshot) Size() int {
return int(i.size)
}
func (i *IndexSnapshot) updateSize() {
i.size += uint64(reflectStaticSizeIndexSnapshot)
for _, s := range i.segment {
i.size += uint64(s.Size())
}
}
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {
go func(index int, segment *SegmentSnapshot) {
dict, err := segment.Dictionary(field)
if err != nil {
results <- &asynchSegmentResult{err: err}
} else {
results <- &asynchSegmentResult{dictItr: makeItr(dict)}
}
}(index, segment)
}
var err error
rv := &IndexSnapshotFieldDict{
snapshot: i,
cursors: make([]*segmentDictCursor, 0, len(i.segment)),
}
for count := 0; count < len(i.segment); count++ {
asr := <-results
if asr.err != nil && err == nil {
err = asr.err
} else {
next, err2 := asr.dictItr.Next()
if err2 != nil && err == nil {
err = err2
}
if next != nil {
rv.cursors = append(rv.cursors, &segmentDictCursor{
itr: asr.dictItr,
curr: *next,
})
}
}
}
// after ensuring we've read all items on channel
if err != nil {
return nil, err
}
// prepare heap
heap.Init(rv)
return rv, nil
}
func (i *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.Iterator()
})
}
func (i *IndexSnapshot) FieldDictRange(field string, startTerm []byte,
endTerm []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.RangeIterator(string(startTerm), string(endTerm))
})
}
func (i *IndexSnapshot) FieldDictPrefix(field string,
termPrefix []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.PrefixIterator(string(termPrefix))
})
}
func (i *IndexSnapshot) FieldDictRegexp(field string,
termRegex []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.RegexpIterator(string(termRegex))
})
}
func (i *IndexSnapshot) FieldDictFuzzy(field string,
term []byte, fuzziness int) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.FuzzyIterator(string(term), fuzziness)
})
}
func (i *IndexSnapshot) FieldDictOnly(field string,
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.OnlyIterator(onlyTerms, includeCount)
})
}
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {
go func(index int, segment *SegmentSnapshot) {
results <- &asynchSegmentResult{
index: index,
docs: segment.DocNumbersLive(),
}
}(index, segment)
}
return i.newDocIDReader(results)
}
func (i *IndexSnapshot) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {
go func(index int, segment *SegmentSnapshot) {
docs, err := segment.DocNumbers(ids)
if err != nil {
results <- &asynchSegmentResult{err: err}
} else {
results <- &asynchSegmentResult{
index: index,
docs: docs,
}
}
}(index, segment)
}
return i.newDocIDReader(results)
}
func (i *IndexSnapshot) newDocIDReader(results chan *asynchSegmentResult) (index.DocIDReader, error) {
rv := &IndexSnapshotDocIDReader{
snapshot: i,
iterators: make([]roaring.IntIterable, len(i.segment)),
}
var err error
for count := 0; count < len(i.segment); count++ {
asr := <-results
if asr.err != nil && err != nil {
err = asr.err
} else {
rv.iterators[asr.index] = asr.docs.Iterator()
}
}
if err != nil {
return nil, err
}
return rv, nil
}
func (i *IndexSnapshot) Fields() ([]string, error) {
// FIXME not making this concurrent for now as it's not used in hot path
// of any searches at the moment (just a debug aid)
fieldsMap := map[string]struct{}{}
for _, segment := range i.segment {
fields := segment.Fields()
for _, field := range fields {
fieldsMap[field] = struct{}{}
}
}
rv := make([]string, 0, len(fieldsMap))
for k := range fieldsMap {
rv = append(rv, k)
}
return rv, nil
}
func (i *IndexSnapshot) GetInternal(key []byte) ([]byte, error) {
return i.internal[string(key)], nil
}
func (i *IndexSnapshot) DocCount() (uint64, error) {
var rv uint64
for _, segment := range i.segment {
rv += segment.Count()
}
return rv, nil
}
func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) {
// FIXME could be done more efficiently directly, but reusing for simplicity
tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false)
if err != nil {
return nil, err
}
defer func() {
if cerr := tfr.Close(); err == nil && cerr != nil {
err = cerr
}
}()
next, err := tfr.Next(nil)
if err != nil {
return nil, err
}
if next == nil {
// no such doc exists
return nil, nil
}
docNum, err := docInternalToNumber(next.ID)
if err != nil {
return nil, err
}
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
rv = document.NewDocument(id)
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool {
if name == "_id" {
return true
}
switch typ {
case 't':
rv.AddField(document.NewTextField(name, pos, value))
case 'n':
rv.AddField(document.NewNumericFieldFromBytes(name, pos, value))
case 'd':
rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value))
case 'b':
rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value))
case 'g':
rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value))
}
return true
})
if err != nil {
return nil, err
}
return rv, nil
}
func (i *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (int, uint64) {
segmentIndex := sort.Search(len(i.offsets),
func(x int) bool {
return i.offsets[x] > docNum
}) - 1
localDocNum := docNum - i.offsets[segmentIndex]
return int(segmentIndex), localDocNum
}
func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
docNum, err := docInternalToNumber(id)
if err != nil {
return "", err
}
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
v, err := i.segment[segmentIndex].DocID(localDocNum)
if err != nil {
return "", err
}
if v == nil {
return "", fmt.Errorf("document number %d not found", docNum)
}
return string(v), nil
}
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
// FIXME could be done more efficiently directly, but reusing for simplicity
tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false)
if err != nil {
return nil, err
}
defer func() {
if cerr := tfr.Close(); err == nil && cerr != nil {
err = cerr
}
}()
next, err := tfr.Next(nil)
if err != nil || next == nil {
return nil, err
}
return next.ID, nil
}
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
includeNorm, includeTermVectors bool) (tfr index.TermFieldReader, err error) {
rv, dicts := i.allocTermFieldReaderDicts(field)
rv.term = term
rv.field = field
rv.snapshot = i
if rv.postings == nil {
rv.postings = make([]segment.PostingsList, len(i.segment))
}
if rv.iterators == nil {
rv.iterators = make([]segment.PostingsIterator, len(i.segment))
}
rv.segmentOffset = 0
rv.includeFreq = includeFreq
rv.includeNorm = includeNorm
rv.includeTermVectors = includeTermVectors
rv.currPosting = nil
rv.currID = rv.currID[:0]
if dicts == nil {
dicts = make([]segment.TermDictionary, len(i.segment))
for i, segment := range i.segment {
dict, err := segment.Dictionary(field)
if err != nil {
return nil, err
}
dicts[i] = dict
}
}
rv.dicts = dicts
for i := range i.segment {
pl, err := dicts[i].PostingsList(term, nil, rv.postings[i])
if err != nil {
return nil, err
}
rv.postings[i] = pl
rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i])
}
atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1))
return rv, nil
}
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (
tfr *IndexSnapshotTermFieldReader, dicts []segment.TermDictionary) {
i.m2.Lock()
if i.fieldDicts != nil {
dicts = i.fieldDicts[field]
}
if i.fieldTFRs != nil {
tfrs := i.fieldTFRs[field]
last := len(tfrs) - 1
if last >= 0 {
rv := tfrs[last]
tfrs[last] = nil
i.fieldTFRs[field] = tfrs[:last]
i.m2.Unlock()
return rv, dicts
}
}
i.m2.Unlock()
return &IndexSnapshotTermFieldReader{}, dicts
}
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) {
i.m2.Lock()
if i.fieldTFRs == nil {
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{}
}
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr)
if i.fieldDicts == nil {
i.fieldDicts = map[string][]segment.TermDictionary{}
}
i.fieldDicts[tfr.field] = tfr.dicts
i.m2.Unlock()
}
func docNumberToBytes(buf []byte, in uint64) []byte {
if len(buf) != 8 {
if cap(buf) >= 8 {
buf = buf[0:8]
} else {
buf = make([]byte, 8)
}
}
binary.BigEndian.PutUint64(buf, in)
return buf
}
func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
if len(in) != 8 {
return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in)
}
return binary.BigEndian.Uint64(in), nil
}
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
fields []string, visitor index.DocumentFieldTermVisitor) error {
_, err := i.documentVisitFieldTerms(id, fields, visitor, nil)
return err
}
func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID,
fields []string, visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) (
segment.DocVisitState, error) {
docNum, err := docInternalToNumber(id)
if err != nil {
return nil, err
}
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
if segmentIndex >= len(i.segment) {
return nil, nil
}
ss := i.segment[segmentIndex]
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok {
// get the list of doc value persisted fields
pFields, err := zaps.VisitableDocValueFields()
if err != nil {
return nil, err
}
// assort the fields for which terms look up have to
// be performed runtime
dvPendingFields := extractDvPendingFields(fields, pFields)
// all fields are doc value persisted
if len(dvPendingFields) == 0 {
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs)
}
// concurrently trigger the runtime doc value preparations for
// pending fields as well as the visit of the persisted doc values
errCh := make(chan error, 1)
go func() {
defer close(errCh)
err := ss.cachedDocs.prepareFields(dvPendingFields, ss)
if err != nil {
errCh <- err
}
}()
// visit the requested persisted dv while the cache preparation in progress
dvs, err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs)
if err != nil {
return nil, err
}
// err out if fieldCache preparation failed
err = <-errCh
if err != nil {
return nil, err
}
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor)
return dvs, nil
}
return dvs, prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor)
}
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string,
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error {
err := ss.cachedDocs.prepareFields(fields, ss)
if err != nil {
return err
}
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor)
return nil
}
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string,
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) {
for _, field := range fields {
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists {
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
for {
i := bytes.Index(tlist, TermSeparatorSplitSlice)
if i < 0 {
break
}
visitor(field, tlist[0:i])
tlist = tlist[i+1:]
}
}
}
}
}
func extractDvPendingFields(requestedFields, persistedFields []string) []string {
removeMap := make(map[string]struct{}, len(persistedFields))
for _, str := range persistedFields {
removeMap[str] = struct{}{}
}
rv := make([]string, 0, len(requestedFields))
for _, s := range requestedFields {
if _, ok := removeMap[s]; !ok {
rv = append(rv, s)
}
}
return rv
}
func (i *IndexSnapshot) DocValueReader(fields []string) (index.DocValueReader, error) {
return &DocValueReader{i: i, fields: fields}, nil
}
type DocValueReader struct {
i *IndexSnapshot
fields []string
dvs segment.DocVisitState
}
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
visitor index.DocumentFieldTermVisitor) (err error) {
dvr.dvs, err = dvr.i.documentVisitFieldTerms(id, dvr.fields, visitor, dvr.dvs)
return err
}
func (i *IndexSnapshot) DumpAll() chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (i *IndexSnapshot) DumpDoc(id string) chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (i *IndexSnapshot) DumpFields() chan interface{} {
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}

View file

@ -0,0 +1,93 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"container/heap"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
)
type segmentDictCursor struct {
itr segment.DictionaryIterator
curr index.DictEntry
}
type IndexSnapshotFieldDict struct {
snapshot *IndexSnapshot
cursors []*segmentDictCursor
entry index.DictEntry
}
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) }
func (i *IndexSnapshotFieldDict) Less(a, b int) bool {
return i.cursors[a].curr.Term < i.cursors[b].curr.Term
}
func (i *IndexSnapshotFieldDict) Swap(a, b int) {
i.cursors[a], i.cursors[b] = i.cursors[b], i.cursors[a]
}
func (i *IndexSnapshotFieldDict) Push(x interface{}) {
i.cursors = append(i.cursors, x.(*segmentDictCursor))
}
func (i *IndexSnapshotFieldDict) Pop() interface{} {
n := len(i.cursors)
x := i.cursors[n-1]
i.cursors = i.cursors[0 : n-1]
return x
}
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
if len(i.cursors) <= 0 {
return nil, nil
}
i.entry = i.cursors[0].curr
next, err := i.cursors[0].itr.Next()
if err != nil {
return nil, err
}
if next == nil {
// at end of this cursor, remove it
heap.Pop(i)
} else {
// modified heap, fix it
i.cursors[0].curr = *next
heap.Fix(i, 0)
}
// look for any other entries with the exact same term
for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term {
i.entry.Count += i.cursors[0].curr.Count
next, err := i.cursors[0].itr.Next()
if err != nil {
return nil, err
}
if next == nil {
// at end of this cursor, remove it
heap.Pop(i)
} else {
// modified heap, fix it
i.cursors[0].curr = *next
heap.Fix(i, 0)
}
}
return &i.entry, nil
}
func (i *IndexSnapshotFieldDict) Close() error {
return nil
}

View file

@ -0,0 +1,80 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"bytes"
"reflect"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeIndexSnapshotDocIDReader int
func init() {
var isdr IndexSnapshotDocIDReader
reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size())
}
type IndexSnapshotDocIDReader struct {
snapshot *IndexSnapshot
iterators []roaring.IntIterable
segmentOffset int
}
func (i *IndexSnapshotDocIDReader) Size() int {
return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr
}
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
for i.segmentOffset < len(i.iterators) {
if !i.iterators[i.segmentOffset].HasNext() {
i.segmentOffset++
continue
}
next := i.iterators[i.segmentOffset].Next()
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
return docNumberToBytes(nil, uint64(next)+globalOffset), nil
}
return nil, nil
}
func (i *IndexSnapshotDocIDReader) Advance(ID index.IndexInternalID) (index.IndexInternalID, error) {
// FIXME do something better
next, err := i.Next()
if err != nil {
return nil, err
}
if next == nil {
return nil, nil
}
for bytes.Compare(next, ID) < 0 {
next, err = i.Next()
if err != nil {
return nil, err
}
if next == nil {
break
}
}
return next, nil
}
func (i *IndexSnapshotDocIDReader) Close() error {
return nil
}

View file

@ -0,0 +1,185 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"bytes"
"fmt"
"reflect"
"sync/atomic"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeIndexSnapshotTermFieldReader int
func init() {
var istfr IndexSnapshotTermFieldReader
reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size())
}
type IndexSnapshotTermFieldReader struct {
term []byte
field string
snapshot *IndexSnapshot
dicts []segment.TermDictionary
postings []segment.PostingsList
iterators []segment.PostingsIterator
segmentOffset int
includeFreq bool
includeNorm bool
includeTermVectors bool
currPosting segment.Posting
currID index.IndexInternalID
}
func (i *IndexSnapshotTermFieldReader) Size() int {
sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr +
len(i.term) +
len(i.field) +
len(i.currID)
for _, entry := range i.postings {
sizeInBytes += entry.Size()
}
for _, entry := range i.iterators {
sizeInBytes += entry.Size()
}
if i.currPosting != nil {
sizeInBytes += i.currPosting.Size()
}
return sizeInBytes
}
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
rv := preAlloced
if rv == nil {
rv = &index.TermFieldDoc{}
}
// find the next hit
for i.segmentOffset < len(i.postings) {
next, err := i.iterators[i.segmentOffset].Next()
if err != nil {
return nil, err
}
if next != nil {
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
nnum := next.Number()
rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset)
i.postingToTermFieldDoc(next, rv)
i.currID = rv.ID
i.currPosting = next
return rv, nil
}
i.segmentOffset++
}
return nil, nil
}
func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Posting, rv *index.TermFieldDoc) {
if i.includeFreq {
rv.Freq = next.Frequency()
}
if i.includeNorm {
rv.Norm = next.Norm()
}
if i.includeTermVectors {
locs := next.Locations()
if cap(rv.Vectors) < len(locs) {
rv.Vectors = make([]*index.TermFieldVector, len(locs))
backing := make([]index.TermFieldVector, len(locs))
for i := range backing {
rv.Vectors[i] = &backing[i]
}
}
rv.Vectors = rv.Vectors[:len(locs)]
for i, loc := range locs {
*rv.Vectors[i] = index.TermFieldVector{
Start: loc.Start(),
End: loc.End(),
Pos: loc.Pos(),
ArrayPositions: loc.ArrayPositions(),
Field: loc.Field(),
}
}
}
}
func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
// FIXME do something better
// for now, if we need to seek backwards, then restart from the beginning
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
i2, err := i.snapshot.TermFieldReader(i.term, i.field,
i.includeFreq, i.includeNorm, i.includeTermVectors)
if err != nil {
return nil, err
}
*i = *(i2.(*IndexSnapshotTermFieldReader))
}
num, err := docInternalToNumber(ID)
if err != nil {
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
}
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
if segIndex >= len(i.snapshot.segment) {
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
segIndex, len(i.snapshot.segment))
}
// skip directly to the target segment
i.segmentOffset = segIndex
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
if err != nil {
return nil, err
}
if next == nil {
// we jumped directly to the segment that should have contained it
// but it wasn't there, so reuse Next() which should correctly
// get the next hit after it (we moved i.segmentOffset)
return i.Next(preAlloced)
}
if preAlloced == nil {
preAlloced = &index.TermFieldDoc{}
}
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
i.snapshot.offsets[segIndex])
i.postingToTermFieldDoc(next, preAlloced)
i.currID = preAlloced.ID
i.currPosting = next
return preAlloced, nil
}
func (i *IndexSnapshotTermFieldReader) Count() uint64 {
var rv uint64
for _, posting := range i.postings {
rv += posting.Count()
}
return rv
}
func (i *IndexSnapshotTermFieldReader) Close() error {
if i.snapshot != nil {
atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1))
i.snapshot.recycleTermFieldReader(i)
}
return nil
}

View file

@ -0,0 +1,173 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"fmt"
"log"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/boltdb/bolt"
)
type RollbackPoint struct {
epoch uint64
meta map[string][]byte
}
func (r *RollbackPoint) GetInternal(key []byte) []byte {
return r.meta[string(key)]
}
// RollbackPoints returns an array of rollback points available for
// the application to rollback to, with more recent rollback points
// (higher epochs) coming first.
func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) {
if s.rootBolt == nil {
return nil, fmt.Errorf("RollbackPoints: root is nil")
}
// start a read-only bolt transaction
tx, err := s.rootBolt.Begin(false)
if err != nil {
return nil, fmt.Errorf("RollbackPoints: failed to start" +
" read-only transaction")
}
// read-only bolt transactions to be rolled back
defer func() {
_ = tx.Rollback()
}()
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return nil, nil
}
rollbackPoints := []*RollbackPoint{}
c1 := snapshots.Cursor()
for k, _ := c1.Last(); k != nil; k, _ = c1.Prev() {
_, snapshotEpoch, err := segment.DecodeUvarintAscending(k)
if err != nil {
log.Printf("RollbackPoints:"+
" unable to parse segment epoch %x, continuing", k)
continue
}
snapshot := snapshots.Bucket(k)
if snapshot == nil {
log.Printf("RollbackPoints:"+
" snapshot key, but bucket missing %x, continuing", k)
continue
}
meta := map[string][]byte{}
c2 := snapshot.Cursor()
for j, _ := c2.First(); j != nil; j, _ = c2.Next() {
if j[0] == boltInternalKey[0] {
internalBucket := snapshot.Bucket(j)
err = internalBucket.ForEach(func(key []byte, val []byte) error {
copiedVal := append([]byte(nil), val...)
meta[string(key)] = copiedVal
return nil
})
if err != nil {
break
}
}
}
if err != nil {
log.Printf("RollbackPoints:"+
" failed in fetching internal data: %v", err)
continue
}
rollbackPoints = append(rollbackPoints, &RollbackPoint{
epoch: snapshotEpoch,
meta: meta,
})
}
return rollbackPoints, nil
}
// Rollback atomically and durably (if unsafeBatch is unset) brings
// the store back to the point in time as represented by the
// RollbackPoint. Rollback() should only be passed a RollbackPoint
// that came from the same store using the RollbackPoints() API.
func (s *Scorch) Rollback(to *RollbackPoint) error {
if to == nil {
return fmt.Errorf("Rollback: RollbackPoint is nil")
}
if s.rootBolt == nil {
return fmt.Errorf("Rollback: root is nil")
}
revert := &snapshotReversion{}
s.rootLock.Lock()
err := s.rootBolt.View(func(tx *bolt.Tx) error {
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return fmt.Errorf("Rollback: no snapshots available")
}
pos := segment.EncodeUvarintAscending(nil, to.epoch)
snapshot := snapshots.Bucket(pos)
if snapshot == nil {
return fmt.Errorf("Rollback: snapshot not found")
}
indexSnapshot, err := s.loadSnapshot(snapshot)
if err != nil {
return fmt.Errorf("Rollback: unable to load snapshot: %v", err)
}
// add segments referenced by loaded index snapshot to the
// ineligibleForRemoval map
for _, segSnap := range indexSnapshot.segment {
filename := zapFileName(segSnap.id)
s.ineligibleForRemoval[filename] = true
}
revert.snapshot = indexSnapshot
revert.applied = make(chan error)
revert.persisted = make(chan error)
return nil
})
s.rootLock.Unlock()
if err != nil {
return err
}
// introduce the reversion
s.revertToSnapshots <- revert
// block until this snapshot is applied
err = <-revert.applied
if err != nil {
return fmt.Errorf("Rollback: failed with err: %v", err)
}
return <-revert.persisted
}

View file

@ -0,0 +1,272 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"sync"
"sync/atomic"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/size"
)
var TermSeparator byte = 0xff
var TermSeparatorSplitSlice = []byte{TermSeparator}
type SegmentDictionarySnapshot struct {
s *SegmentSnapshot
d segment.TermDictionary
}
func (s *SegmentDictionarySnapshot) PostingsList(term []byte, except *roaring.Bitmap,
prealloc segment.PostingsList) (segment.PostingsList, error) {
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted?
return s.d.PostingsList(term, s.s.deleted, prealloc)
}
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator {
return s.d.Iterator()
}
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator {
return s.d.PrefixIterator(prefix)
}
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator {
return s.d.RangeIterator(start, end)
}
func (s *SegmentDictionarySnapshot) RegexpIterator(regex string) segment.DictionaryIterator {
return s.d.RegexpIterator(regex)
}
func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
fuzziness int) segment.DictionaryIterator {
return s.d.FuzzyIterator(term, fuzziness)
}
func (s *SegmentDictionarySnapshot) OnlyIterator(onlyTerms [][]byte,
includeCount bool) segment.DictionaryIterator {
return s.d.OnlyIterator(onlyTerms, includeCount)
}
type SegmentSnapshot struct {
id uint64
segment segment.Segment
deleted *roaring.Bitmap
creator string
cachedDocs *cachedDocs
}
func (s *SegmentSnapshot) Segment() segment.Segment {
return s.segment
}
func (s *SegmentSnapshot) Deleted() *roaring.Bitmap {
return s.deleted
}
func (s *SegmentSnapshot) Id() uint64 {
return s.id
}
func (s *SegmentSnapshot) FullSize() int64 {
return int64(s.segment.Count())
}
func (s SegmentSnapshot) LiveSize() int64 {
return int64(s.Count())
}
func (s *SegmentSnapshot) Close() error {
return s.segment.Close()
}
func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
return s.segment.VisitDocument(num, visitor)
}
func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) {
return s.segment.DocID(num)
}
func (s *SegmentSnapshot) Count() uint64 {
rv := s.segment.Count()
if s.deleted != nil {
rv -= s.deleted.GetCardinality()
}
return rv
}
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) {
d, err := s.segment.Dictionary(field)
if err != nil {
return nil, err
}
return &SegmentDictionarySnapshot{
s: s,
d: d,
}, nil
}
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
rv, err := s.segment.DocNumbers(docIDs)
if err != nil {
return nil, err
}
if s.deleted != nil {
rv.AndNot(s.deleted)
}
return rv, nil
}
// DocNumbersLive returns bitsit containing doc numbers for all live docs
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
rv := roaring.NewBitmap()
rv.AddRange(0, s.segment.Count())
if s.deleted != nil {
rv.AndNot(s.deleted)
}
return rv
}
func (s *SegmentSnapshot) Fields() []string {
return s.segment.Fields()
}
func (s *SegmentSnapshot) Size() (rv int) {
rv = s.segment.Size()
if s.deleted != nil {
rv += int(s.deleted.GetSizeInBytes())
}
rv += s.cachedDocs.Size()
return
}
type cachedFieldDocs struct {
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
err error // Non-nil if there was an error when preparing this cachedFieldDocs.
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
size uint64
}
func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
defer close(cfd.readyCh)
cfd.size += uint64(size.SizeOfUint64) /* size field */
dict, err := ss.segment.Dictionary(field)
if err != nil {
cfd.err = err
return
}
var postings segment.PostingsList
var postingsItr segment.PostingsIterator
dictItr := dict.Iterator()
next, err := dictItr.Next()
for err == nil && next != nil {
var err1 error
postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings)
if err1 != nil {
cfd.err = err1
return
}
cfd.size += uint64(size.SizeOfUint64) /* map key */
postingsItr = postings.Iterator(false, false, false, postingsItr)
nextPosting, err2 := postingsItr.Next()
for err2 == nil && nextPosting != nil {
docNum := nextPosting.Number()
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
cfd.size += uint64(len(next.Term) + 1) // map value
nextPosting, err2 = postingsItr.Next()
}
if err2 != nil {
cfd.err = err2
return
}
next, err = dictItr.Next()
}
if err != nil {
cfd.err = err
return
}
}
type cachedDocs struct {
m sync.Mutex // As the cache is asynchronously prepared, need a lock
cache map[string]*cachedFieldDocs // Keyed by field
size uint64
}
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
c.m.Lock()
if c.cache == nil {
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
}
for _, field := range wantedFields {
_, exists := c.cache[field]
if !exists {
c.cache[field] = &cachedFieldDocs{
readyCh: make(chan struct{}),
docs: make(map[uint64][]byte),
}
go c.cache[field].prepareFields(field, ss)
}
}
for _, field := range wantedFields {
cachedFieldDocs := c.cache[field]
c.m.Unlock()
<-cachedFieldDocs.readyCh
if cachedFieldDocs.err != nil {
return cachedFieldDocs.err
}
c.m.Lock()
}
c.updateSizeLOCKED()
c.m.Unlock()
return nil
}
func (c *cachedDocs) Size() int {
return int(atomic.LoadUint64(&c.size))
}
func (c *cachedDocs) updateSizeLOCKED() {
sizeInBytes := 0
for k, v := range c.cache { // cachedFieldDocs
sizeInBytes += len(k)
if v != nil {
for _, entry := range v.docs { // docs
sizeInBytes += 8 /* size of uint64 */ + len(entry)
}
}
}
atomic.StoreUint64(&c.size, uint64(sizeInBytes))
}

View file

@ -0,0 +1,129 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"encoding/json"
"reflect"
"sync/atomic"
)
// Stats tracks statistics about the index, fields that are
// prefixed like CurXxxx are gauges (can go up and down),
// and fields that are prefixed like TotXxxx are monotonically
// increasing counters.
type Stats struct {
TotUpdates uint64
TotDeletes uint64
TotBatches uint64
TotBatchesEmpty uint64
TotBatchIntroTime uint64
MaxBatchIntroTime uint64
TotOnErrors uint64
TotAnalysisTime uint64
TotIndexTime uint64
TotIndexedPlainTextBytes uint64
TotTermSearchersStarted uint64
TotTermSearchersFinished uint64
TotIntroduceLoop uint64
TotIntroduceSegmentBeg uint64
TotIntroduceSegmentEnd uint64
TotIntroducePersistBeg uint64
TotIntroducePersistEnd uint64
TotIntroduceMergeBeg uint64
TotIntroduceMergeEnd uint64
TotIntroduceRevertBeg uint64
TotIntroduceRevertEnd uint64
TotIntroducedItems uint64
TotIntroducedSegmentsBatch uint64
TotIntroducedSegmentsMerge uint64
TotPersistLoopBeg uint64
TotPersistLoopErr uint64
TotPersistLoopProgress uint64
TotPersistLoopWait uint64
TotPersistLoopWaitNotified uint64
TotPersistLoopEnd uint64
TotPersistedItems uint64
TotPersistedSegments uint64
TotPersisterSlowMergerPause uint64
TotPersisterSlowMergerResume uint64
TotFileMergeLoopBeg uint64
TotFileMergeLoopErr uint64
TotFileMergeLoopEnd uint64
TotFileMergePlan uint64
TotFileMergePlanErr uint64
TotFileMergePlanNone uint64
TotFileMergePlanOk uint64
TotFileMergePlanTasks uint64
TotFileMergePlanTasksDone uint64
TotFileMergePlanTasksErr uint64
TotFileMergePlanTasksSegments uint64
TotFileMergePlanTasksSegmentsEmpty uint64
TotFileMergeSegmentsEmpty uint64
TotFileMergeSegments uint64
TotFileMergeWrittenBytes uint64
TotFileMergeZapBeg uint64
TotFileMergeZapEnd uint64
TotFileMergeZapTime uint64
MaxFileMergeZapTime uint64
TotFileMergeIntroductions uint64
TotFileMergeIntroductionsDone uint64
TotMemMergeBeg uint64
TotMemMergeErr uint64
TotMemMergeDone uint64
TotMemMergeZapBeg uint64
TotMemMergeZapEnd uint64
TotMemMergeZapTime uint64
MaxMemMergeZapTime uint64
TotMemMergeSegments uint64
}
// atomically populates the returned map
func (s *Stats) ToMap() map[string]interface{} {
m := map[string]interface{}{}
sve := reflect.ValueOf(s).Elem()
svet := sve.Type()
for i := 0; i < svet.NumField(); i++ {
svef := sve.Field(i)
if svef.CanAddr() {
svefp := svef.Addr().Interface()
m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64))
}
}
return m
}
// MarshalJSON implements json.Marshaler, and in contrast to standard
// json marshaling provides atomic safety
func (s *Stats) MarshalJSON() ([]byte, error) {
return json.Marshal(s.ToMap())
}

View file

@ -15,11 +15,20 @@
package upsidedown
import (
"reflect"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
var reflectStaticSizeIndexReader int
func init() {
var ir IndexReader
reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size())
}
type IndexReader struct {
index *UpsideDownCouch
kvreader store.KVReader
@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte {
}
return rv
}
func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) {
return &DocValueReader{i: i, fields: fields}, nil
}
type DocValueReader struct {
i *IndexReader
fields []string
}
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
visitor index.DocumentFieldTermVisitor) error {
return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor)
}

View file

@ -16,13 +16,27 @@ package upsidedown
import (
"bytes"
"reflect"
"sort"
"sync/atomic"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeUpsideDownCouchTermFieldReader int
var reflectStaticSizeUpsideDownCouchDocIDReader int
func init() {
var tfr UpsideDownCouchTermFieldReader
reflectStaticSizeUpsideDownCouchTermFieldReader =
int(reflect.TypeOf(tfr).Size())
var cdr UpsideDownCouchDocIDReader
reflectStaticSizeUpsideDownCouchDocIDReader =
int(reflect.TypeOf(cdr).Size())
}
type UpsideDownCouchTermFieldReader struct {
count uint64
indexReader *IndexReader
@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct {
includeTermVectors bool
}
func (r *UpsideDownCouchTermFieldReader) Size() int {
sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr +
len(r.term) +
r.tfrPrealloc.Size() +
len(r.keyBuf)
if r.tfrNext != nil {
sizeInBytes += r.tfrNext.Size()
}
return sizeInBytes
}
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) {
bufNeeded := termFrequencyRowKeySize(term, nil)
if bufNeeded < dictionaryRowKeySize(term) {
@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct {
onlyMode bool
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
func (r *UpsideDownCouchDocIDReader) Size() int {
sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader +
reflectStaticSizeIndexReader + size.SizeOfPtr
for _, entry := range r.only {
sizeInBytes += size.SizeOfString + len(entry)
}
return sizeInBytes
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte{0x0}
endBytes := []byte{0xff}
@ -190,15 +227,18 @@ func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDo
}
func newUpsideDownCouchDocIDReaderOnly(indexReader *IndexReader, ids []string) (*UpsideDownCouchDocIDReader, error) {
// we don't actually own the list of ids, so if before we sort we must copy
idsCopy := make([]string, len(ids))
copy(idsCopy, ids)
// ensure ids are sorted
sort.Strings(ids)
sort.Strings(idsCopy)
startBytes := []byte{0x0}
if len(ids) > 0 {
startBytes = []byte(ids[0])
if len(idsCopy) > 0 {
startBytes = []byte(idsCopy[0])
}
endBytes := []byte{0xff}
if len(ids) > 0 {
endBytes = incrementBytes([]byte(ids[len(ids)-1]))
if len(idsCopy) > 0 {
endBytes = incrementBytes([]byte(idsCopy[len(idsCopy)-1]))
}
bisr := NewBackIndexRow(startBytes, nil, nil)
bier := NewBackIndexRow(endBytes, nil, nil)
@ -207,7 +247,7 @@ func newUpsideDownCouchDocIDReaderOnly(indexReader *IndexReader, ids []string) (
return &UpsideDownCouchDocIDReader{
indexReader: indexReader,
iterator: it,
only: ids,
only: idsCopy,
onlyMode: true,
}, nil
}

View file

@ -20,10 +20,22 @@ import (
"fmt"
"io"
"math"
"reflect"
"github.com/blevesearch/bleve/size"
"github.com/golang/protobuf/proto"
)
var reflectStaticSizeTermFrequencyRow int
var reflectStaticSizeTermVector int
func init() {
var tfr TermFrequencyRow
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
var tv TermVector
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
}
const ByteSeparator byte = 0xff
type UpsideDownCouchRowStream chan UpsideDownCouchRow
@ -358,6 +370,11 @@ type TermVector struct {
end uint64
}
func (tv *TermVector) Size() int {
return reflectStaticSizeTermVector + size.SizeOfPtr +
len(tv.arrayPositions)*size.SizeOfUint64
}
func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
}
@ -371,6 +388,18 @@ type TermFrequencyRow struct {
field uint16
}
func (tfr *TermFrequencyRow) Size() int {
sizeInBytes := reflectStaticSizeTermFrequencyRow +
len(tfr.term) +
len(tfr.doc)
for _, entry := range tfr.vectors {
sizeInBytes += entry.Size()
}
return sizeInBytes
}
func (tfr *TermFrequencyRow) Term() []byte {
return tfr.term
}

View file

@ -293,7 +293,7 @@ func (udc *UpsideDownCouch) batchRows(writer store.KVWriter, addRowsAll [][]Upsi
}
func (udc *UpsideDownCouch) Open() (err error) {
//acquire the write mutex for the duratin of Open()
// acquire the write mutex for the duration of Open()
udc.writeMutex.Lock()
defer udc.writeMutex.Unlock()
@ -837,6 +837,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
docBackIndexRowErr = err
return
}
defer func() {
if cerr := kvreader.Close(); err == nil && cerr != nil {
docBackIndexRowErr = cerr
}
}()
for docID, doc := range batch.IndexOps {
backIndexRow, err := backIndexRowForDoc(kvreader, index.IndexInternalID(docID))
@ -847,12 +852,6 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
docBackIndexRowCh <- &docBackIndexRow{docID, doc, backIndexRow}
}
err = kvreader.Close()
if err != nil {
docBackIndexRowErr = err
return
}
}()
// wait for analysis result

View file

@ -15,12 +15,11 @@
package bleve
import (
"context"
"sort"
"sync"
"time"
"golang.org/x/net/context"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"

View file

@ -15,6 +15,7 @@
package bleve
import (
"context"
"encoding/json"
"fmt"
"os"
@ -22,8 +23,6 @@ import (
"sync/atomic"
"time"
"golang.org/x/net/context"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
@ -51,6 +50,12 @@ const storePath = "store"
var mappingInternalKey = []byte("_mapping")
const SearchQueryStartCallbackKey = "_search_query_start_callback_key"
const SearchQueryEndCallbackKey = "_search_query_end_callback_key"
type SearchQueryStartCallbackFn func(size uint64) error
type SearchQueryEndCallbackFn func(size uint64) error
func indexStorePath(path string) string {
return path + string(os.PathSeparator) + storePath
}
@ -253,6 +258,24 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) {
return
}
// IndexAdvanced takes a document.Document object
// skips the mapping and indexes it.
func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) {
if doc.ID == "" {
return ErrorEmptyID
}
i.mutex.RLock()
defer i.mutex.RUnlock()
if !i.open {
return ErrorIndexClosed
}
err = i.i.Update(doc)
return
}
// Delete entries for the specified identifier from
// the index.
func (i *indexImpl) Delete(id string) (err error) {
@ -345,8 +368,70 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) {
return i.SearchInContext(context.Background(), req)
}
var documentMatchEmptySize int
var searchContextEmptySize int
var facetResultEmptySize int
var documentEmptySize int
func init() {
var dm search.DocumentMatch
documentMatchEmptySize = dm.Size()
var sc search.SearchContext
searchContextEmptySize = sc.Size()
var fr search.FacetResult
facetResultEmptySize = fr.Size()
var d document.Document
documentEmptySize = d.Size()
}
// memNeededForSearch is a helper function that returns an estimate of RAM
// needed to execute a search request.
func memNeededForSearch(req *SearchRequest,
searcher search.Searcher,
topnCollector *collector.TopNCollector) uint64 {
backingSize := req.Size + req.From + 1
if req.Size+req.From > collector.PreAllocSizeSkipCap {
backingSize = collector.PreAllocSizeSkipCap + 1
}
numDocMatches := backingSize + searcher.DocumentMatchPoolSize()
estimate := 0
// overhead, size in bytes from collector
estimate += topnCollector.Size()
// pre-allocing DocumentMatchPool
estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize
// searcher overhead
estimate += searcher.Size()
// overhead from results, lowestMatchOutsideResults
estimate += (numDocMatches + 1) * documentMatchEmptySize
// additional overhead from SearchResult
estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus
// overhead from facet results
if req.Facets != nil {
estimate += len(req.Facets) * facetResultEmptySize
}
// highlighting, store
if len(req.Fields) > 0 || req.Highlight != nil {
// Size + From => number of hits
estimate += (req.Size + req.From) * documentEmptySize
}
return uint64(estimate)
}
// SearchInContext executes a search request operation within the provided
// Context. Returns a SearchResult object or an error.
// Context. Returns a SearchResult object or an error.
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) {
i.mutex.RLock()
defer i.mutex.RUnlock()
@ -411,6 +496,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
collector.SetFacetsBuilder(facetsBuilder)
}
memNeeded := memNeededForSearch(req, searcher, collector)
if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil {
if cbF, ok := cb.(SearchQueryStartCallbackFn); ok {
err = cbF(memNeeded)
}
}
if err != nil {
return nil, err
}
if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil {
if cbF, ok := cb.(SearchQueryEndCallbackFn); ok {
defer func() {
_ = cbF(memNeeded)
}()
}
}
err = collector.Collect(ctx, searcher, indexReader)
if err != nil {
return nil, err
@ -442,7 +545,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
doc, err := indexReader.Document(hit.ID)
if err == nil && doc != nil {
if len(req.Fields) > 0 {
for _, f := range req.Fields {
fieldsToLoad := deDuplicate(req.Fields)
for _, f := range fieldsToLoad {
for _, docF := range doc.Fields {
if f == "*" || docF.Name() == f {
var value interface{}
@ -516,9 +620,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
return &SearchResult{
Status: &SearchStatus{
Total: 1,
Failed: 0,
Successful: 1,
Errors: make(map[string]error),
},
Request: req,
Hits: hits,
@ -738,3 +840,16 @@ func (f *indexImplFieldDict) Close() error {
}
return f.indexReader.Close()
}
// helper function to remove duplicate entries from slice of strings
func deDuplicate(fields []string) []string {
entries := make(map[string]struct{})
ret := []string{}
for _, entry := range fields {
if _, exists := entries[entry]; !exists {
entries[entry] = struct{}{}
ret = append(ret, entry)
}
}
return ret
}

View file

@ -15,6 +15,7 @@
package mapping
import (
"encoding"
"encoding/json"
"fmt"
"reflect"
@ -178,6 +179,7 @@ OUTER:
continue OUTER
}
}
break
}
return current
}
@ -481,6 +483,17 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
fieldMapping := newDateTimeFieldMappingDynamic(context.im)
fieldMapping.processTime(property, pathString, path, indexes, context)
}
case encoding.TextMarshaler:
txt, err := property.MarshalText()
if err == nil && subDocMapping != nil {
// index by explicit mapping
for _, fieldMapping := range subDocMapping.Fields {
if fieldMapping.Type == "text" {
fieldMapping.processString(string(txt), pathString, path, indexes, context)
}
}
}
dm.walkDocument(property, path, indexes, context)
default:
if subDocMapping != nil {
for _, fieldMapping := range subDocMapping.Fields {
@ -491,7 +504,7 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
}
dm.walkDocument(property, path, indexes, context)
}
case reflect.Map:
case reflect.Map, reflect.Slice:
if subDocMapping != nil {
for _, fieldMapping := range subDocMapping.Fields {
if fieldMapping.Type == "geopoint" {
@ -500,6 +513,27 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
}
}
dm.walkDocument(property, path, indexes, context)
case reflect.Ptr:
if !propertyValue.IsNil() {
switch property := property.(type) {
case encoding.TextMarshaler:
txt, err := property.MarshalText()
if err == nil && subDocMapping != nil {
// index by explicit mapping
for _, fieldMapping := range subDocMapping.Fields {
if fieldMapping.Type == "text" {
fieldMapping.processString(string(txt), pathString, path, indexes, context)
}
}
} else {
dm.walkDocument(property, path, indexes, context)
}
default:
dm.walkDocument(property, path, indexes, context)
}
}
default:
dm.walkDocument(property, path, indexes, context)
}

View file

@ -26,8 +26,9 @@ import (
// control the default behavior for dynamic fields (those not explicitly mapped)
var (
IndexDynamic = true
StoreDynamic = true
IndexDynamic = true
StoreDynamic = true
DocValuesDynamic = true // TODO revisit default?
)
// A FieldMapping describes how a specific item
@ -54,6 +55,10 @@ type FieldMapping struct {
IncludeTermVectors bool `json:"include_term_vectors,omitempty"`
IncludeInAll bool `json:"include_in_all,omitempty"`
DateFormat string `json:"date_format,omitempty"`
// DocValues, if true makes the index uninverting possible for this field
// It is useful for faceting and sorting queries.
DocValues bool `json:"docvalues,omitempty"`
}
// NewTextFieldMapping returns a default field mapping for text
@ -64,6 +69,7 @@ func NewTextFieldMapping() *FieldMapping {
Index: true,
IncludeTermVectors: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -71,6 +77,7 @@ func newTextFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewTextFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValuesDynamic
return rv
}
@ -81,6 +88,7 @@ func NewNumericFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -88,6 +96,7 @@ func newNumericFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewNumericFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValuesDynamic
return rv
}
@ -98,6 +107,7 @@ func NewDateTimeFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -105,6 +115,7 @@ func newDateTimeFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewDateTimeFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValuesDynamic
return rv
}
@ -115,6 +126,7 @@ func NewBooleanFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -122,6 +134,7 @@ func newBooleanFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewBooleanFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValuesDynamic
return rv
}
@ -132,6 +145,7 @@ func NewGeoPointFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -147,6 +161,9 @@ func (fm *FieldMapping) Options() document.IndexingOptions {
if fm.IncludeTermVectors {
rv |= document.IncludeTermVectors
}
if fm.DocValues {
rv |= document.DocValues
}
return rv
}
@ -308,6 +325,11 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error {
if err != nil {
return err
}
case "docvalues":
err := json.Unmarshal(v, &fm.DocValues)
if err != nil {
return err
}
default:
invalidKeys = append(invalidKeys, k)
}

View file

@ -50,6 +50,7 @@ type IndexMappingImpl struct {
DefaultField string `json:"default_field"`
StoreDynamic bool `json:"store_dynamic"`
IndexDynamic bool `json:"index_dynamic"`
DocValuesDynamic bool `json:"docvalues_dynamic,omitempty"`
CustomAnalysis *customAnalysis `json:"analysis,omitempty"`
cache *registry.Cache
}
@ -154,6 +155,7 @@ func NewIndexMapping() *IndexMappingImpl {
DefaultField: defaultField,
IndexDynamic: IndexDynamic,
StoreDynamic: StoreDynamic,
DocValuesDynamic: DocValuesDynamic,
CustomAnalysis: newCustomAnalysis(),
cache: registry.NewCache(),
}
@ -217,6 +219,7 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
im.TypeMapping = make(map[string]*DocumentMapping)
im.StoreDynamic = StoreDynamic
im.IndexDynamic = IndexDynamic
im.DocValuesDynamic = DocValuesDynamic
var invalidKeys []string
for k, v := range tmp {
@ -271,6 +274,11 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
if err != nil {
return err
}
case "docvalues_dynamic":
err := json.Unmarshal(v, &im.DocValuesDynamic)
if err != nil {
return err
}
default:
invalidKeys = append(invalidKeys, k)
}
@ -289,7 +297,12 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
}
func (im *IndexMappingImpl) determineType(data interface{}) string {
// first see if the object implements Classifier
// first see if the object implements bleveClassifier
bleveClassifier, ok := data.(bleveClassifier)
if ok {
return bleveClassifier.BleveType()
}
// next see if the object implements Classifier
classifier, ok := data.(Classifier)
if ok {
return classifier.Type()
@ -313,7 +326,7 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}
// see if the _all field was disabled
allMapping := docMapping.documentMappingForPath("_all")
if allMapping == nil || (allMapping.Enabled != false) {
if allMapping == nil || allMapping.Enabled {
field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, document.IndexField|document.IncludeTermVectors)
doc.AddField(field)
}
@ -334,7 +347,7 @@ func (im *IndexMappingImpl) newWalkContext(doc *document.Document, dm *DocumentM
doc: doc,
im: im,
dm: dm,
excludedFromAll: []string{},
excludedFromAll: []string{"_id"},
}
}

View file

@ -22,12 +22,21 @@ import (
"github.com/blevesearch/bleve/document"
)
// A Classifier is an interface describing any object
// which knows how to identify its own type.
// A Classifier is an interface describing any object which knows how to
// identify its own type. Alternatively, if a struct already has a Type
// field or method in conflict, one can use BleveType instead.
type Classifier interface {
Type() string
}
// A bleveClassifier is an interface describing any object which knows how
// to identify its own type. This is introduced as an alternative to the
// Classifier interface which often has naming conflicts with existing
// structures.
type bleveClassifier interface {
BleveType() string
}
var logger = log.New(ioutil.Discard, "bleve mapping ", log.LstdFlags)
// SetLog sets the logger used for logging

View file

@ -209,8 +209,8 @@ func NewGeoBoundingBoxQuery(topLeftLon, topLeftLat, bottomRightLon, bottomRightL
return query.NewGeoBoundingBoxQuery(topLeftLon, topLeftLat, bottomRightLon, bottomRightLat)
}
// NewGeoDistanceQuery creates a new Query for performing geo bounding
// box searches. The arguments describe a position and a distance. Documents
// NewGeoDistanceQuery creates a new Query for performing geo distance
// searches. The arguments describe a position and a distance. Documents
// which have an indexed geo point which is less than or equal to the provided
// distance from the given position will be returned.
func NewGeoDistanceQuery(lon, lat float64, distance string) *query.GeoDistanceQuery {

View file

@ -17,15 +17,29 @@ package bleve
import (
"encoding/json"
"fmt"
"reflect"
"time"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/datetime/optional"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/collector"
"github.com/blevesearch/bleve/search/query"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeSearchResult int
var reflectStaticSizeSearchStatus int
func init() {
var sr SearchResult
reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size())
var ss SearchStatus
reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size())
}
var cache = registry.NewCache()
const defaultDateTimeParser = optional.Name
@ -170,6 +184,16 @@ func (fr *FacetRequest) AddDateTimeRange(name string, start, end time.Time) {
fr.DateTimeRanges = append(fr.DateTimeRanges, &dateTimeRange{Name: name, Start: start, End: end})
}
// AddDateTimeRangeString adds a bucket to a field
// containing date values.
func (fr *FacetRequest) AddDateTimeRangeString(name string, start, end *string) {
if fr.DateTimeRanges == nil {
fr.DateTimeRanges = make([]*dateTimeRange, 0, 1)
}
fr.DateTimeRanges = append(fr.DateTimeRanges,
&dateTimeRange{Name: name, startString: start, endString: end})
}
// AddNumericRange adds a bucket to a field
// containing numeric values. Documents with a
// numeric value falling into this range are
@ -422,6 +446,24 @@ type SearchResult struct {
Facets search.FacetResults `json:"facets"`
}
func (sr *SearchResult) Size() int {
sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr +
reflectStaticSizeSearchStatus
for _, entry := range sr.Hits {
if entry != nil {
sizeInBytes += entry.Size()
}
}
for k, v := range sr.Facets {
sizeInBytes += size.SizeOfString + len(k) +
v.Size()
}
return sizeInBytes
}
func (sr *SearchResult) String() string {
rv := ""
if sr.Total > 0 {
@ -471,5 +513,51 @@ func (sr *SearchResult) Merge(other *SearchResult) {
if other.MaxScore > sr.MaxScore {
sr.MaxScore = other.MaxScore
}
if sr.Facets == nil && len(other.Facets) != 0 {
sr.Facets = other.Facets
return
}
sr.Facets.Merge(other.Facets)
}
// MemoryNeededForSearchResult is an exported helper function to determine the RAM
// needed to accommodate the results for a given search request.
func MemoryNeededForSearchResult(req *SearchRequest) uint64 {
if req == nil {
return 0
}
numDocMatches := req.Size + req.From
if req.Size+req.From > collector.PreAllocSizeSkipCap {
numDocMatches = collector.PreAllocSizeSkipCap
}
estimate := 0
// overhead from the SearchResult structure
var sr SearchResult
estimate += sr.Size()
var dm search.DocumentMatch
sizeOfDocumentMatch := dm.Size()
// overhead from results
estimate += numDocMatches * sizeOfDocumentMatch
// overhead from facet results
if req.Facets != nil {
var fr search.FacetResult
estimate += len(req.Facets) * fr.Size()
}
// highlighting, store
var d document.Document
if len(req.Fields) > 0 || req.Highlight != nil {
for i := 0; i < (req.Size + req.From); i++ {
estimate += (req.Size + req.From) * d.Size()
}
}
return uint64(estimate)
}

View file

@ -15,11 +15,10 @@
package search
import (
"context"
"time"
"github.com/blevesearch/bleve/index"
"golang.org/x/net/context"
)
type Collector interface {

View file

@ -34,11 +34,20 @@ func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap {
return rv
}
func (c *collectStoreHeap) Add(doc *search.DocumentMatch) {
func (c *collectStoreHeap) AddNotExceedingSize(doc *search.DocumentMatch,
size int) *search.DocumentMatch {
c.add(doc)
if c.Len() > size {
return c.removeLast()
}
return nil
}
func (c *collectStoreHeap) add(doc *search.DocumentMatch) {
heap.Push(c, doc)
}
func (c *collectStoreHeap) RemoveLast() *search.DocumentMatch {
func (c *collectStoreHeap) removeLast() *search.DocumentMatch {
return heap.Pop(c).(*search.DocumentMatch)
}
@ -49,17 +58,12 @@ func (c *collectStoreHeap) Final(skip int, fixup collectorFixup) (search.Documen
return make(search.DocumentMatchCollection, 0), nil
}
rv := make(search.DocumentMatchCollection, size)
for count > 0 {
count--
if count >= skip {
size--
doc := heap.Pop(c).(*search.DocumentMatch)
rv[size] = doc
err := fixup(doc)
if err != nil {
return nil, err
}
for i := size - 1; i >= 0; i-- {
doc := heap.Pop(c).(*search.DocumentMatch)
rv[i] = doc
err := fixup(doc)
if err != nil {
return nil, err
}
}
return rv, nil

View file

@ -34,7 +34,16 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList {
return rv
}
func (c *collectStoreList) Add(doc *search.DocumentMatch) {
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch,
size int) *search.DocumentMatch {
c.add(doc)
if c.len() > size {
return c.removeLast()
}
return nil
}
func (c *collectStoreList) add(doc *search.DocumentMatch) {
for e := c.results.Front(); e != nil; e = e.Next() {
curr := e.Value.(*search.DocumentMatch)
if c.compare(doc, curr) >= 0 {
@ -46,7 +55,7 @@ func (c *collectStoreList) Add(doc *search.DocumentMatch) {
c.results.PushBack(doc)
}
func (c *collectStoreList) RemoveLast() *search.DocumentMatch {
func (c *collectStoreList) removeLast() *search.DocumentMatch {
return c.results.Remove(c.results.Front()).(*search.DocumentMatch)
}
@ -73,6 +82,6 @@ func (c *collectStoreList) Final(skip int, fixup collectorFixup) (search.Documen
return search.DocumentMatchCollection{}, nil
}
func (c *collectStoreList) Len() int {
func (c *collectStoreList) len() int {
return c.results.Len()
}

View file

@ -29,7 +29,16 @@ func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice {
return rv
}
func (c *collectStoreSlice) Add(doc *search.DocumentMatch) {
func (c *collectStoreSlice) AddNotExceedingSize(doc *search.DocumentMatch,
size int) *search.DocumentMatch {
c.add(doc)
if c.len() > size {
return c.removeLast()
}
return nil
}
func (c *collectStoreSlice) add(doc *search.DocumentMatch) {
// find where to insert, starting at end (lowest)
i := len(c.slice)
for ; i > 0; i-- {
@ -44,7 +53,7 @@ func (c *collectStoreSlice) Add(doc *search.DocumentMatch) {
c.slice[i] = doc
}
func (c *collectStoreSlice) RemoveLast() *search.DocumentMatch {
func (c *collectStoreSlice) removeLast() *search.DocumentMatch {
var rv *search.DocumentMatch
rv, c.slice = c.slice[len(c.slice)-1], c.slice[:len(c.slice)-1]
return rv
@ -63,6 +72,6 @@ func (c *collectStoreSlice) Final(skip int, fixup collectorFixup) (search.Docume
return search.DocumentMatchCollection{}, nil
}
func (c *collectStoreSlice) Len() int {
func (c *collectStoreSlice) len() int {
return len(c.slice)
}

View file

@ -15,13 +15,31 @@
package collector
import (
"context"
"reflect"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"golang.org/x/net/context"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTopNCollector int
func init() {
var coll TopNCollector
reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size())
}
type collectorStore interface {
// Add the document, and if the new store size exceeds the provided size
// the last element is removed and returned. If the size has not been
// exceeded, nil is returned.
AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch
Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error)
}
// PreAllocSizeSkipCap will cap preallocation to this amount when
// size+skip exceeds this value
var PreAllocSizeSkipCap = 1000
@ -41,7 +59,7 @@ type TopNCollector struct {
results search.DocumentMatchCollection
facetsBuilder *search.FacetsBuilder
store *collectStoreHeap
store collectorStore
needDocIds bool
neededFields []string
@ -49,6 +67,8 @@ type TopNCollector struct {
cachedDesc []bool
lowestMatchOutsideResults *search.DocumentMatch
updateFieldVisitor index.DocumentFieldTermVisitor
dvReader index.DocValueReader
}
// CheckDoneEvery controls how frequently we check the context deadline
@ -68,9 +88,15 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
backingSize = PreAllocSizeSkipCap + 1
}
hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
if size+skip > 10 {
hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
} else {
hc.store = newStoreSlice(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
}
// these lookups traverse an interface, so do once up-front
if sort.RequiresDocID() {
@ -83,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
return hc
}
func (hc *TopNCollector) Size() int {
sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr
if hc.facetsBuilder != nil {
sizeInBytes += hc.facetsBuilder.Size()
}
for _, entry := range hc.neededFields {
sizeInBytes += len(entry) + size.SizeOfString
}
sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc)
return sizeInBytes
}
// Collect goes to the index to find the matching documents
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
startTime := time.Now()
@ -100,6 +142,18 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
}
hc.dvReader, err = reader.DocValueReader(hc.neededFields)
if err != nil {
return err
}
hc.updateFieldVisitor = func(field string, term []byte) {
if hc.facetsBuilder != nil {
hc.facetsBuilder.UpdateVisitor(field, term)
}
hc.sort.UpdateVisitor(field, term)
}
select {
case <-ctx.Done():
return ctx.Err()
@ -184,9 +238,8 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
}
}
hc.store.Add(d)
if hc.store.Len() > hc.size+hc.skip {
removed := hc.store.RemoveLast()
removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
if removed != nil {
if hc.lowestMatchOutsideResults == nil {
hc.lowestMatchOutsideResults = removed
} else {
@ -209,13 +262,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc
hc.facetsBuilder.StartDoc()
}
err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) {
if hc.facetsBuilder != nil {
hc.facetsBuilder.UpdateVisitor(field, term)
}
hc.sort.UpdateVisitor(field, term)
})
err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor)
if hc.facetsBuilder != nil {
hc.facetsBuilder.EndDoc()
}
@ -243,6 +290,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
return err
}
}
doc.Complete(nil)
return nil
})
@ -274,5 +322,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults {
if hc.facetsBuilder != nil {
return hc.facetsBuilder.Results()
}
return search.FacetResults{}
return nil
}

View file

@ -17,8 +17,18 @@ package search
import (
"encoding/json"
"fmt"
"reflect"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeExplanation int
func init() {
var e Explanation
reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size())
}
type Explanation struct {
Value float64 `json:"value"`
Message string `json:"message"`
@ -32,3 +42,14 @@ func (expl *Explanation) String() string {
}
return string(js)
}
func (expl *Explanation) Size() int {
sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr +
len(expl.Message)
for _, entry := range expl.Children {
sizeInBytes += entry.Size()
}
return sizeInBytes
}

View file

@ -15,13 +15,25 @@
package facet
import (
"reflect"
"sort"
"time"
"github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDateTimeFacetBuilder int
var reflectStaticSizedateTimeRange int
func init() {
var dtfb DateTimeFacetBuilder
reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size())
var dtr dateTimeRange
reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size())
}
type dateTimeRange struct {
start time.Time
end time.Time
@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder {
}
}
func (fb *DateTimeFacetBuilder) Size() int {
sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr +
len(fb.field)
for k, _ := range fb.termsCount {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfInt
}
for k, _ := range fb.ranges {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfPtr + reflectStaticSizedateTimeRange
}
return sizeInBytes
}
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) {
r := dateTimeRange{
start: start,

View file

@ -15,12 +15,24 @@
package facet
import (
"reflect"
"sort"
"github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeNumericFacetBuilder int
var reflectStaticSizenumericRange int
func init() {
var nfb NumericFacetBuilder
reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size())
var nr numericRange
reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size())
}
type numericRange struct {
min *float64
max *float64
@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder {
}
}
func (fb *NumericFacetBuilder) Size() int {
sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr +
len(fb.field)
for k, _ := range fb.termsCount {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfInt
}
for k, _ := range fb.ranges {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfPtr + reflectStaticSizenumericRange
}
return sizeInBytes
}
func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) {
r := numericRange{
min: min,

View file

@ -15,11 +15,20 @@
package facet
import (
"reflect"
"sort"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTermsFacetBuilder int
func init() {
var tfb TermsFacetBuilder
reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size())
}
type TermsFacetBuilder struct {
size int
field string
@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder {
}
}
func (fb *TermsFacetBuilder) Size() int {
sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr +
len(fb.field)
for k, _ := range fb.termsCount {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfInt
}
return sizeInBytes
}
func (fb *TermsFacetBuilder) Field() string {
return fb.field
}

View file

@ -15,11 +15,32 @@
package search
import (
"reflect"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeFacetsBuilder int
var reflectStaticSizeFacetResult int
var reflectStaticSizeTermFacet int
var reflectStaticSizeNumericRangeFacet int
var reflectStaticSizeDateRangeFacet int
func init() {
var fb FacetsBuilder
reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size())
var fr FacetResult
reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size())
var tf TermFacet
reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size())
var nrf NumericRangeFacet
reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size())
var drf DateRangeFacet
reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size())
}
type FacetBuilder interface {
StartDoc()
UpdateVisitor(field string, term []byte)
@ -27,6 +48,8 @@ type FacetBuilder interface {
Result() *FacetResult
Field() string
Size() int
}
type FacetsBuilder struct {
@ -42,6 +65,21 @@ func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder {
}
}
func (fb *FacetsBuilder) Size() int {
sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr
for k, v := range fb.facets {
sizeInBytes += size.SizeOfString + len(k) +
v.Size()
}
for _, entry := range fb.fields {
sizeInBytes += size.SizeOfString + len(entry)
}
return sizeInBytes
}
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) {
fb.facets[name] = facetBuilder
fb.fields = append(fb.fields, facetBuilder.Field())
@ -213,6 +251,14 @@ type FacetResult struct {
DateRanges DateRangeFacets `json:"date_ranges,omitempty"`
}
func (fr *FacetResult) Size() int {
return reflectStaticSizeFacetResult + size.SizeOfPtr +
len(fr.Field) +
len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) +
len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) +
len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr)
}
func (fr *FacetResult) Merge(other *FacetResult) {
fr.Total += other.Total
fr.Missing += other.Missing

View file

@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int {
// in which case the first return val will be the max
// and the second will be true, indicating max was exceeded
func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil)
return v, wasMax
}
func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) {
la := len(a)
lb := len(b)
ld := int(math.Abs(float64(la - lb)))
if ld > max {
return max, true
return max, true, d
}
d := make([]int, la+1)
if cap(d) < la+1 {
d = make([]int, la+1)
}
d = d[:la+1]
var lastdiag, olddiag, temp int
for i := 1; i <= la; i++ {
@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
}
// after each row if rowmin isn't less than max stop
if rowmin > max {
return max, true
return max, true, d
}
}
return d[la], false
return d[la], false, d
}

View file

@ -14,6 +14,17 @@
package search
import (
"reflect"
)
var reflectStaticSizeDocumentMatchPool int
func init() {
var dmp DocumentMatchPool
reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size())
}
// DocumentMatchPoolTooSmall is a callback function that can be executed
// when the DocumentMatchPool does not have sufficient capacity
// By default we just perform just-in-time allocation, but you could log

View file

@ -43,6 +43,10 @@ func (q *QueryStringQuery) Boost() float64 {
return q.BoostVal.Value()
}
func (q *QueryStringQuery) Parse() (Query, error) {
return parseQuerySyntax(q.Query)
}
func (q *QueryStringQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
newQuery, err := parseQuerySyntax(q.Query)
if err != nil {

View file

@ -15,13 +15,27 @@
package scorer
import (
"reflect"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeConjunctionQueryScorer int
func init() {
var cqs ConjunctionQueryScorer
reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size())
}
type ConjunctionQueryScorer struct {
options search.SearcherOptions
}
func (s *ConjunctionQueryScorer) Size() int {
return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr
}
func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer {
return &ConjunctionQueryScorer{
options: options,
@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
childrenExplanations = make([]*search.Explanation, len(constituents))
}
locations := []search.FieldTermLocationMap{}
for i, docMatch := range constituents {
sum += docMatch.Score
if s.options.Explain {
childrenExplanations[i] = docMatch.Expl
}
if docMatch.Locations != nil {
locations = append(locations, docMatch.Locations)
}
}
newScore := sum
var newExpl *search.Explanation
@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
rv := constituents[0]
rv.Score = newScore
rv.Expl = newExpl
if len(locations) == 1 {
rv.Locations = locations[0]
} else if len(locations) > 1 {
rv.Locations = search.MergeLocations(locations)
}
rv.FieldTermLocations = search.MergeFieldTermLocations(
rv.FieldTermLocations, constituents[1:])
return rv
}

View file

@ -16,11 +16,20 @@ package scorer
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeConstantScorer int
func init() {
var cs ConstantScorer
reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size())
}
type ConstantScorer struct {
constant float64
boost float64
@ -30,6 +39,16 @@ type ConstantScorer struct {
queryWeightExplanation *search.Explanation
}
func (s *ConstantScorer) Size() int {
sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
}
return sizeInBytes
}
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer {
rv := ConstantScorer{
options: options,

View file

@ -16,14 +16,27 @@ package scorer
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDisjunctionQueryScorer int
func init() {
var dqs DisjunctionQueryScorer
reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size())
}
type DisjunctionQueryScorer struct {
options search.SearcherOptions
}
func (s *DisjunctionQueryScorer) Size() int {
return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr
}
func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer {
return &DisjunctionQueryScorer{
options: options,
@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
childrenExplanations = make([]*search.Explanation, len(constituents))
}
var locations []search.FieldTermLocationMap
for i, docMatch := range constituents {
sum += docMatch.Score
if s.options.Explain {
childrenExplanations[i] = docMatch.Expl
}
if docMatch.Locations != nil {
locations = append(locations, docMatch.Locations)
}
}
var rawExpl *search.Explanation
@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
rv := constituents[0]
rv.Score = newScore
rv.Expl = newExpl
if len(locations) == 1 {
rv.Locations = locations[0]
} else if len(locations) > 1 {
rv.Locations = search.MergeLocations(locations)
}
rv.FieldTermLocations = search.MergeFieldTermLocations(
rv.FieldTermLocations, constituents[1:])
return rv
}

View file

@ -17,13 +17,22 @@ package scorer
import (
"fmt"
"math"
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTermQueryScorer int
func init() {
var tqs TermQueryScorer
reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size())
}
type TermQueryScorer struct {
queryTerm []byte
queryTerm string
queryField string
queryBoost float64
docTerm uint64
@ -36,9 +45,24 @@ type TermQueryScorer struct {
queryWeightExplanation *search.Explanation
}
func (s *TermQueryScorer) Size() int {
sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr +
len(s.queryTerm) + len(s.queryField)
if s.idfExplanation != nil {
sizeInBytes += s.idfExplanation.Size()
}
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
}
return sizeInBytes
}
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer {
rv := TermQueryScorer{
queryTerm: queryTerm,
queryTerm: string(queryTerm),
queryField: queryField,
queryBoost: queryBoost,
docTerm: docTerm,
@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
}
s.queryWeightExplanation = &search.Explanation{
Value: s.queryWeight,
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost),
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost),
Children: childrenExplanations,
}
}
@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
childrenExplanations := make([]*search.Explanation, 3)
childrenExplanations[0] = &search.Explanation{
Value: tf,
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq),
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
}
childrenExplanations[1] = &search.Explanation{
Value: termMatch.Norm,
@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
childrenExplanations[2] = s.idfExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID),
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),
Children: childrenExplanations,
}
}
@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
childExplanations[1] = scoreExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID),
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
Children: childExplanations,
}
}
@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
rv.Expl = scoreExplanation
}
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 {
locs := make([]search.Location, len(termMatch.Vectors))
locsUsed := 0
totalPositions := 0
for _, v := range termMatch.Vectors {
totalPositions += len(v.ArrayPositions)
if len(termMatch.Vectors) > 0 {
if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {
rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors))
}
positions := make(search.ArrayPositions, totalPositions)
positionsUsed := 0
rv.Locations = make(search.FieldTermLocationMap)
for _, v := range termMatch.Vectors {
tlm := rv.Locations[v.Field]
if tlm == nil {
tlm = make(search.TermLocationMap)
rv.Locations[v.Field] = tlm
}
loc := &locs[locsUsed]
locsUsed++
loc.Pos = v.Pos
loc.Start = v.Start
loc.End = v.End
var ap search.ArrayPositions
if len(v.ArrayPositions) > 0 {
loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)]
for i, ap := range v.ArrayPositions {
loc.ArrayPositions[i] = ap
n := len(rv.FieldTermLocations)
if n < cap(rv.FieldTermLocations) { // reuse ap slice if available
ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0]
}
positionsUsed += len(v.ArrayPositions)
ap = append(ap, v.ArrayPositions...)
}
tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc)
rv.FieldTermLocations =
append(rv.FieldTermLocations, search.FieldTermLocation{
Field: v.Field,
Term: s.queryTerm,
Location: search.Location{
Pos: v.Pos,
Start: v.Start,
End: v.End,
ArrayPositions: ap,
},
})
}
}

View file

@ -16,11 +16,26 @@ package search
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDocumentMatch int
var reflectStaticSizeSearchContext int
var reflectStaticSizeLocation int
func init() {
var dm DocumentMatch
reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size())
var sc SearchContext
reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size())
var l Location
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
}
type ArrayPositions []uint64
func (ap ArrayPositions) Equals(other ArrayPositions) bool {
@ -36,12 +51,22 @@ func (ap ArrayPositions) Equals(other ArrayPositions) bool {
}
type Location struct {
Pos uint64 `json:"pos"`
Start uint64 `json:"start"`
End uint64 `json:"end"`
// Pos is the position of the term within the field, starting at 1
Pos uint64 `json:"pos"`
// Start and End are the byte offsets of the term in the field
Start uint64 `json:"start"`
End uint64 `json:"end"`
// ArrayPositions contains the positions of the term within any elements.
ArrayPositions ArrayPositions `json:"array_positions"`
}
func (l *Location) Size() int {
return reflectStaticSizeLocation + size.SizeOfPtr +
len(l.ArrayPositions)*size.SizeOfUint64
}
type Locations []*Location
type TermLocationMap map[string]Locations
@ -52,6 +77,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) {
type FieldTermLocationMap map[string]TermLocationMap
type FieldTermLocation struct {
Field string
Term string
Location Location
}
type FieldFragmentMap map[string][]string
type DocumentMatch struct {
@ -74,6 +105,12 @@ type DocumentMatch struct {
// used to maintain natural index order
HitNumber uint64 `json:"-"`
// used to temporarily hold field term location information during
// search processing in an efficient, recycle-friendly manner, to
// be later incorporated into the Locations map when search
// results are completed
FieldTermLocations []FieldTermLocation `json:"-"`
}
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
@ -103,15 +140,120 @@ func (dm *DocumentMatch) Reset() *DocumentMatch {
indexInternalID := dm.IndexInternalID
// remember the []interface{} used for sort
sort := dm.Sort
// remember the FieldTermLocations backing array
ftls := dm.FieldTermLocations
for i := range ftls { // recycle the ArrayPositions of each location
ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0]
}
// idiom to copy over from empty DocumentMatch (0 allocations)
*dm = DocumentMatch{}
// reuse the []byte already allocated (and reset len to 0)
dm.IndexInternalID = indexInternalID[:0]
// reuse the []interface{} already allocated (and reset len to 0)
dm.Sort = sort[:0]
// reuse the FieldTermLocations already allocated (and reset len to 0)
dm.FieldTermLocations = ftls[:0]
return dm
}
func (dm *DocumentMatch) Size() int {
sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr +
len(dm.Index) +
len(dm.ID) +
len(dm.IndexInternalID)
if dm.Expl != nil {
sizeInBytes += dm.Expl.Size()
}
for k, v := range dm.Locations {
sizeInBytes += size.SizeOfString + len(k)
for k1, v1 := range v {
sizeInBytes += size.SizeOfString + len(k1) +
size.SizeOfSlice
for _, entry := range v1 {
sizeInBytes += entry.Size()
}
}
}
for k, v := range dm.Fragments {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfSlice
for _, entry := range v {
sizeInBytes += size.SizeOfString + len(entry)
}
}
for _, entry := range dm.Sort {
sizeInBytes += size.SizeOfString + len(entry)
}
for k, _ := range dm.Fields {
sizeInBytes += size.SizeOfString + len(k) +
size.SizeOfPtr
}
if dm.Document != nil {
sizeInBytes += dm.Document.Size()
}
return sizeInBytes
}
// Complete performs final preparation & transformation of the
// DocumentMatch at the end of search processing, also allowing the
// caller to provide an optional preallocated locations slice
func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
// transform the FieldTermLocations slice into the Locations map
nlocs := len(dm.FieldTermLocations)
if nlocs > 0 {
if cap(prealloc) < nlocs {
prealloc = make([]Location, nlocs)
}
prealloc = prealloc[:nlocs]
var lastField string
var tlm TermLocationMap
for i, ftl := range dm.FieldTermLocations {
if lastField != ftl.Field {
lastField = ftl.Field
if dm.Locations == nil {
dm.Locations = make(FieldTermLocationMap)
}
tlm = dm.Locations[ftl.Field]
if tlm == nil {
tlm = make(TermLocationMap)
dm.Locations[ftl.Field] = tlm
}
}
loc := &prealloc[i]
*loc = ftl.Location
if len(loc.ArrayPositions) > 0 { // copy
loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...)
}
tlm[ftl.Term] = append(tlm[ftl.Term], loc)
dm.FieldTermLocations[i] = FieldTermLocation{ // recycle
Location: Location{
ArrayPositions: ftl.Location.ArrayPositions[:0],
},
}
}
}
dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle
return prealloc
}
func (dm *DocumentMatch) String() string {
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
}
@ -130,6 +272,7 @@ type Searcher interface {
SetQueryNorm(float64)
Count() uint64
Min() int
Size() int
DocumentMatchPoolSize() int
}
@ -143,3 +286,18 @@ type SearcherOptions struct {
type SearchContext struct {
DocumentMatchPool *DocumentMatchPool
}
func (sc *SearchContext) Size() int {
sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr +
reflectStaticSizeDocumentMatchPool + size.SizeOfPtr
if sc.DocumentMatchPool != nil {
for _, entry := range sc.DocumentMatchPool.avail {
if entry != nil {
sizeInBytes += entry.Size()
}
}
}
return sizeInBytes
}

View file

@ -16,12 +16,21 @@ package searcher
import (
"math"
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeBooleanSearcher int
func init() {
var bs BooleanSearcher
reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size())
}
type BooleanSearcher struct {
indexReader index.IndexReader
mustSearcher search.Searcher
@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc
return &rv, nil
}
func (s *BooleanSearcher) Size() int {
sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr
if s.mustSearcher != nil {
sizeInBytes += s.mustSearcher.Size()
}
if s.shouldSearcher != nil {
sizeInBytes += s.shouldSearcher.Size()
}
if s.mustNotSearcher != nil {
sizeInBytes += s.mustNotSearcher.Size()
}
sizeInBytes += s.scorer.Size()
for _, entry := range s.matches {
if entry != nil {
sizeInBytes += entry.Size()
}
}
return sizeInBytes
}
func (s *BooleanSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0

View file

@ -16,13 +16,22 @@ package searcher
import (
"math"
"reflect"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeConjunctionSearcher int
func init() {
var cs ConjunctionSearcher
reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size())
}
type ConjunctionSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
@ -51,31 +60,72 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
scorer: scorer.NewConjunctionQueryScorer(options),
}
rv.computeQueryNorm()
// attempt push-down conjunction optimization when there's >1 searchers
if len(searchers) > 1 {
var octx index.OptimizableContext
for _, searcher := range searchers {
o, ok := searcher.(index.Optimizable)
if ok {
var err error
octx, err = o.Optimize("conjunction", octx)
if err != nil {
return nil, err
}
}
}
if octx != nil {
err := octx.Finish()
if err != nil {
return nil, err
}
}
}
return &rv, nil
}
func (s *ConjunctionSearcher) Size() int {
sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.currs {
if entry != nil {
sizeInBytes += entry.Size()
}
}
return sizeInBytes
}
func (s *ConjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, termSearcher := range s.searchers {
sumOfSquaredWeights += termSearcher.Weight()
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, termSearcher := range s.searchers {
termSearcher.SetQueryNorm(s.queryNorm)
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *ConjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, termSearcher := range s.searchers {
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Next(ctx)
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return err
}
@ -160,11 +210,11 @@ OUTER:
// we know all the searchers are pointing at the same thing
// so they all need to be bumped
for i, termSearcher := range s.searchers {
for i, searcher := range s.searchers {
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Next(ctx)
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return nil, err
}
@ -184,6 +234,9 @@ func (s *ConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexI
}
}
for i := range s.searchers {
if s.currs[i] != nil && s.currs[i].IndexInternalID.Compare(ID) >= 0 {
continue
}
err := s.advanceChild(ctx, i, ID)
if err != nil {
return nil, err

View file

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -16,12 +16,9 @@ package searcher
import (
"fmt"
"math"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
)
// DisjunctionMaxClauseCount is a compile time setting that applications can
@ -29,17 +26,26 @@ import (
// error instead of exeucting searches when the size exceeds this value.
var DisjunctionMaxClauseCount = 0
type DisjunctionSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
numSearchers int
queryNorm float64
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
// DisjunctionHeapTakeover is a compile time setting that applications can
// adjust to control when the DisjunctionSearcher will switch from a simple
// slice implementation to a heap implementation.
var DisjunctionHeapTakeover = 10
func NewDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
search.Searcher, error) {
return newDisjunctionSearcher(indexReader, qsearchers, min, options, true)
}
func newDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (search.Searcher, error) {
if len(qsearchers) > DisjunctionHeapTakeover {
return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options,
limit)
}
return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options,
limit)
}
func tooManyClauses(count int) bool {
@ -53,219 +59,3 @@ func tooManyClausesErr() error {
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]",
DisjunctionMaxClauseCount)
}
func NewDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
*DisjunctionSearcher, error) {
return newDisjunctionSearcher(indexReader, qsearchers, min, options,
true)
}
func newDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (
*DisjunctionSearcher, error) {
if limit && tooManyClauses(len(qsearchers)) {
return nil, tooManyClausesErr()
}
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
}
// sort the searchers
sort.Sort(sort.Reverse(searchers))
// build our searcher
rv := DisjunctionSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, termSearcher := range s.searchers {
sumOfSquaredWeights += termSearcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, termSearcher := range s.searchers {
termSearcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, termSearcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Next(ctx)
if err != nil {
return err
}
}
err = s.updateMatches()
if err != nil {
return err
}
s.initialized = true
return nil
}
func (s *DisjunctionSearcher) updateMatches() error {
matching := s.matching[:0]
matchingIdxs := s.matchingIdxs[:0]
for i := 0; i < len(s.currs); i++ {
curr := s.currs[i]
if curr == nil {
continue
}
if len(matching) > 0 {
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
if cmp > 0 {
continue
}
if cmp < 0 {
matching = matching[:0]
matchingIdxs = matchingIdxs[:0]
}
}
matching = append(matching, curr)
matchingIdxs = append(matchingIdxs, i)
}
s.matching = matching
s.matchingIdxs = matchingIdxs
return nil
}
func (s *DisjunctionSearcher) Weight() float64 {
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) {
for _, searcher := range s.searchers {
searcher.SetQueryNorm(qnorm)
}
}
func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
var err error
var rv *search.DocumentMatch
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
}
// invoke next on all the matching searchers
for _, i := range s.matchingIdxs {
searcher := s.searchers[i]
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
}
return rv, nil
}
func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext,
ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
// get all searchers pointing at their first match
var err error
for i, termSearcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
return s.Next(ctx)
}
func (s *DisjunctionSearcher) Count() uint64 {
// for now return a worst case
var sum uint64
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
}
func (s *DisjunctionSearcher) Close() (rv error) {
for _, searcher := range s.searchers {
err := searcher.Close()
if err != nil && rv == nil {
rv = err
}
}
return rv
}
func (s *DisjunctionSearcher) Min() int {
return s.min
}
func (s *DisjunctionSearcher) DocumentMatchPoolSize() int {
rv := len(s.currs)
for _, s := range s.searchers {
rv += s.DocumentMatchPoolSize()
}
return rv
}

View file

@ -0,0 +1,343 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
import (
"bytes"
"container/heap"
"math"
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDisjunctionHeapSearcher int
var reflectStaticSizeSearcherCurr int
func init() {
var dhs DisjunctionHeapSearcher
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size())
var sc SearcherCurr
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size())
}
type SearcherCurr struct {
searcher search.Searcher
curr *search.DocumentMatch
}
type DisjunctionHeapSearcher struct {
indexReader index.IndexReader
numSearchers int
scorer *scorer.DisjunctionQueryScorer
min int
queryNorm float64
initialized bool
searchers []search.Searcher
heap []*SearcherCurr
matching []*search.DocumentMatch
matchingCurrs []*SearcherCurr
}
func newDisjunctionHeapSearcher(indexReader index.IndexReader,
searchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (
*DisjunctionHeapSearcher, error) {
if limit && tooManyClauses(len(searchers)) {
return nil, tooManyClausesErr()
}
// build our searcher
rv := DisjunctionHeapSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingCurrs: make([]*SearcherCurr, len(searchers)),
heap: make([]*SearcherCurr, 0, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionHeapSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.matching {
if entry != nil {
sizeInBytes += entry.Size()
}
}
// for matchingCurrs and heap, just use static size * len
// since searchers and document matches already counted above
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr
return sizeInBytes
}
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error {
// alloc a single block of SearcherCurrs
block := make([]SearcherCurr, len(s.searchers))
// get all searchers pointing at their first match
for i, searcher := range s.searchers {
curr, err := searcher.Next(ctx)
if err != nil {
return err
}
if curr != nil {
block[i].searcher = searcher
block[i].curr = curr
heap.Push(s, &block[i])
}
}
err := s.updateMatches()
if err != nil {
return err
}
s.initialized = true
return nil
}
func (s *DisjunctionHeapSearcher) updateMatches() error {
matching := s.matching[:0]
matchingCurrs := s.matchingCurrs[:0]
if len(s.heap) > 0 {
// top of the heap is our next hit
next := heap.Pop(s).(*SearcherCurr)
matching = append(matching, next.curr)
matchingCurrs = append(matchingCurrs, next)
// now as long as top of heap matches, keep popping
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 {
next = heap.Pop(s).(*SearcherCurr)
matching = append(matching, next.curr)
matchingCurrs = append(matchingCurrs, next)
}
}
s.matching = matching
s.matchingCurrs = matchingCurrs
return nil
}
func (s *DisjunctionHeapSearcher) Weight() float64 {
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) {
for _, searcher := range s.searchers {
searcher.SetQueryNorm(qnorm)
}
}
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
var rv *search.DocumentMatch
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
}
// invoke next on all the matching searchers
for _, matchingCurr := range s.matchingCurrs {
if matchingCurr.curr != rv {
ctx.DocumentMatchPool.Put(matchingCurr.curr)
}
curr, err := matchingCurr.searcher.Next(ctx)
if err != nil {
return nil, err
}
if curr != nil {
matchingCurr.curr = curr
heap.Push(s, matchingCurr)
}
}
err := s.updateMatches()
if err != nil {
return nil, err
}
}
return rv, nil
}
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext,
ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
// if there is anything in matching, toss it back onto the heap
for _, matchingCurr := range s.matchingCurrs {
heap.Push(s, matchingCurr)
}
s.matching = s.matching[:0]
s.matchingCurrs = s.matchingCurrs[:0]
// find all searchers that actually need to be advanced
// advance them, using s.matchingCurrs as temp storage
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 {
searcherCurr := heap.Pop(s).(*SearcherCurr)
ctx.DocumentMatchPool.Put(searcherCurr.curr)
curr, err := searcherCurr.searcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
if curr != nil {
searcherCurr.curr = curr
s.matchingCurrs = append(s.matchingCurrs, searcherCurr)
}
}
// now all of the searchers that we advanced have to be pushed back
for _, matchingCurr := range s.matchingCurrs {
heap.Push(s, matchingCurr)
}
// reset our temp space
s.matchingCurrs = s.matchingCurrs[:0]
err := s.updateMatches()
if err != nil {
return nil, err
}
return s.Next(ctx)
}
func (s *DisjunctionHeapSearcher) Count() uint64 {
// for now return a worst case
var sum uint64
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
}
func (s *DisjunctionHeapSearcher) Close() (rv error) {
for _, searcher := range s.searchers {
err := searcher.Close()
if err != nil && rv == nil {
rv = err
}
}
return rv
}
func (s *DisjunctionHeapSearcher) Min() int {
return s.min
}
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int {
rv := len(s.searchers)
for _, s := range s.searchers {
rv += s.DocumentMatchPoolSize()
}
return rv
}
// a disjunction searcher implements the index.Optimizable interface
// but only activates on an edge case where the disjunction is a
// wrapper around a single Optimizable child searcher
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
if len(s.searchers) == 1 {
o, ok := s.searchers[0].(index.Optimizable)
if ok {
return o.Optimize(kind, octx)
}
}
return octx, nil
}
// heap impl
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) }
func (s *DisjunctionHeapSearcher) Less(i, j int) bool {
if s.heap[i].curr == nil {
return true
} else if s.heap[j].curr == nil {
return false
}
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0
}
func (s *DisjunctionHeapSearcher) Swap(i, j int) {
s.heap[i], s.heap[j] = s.heap[j], s.heap[i]
}
func (s *DisjunctionHeapSearcher) Push(x interface{}) {
s.heap = append(s.heap, x.(*SearcherCurr))
}
func (s *DisjunctionHeapSearcher) Pop() interface{} {
old := s.heap
n := len(old)
x := old[n-1]
s.heap = old[0 : n-1]
return x
}

View file

@ -0,0 +1,298 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
import (
"math"
"reflect"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDisjunctionSliceSearcher int
func init() {
var ds DisjunctionSliceSearcher
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size())
}
type DisjunctionSliceSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
numSearchers int
queryNorm float64
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
}
func newDisjunctionSliceSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (
*DisjunctionSliceSearcher, error) {
if limit && tooManyClauses(len(qsearchers)) {
return nil, tooManyClausesErr()
}
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
}
// sort the searchers
sort.Sort(sort.Reverse(searchers))
// build our searcher
rv := DisjunctionSliceSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionSliceSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.currs {
if entry != nil {
sizeInBytes += entry.Size()
}
}
for _, entry := range s.matching {
if entry != nil {
sizeInBytes += entry.Size()
}
}
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
return sizeInBytes
}
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return err
}
}
err = s.updateMatches()
if err != nil {
return err
}
s.initialized = true
return nil
}
func (s *DisjunctionSliceSearcher) updateMatches() error {
matching := s.matching[:0]
matchingIdxs := s.matchingIdxs[:0]
for i := 0; i < len(s.currs); i++ {
curr := s.currs[i]
if curr == nil {
continue
}
if len(matching) > 0 {
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
if cmp > 0 {
continue
}
if cmp < 0 {
matching = matching[:0]
matchingIdxs = matchingIdxs[:0]
}
}
matching = append(matching, curr)
matchingIdxs = append(matchingIdxs, i)
}
s.matching = matching
s.matchingIdxs = matchingIdxs
return nil
}
func (s *DisjunctionSliceSearcher) Weight() float64 {
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) {
for _, searcher := range s.searchers {
searcher.SetQueryNorm(qnorm)
}
}
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
var err error
var rv *search.DocumentMatch
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
}
// invoke next on all the matching searchers
for _, i := range s.matchingIdxs {
searcher := s.searchers[i]
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
}
return rv, nil
}
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext,
ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
// get all searchers pointing at their first match
var err error
for i, searcher := range s.searchers {
if s.currs[i] != nil {
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
continue
}
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
return s.Next(ctx)
}
func (s *DisjunctionSliceSearcher) Count() uint64 {
// for now return a worst case
var sum uint64
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
}
func (s *DisjunctionSliceSearcher) Close() (rv error) {
for _, searcher := range s.searchers {
err := searcher.Close()
if err != nil && rv == nil {
rv = err
}
}
return rv
}
func (s *DisjunctionSliceSearcher) Min() int {
return s.min
}
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int {
rv := len(s.currs)
for _, s := range s.searchers {
rv += s.DocumentMatchPoolSize()
}
return rv
}
// a disjunction searcher implements the index.Optimizable interface
// but only activates on an edge case where the disjunction is a
// wrapper around a single Optimizable child searcher
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
if len(s.searchers) == 1 {
o, ok := s.searchers[0].(index.Optimizable)
if ok {
return o.Optimize(kind, octx)
}
}
return octx, nil
}

View file

@ -15,11 +15,21 @@
package searcher
import (
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeDocIDSearcher int
func init() {
var ds DocIDSearcher
reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size())
}
// DocIDSearcher returns documents matching a predefined set of identifiers.
type DocIDSearcher struct {
reader index.DocIDReader
@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64
}, nil
}
func (s *DocIDSearcher) Size() int {
return reflectStaticSizeDocIDSearcher + size.SizeOfPtr +
s.reader.Size() +
s.scorer.Size()
}
func (s *DocIDSearcher) Count() uint64 {
return uint64(s.count)
}

View file

@ -15,10 +15,20 @@
package searcher
import (
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeFilteringSearcher int
func init() {
var fs FilteringSearcher
reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size())
}
// FilterFunc defines a function which can filter documents
// returning true means keep the document
// returning false means do not keep the document
@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch
}
}
func (f *FilteringSearcher) Size() int {
return reflectStaticSizeFilteringSearcher + size.SizeOfPtr +
f.child.Size()
}
func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
next, err := f.child.Next(ctx)
for next != nil && err == nil {

View file

@ -15,13 +15,22 @@
package searcher
import (
"fmt"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
)
var MaxFuzziness = 2
func NewFuzzySearcher(indexReader index.IndexReader, term string,
prefix, fuzziness int, field string, boost float64,
options search.SearcherOptions) (search.Searcher, error) {
if fuzziness > MaxFuzziness {
return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
}
// Note: we don't byte slice the term for a prefix because of runes.
prefixTerm := ""
for i, r := range term {
@ -31,7 +40,6 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
break
}
}
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
field, prefixTerm)
if err != nil {
@ -49,6 +57,26 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
if len(prefixTerm) > 0 {
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
} else {
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
if err != nil {
return rv, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv = append(rv, tfd.Term)
tfd, err = fieldDict.Next()
}
return rv, err
}
fieldDict, err = indexReader.FieldDict(field)
}
defer func() {
@ -58,9 +86,12 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
}()
// enumerate terms and check levenshtein distance
var reuse []int
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness)
var ld int
var exceeded bool
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
if !exceeded && ld <= fuzziness {
rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) {

View file

@ -24,10 +24,12 @@ import (
func NewGeoPointDistanceSearcher(indexReader index.IndexReader, centerLon,
centerLat, dist float64, field string, boost float64,
options search.SearcherOptions) (search.Searcher, error) {
// compute bounding box containing the circle
topLeftLon, topLeftLat, bottomRightLon, bottomRightLat :=
geo.ComputeBoundingBox(centerLon, centerLat, dist)
topLeftLon, topLeftLat, bottomRightLon, bottomRightLat, err :=
geo.RectFromPointDistance(centerLon, centerLat, dist)
if err != nil {
return nil, err
}
// build a searcher for the box
boxSearcher, err := boxSearcher(indexReader,

View file

@ -15,11 +15,21 @@
package searcher
import (
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeMatchAllSearcher int
func init() {
var mas MatchAllSearcher
reflectStaticSizeMatchAllSearcher = int(reflect.TypeOf(mas).Size())
}
type MatchAllSearcher struct {
indexReader index.IndexReader
reader index.DocIDReader
@ -46,6 +56,12 @@ func NewMatchAllSearcher(indexReader index.IndexReader, boost float64, options s
}, nil
}
func (s *MatchAllSearcher) Size() int {
return reflectStaticSizeMatchAllSearcher + size.SizeOfPtr +
s.reader.Size() +
s.scorer.Size()
}
func (s *MatchAllSearcher) Count() uint64 {
return s.count
}

View file

@ -15,10 +15,20 @@
package searcher
import (
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeMatchNoneSearcher int
func init() {
var mns MatchNoneSearcher
reflectStaticSizeMatchNoneSearcher = int(reflect.TypeOf(mns).Size())
}
type MatchNoneSearcher struct {
indexReader index.IndexReader
}
@ -29,6 +39,10 @@ func NewMatchNoneSearcher(indexReader index.IndexReader) (*MatchNoneSearcher, er
}, nil
}
func (s *MatchNoneSearcher) Size() int {
return reflectStaticSizeMatchNoneSearcher + size.SizeOfPtr
}
func (s *MatchNoneSearcher) Count() uint64 {
return uint64(0)
}

View file

@ -17,6 +17,7 @@ package searcher
import (
"bytes"
"math"
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/numeric"
@ -55,6 +56,17 @@ func NewNumericRangeSearcher(indexReader index.IndexReader,
// FIXME hard-coded precision, should match field declaration
termRanges := splitInt64Range(minInt64, maxInt64, 4)
terms := termRanges.Enumerate()
if len(terms) < 1 {
// cannot return MatchNoneSearcher because of interaction with
// commit f391b991c20f02681bacd197afc6d8aed444e132
return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options,
true)
}
var err error
terms, err = filterCandidateTerms(indexReader, terms, field)
if err != nil {
return nil, err
}
if tooManyClauses(len(terms)) {
return nil, tooManyClausesErr()
}
@ -63,6 +75,51 @@ func NewNumericRangeSearcher(indexReader index.IndexReader,
true)
}
func filterCandidateTerms(indexReader index.IndexReader,
terms [][]byte, field string) (rv [][]byte, err error) {
if ir, ok := indexReader.(index.IndexReaderOnly); ok {
fieldDict, err := ir.FieldDictOnly(field, terms, false)
if err != nil {
return nil, err
}
// enumerate the terms (no need to check them again)
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv = append(rv, []byte(tfd.Term))
tfd, err = fieldDict.Next()
}
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
return rv, err
}
fieldDict, err := indexReader.FieldDictRange(field, terms[0], terms[len(terms)-1])
if err != nil {
return nil, err
}
// enumerate the terms and check against list of terms
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
termBytes := []byte(tfd.Term)
i := sort.Search(len(terms), func(i int) bool { return bytes.Compare(terms[i], termBytes) >= 0 })
if i < len(terms) && bytes.Compare(terms[i], termBytes) == 0 {
rv = append(rv, terms[i])
}
terms = terms[i:]
tfd, err = fieldDict.Next()
}
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
return rv, err
}
type termRange struct {
startTerm []byte
endTerm []byte

View file

@ -17,21 +17,52 @@ package searcher
import (
"fmt"
"math"
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizePhraseSearcher int
func init() {
var ps PhraseSearcher
reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size())
}
type PhraseSearcher struct {
indexReader index.IndexReader
mustSearcher *ConjunctionSearcher
queryNorm float64
currMust *search.DocumentMatch
slop int
terms [][]string
path phrasePath
paths []phrasePath
locations []search.Location
initialized bool
}
func (s *PhraseSearcher) Size() int {
sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr
if s.mustSearcher != nil {
sizeInBytes += s.mustSearcher.Size()
}
if s.currMust != nil {
sizeInBytes += s.currMust.Size()
}
for _, entry := range s.terms {
sizeInBytes += size.SizeOfSlice
for _, entry1 := range entry {
sizeInBytes += size.SizeOfString + len(entry1)
}
}
return sizeInBytes
}
func NewPhraseSearcher(indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
// turn flat terms []string into [][]string
mterms := make([][]string, len(terms))
@ -96,7 +127,6 @@ func NewMultiPhraseSearcher(indexReader index.IndexReader, terms [][]string, fie
// build our searcher
rv := PhraseSearcher{
indexReader: indexReader,
mustSearcher: mustSearcher,
terms: terms,
}
@ -133,6 +163,9 @@ func (s *PhraseSearcher) advanceNextMust(ctx *search.SearchContext) error {
var err error
if s.mustSearcher != nil {
if s.currMust != nil {
ctx.DocumentMatchPool.Put(s.currMust)
}
s.currMust, err = s.mustSearcher.Next(ctx)
if err != nil {
return err
@ -182,43 +215,59 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch,
// also satisfies the phase constraints. if so, it returns a DocumentMatch
// for this document, otherwise nil
func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch {
rvftlm := make(search.FieldTermLocationMap, 0)
freq := 0
s.locations = s.currMust.Complete(s.locations)
locations := s.currMust.Locations
s.currMust.Locations = nil
ftls := s.currMust.FieldTermLocations
// typically we would expect there to only actually be results in
// one field, but we allow for this to not be the case
// but, we note that phrase constraints can only be satisfied within
// a single field, so we can check them each independently
for field, tlm := range s.currMust.Locations {
f, rvtlm := s.checkCurrMustMatchField(ctx, tlm)
if f > 0 {
freq += f
rvftlm[field] = rvtlm
}
for field, tlm := range locations {
ftls = s.checkCurrMustMatchField(ctx, field, tlm, ftls)
}
if freq > 0 {
if len(ftls) > 0 {
// return match
rv := s.currMust
rv.Locations = rvftlm
s.currMust = nil
rv.FieldTermLocations = ftls
return rv
}
return nil
}
// checkCurrMustMatchField is soley concerned with determining if one particular
// field within the currMust DocumentMatch Locations satisfies the phase
// constraints (possibly more than once). if so, the number of times it was
// satisfied, and these locations are returned. otherwise 0 and either
// a nil or empty TermLocationMap
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) (int, search.TermLocationMap) {
paths := findPhrasePaths(0, nil, s.terms, tlm, nil, 0)
rv := make(search.TermLocationMap, len(s.terms))
for _, p := range paths {
p.MergeInto(rv)
// checkCurrMustMatchField is soley concerned with determining if one
// particular field within the currMust DocumentMatch Locations
// satisfies the phase constraints (possibly more than once). if so,
// the matching field term locations are appended to the provided
// slice
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
field string, tlm search.TermLocationMap,
ftls []search.FieldTermLocation) []search.FieldTermLocation {
if s.path == nil {
s.path = make(phrasePath, 0, len(s.terms))
}
return len(paths), rv
s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0])
for _, p := range s.paths {
for _, pp := range p {
ftls = append(ftls, search.FieldTermLocation{
Field: field,
Term: pp.term,
Location: search.Location{
Pos: pp.loc.Pos,
Start: pp.loc.Start,
End: pp.loc.End,
ArrayPositions: pp.loc.ArrayPositions,
},
})
}
}
return ftls
}
type phrasePart struct {
@ -226,7 +275,11 @@ type phrasePart struct {
loc *search.Location
}
type phrasePath []*phrasePart
func (p *phrasePart) String() string {
return fmt.Sprintf("[%s %v]", p.term, p.loc)
}
type phrasePath []phrasePart
func (p phrasePath) MergeInto(in search.TermLocationMap) {
for _, pp := range p {
@ -234,24 +287,51 @@ func (p phrasePath) MergeInto(in search.TermLocationMap) {
}
}
// findPhrasePaths is a function to identify phase matches from a set of known
// term locations. the implementation is recursive, so care must be taken
// with arguments and return values.
func (p phrasePath) String() string {
rv := "["
for i, pp := range p {
if i > 0 {
rv += ", "
}
rv += pp.String()
}
rv += "]"
return rv
}
// findPhrasePaths is a function to identify phase matches from a set
// of known term locations. it recursive so care must be taken with
// arguments and return values.
//
// prev - the previous location, nil on first invocation
// phraseTerms - slice containing the phrase terms themselves
// prevPos - the previous location, 0 on first invocation
// ap - array positions of the first candidate phrase part to
// which further recursive phrase parts must match,
// nil on initial invocation or when there are no array positions
// phraseTerms - slice containing the phrase terms,
// may contain empty string as placeholder (don't care)
// tlm - the Term Location Map containing all relevant term locations
// offset - the offset from the previous that this next term must match
// p - the current path being explored (appended to in recursive calls)
// this is the primary state being built during the traversal
// remainingSlop - amount of sloppiness that's allowed, which is the
// sum of the editDistances from each matching phrase part,
// where 0 means no sloppiness allowed (all editDistances must be 0),
// decremented during recursion
// rv - the final result being appended to by all the recursive calls
//
// returns slice of paths, or nil if invocation did not find any successul paths
func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath {
func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string,
tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath {
// no more terms
if len(phraseTerms) < 1 {
return []phrasePath{p}
// snapshot or copy the recursively built phrasePath p and
// append it to the rv, also optimizing by checking if next
// phrasePath item in the rv (which we're about to overwrite)
// is available for reuse
var pcopy phrasePath
if len(rv) < cap(rv) {
pcopy = rv[:len(rv)+1][len(rv)][:0]
}
return append(rv, append(pcopy, p...))
}
car := phraseTerms[0]
@ -264,13 +344,13 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s
// if prevPos was 0, don't set it to 1 (as thats not a real abs pos)
nextPos = 0 // don't advance nextPos if prevPos was 0
}
return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop)
return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop, rv)
}
var rv []phrasePath
// locations for this term
for _, carTerm := range car {
locations := tlm[carTerm]
LOCATIONS_LOOP:
for _, loc := range locations {
if prevPos != 0 && !loc.ArrayPositions.Equals(ap) {
// if the array positions are wrong, can't match, try next location
@ -283,11 +363,18 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s
dist = editDistance(prevPos+1, loc.Pos)
}
// if enough slop reamining, continue recursively
// if enough slop remaining, continue recursively
if prevPos == 0 || (remainingSlop-dist) >= 0 {
// skip if we've already used this term+loc already
for _, ppart := range p {
if ppart.term == carTerm && ppart.loc == loc {
continue LOCATIONS_LOOP
}
}
// this location works, add it to the path (but not for empty term)
px := append(p, &phrasePart{term: carTerm, loc: loc})
rv = append(rv, findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist)...)
px := append(p, phrasePart{term: carTerm, loc: loc})
rv = findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv)
}
}
}
@ -309,6 +396,15 @@ func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexIntern
return nil, err
}
}
if s.currMust != nil {
if s.currMust.IndexInternalID.Compare(ID) >= 0 {
return s.Next(ctx)
}
ctx.DocumentMatchPool.Put(s.currMust)
}
if s.currMust == nil {
return nil, nil
}
var err error
s.currMust, err = s.mustSearcher.Advance(ctx, ID)
if err != nil {

View file

@ -29,19 +29,40 @@ import (
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
prefixTerm, complete := pattern.LiteralPrefix()
var candidateTerms []string
if complete {
// there is no pattern
candidateTerms = []string{prefixTerm}
} else {
var err error
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
prefixTerm)
if ir, ok := indexReader.(index.IndexReaderRegexp); ok {
fieldDict, err := ir.FieldDictRegexp(field, []byte(pattern.String()))
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
// enumerate the terms and check against regexp
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
candidateTerms = append(candidateTerms, tfd.Term)
tfd, err = fieldDict.Next()
}
if err != nil {
return nil, err
}
} else {
prefixTerm, complete := pattern.LiteralPrefix()
if complete {
// there is no pattern
candidateTerms = []string{prefixTerm}
} else {
var err error
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
prefixTerm)
if err != nil {
return nil, err
}
}
}
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,

View file

@ -15,11 +15,21 @@
package searcher
import (
"reflect"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
)
var reflectStaticSizeTermSearcher int
func init() {
var ts TermSearcher
reflectStaticSizeTermSearcher = int(reflect.TypeOf(ts).Size())
}
type TermSearcher struct {
indexReader index.IndexReader
reader index.TermFieldReader
@ -63,6 +73,13 @@ func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field stri
}, nil
}
func (s *TermSearcher) Size() int {
return reflectStaticSizeTermSearcher + size.SizeOfPtr +
s.reader.Size() +
s.tfd.Size() +
s.scorer.Size()
}
func (s *TermSearcher) Count() uint64 {
return s.reader.Count()
}
@ -120,3 +137,13 @@ func (s *TermSearcher) Min() int {
func (s *TermSearcher) DocumentMatchPoolSize() int {
return 1
}
func (s *TermSearcher) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
o, ok := s.reader.(index.Optimizable)
if ok {
return o.Optimize(kind, octx)
}
return octx, nil
}

View file

@ -64,6 +64,10 @@ func NewTermRangeSearcher(indexReader index.IndexReader,
if !*inclusiveMin && min != nil && string(min) == terms[0] {
terms = terms[1:]
// check again, as we might have removed only entry
if len(terms) < 1 {
return NewMatchNoneSearcher(indexReader)
}
}
// if our term list included the max, it would be the last item

View file

@ -67,8 +67,8 @@ func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) {
rvd := &SortGeoDistance{
Field: field,
Desc: descending,
lon: lon,
lat: lat,
Lon: lon,
Lat: lat,
unitMult: 1.0,
}
if distUnit, ok := input["unit"].(string); ok {
@ -251,23 +251,21 @@ func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatc
}
func (so SortOrder) RequiresScore() bool {
rv := false
for _, soi := range so {
if soi.RequiresScoring() {
rv = true
return true
}
}
return rv
return false
}
func (so SortOrder) RequiresDocID() bool {
rv := false
for _, soi := range so {
if soi.RequiresDocID() {
rv = true
return true
}
}
return rv
return false
}
func (so SortOrder) RequiredFields() []string {
@ -279,7 +277,7 @@ func (so SortOrder) RequiredFields() []string {
}
func (so SortOrder) CacheIsScore() []bool {
var rv []bool
rv := make([]bool, 0, len(so))
for _, soi := range so {
rv = append(rv, soi.RequiresScoring())
}
@ -287,7 +285,7 @@ func (so SortOrder) CacheIsScore() []bool {
}
func (so SortOrder) CacheDescending() []bool {
var rv []bool
rv := make([]bool, 0, len(so))
for _, soi := range so {
rv = append(rv, soi.Descending())
}
@ -486,8 +484,7 @@ func (s *SortField) MarshalJSON() ([]byte, error) {
}
func (s *SortField) Copy() SearchSort {
var rv SortField
rv = *s
rv := *s
return &rv
}
@ -499,7 +496,6 @@ type SortDocID struct {
// UpdateVisitor is a no-op for SortDocID as it's value
// is not dependent on any field terms
func (s *SortDocID) UpdateVisitor(field string, term []byte) {
}
// Value returns the sort value of the DocumentMatch
@ -529,8 +525,7 @@ func (s *SortDocID) MarshalJSON() ([]byte, error) {
}
func (s *SortDocID) Copy() SearchSort {
var rv SortDocID
rv = *s
rv := *s
return &rv
}
@ -542,7 +537,6 @@ type SortScore struct {
// UpdateVisitor is a no-op for SortScore as it's value
// is not dependent on any field terms
func (s *SortScore) UpdateVisitor(field string, term []byte) {
}
// Value returns the sort value of the DocumentMatch
@ -572,8 +566,7 @@ func (s *SortScore) MarshalJSON() ([]byte, error) {
}
func (s *SortScore) Copy() SearchSort {
var rv SortScore
rv = *s
rv := *s
return &rv
}
@ -583,13 +576,12 @@ var maxDistance = string(numeric.MustNewPrefixCodedInt64(math.MaxInt64, 0))
// their distance from the specified point.
func NewSortGeoDistance(field, unit string, lon, lat float64, desc bool) (
*SortGeoDistance, error) {
rv := &SortGeoDistance{
Field: field,
Desc: desc,
Unit: unit,
lon: lon,
lat: lat,
Lon: lon,
Lat: lat,
}
var err error
rv.unitMult, err = geo.ParseDistanceUnit(unit)
@ -608,8 +600,8 @@ type SortGeoDistance struct {
Desc bool
Unit string
values []string
lon float64
lat float64
Lon float64
Lat float64
unitMult float64
}
@ -640,7 +632,7 @@ func (s *SortGeoDistance) Value(i *DocumentMatch) string {
docLon := geo.MortonUnhashLon(uint64(i64))
docLat := geo.MortonUnhashLat(uint64(i64))
dist := geo.Haversin(s.lon, s.lat, docLon, docLat)
dist := geo.Haversin(s.Lon, s.Lat, docLon, docLat)
// dist is returned in km, so convert to m
dist *= 1000
if s.unitMult != 0 {
@ -690,8 +682,8 @@ func (s *SortGeoDistance) MarshalJSON() ([]byte, error) {
"by": "geo_distance",
"field": s.Field,
"location": map[string]interface{}{
"lon": s.lon,
"lat": s.lat,
"lon": s.Lon,
"lat": s.Lat,
},
}
if s.Unit != "" {
@ -705,7 +697,6 @@ func (s *SortGeoDistance) MarshalJSON() ([]byte, error) {
}
func (s *SortGeoDistance) Copy() SearchSort {
var rv SortGeoDistance
rv = *s
rv := *s
return &rv
}

View file

@ -40,3 +40,30 @@ func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap {
}
return rv
}
func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation {
n := len(dest)
for _, dm := range matches {
n += len(dm.FieldTermLocations)
}
if cap(dest) < n {
dest = append(make([]FieldTermLocation, 0, n), dest...)
}
for _, dm := range matches {
for _, ftl := range dm.FieldTermLocations {
dest = append(dest, FieldTermLocation{
Field: ftl.Field,
Term: ftl.Term,
Location: Location{
Pos: ftl.Location.Pos,
Start: ftl.Location.Start,
End: ftl.Location.End,
ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...),
},
})
}
}
return dest
}

59
vendor/github.com/blevesearch/bleve/size/sizes.go generated vendored Normal file
View file

@ -0,0 +1,59 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package size
import (
"reflect"
)
func init() {
var b bool
SizeOfBool = int(reflect.TypeOf(b).Size())
var f32 float32
SizeOfFloat32 = int(reflect.TypeOf(f32).Size())
var f64 float64
SizeOfFloat64 = int(reflect.TypeOf(f64).Size())
var i int
SizeOfInt = int(reflect.TypeOf(i).Size())
var m map[int]int
SizeOfMap = int(reflect.TypeOf(m).Size())
var ptr *int
SizeOfPtr = int(reflect.TypeOf(ptr).Size())
var slice []int
SizeOfSlice = int(reflect.TypeOf(slice).Size())
var str string
SizeOfString = int(reflect.TypeOf(str).Size())
var u8 uint8
SizeOfUint8 = int(reflect.TypeOf(u8).Size())
var u16 uint16
SizeOfUint16 = int(reflect.TypeOf(u16).Size())
var u32 uint32
SizeOfUint32 = int(reflect.TypeOf(u32).Size())
var u64 uint64
SizeOfUint64 = int(reflect.TypeOf(u64).Size())
}
var SizeOfBool int
var SizeOfFloat32 int
var SizeOfFloat64 int
var SizeOfInt int
var SizeOfMap int
var SizeOfPtr int
var SizeOfSlice int
var SizeOfString int
var SizeOfUint8 int
var SizeOfUint16 int
var SizeOfUint32 int
var SizeOfUint64 int