Switch to bbolt

This commit is contained in:
Ken-Håvard Lieng 2020-04-23 01:06:36 +02:00
parent 360bed00f9
commit 77543e3aed
617 changed files with 68468 additions and 97867 deletions

View file

@ -3,10 +3,9 @@ sudo: false
language: go
go:
- 1.7.x
- 1.8.x
- 1.9.x
- "1.10"
- "1.12.x"
- "1.13.x"
- "1.14.x"
script:
- go get golang.org/x/tools/cmd/cover
@ -14,9 +13,10 @@ script:
- go get github.com/kisielk/errcheck
- go get -u github.com/FiloSottile/gvt
- gvt restore
- go test -v $(go list ./... | grep -v vendor/)
- go test -race -v $(go list ./... | grep -v vendor/)
- go vet $(go list ./... | grep -v vendor/)
- errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/)
- go test ./test -v -indexType scorch
- errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/);
- docs/project-code-coverage.sh
- docs/build_children.sh

View file

@ -0,0 +1,49 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/english"
)
const SnowballStemmerName = "stemmer_en_snowball"
type EnglishStemmerFilter struct {
}
func NewEnglishStemmerFilter() *EnglishStemmerFilter {
return &EnglishStemmerFilter{}
}
func (s *EnglishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
english.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func EnglishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewEnglishStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor)
}

View file

@ -86,6 +86,10 @@ func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
return fieldLength, tokenFreqs
}
func (t *TextField) Analyzer() *analysis.Analyzer {
return t.analyzer
}
func (t *TextField) Value() []byte {
return t.value
}

View file

@ -33,10 +33,18 @@ var minLonRad = minLon * degreesToRadian
var minLatRad = minLat * degreesToRadian
var maxLonRad = maxLon * degreesToRadian
var maxLatRad = maxLat * degreesToRadian
var geoTolerance = 1E-6
var geoTolerance = 1e-6
var lonScale = float64((uint64(0x1)<<GeoBits)-1) / 360.0
var latScale = float64((uint64(0x1)<<GeoBits)-1) / 180.0
var geoHashMaxLength = 12
// Point represents a geo point.
type Point struct {
Lon float64 `json:"lon"`
Lat float64 `json:"lat"`
}
// MortonHash computes the morton hash value for the provided geo point
// This point is ordered as lon, lat.
func MortonHash(lon, lat float64) uint64 {
@ -168,3 +176,35 @@ func checkLongitude(longitude float64) error {
}
return nil
}
func BoundingRectangleForPolygon(polygon []Point) (
float64, float64, float64, float64, error) {
err := checkLongitude(polygon[0].Lon)
if err != nil {
return 0, 0, 0, 0, err
}
err = checkLatitude(polygon[0].Lat)
if err != nil {
return 0, 0, 0, 0, err
}
maxY, minY := polygon[0].Lat, polygon[0].Lat
maxX, minX := polygon[0].Lon, polygon[0].Lon
for i := 1; i < len(polygon); i++ {
err := checkLongitude(polygon[i].Lon)
if err != nil {
return 0, 0, 0, 0, err
}
err = checkLatitude(polygon[i].Lat)
if err != nil {
return 0, 0, 0, 0, err
}
maxY = math.Max(maxY, polygon[i].Lat)
minY = math.Min(minY, polygon[i].Lat)
maxX = math.Max(maxX, polygon[i].Lon)
minX = math.Min(minX, polygon[i].Lon)
}
return minX, maxY, maxX, minY, nil
}

111
vendor/github.com/blevesearch/bleve/geo/geohash.go generated vendored Normal file
View file

@ -0,0 +1,111 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This implementation is inspired from the geohash-js
// ref: https://github.com/davetroy/geohash-js
package geo
// encoding encapsulates an encoding defined by a given base32 alphabet.
type encoding struct {
enc string
dec [256]byte
}
// newEncoding constructs a new encoding defined by the given alphabet,
// which must be a 32-byte string.
func newEncoding(encoder string) *encoding {
e := new(encoding)
e.enc = encoder
for i := 0; i < len(e.dec); i++ {
e.dec[i] = 0xff
}
for i := 0; i < len(encoder); i++ {
e.dec[encoder[i]] = byte(i)
}
return e
}
// base32encoding with the Geohash alphabet.
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz")
var masks = []uint64{16, 8, 4, 2, 1}
// DecodeGeoHash decodes the string geohash faster with
// higher precision. This api is in experimental phase.
func DecodeGeoHash(geoHash string) (float64, float64) {
even := true
lat := []float64{-90.0, 90.0}
lon := []float64{-180.0, 180.0}
for i := 0; i < len(geoHash); i++ {
cd := uint64(base32encoding.dec[geoHash[i]])
for j := 0; j < 5; j++ {
if even {
if cd&masks[j] > 0 {
lon[0] = (lon[0] + lon[1]) / 2
} else {
lon[1] = (lon[0] + lon[1]) / 2
}
} else {
if cd&masks[j] > 0 {
lat[0] = (lat[0] + lat[1]) / 2
} else {
lat[1] = (lat[0] + lat[1]) / 2
}
}
even = !even
}
}
return (lat[0] + lat[1]) / 2, (lon[0] + lon[1]) / 2
}
func EncodeGeoHash(lat, lon float64) string {
even := true
lats := []float64{-90.0, 90.0}
lons := []float64{-180.0, 180.0}
precision := 12
var ch, bit uint64
var geoHash string
for len(geoHash) < precision {
if even {
mid := (lons[0] + lons[1]) / 2
if lon > mid {
ch |= masks[bit]
lons[0] = mid
} else {
lons[1] = mid
}
} else {
mid := (lats[0] + lats[1]) / 2
if lat > mid {
ch |= masks[bit]
lats[0] = mid
} else {
lats[1] = mid
}
}
even = !even
if bit < 4 {
bit++
} else {
geoHash += string(base32encoding.enc[ch])
ch = 0
bit = 0
}
}
return geoHash
}

View file

@ -16,6 +16,7 @@ package geo
import (
"reflect"
"strconv"
"strings"
)
@ -24,6 +25,8 @@ import (
// Container:
// slice length 2 (GeoJSON)
// first element lon, second element lat
// string (coordinates separated by comma, or a geohash)
// first element lat, second element lon
// map[string]interface{}
// exact keys lat and lon or lng
// struct
@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
var foundLon, foundLat bool
thingVal := reflect.ValueOf(thing)
if !thingVal.IsValid() {
return lon, lat, false
}
thingTyp := thingVal.Type()
// is it a slice
if thingVal.IsValid() && thingVal.Kind() == reflect.Slice {
if thingVal.Kind() == reflect.Slice {
// must be length 2
if thingVal.Len() == 2 {
first := thingVal.Index(0)
@ -55,6 +62,37 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
}
}
// is it a string
if thingVal.Kind() == reflect.String {
geoStr := thingVal.Interface().(string)
if strings.Contains(geoStr, ",") {
// geo point with coordinates split by comma
points := strings.Split(geoStr, ",")
for i, point := range points {
// trim any leading or trailing white spaces
points[i] = strings.TrimSpace(point)
}
if len(points) == 2 {
var err error
lat, err = strconv.ParseFloat(points[0], 64)
if err == nil {
foundLat = true
}
lon, err = strconv.ParseFloat(points[1], 64)
if err == nil {
foundLon = true
}
}
} else {
// geohash
if len(geoStr) <= geoHashMaxLength {
lat, lon = DecodeGeoHash(geoStr)
foundLat = true
foundLon = true
}
}
}
// is it a map
if l, ok := thing.(map[string]interface{}); ok {
if lval, ok := l["lon"]; ok {
@ -68,7 +106,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
}
// now try reflection on struct fields
if thingVal.IsValid() && thingVal.Kind() == reflect.Struct {
if thingVal.Kind() == reflect.Struct {
for i := 0; i < thingVal.NumField(); i++ {
fieldName := thingTyp.Field(i).Name
if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
@ -113,6 +151,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
// extract numeric value (if possible) and returns a float64
func extractNumericVal(v interface{}) (float64, bool) {
val := reflect.ValueOf(v)
if !val.IsValid() {
return 0, false
}
typ := val.Type()
switch typ.Kind() {
case reflect.Float32, reflect.Float64:

25
vendor/github.com/blevesearch/bleve/go.mod generated vendored Normal file
View file

@ -0,0 +1,25 @@
module github.com/blevesearch/bleve
go 1.13
require (
github.com/RoaringBitmap/roaring v0.4.21
github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040
github.com/blevesearch/go-porterstemmer v1.0.3
github.com/blevesearch/segment v0.9.0
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/zap/v11 v11.0.7
github.com/blevesearch/zap/v12 v12.0.7
github.com/couchbase/ghistogram v0.1.0 // indirect
github.com/couchbase/moss v0.1.0
github.com/couchbase/vellum v1.0.1
github.com/golang/protobuf v1.3.2
github.com/kljensen/snowball v0.6.0
github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563
github.com/spf13/cobra v0.0.5
github.com/steveyen/gtreap v0.1.0
github.com/syndtr/goleveldb v1.0.0
github.com/willf/bitset v1.1.10
go.etcd.io/bbolt v1.3.4
golang.org/x/text v0.3.0
)

View file

@ -117,6 +117,26 @@ func (b *Batch) String() string {
// be re-used in the future.
func (b *Batch) Reset() {
b.internal.Reset()
b.lastDocSize = 0
b.totalSize = 0
}
func (b *Batch) Merge(o *Batch) {
if o != nil && o.internal != nil {
b.internal.Merge(o.internal)
if o.LastDocSize() > 0 {
b.lastDocSize = o.LastDocSize()
}
b.totalSize = uint64(b.internal.TotalDocSize())
}
}
func (b *Batch) SetPersistedCallback(f index.BatchCallback) {
b.internal.SetPersistedCallback(f)
}
func (b *Batch) PersistedCallback() index.BatchCallback {
return b.internal.PersistedCallback()
}
// An Index implements all the indexing and searching

View file

@ -98,18 +98,33 @@ type IndexReader interface {
Close() error
}
// The Regexp interface defines the subset of the regexp.Regexp API
// methods that are used by bleve indexes, allowing callers to pass in
// alternate implementations.
type Regexp interface {
FindStringIndex(s string) (loc []int)
LiteralPrefix() (prefix string, complete bool)
String() string
}
type IndexReaderRegexp interface {
FieldDictRegexp(field string, regex []byte) (FieldDict, error)
FieldDictRegexp(field string, regex string) (FieldDict, error)
}
type IndexReaderFuzzy interface {
FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
}
type IndexReaderOnly interface {
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
}
type IndexReaderContains interface {
FieldDictContains(field string) (FieldDictContains, error)
}
// FieldTerms contains the terms used by a document, keyed by field
type FieldTerms map[string][]string
@ -219,6 +234,10 @@ type FieldDict interface {
Close() error
}
type FieldDictContains interface {
Contains(key []byte) (bool, error)
}
// DocIDReader is the interface exposing enumeration of documents identifiers.
// Close the reader to release associated resources.
type DocIDReader interface {
@ -237,9 +256,12 @@ type DocIDReader interface {
Close() error
}
type BatchCallback func(error)
type Batch struct {
IndexOps map[string]*document.Document
InternalOps map[string][]byte
IndexOps map[string]*document.Document
InternalOps map[string][]byte
persistedCallback BatchCallback
}
func NewBatch() *Batch {
@ -265,6 +287,14 @@ func (b *Batch) DeleteInternal(key []byte) {
b.InternalOps[string(key)] = nil
}
func (b *Batch) SetPersistedCallback(f BatchCallback) {
b.persistedCallback = f
}
func (b *Batch) PersistedCallback() BatchCallback {
return b.persistedCallback
}
func (b *Batch) String() string {
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
for k, v := range b.IndexOps {
@ -287,6 +317,27 @@ func (b *Batch) String() string {
func (b *Batch) Reset() {
b.IndexOps = make(map[string]*document.Document)
b.InternalOps = make(map[string][]byte)
b.persistedCallback = nil
}
func (b *Batch) Merge(o *Batch) {
for k, v := range o.IndexOps {
b.IndexOps[k] = v
}
for k, v := range o.InternalOps {
b.InternalOps[k] = v
}
}
func (b *Batch) TotalDocSize() int {
var s int
for k, v := range b.IndexOps {
if v != nil {
s += v.Size() + size.SizeOfString
}
s += len(k)
}
return s
}
// Optimizable represents an optional interface that implementable by
@ -298,11 +349,19 @@ type Optimizable interface {
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
}
// Represents a result of optimization -- see the Finish() method.
type Optimized interface{}
type OptimizableContext interface {
// Once all the optimzable resources have been provided the same
// OptimizableContext instance, the optimization preparations are
// finished or completed via the Finish() method.
Finish() error
//
// Depending on the optimization being performed, the Finish()
// method might return a non-nil Optimized instance. For example,
// the Optimized instance might represent an optimized
// TermFieldReader instance.
Finish() (Optimized, error)
}
type DocValueReader interface {

View file

@ -302,7 +302,7 @@ Map local bitsets into global number space (global meaning cross-segment but sti
IndexSnapshot already should have mapping something like:
0 - Offset 0
1 - Offset 3 (because segment 0 had 3 docs)
2 - Offset 4 (becuase segment 1 had 1 doc)
2 - Offset 4 (because segment 1 had 1 doc)
This maps to search result bitset:

View file

@ -19,6 +19,7 @@ import (
"sync/atomic"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
)
@ -29,8 +30,9 @@ type segmentIntroduction struct {
ids []string
internal map[string][]byte
applied chan error
persisted chan error
applied chan error
persisted chan error
persistedCallback index.BatchCallback
}
type persistIntroduction struct {
@ -74,11 +76,6 @@ OUTER:
case persist := <-s.persists:
s.introducePersist(persist)
case revertTo := <-s.revertToSnapshots:
err := s.revertToSnapshot(revertTo)
if err != nil {
continue OUTER
}
}
var epochCurr uint64
@ -107,8 +104,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
s.rootLock.RLock()
root := s.root
root.AddRef()
s.rootLock.RUnlock()
defer func() { _ = root.DecRef() }()
nsegs := len(root.segment)
// prepare new index snapshot
@ -123,6 +123,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
// iterate through current segments
var running uint64
var docsToPersistCount, memSegments, fileSegments uint64
for i := range root.segment {
// see if optimistic work included this segment
delta, ok := next.obsoletes[root.segment[i].id]
@ -161,8 +162,19 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
newSnapshot.offsets = append(newSnapshot.offsets, running)
running += newss.segment.Count()
}
if isMemorySegment(root.segment[i]) {
docsToPersistCount += root.segment[i].Count()
memSegments++
} else {
fileSegments++
}
}
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
// append new segment, if any, to end of the new index snapshot
if next.data != nil {
newSegmentSnapshot := &SegmentSnapshot{
@ -197,6 +209,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
if next.persisted != nil {
s.rootPersisted = append(s.rootPersisted, next.persisted)
}
if next.persistedCallback != nil {
s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback)
}
// swap in new index snapshot
newSnapshot.epoch = s.nextSnapshotEpoch
s.nextSnapshotEpoch++
@ -221,10 +236,13 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
s.rootLock.Lock()
root := s.root
root.AddRef()
nextSnapshotEpoch := s.nextSnapshotEpoch
s.nextSnapshotEpoch++
s.rootLock.Unlock()
defer func() { _ = root.DecRef() }()
newIndexSnapshot := &IndexSnapshot{
parent: s,
epoch: nextSnapshotEpoch,
@ -235,6 +253,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
creator: "introducePersist",
}
var docsToPersistCount, memSegments, fileSegments uint64
for i, segmentSnapshot := range root.segment {
// see if this segment has been replaced
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok {
@ -251,9 +270,17 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
// update items persisted incase of a new segment snapshot
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count())
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
fileSegments++
} else {
newIndexSnapshot.segment[i] = root.segment[i]
newIndexSnapshot.segment[i].segment.AddRef()
if isMemorySegment(root.segment[i]) {
docsToPersistCount += root.segment[i].Count()
memSegments++
} else {
fileSegments++
}
}
newIndexSnapshot.offsets[i] = root.offsets[i]
}
@ -262,6 +289,9 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
newIndexSnapshot.internal[k] = v
}
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
newIndexSnapshot.updateSize()
s.rootLock.Lock()
rootPrev := s.root
@ -276,14 +306,19 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
close(persist.applied)
}
// The introducer should definitely handle the segmentMerge.notify
// channel before exiting the introduceMerge.
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1)
s.rootLock.RLock()
root := s.root
root.AddRef()
s.rootLock.RUnlock()
defer func() { _ = root.DecRef() }()
newSnapshot := &IndexSnapshot{
parent: s,
internal: root.internal,
@ -293,7 +328,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
// iterate through current segments
newSegmentDeleted := roaring.NewBitmap()
var running uint64
var running, docsToPersistCount, memSegments, fileSegments uint64
for i := range root.segment {
segmentID := root.segment[i].id
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
@ -329,7 +364,15 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
root.segment[i].segment.AddRef()
newSnapshot.offsets = append(newSnapshot.offsets, running)
running += root.segment[i].segment.Count()
if isMemorySegment(root.segment[i]) {
docsToPersistCount += root.segment[i].Count()
memSegments++
} else {
fileSegments++
}
}
}
// before the newMerge introduction, need to clean the newly
@ -360,8 +403,20 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
})
newSnapshot.offsets = append(newSnapshot.offsets, running)
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
switch nextMerge.new.(type) {
case segment.PersistedSegment:
fileSegments++
default:
docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality()
memSegments++
}
}
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
newSnapshot.updateSize()
@ -384,65 +439,11 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
close(nextMerge.notify)
}
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1)
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1)
if revertTo.snapshot == nil {
err := fmt.Errorf("Cannot revert to a nil snapshot")
revertTo.applied <- err
return err
func isMemorySegment(s *SegmentSnapshot) bool {
switch s.segment.(type) {
case segment.PersistedSegment:
return false
default:
return true
}
// acquire lock
s.rootLock.Lock()
// prepare a new index snapshot, based on next snapshot
newSnapshot := &IndexSnapshot{
parent: s,
segment: make([]*SegmentSnapshot, len(revertTo.snapshot.segment)),
offsets: revertTo.snapshot.offsets,
internal: revertTo.snapshot.internal,
epoch: s.nextSnapshotEpoch,
refs: 1,
creator: "revertToSnapshot",
}
s.nextSnapshotEpoch++
// iterate through segments
for i, segmentSnapshot := range revertTo.snapshot.segment {
newSnapshot.segment[i] = &SegmentSnapshot{
id: segmentSnapshot.id,
segment: segmentSnapshot.segment,
deleted: segmentSnapshot.deleted,
cachedDocs: segmentSnapshot.cachedDocs,
creator: segmentSnapshot.creator,
}
newSnapshot.segment[i].segment.AddRef()
// remove segment from ineligibleForRemoval map
filename := zapFileName(segmentSnapshot.id)
delete(s.ineligibleForRemoval, filename)
}
if revertTo.persisted != nil {
s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
}
newSnapshot.updateSize()
// swap in new snapshot
rootPrev := s.root
s.root = newSnapshot
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
// release lock
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
close(revertTo.applied)
return nil
}

View file

@ -18,13 +18,13 @@ import (
"encoding/json"
"fmt"
"os"
"strings"
"sync/atomic"
"time"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/mergeplan"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
)
func (s *Scorch) mergerLoop() {
@ -46,12 +46,12 @@ OUTER:
default:
// check to see if there is a new snapshot to persist
s.rootLock.RLock()
s.rootLock.Lock()
ourSnapshot := s.root
ourSnapshot.AddRef()
atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size()))
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch)
s.rootLock.RUnlock()
s.rootLock.Unlock()
if ourSnapshot.epoch != lastEpochMergePlanned {
startTime := time.Now()
@ -60,7 +60,7 @@ OUTER:
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
if err != nil {
atomic.StoreUint64(&s.iStats.mergeEpoch, 0)
if err == ErrClosed {
if err == segment.ErrClosed {
// index has been closed
_ = ourSnapshot.DecRef()
break OUTER
@ -130,18 +130,18 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions,
func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
options *mergeplan.MergePlanOptions) error {
// build list of zap segments in this snapshot
var onlyZapSnapshots []mergeplan.Segment
// build list of persisted segments in this snapshot
var onlyPersistedSnapshots []mergeplan.Segment
for _, segmentSnapshot := range ourSnapshot.segment {
if _, ok := segmentSnapshot.segment.(*zap.Segment); ok {
onlyZapSnapshots = append(onlyZapSnapshots, segmentSnapshot)
if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok {
onlyPersistedSnapshots = append(onlyPersistedSnapshots, segmentSnapshot)
}
}
atomic.AddUint64(&s.stats.TotFileMergePlan, 1)
// give this list to the planner
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
resultMergePlan, err := mergeplan.Plan(onlyPersistedSnapshots, options)
if err != nil {
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1)
return fmt.Errorf("merge planning err: %v", err)
@ -151,13 +151,13 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
return nil
}
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
// process tasks in serial for now
var notifications []chan *IndexSnapshot
var filenames []string
for _, task := range resultMergePlan.Tasks {
if len(task.Segments) == 0 {
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
@ -168,26 +168,32 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
oldMap := make(map[uint64]*SegmentSnapshot)
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
segmentsToMerge := make([]segment.Segment, 0, len(task.Segments))
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
for _, planSegment := range task.Segments {
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
oldMap[segSnapshot.id] = segSnapshot
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
if persistedSeg, ok := segSnapshot.segment.(segment.PersistedSegment); ok {
if segSnapshot.LiveSize() == 0 {
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1)
oldMap[segSnapshot.id] = nil
} else {
segmentsToMerge = append(segmentsToMerge, zapSeg)
segmentsToMerge = append(segmentsToMerge, segSnapshot.segment)
docsToDrop = append(docsToDrop, segSnapshot.deleted)
}
// track the files getting merged for unsetting the
// removal ineligibility. This helps to unflip files
// even with fast merger, slow persister work flows.
path := persistedSeg.Path()
filenames = append(filenames,
strings.TrimPrefix(path, s.path+string(os.PathSeparator)))
}
}
}
var oldNewDocNums map[uint64][]uint64
var segment segment.Segment
var seg segment.Segment
if len(segmentsToMerge) > 0 {
filename := zapFileName(newSegmentID)
s.markIneligibleForRemoval(filename)
@ -196,9 +202,9 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
fileMergeZapStartTime := time.Now()
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor)
newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path,
s.closeCh, s)
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes)
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime)
@ -209,10 +215,13 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
if err != nil {
s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
if err == segment.ErrClosed {
return err
}
return fmt.Errorf("merging failed: %v", err)
}
segment, err = zap.Open(path)
seg, err = s.segPlugin.Open(path)
if err != nil {
s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
@ -230,33 +239,41 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
id: newSegmentID,
old: oldMap,
oldNewDocNums: oldNewDocNums,
new: segment,
notify: make(chan *IndexSnapshot, 1),
new: seg,
notify: make(chan *IndexSnapshot),
}
notifications = append(notifications, sm.notify)
// give it to the introducer
select {
case <-s.closeCh:
_ = segment.Close()
return ErrClosed
_ = seg.Close()
return segment.ErrClosed
case s.merges <- sm:
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1)
}
introStartTime := time.Now()
// it is safe to blockingly wait for the merge introduction
// here as the introducer is bound to handle the notify channel.
newSnapshot := <-sm.notify
introTime := uint64(time.Since(introStartTime))
atomic.AddUint64(&s.stats.TotFileMergeZapIntroductionTime, introTime)
if atomic.LoadUint64(&s.stats.MaxFileMergeZapIntroductionTime) < introTime {
atomic.StoreUint64(&s.stats.MaxFileMergeZapIntroductionTime, introTime)
}
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
if newSnapshot != nil {
_ = newSnapshot.DecRef()
}
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1)
}
for _, notification := range notifications {
select {
case <-s.closeCh:
return ErrClosed
case newSnapshot := <-notification:
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
if newSnapshot != nil {
_ = newSnapshot.DecRef()
}
}
// once all the newly merged segment introductions are done,
// its safe to unflip the removal ineligibility for the replaced
// older segments
for _, f := range filenames {
s.unmarkIneligibleForRemoval(f)
}
return nil
@ -274,8 +291,8 @@ type segmentMerge struct {
// persisted segment, and synchronously introduce that new segment
// into the root
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
chunkFactor uint32) (*IndexSnapshot, uint64, error) {
sbs []segment.Segment, sbsDrops []*roaring.Bitmap,
sbsIndexes []int) (*IndexSnapshot, uint64, error) {
atomic.AddUint64(&s.stats.TotMemMergeBeg, 1)
memMergeZapStartTime := time.Now()
@ -287,7 +304,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
path := s.path + string(os.PathSeparator) + filename
newDocNums, _, err :=
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor)
s.segPlugin.Merge(sbs, sbsDrops, path, s.closeCh, s)
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)
@ -302,22 +319,22 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
return nil, 0, err
}
segment, err := zap.Open(path)
seg, err := s.segPlugin.Open(path)
if err != nil {
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
return nil, 0, err
}
// update persisted stats
atomic.AddUint64(&s.stats.TotPersistedItems, segment.Count())
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count())
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
sm := &segmentMerge{
id: newSegmentID,
old: make(map[uint64]*SegmentSnapshot),
oldNewDocNums: make(map[uint64][]uint64),
new: segment,
notify: make(chan *IndexSnapshot, 1),
new: seg,
notify: make(chan *IndexSnapshot),
}
for i, idx := range sbsIndexes {
@ -328,17 +345,20 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
select { // send to introducer
case <-s.closeCh:
_ = segment.DecRef()
return nil, 0, ErrClosed
_ = seg.DecRef()
return nil, 0, segment.ErrClosed
case s.merges <- sm:
}
select { // wait for introduction to complete
case <-s.closeCh:
return nil, 0, ErrClosed
case newSnapshot := <-sm.notify:
// blockingly wait for the introduction to complete
newSnapshot := <-sm.notify
if newSnapshot != nil {
atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs)))
atomic.AddUint64(&s.stats.TotMemMergeDone, 1)
return newSnapshot, newSegmentID, nil
}
return newSnapshot, newSegmentID, nil
}
func (s *Scorch) ReportBytesWritten(bytesWritten uint64) {
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten)
}

View file

@ -217,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
if len(roster) > 0 {
rosterScore := scoreSegments(roster, o)
if len(bestRoster) <= 0 || rosterScore < bestRosterScore {
if len(bestRoster) == 0 || rosterScore < bestRosterScore {
bestRoster = roster
bestRosterScore = rosterScore
}
}
}
if len(bestRoster) <= 0 {
if len(bestRoster) == 0 {
return rv, nil
}

View file

@ -18,17 +18,37 @@ import (
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/blevesearch/bleve/index/scorch/segment"
)
func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.OptimizableContext) (
index.OptimizableContext, error) {
if kind != "conjunction" {
return octx, nil
var OptimizeConjunction = true
var OptimizeConjunctionUnadorned = true
var OptimizeDisjunctionUnadorned = true
func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
octx index.OptimizableContext) (index.OptimizableContext, error) {
if OptimizeConjunction && kind == "conjunction" {
return s.optimizeConjunction(octx)
}
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
return s.optimizeConjunctionUnadorned(octx)
}
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
return s.optimizeDisjunctionUnadorned(octx)
}
return octx, nil
}
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
// ----------------------------------------------------------------
func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
octx index.OptimizableContext) (index.OptimizableContext, error) {
if octx == nil {
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
}
@ -39,7 +59,7 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.Optimiza
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize across different snapshots")
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
}
o.tfrs = append(o.tfrs, s)
@ -53,41 +73,324 @@ type OptimizeTFRConjunction struct {
tfrs []*IndexSnapshotTermFieldReader
}
func (o *OptimizeTFRConjunction) Finish() error {
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
if len(o.tfrs) <= 1 {
return nil
return nil, nil
}
for i := range o.snapshot.segment {
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
if !ok || itr0.ActualBM == nil {
itr0, ok := o.tfrs[0].iterators[i].(segment.OptimizablePostingsIterator)
if !ok || itr0.ActualBitmap() == nil {
continue
}
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
if !ok || itr1.ActualBM == nil {
itr1, ok := o.tfrs[1].iterators[i].(segment.OptimizablePostingsIterator)
if !ok || itr1.ActualBitmap() == nil {
continue
}
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
bm := roaring.And(itr0.ActualBitmap(), itr1.ActualBitmap())
for _, tfr := range o.tfrs[2:] {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if !ok || itr.ActualBM == nil {
itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
if !ok || itr.ActualBitmap() == nil {
continue
}
bm.And(itr.ActualBM)
bm.And(itr.ActualBitmap())
}
// in this conjunction optimization, the postings iterators
// will all share the same AND'ed together actual bitmap. The
// regular conjunction searcher machinery will still be used,
// but the underlying bitmap will be smaller.
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
if ok && itr.ActualBM != nil {
itr.ActualBM = bm
itr.Actual = bm.Iterator()
itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
if ok && itr.ActualBitmap() != nil {
itr.ReplaceActual(bm)
}
}
}
return nil
return nil, nil
}
// ----------------------------------------------------------------
// An "unadorned" conjunction optimization is appropriate when
// additional or subsidiary information like freq-norm's and
// term-vectors are not required, and instead only the internal-id's
// are needed.
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
octx index.OptimizableContext) (index.OptimizableContext, error) {
if octx == nil {
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
}
o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
if !ok {
return nil, nil
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
}
o.tfrs = append(o.tfrs, s)
return o, nil
}
type OptimizeTFRConjunctionUnadorned struct {
snapshot *IndexSnapshot
tfrs []*IndexSnapshotTermFieldReader
}
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
var OptimizeTFRConjunctionUnadornedField = "*"
// Finish of an unadorned conjunction optimization will compute a
// termFieldReader with an "actual" bitmap that represents the
// constituent bitmaps AND'ed together. This termFieldReader cannot
// provide any freq-norm or termVector associated information.
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
if len(o.tfrs) <= 1 {
return nil, nil
}
// We use an artificial term and field because the optimized
// termFieldReader can represent multiple terms and fields.
oTFR := &IndexSnapshotTermFieldReader{
term: OptimizeTFRConjunctionUnadornedTerm,
field: OptimizeTFRConjunctionUnadornedField,
snapshot: o.snapshot,
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
segmentOffset: 0,
includeFreq: false,
includeNorm: false,
includeTermVectors: false,
}
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
OUTER:
for i := range o.snapshot.segment {
actualBMs = actualBMs[:0]
var docNum1HitLast uint64
var docNum1HitLastOk bool
for _, tfr := range o.tfrs {
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
// An empty postings iterator means the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
if !ok {
// We only optimize postings iterators that support this operation.
return nil, nil
}
// If the postings iterator is "1-hit" optimized, then we
// can perform several optimizations up-front here.
docNum1Hit, ok := itr.DocNum1Hit()
if ok {
if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
// The docNum1Hit doesn't match the previous
// docNum1HitLast, so the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
docNum1HitLast = docNum1Hit
docNum1HitLastOk = true
continue
}
if itr.ActualBitmap() == nil {
// An empty actual bitmap means the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
// Collect the actual bitmap for more processing later.
actualBMs = append(actualBMs, itr.ActualBitmap())
}
if docNum1HitLastOk {
// We reach here if all the 1-hit optimized posting
// iterators had the same 1-hit docNum, so we can check if
// our collected actual bitmaps also have that docNum.
for _, bm := range actualBMs {
if !bm.Contains(uint32(docNum1HitLast)) {
// The docNum1Hit isn't in one of our actual
// bitmaps, so the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
}
// The actual bitmaps and docNum1Hits all contain or have
// the same 1-hit docNum, so that's our AND'ed result.
oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFrom1Hit(docNum1HitLast)
continue OUTER
}
if len(actualBMs) == 0 {
// If we've collected no actual bitmaps at this point,
// then the entire AND is empty.
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
continue OUTER
}
if len(actualBMs) == 1 {
// If we've only 1 actual bitmap, then that's our result.
oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(actualBMs[0])
continue OUTER
}
// Else, AND together our collected bitmaps as our result.
bm := roaring.And(actualBMs[0], actualBMs[1])
for _, actualBM := range actualBMs[2:] {
bm.And(actualBM)
}
oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm)
}
return oTFR, nil
}
// ----------------------------------------------------------------
// An "unadorned" disjunction optimization is appropriate when
// additional or subsidiary information like freq-norm's and
// term-vectors are not required, and instead only the internal-id's
// are needed.
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
octx index.OptimizableContext) (index.OptimizableContext, error) {
if octx == nil {
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
}
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
if !ok {
return nil, nil
}
if o.snapshot != s.snapshot {
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
}
o.tfrs = append(o.tfrs, s)
return o, nil
}
type OptimizeTFRDisjunctionUnadorned struct {
snapshot *IndexSnapshot
tfrs []*IndexSnapshotTermFieldReader
}
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
var OptimizeTFRDisjunctionUnadornedField = "*"
// Finish of an unadorned disjunction optimization will compute a
// termFieldReader with an "actual" bitmap that represents the
// constituent bitmaps OR'ed together. This termFieldReader cannot
// provide any freq-norm or termVector associated information.
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
if len(o.tfrs) <= 1 {
return nil, nil
}
for i := range o.snapshot.segment {
var cMax uint64
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
if !ok {
return nil, nil
}
if itr.ActualBitmap() != nil {
c := itr.ActualBitmap().GetCardinality()
if cMax < c {
cMax = c
}
}
}
// Heuristic to skip the optimization if all the constituent
// bitmaps are too small, where the processing & resource
// overhead to create the OR'ed bitmap outweighs the benefit.
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
return nil, nil
}
}
// We use an artificial term and field because the optimized
// termFieldReader can represent multiple terms and fields.
oTFR := &IndexSnapshotTermFieldReader{
term: OptimizeTFRDisjunctionUnadornedTerm,
field: OptimizeTFRDisjunctionUnadornedField,
snapshot: o.snapshot,
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
segmentOffset: 0,
includeFreq: false,
includeNorm: false,
includeTermVectors: false,
}
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
for i := range o.snapshot.segment {
docNums = docNums[:0]
actualBMs = actualBMs[:0]
for _, tfr := range o.tfrs {
itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator)
if !ok {
return nil, nil
}
docNum, ok := itr.DocNum1Hit()
if ok {
docNums = append(docNums, uint32(docNum))
continue
}
if itr.ActualBitmap() != nil {
actualBMs = append(actualBMs, itr.ActualBitmap())
}
}
var bm *roaring.Bitmap
if len(actualBMs) > 2 {
bm = roaring.HeapOr(actualBMs...)
} else if len(actualBMs) == 2 {
bm = roaring.Or(actualBMs[0], actualBMs[1])
} else if len(actualBMs) == 1 {
bm = actualBMs[0].Clone()
}
if bm == nil {
bm = roaring.New()
}
bm.AddMany(docNums)
oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm)
}
return oTFR, nil
}

View file

@ -17,9 +17,11 @@ package scorch
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"math"
"os"
"path/filepath"
"strconv"
@ -28,23 +30,54 @@ import (
"time"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/boltdb/bolt"
bolt "go.etcd.io/bbolt"
)
var DefaultChunkFactor uint32 = 1024
// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
// persistence of segments with the default safe batch option.
// If the default safe batch option results in high number of
// files on disk, then users may initialise this configuration parameter
// with higher values so that the persister will nap a bit within it's
// work loop to favour better in-memory merging of segments to result
// in fewer segment files on disk. But that may come with an indexing
// performance overhead.
// Unsafe batch users are advised to override this to higher value
// for better performance especially with high data density.
var DefaultPersisterNapTimeMSec int = 0 // ms
// Arbitrary number, need to make it configurable.
// Lower values like 10/making persister really slow
// doesn't work well as it is creating more files to
// persist for in next persist iteration and spikes the # FDs.
// Ideal value should let persister also proceed at
// an optimum pace so that the merger can skip
// many intermediate snapshots.
// This needs to be based on empirical data.
// TODO - may need to revisit this approach/value.
var epochDistance = uint64(5)
// DefaultPersisterNapUnderNumFiles helps in controlling the pace of
// persister. At times of a slow merger progress with heavy file merging
// operations, its better to pace down the persister for letting the merger
// to catch up within a range defined by this parameter.
// Fewer files on disk (as per the merge plan) would result in keeping the
// file handle usage under limit, faster disk merger and a healthier index.
// Its been observed that such a loosely sync'ed introducer-persister-merger
// trio results in better overall performance.
var DefaultPersisterNapUnderNumFiles int = 1000
var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64
type persisterOptions struct {
// PersisterNapTimeMSec controls the wait/delay injected into
// persistence workloop to improve the chances for
// a healthier and heavier in-memory merging
PersisterNapTimeMSec int
// PersisterNapTimeMSec > 0, and the number of files is less than
// PersisterNapUnderNumFiles, then the persister will sleep
// PersisterNapTimeMSec amount of time to improve the chances for
// a healthier and heavier in-memory merging
PersisterNapUnderNumFiles int
// MemoryPressurePauseThreshold let persister to have a better leeway
// for prudently performing the memory merge of segments on a memory
// pressure situation. Here the config value is an upper threshold
// for the number of paused application threads. The default value would
// be a very high number to always favour the merging of memory segments.
MemoryPressurePauseThreshold uint64
}
type notificationChan chan struct{}
@ -54,6 +87,16 @@ func (s *Scorch) persisterLoop() {
var persistWatchers []*epochWatcher
var lastPersistedEpoch, lastMergedEpoch uint64
var ew *epochWatcher
var unpersistedCallbacks []index.BatchCallback
po, err := s.parsePersisterOptions()
if err != nil {
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
s.asyncTasks.Done()
return
}
OUTER:
for {
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
@ -69,10 +112,11 @@ OUTER:
lastMergedEpoch = ew.epoch
}
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
lastMergedEpoch, persistWatchers)
lastMergedEpoch, persistWatchers, po)
var ourSnapshot *IndexSnapshot
var ourPersisted []chan error
var ourPersistedCallbacks []index.BatchCallback
// check to see if there is a new snapshot to persist
s.rootLock.Lock()
@ -81,6 +125,8 @@ OUTER:
ourSnapshot.AddRef()
ourPersisted = s.rootPersisted
s.rootPersisted = nil
ourPersistedCallbacks = s.persistedCallbacks
s.persistedCallbacks = nil
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
}
@ -89,7 +135,7 @@ OUTER:
if ourSnapshot != nil {
startTime := time.Now()
err := s.persistSnapshot(ourSnapshot)
err := s.persistSnapshot(ourSnapshot, po)
for _, ch := range ourPersisted {
if err != nil {
ch <- err
@ -98,17 +144,34 @@ OUTER:
}
if err != nil {
atomic.StoreUint64(&s.iStats.persistEpoch, 0)
if err == ErrClosed {
if err == segment.ErrClosed {
// index has been closed
_ = ourSnapshot.DecRef()
break OUTER
}
// save this current snapshot's persistedCallbacks, to invoke during
// the retry attempt
unpersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
_ = ourSnapshot.DecRef()
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
continue OUTER
}
if unpersistedCallbacks != nil {
// in the event of this being a retry attempt for persisting a snapshot
// that had earlier failed, prepend the persistedCallbacks associated
// with earlier segment(s) to the latest persistedCallbacks
ourPersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
unpersistedCallbacks = nil
}
for i := range ourPersistedCallbacks {
ourPersistedCallbacks[i](err)
}
atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)
lastPersistedEpoch = ourSnapshot.epoch
@ -179,15 +242,51 @@ func notifyMergeWatchers(lastPersistedEpoch uint64,
return watchersNext
}
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
persistWatchers []*epochWatcher) (uint64, []*epochWatcher) {
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64,
lastMergedEpoch uint64, persistWatchers []*epochWatcher,
po *persisterOptions) (uint64, []*epochWatcher) {
// first, let the watchers proceed if they lag behind
// First, let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
// Check the merger lag by counting the segment files on disk,
numFilesOnDisk, _, _ := s.diskFileStats(nil)
// On finding fewer files on disk, persister takes a short pause
// for sufficient in-memory segments to pile up for the next
// memory merge cum persist loop.
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
select {
case <-s.closeCh:
case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)
case ew := <-s.persisterNotifier:
// unblock the merger in meantime
persistWatchers = append(persistWatchers, ew)
lastMergedEpoch = ew.epoch
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
}
return lastMergedEpoch, persistWatchers
}
// Finding too many files on disk could be due to two reasons.
// 1. Too many older snapshots awaiting the clean up.
// 2. The merger could be lagging behind on merging the disk files.
if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) {
s.removeOldData()
numFilesOnDisk, _, _ = s.diskFileStats(nil)
}
// Persister pause until the merger catches up to reduce the segment
// file count under the threshold.
// But if there is memory pressure, then skip this sleep maneuvers.
OUTER:
// check for slow merger and await until the merger catch up
for lastPersistedEpoch > lastMergedEpoch+epochDistance {
for po.PersisterNapUnderNumFiles > 0 &&
numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
lastMergedEpoch < lastPersistedEpoch {
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
select {
@ -202,18 +301,46 @@ OUTER:
// let the watchers proceed if they lag behind
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
numFilesOnDisk, _, _ = s.diskFileStats(nil)
}
return lastMergedEpoch, persistWatchers
}
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
if err != nil {
return err
func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
po := persisterOptions{
PersisterNapTimeMSec: DefaultPersisterNapTimeMSec,
PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles,
MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold,
}
if persisted {
return nil
if v, ok := s.config["scorchPersisterOptions"]; ok {
b, err := json.Marshal(v)
if err != nil {
return &po, err
}
err = json.Unmarshal(b, &po)
if err != nil {
return &po, err
}
}
return &po, nil
}
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
po *persisterOptions) error {
// Perform in-memory segment merging only when the memory pressure is
// below the configured threshold, else the persister performs the
// direct persistence of segments.
if s.paused() < po.MemoryPressurePauseThreshold {
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
if err != nil {
return err
}
if persisted {
return nil
}
}
return s.persistSnapshotDirect(snapshot)
@ -230,13 +357,13 @@ var DefaultMinSegmentsForInMemoryMerge = 2
func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
bool, error) {
// collect the in-memory zap segments (SegmentBase instances)
var sbs []*zap.SegmentBase
var sbs []segment.Segment
var sbsDrops []*roaring.Bitmap
var sbsIndexes []int
for i, segmentSnapshot := range snapshot.segment {
if sb, ok := segmentSnapshot.segment.(*zap.SegmentBase); ok {
sbs = append(sbs, sb)
if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); !ok {
sbs = append(sbs, segmentSnapshot.segment)
sbsDrops = append(sbsDrops, segmentSnapshot.deleted)
sbsIndexes = append(sbsIndexes, i)
}
@ -247,7 +374,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
}
newSnapshot, newSegmentID, err := s.mergeSegmentBases(
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
snapshot, sbs, sbsDrops, sbsIndexes)
if err != nil {
return false, err
}
@ -329,13 +456,13 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
if err != nil {
return err
}
err = metaBucket.Put([]byte("type"), []byte(zap.Type))
err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(s.segPlugin.Type()))
if err != nil {
return err
}
buf := make([]byte, binary.MaxVarintLen32)
binary.BigEndian.PutUint32(buf, zap.Version)
err = metaBucket.Put([]byte("version"), buf)
binary.BigEndian.PutUint32(buf, s.segPlugin.Version())
err = metaBucket.Put(boltMetaDataSegmentVersionKey, buf)
if err != nil {
return err
}
@ -364,11 +491,19 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
return err
}
switch seg := segmentSnapshot.segment.(type) {
case *zap.SegmentBase:
case segment.PersistedSegment:
path := seg.Path()
filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator))
err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
if err != nil {
return err
}
filenames = append(filenames, filename)
case segment.UnpersistedSegment:
// need to persist this to disk
filename := zapFileName(segmentSnapshot.id)
path := s.path + string(os.PathSeparator) + filename
err = zap.PersistSegmentBase(seg, path)
err = seg.Persist(path)
if err != nil {
return fmt.Errorf("error persisting segment: %v", err)
}
@ -378,14 +513,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
return err
}
filenames = append(filenames, filename)
case *zap.Segment:
path := seg.Path()
filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator))
err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
if err != nil {
return err
}
filenames = append(filenames, filename)
default:
return fmt.Errorf("unknown segment type: %T", seg)
}
@ -423,7 +551,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
}
}()
for segmentID, path := range newSegmentPaths {
newSegments[segmentID], err = zap.Open(path)
newSegments[segmentID], err = s.segPlugin.Open(path)
if err != nil {
return fmt.Errorf("error opening new segment at %s, %v", path, err)
}
@ -436,15 +564,13 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
select {
case <-s.closeCh:
err = ErrClosed
return err
return segment.ErrClosed
case s.persists <- persist:
}
select {
case <-s.closeCh:
err = ErrClosed
return err
return segment.ErrClosed
case <-persist.applied:
}
}
@ -481,6 +607,8 @@ var boltPathKey = []byte{'p'}
var boltDeletedKey = []byte{'d'}
var boltInternalKey = []byte{'i'}
var boltMetaDataKey = []byte{'m'}
var boltMetaDataSegmentTypeKey = []byte("type")
var boltMetaDataSegmentVersionKey = []byte("version")
func (s *Scorch) loadFromBolt() error {
return s.rootBolt.View(func(tx *bolt.Tx) error {
@ -521,11 +649,14 @@ func (s *Scorch) loadFromBolt() error {
s.nextSegmentID++
s.rootLock.Lock()
s.nextSnapshotEpoch = snapshotEpoch + 1
if s.root != nil {
_ = s.root.DecRef()
}
rootPrev := s.root
s.root = indexSnapshot
s.rootLock.Unlock()
if rootPrev != nil {
_ = rootPrev.DecRef()
}
foundRoot = true
}
return nil
@ -562,6 +693,23 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
refs: 1,
creator: "loadSnapshot",
}
// first we look for the meta-data bucket, this will tell us
// which segment type/version was used for this snapshot
// all operations for this scorch will use this type/version
metaBucket := snapshot.Bucket(boltMetaDataKey)
if metaBucket == nil {
_ = rv.DecRef()
return nil, fmt.Errorf("meta-data bucket missing")
}
segmentType := string(metaBucket.Get(boltMetaDataSegmentTypeKey))
segmentVersion := binary.BigEndian.Uint32(
metaBucket.Get(boltMetaDataSegmentVersionKey))
err := s.loadSegmentPlugin(segmentType, segmentVersion)
if err != nil {
_ = rv.DecRef()
return nil, fmt.Errorf(
"unable to load correct segment wrapper: %v", err)
}
var running uint64
c := snapshot.Cursor()
for k, _ := c.First(); k != nil; k, _ = c.Next() {
@ -606,7 +754,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
return nil, fmt.Errorf("segment path missing")
}
segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
segment, err := zap.Open(segmentPath)
segment, err := s.segPlugin.Open(segmentPath)
if err != nil {
return nil, fmt.Errorf("error opening bolt segment: %v", err)
}
@ -643,12 +791,11 @@ func (s *Scorch) removeOldData() {
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err))
}
atomic.AddUint64(&s.stats.TotSnapshotsRemovedFromMetaStore, uint64(removed))
if removed > 0 {
err = s.removeOldZapFiles()
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
}
err = s.removeOldZapFiles()
if err != nil {
s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
}
}
@ -690,7 +837,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
s.eligibleForRemoval = newEligible
s.rootLock.Unlock()
if len(epochsToRemove) <= 0 {
if len(epochsToRemove) == 0 {
return 0, nil
}

View file

@ -28,10 +28,9 @@ import (
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry"
"github.com/boltdb/bolt"
bolt "go.etcd.io/bbolt"
)
const Name = "scorch"
@ -41,12 +40,14 @@ const Version uint8 = 2
var ErrClosed = fmt.Errorf("scorch closed")
type Scorch struct {
nextSegmentID uint64
stats Stats
iStats internalStats
readOnly bool
version uint8
config map[string]interface{}
analysisQueue *index.AnalysisQueue
stats Stats
nextSegmentID uint64
path string
unsafeBatch bool
@ -54,6 +55,7 @@ type Scorch struct {
rootLock sync.RWMutex
root *IndexSnapshot // holds 1 ref-count on the root
rootPersisted []chan error // closed when root is persisted
persistedCallbacks []index.BatchCallback
nextSnapshotEpoch uint64
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
@ -64,7 +66,6 @@ type Scorch struct {
persists chan *persistIntroduction
merges chan *segmentMerge
introducerNotifier chan *epochWatcher
revertToSnapshots chan *snapshotReversion
persisterNotifier chan *epochWatcher
rootBolt *bolt.DB
asyncTasks sync.WaitGroup
@ -72,7 +73,11 @@ type Scorch struct {
onEvent func(event Event)
onAsyncError func(err error)
iStats internalStats
pauseLock sync.RWMutex
pauseCount uint64
segPlugin segment.Plugin
}
type internalStats struct {
@ -96,7 +101,25 @@ func NewScorch(storeName string,
nextSnapshotEpoch: 1,
closeCh: make(chan struct{}),
ineligibleForRemoval: map[string]bool{},
segPlugin: defaultSegmentPlugin,
}
// check if the caller has requested a specific segment type/version
forcedSegmentVersion, ok := config["forceSegmentVersion"].(int)
if ok {
forcedSegmentType, ok2 := config["forceSegmentType"].(string)
if !ok2 {
return nil, fmt.Errorf(
"forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion)
}
err := rv.loadSegmentPlugin(forcedSegmentType,
uint32(forcedSegmentVersion))
if err != nil {
return nil, err
}
}
rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"}
ro, ok := config["read_only"].(bool)
if ok {
@ -117,9 +140,30 @@ func NewScorch(storeName string,
return rv, nil
}
func (s *Scorch) paused() uint64 {
s.pauseLock.Lock()
pc := s.pauseCount
s.pauseLock.Unlock()
return pc
}
func (s *Scorch) incrPause() {
s.pauseLock.Lock()
s.pauseCount++
s.pauseLock.Unlock()
}
func (s *Scorch) decrPause() {
s.pauseLock.Lock()
s.pauseCount--
s.pauseLock.Unlock()
}
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
if s.onEvent != nil {
s.incrPause()
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
s.decrPause()
}
}
@ -189,12 +233,14 @@ func (s *Scorch) openBolt() error {
}
}
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment)))
s.introductions = make(chan *segmentIntroduction)
s.persists = make(chan *persistIntroduction)
s.merges = make(chan *segmentMerge)
s.introducerNotifier = make(chan *epochWatcher, 1)
s.revertToSnapshots = make(chan *snapshotReversion)
s.persisterNotifier = make(chan *epochWatcher, 1)
s.closeCh = make(chan struct{})
if !s.readOnly && s.path != "" {
err := s.removeOldZapFiles() // Before persister or merger create any new files.
@ -235,7 +281,10 @@ func (s *Scorch) Close() (err error) {
err = s.rootBolt.Close()
s.rootLock.Lock()
if s.root != nil {
_ = s.root.DecRef()
err2 := s.root.DecRef()
if err == nil {
err = err2
}
}
s.root = nil
s.rootLock.Unlock()
@ -284,15 +333,17 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
// FIXME could sort ids list concurrent with analysis?
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
aw := index.NewAnalysisWork(s, doc, resultChan)
// put the work on the queue
s.analysisQueue.Queue(aw)
if numUpdates > 0 {
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
aw := index.NewAnalysisWork(s, doc, resultChan)
// put the work on the queue
s.analysisQueue.Queue(aw)
}
}
}
}()
}()
}
// wait for analysis result
analysisResults := make([]*index.AnalysisResult, int(numUpdates))
@ -319,7 +370,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
var newSegment segment.Segment
var bufBytes uint64
if len(analysisResults) > 0 {
newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor)
newSegment, bufBytes, err = s.segPlugin.New(analysisResults)
if err != nil {
return err
}
@ -328,7 +379,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
}
err = s.prepareSegment(newSegment, ids, batch.InternalOps)
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback())
if err != nil {
if newSegment != nil {
_ = newSegment.Close()
@ -348,16 +399,17 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
}
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
internalOps map[string][]byte) error {
internalOps map[string][]byte, persistedCallback index.BatchCallback) error {
// new introduction
introduction := &segmentIntroduction{
id: atomic.AddUint64(&s.nextSegmentID, 1),
data: newSegment,
ids: ids,
obsoletes: make(map[uint64]*roaring.Bitmap),
internal: internalOps,
applied: make(chan error),
id: atomic.AddUint64(&s.nextSegmentID, 1),
data: newSegment,
ids: ids,
obsoletes: make(map[uint64]*roaring.Bitmap),
internal: internalOps,
applied: make(chan error),
persistedCallback: persistedCallback,
}
if !s.unsafeBatch {
@ -370,6 +422,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
root.AddRef()
s.rootLock.RUnlock()
defer func() { _ = root.DecRef() }()
for _, seg := range root.segment {
delta, err := seg.segment.DocNumbers(ids)
if err != nil {
@ -378,8 +432,6 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
introduction.obsoletes[seg.id] = delta
}
_ = root.DecRef()
introStartTime := time.Now()
s.introductions <- introduction
@ -434,24 +486,57 @@ func (s *Scorch) currentSnapshot() *IndexSnapshot {
func (s *Scorch) Stats() json.Marshaler {
return &s.stats
}
func (s *Scorch) StatsMap() map[string]interface{} {
m := s.stats.ToMap()
func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64,
uint64, uint64) {
var numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot uint64
if s.path != "" {
finfos, err := ioutil.ReadDir(s.path)
if err == nil {
var numFilesOnDisk, numBytesUsedDisk uint64
for _, finfo := range finfos {
if !finfo.IsDir() {
numBytesUsedDisk += uint64(finfo.Size())
numFilesOnDisk++
if rootSegmentPaths != nil {
fname := s.path + string(os.PathSeparator) + finfo.Name()
if _, fileAtRoot := rootSegmentPaths[fname]; fileAtRoot {
numBytesOnDiskByRoot += uint64(finfo.Size())
}
}
}
}
m["CurOnDiskBytes"] = numBytesUsedDisk
m["CurOnDiskFiles"] = numFilesOnDisk
}
}
// if no root files path given, then consider all disk files.
if rootSegmentPaths == nil {
return numFilesOnDisk, numBytesUsedDisk, numBytesUsedDisk
}
return numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot
}
func (s *Scorch) rootDiskSegmentsPaths() map[string]struct{} {
rv := make(map[string]struct{}, len(s.root.segment))
for _, segmentSnapshot := range s.root.segment {
if seg, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok {
rv[seg.Path()] = struct{}{}
}
}
return rv
}
func (s *Scorch) StatsMap() map[string]interface{} {
m := s.stats.ToMap()
s.rootLock.RLock()
rootSegPaths := s.rootDiskSegmentsPaths()
m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval))
s.rootLock.RUnlock()
numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot := s.diskFileStats(rootSegPaths)
m["CurOnDiskBytes"] = numBytesUsedDisk
m["CurOnDiskFiles"] = numFilesOnDisk
// TODO: consider one day removing these backwards compatible
// names for apps using the old names
@ -466,8 +551,16 @@ func (s *Scorch) StatsMap() map[string]interface{} {
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
m["num_items_introduced"] = m["TotIntroducedItems"]
m["num_items_persisted"] = m["TotPersistedItems"]
m["num_bytes_used_disk"] = m["CurOnDiskBytes"]
m["num_files_on_disk"] = m["CurOnDiskFiles"]
m["num_recs_to_persist"] = m["TotItemsToPersist"]
// total disk bytes found in index directory inclusive of older snapshots
m["num_bytes_used_disk"] = numBytesUsedDisk
// total disk bytes by the latest root index, exclusive of older snapshots
m["num_bytes_used_disk_by_root"] = numBytesOnDiskByRoot
m["num_files_on_disk"] = numFilesOnDisk
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"]
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"]
m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"]
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"]
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
return m
@ -486,7 +579,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult {
rv.Analyzed[i] = tokenFreqs
rv.Length[i] = fieldLength
if len(d.CompositeFields) > 0 {
if len(d.CompositeFields) > 0 && field.Name() != "_id" {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)

View file

@ -17,6 +17,7 @@ package segment
import (
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/couchbase/vellum"
)
type EmptySegment struct{}
@ -80,12 +81,8 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) RegexpIterator(start string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) FuzzyIterator(term string,
fuzziness int) DictionaryIterator {
func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
@ -94,14 +91,18 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) Contains(key []byte) (bool, error) {
return false, nil
}
type EmptyDictionaryIterator struct{}
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
return nil, nil
}
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
return nil, nil
func (e *EmptyDictionaryIterator) Contains(key []byte) (bool, error) {
return false, nil
}
type EmptyPostingsList struct{}
@ -125,6 +126,12 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) {
return nil, nil
}
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
return nil, nil
}
func (e *EmptyPostingsIterator) Size() int {
return 0
}
var AnEmptyPostingsIterator = &EmptyPostingsIterator{}

View file

@ -19,7 +19,10 @@
package segment
import "fmt"
import (
"errors"
"fmt"
)
const (
MaxVarintSize = 9
@ -92,3 +95,82 @@ func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) {
}
return b[length:], v, nil
}
// ------------------------------------------------------------
type MemUvarintReader struct {
C int // index of next byte to read from S
S []byte
}
func NewMemUvarintReader(s []byte) *MemUvarintReader {
return &MemUvarintReader{S: s}
}
// Len returns the number of unread bytes.
func (r *MemUvarintReader) Len() int {
n := len(r.S) - r.C
if n < 0 {
return 0
}
return n
}
var ErrMemUvarintReaderOverflow = errors.New("MemUvarintReader overflow")
// ReadUvarint reads an encoded uint64. The original code this was
// based on is at encoding/binary/ReadUvarint().
func (r *MemUvarintReader) ReadUvarint() (uint64, error) {
var x uint64
var s uint
var C = r.C
var S = r.S
for {
b := S[C]
C++
if b < 0x80 {
r.C = C
// why 63? The original code had an 'i += 1' loop var and
// checked for i > 9 || i == 9 ...; but, we no longer
// check for the i var, but instead check here for s,
// which is incremented by 7. So, 7*9 == 63.
//
// why the "extra" >= check? The normal case is that s <
// 63, so we check this single >= guard first so that we
// hit the normal, nil-error return pathway sooner.
if s >= 63 && (s > 63 || s == 63 && b > 1) {
return 0, ErrMemUvarintReaderOverflow
}
return x | uint64(b)<<s, nil
}
x |= uint64(b&0x7f) << s
s += 7
}
}
// SkipUvarint skips ahead one encoded uint64.
func (r *MemUvarintReader) SkipUvarint() {
for {
b := r.S[r.C]
r.C++
if b < 0x80 {
return
}
}
}
// SkipBytes skips a count number of bytes.
func (r *MemUvarintReader) SkipBytes(count int) {
r.C = r.C + count
}
func (r *MemUvarintReader) Reset(s []byte) {
r.C = 0
r.S = s
}

View file

@ -0,0 +1,58 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segment
import (
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
)
// Plugin represents the essential functions required by a package to plug in
// it's segment implementation
type Plugin interface {
// Type is the name for this segment plugin
Type() string
// Version is a numeric value identifying a specific version of this type.
// When incompatible changes are made to a particular type of plugin, the
// version must be incremented.
Version() uint32
// New takes a set of AnalysisResults and turns them into a new Segment
New(results []*index.AnalysisResult) (Segment, uint64, error)
// Open attempts to open the file at the specified path and
// return the corresponding Segment
Open(path string) (Segment, error)
// Merge takes a set of Segments, and creates a new segment on disk at
// the specified path.
// Drops is a set of bitmaps (one for each segment) indicating which
// documents can be dropped from the segments during the merge.
// If the closeCh channel is closed, Merge will cease doing work at
// the next opportunity, and return an error (closed).
// StatsReporter can optionally be provided, in which case progress
// made during the merge is reported while operation continues.
// Returns:
// A slice of new document numbers (one for each input segment),
// this allows the caller to know a particular document's new
// document number in the newly merged segment.
// The number of bytes written to the new segment file.
// An error, if any occurred.
Merge(segments []Segment, drops []*roaring.Bitmap, path string,
closeCh chan struct{}, s StatsReporter) (
[][]uint64, uint64, error)
}

View file

@ -0,0 +1,75 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segment
import (
"regexp/syntax"
"github.com/couchbase/vellum/regexp"
)
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) {
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
parsed, err := syntax.Parse(pattern, syntax.Perl)
if err != nil {
return nil, nil, nil, err
}
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit)
if err != nil {
return nil, nil, nil, err
}
prefix := LiteralPrefix(parsed)
if prefix != "" {
prefixBeg := []byte(prefix)
prefixEnd := IncrementBytes(prefixBeg)
return re, prefixBeg, prefixEnd, nil
}
return re, nil, nil, nil
}
// Returns the literal prefix given the parse tree for a regexp
func LiteralPrefix(s *syntax.Regexp) string {
// traverse the left-most branch in the parse tree as long as the
// node represents a concatenation
for s != nil && s.Op == syntax.OpConcat {
if len(s.Sub) < 1 {
return ""
}
s = s.Sub[0]
}
if s.Op == syntax.OpLiteral && (s.Flags&syntax.FoldCase == 0) {
return string(s.Rune)
}
return "" // no literal prefix
}
func IncrementBytes(in []byte) []byte {
rv := make([]byte, len(in))
copy(rv, in)
for i := len(rv) - 1; i >= 0; i-- {
rv[i] = rv[i] + 1
if rv[i] != 0 {
return rv // didn't overflow, so stop
}
}
return nil // overflowed
}

View file

@ -15,10 +15,15 @@
package segment
import (
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/couchbase/vellum"
)
var ErrClosed = fmt.Errorf("index closed")
// DocumentFieldValueVisitor defines a callback to be visited for each
// stored field value. The return value determines if the visitor
// should keep going. Returning true continues visiting, false stops.
@ -45,15 +50,27 @@ type Segment interface {
DecRef() error
}
type UnpersistedSegment interface {
Segment
Persist(path string) error
}
type PersistedSegment interface {
Segment
Path() string
}
type TermDictionary interface {
PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error)
Iterator() DictionaryIterator
PrefixIterator(prefix string) DictionaryIterator
RangeIterator(start, end string) DictionaryIterator
RegexpIterator(regex string) DictionaryIterator
FuzzyIterator(term string, fuzziness int) DictionaryIterator
AutomatonIterator(a vellum.Automaton,
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
Contains(key []byte) (bool, error)
}
type DictionaryIterator interface {
@ -89,6 +106,12 @@ type PostingsIterator interface {
Size() int
}
type OptimizablePostingsIterator interface {
ActualBitmap() *roaring.Bitmap
DocNum1Hit() (uint64, bool)
ReplaceActual(*roaring.Bitmap)
}
type Posting interface {
Number() uint64
@ -124,3 +147,7 @@ type DocumentFieldTermVisitable interface {
type DocVisitState interface {
}
type StatsReporter interface {
ReportBytesWritten(bytesWritten uint64)
}

View file

@ -0,0 +1,148 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package segment
import (
"github.com/RoaringBitmap/roaring"
"math"
"reflect"
)
var reflectStaticSizeUnadornedPostingsIteratorBitmap int
var reflectStaticSizeUnadornedPostingsIterator1Hit int
var reflectStaticSizeUnadornedPosting int
func init() {
var pib UnadornedPostingsIteratorBitmap
reflectStaticSizeUnadornedPostingsIteratorBitmap = int(reflect.TypeOf(pib).Size())
var pi1h UnadornedPostingsIterator1Hit
reflectStaticSizeUnadornedPostingsIterator1Hit = int(reflect.TypeOf(pi1h).Size())
var up UnadornedPosting
reflectStaticSizeUnadornedPosting = int(reflect.TypeOf(up).Size())
}
type UnadornedPostingsIteratorBitmap struct{
actual roaring.IntPeekable
actualBM *roaring.Bitmap
}
func (i *UnadornedPostingsIteratorBitmap) Next() (Posting, error) {
return i.nextAtOrAfter(0)
}
func (i *UnadornedPostingsIteratorBitmap) Advance(docNum uint64) (Posting, error) {
return i.nextAtOrAfter(docNum)
}
func (i *UnadornedPostingsIteratorBitmap) nextAtOrAfter(atOrAfter uint64) (Posting, error) {
docNum, exists := i.nextDocNumAtOrAfter(atOrAfter)
if !exists {
return nil, nil
}
return UnadornedPosting(docNum), nil
}
func (i *UnadornedPostingsIteratorBitmap) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) {
if i.actual == nil || !i.actual.HasNext() {
return 0, false
}
i.actual.AdvanceIfNeeded(uint32(atOrAfter))
if !i.actual.HasNext() {
return 0, false // couldn't find anything
}
return uint64(i.actual.Next()), true
}
func (i *UnadornedPostingsIteratorBitmap) Size() int {
return reflectStaticSizeUnadornedPostingsIteratorBitmap
}
func NewUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) PostingsIterator {
return &UnadornedPostingsIteratorBitmap{
actualBM: bm,
actual: bm.Iterator(),
}
}
const docNum1HitFinished = math.MaxUint64
type UnadornedPostingsIterator1Hit struct{
docNum uint64
}
func (i *UnadornedPostingsIterator1Hit) Next() (Posting, error) {
return i.nextAtOrAfter(0)
}
func (i *UnadornedPostingsIterator1Hit) Advance(docNum uint64) (Posting, error) {
return i.nextAtOrAfter(docNum)
}
func (i *UnadornedPostingsIterator1Hit) nextAtOrAfter(atOrAfter uint64) (Posting, error) {
docNum, exists := i.nextDocNumAtOrAfter(atOrAfter)
if !exists {
return nil, nil
}
return UnadornedPosting(docNum), nil
}
func (i *UnadornedPostingsIterator1Hit) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) {
if i.docNum == docNum1HitFinished {
return 0, false
}
if i.docNum < atOrAfter {
// advanced past our 1-hit
i.docNum = docNum1HitFinished // consume our 1-hit docNum
return 0, false
}
docNum := i.docNum
i.docNum = docNum1HitFinished // consume our 1-hit docNum
return docNum, true
}
func (i *UnadornedPostingsIterator1Hit) Size() int {
return reflectStaticSizeUnadornedPostingsIterator1Hit
}
func NewUnadornedPostingsIteratorFrom1Hit(docNum1Hit uint64) PostingsIterator {
return &UnadornedPostingsIterator1Hit{
docNum1Hit,
}
}
type UnadornedPosting uint64
func (p UnadornedPosting) Number() uint64 {
return uint64(p)
}
func (p UnadornedPosting) Frequency() uint64 {
return 0
}
func (p UnadornedPosting) Norm() float64 {
return 0
}
func (p UnadornedPosting) Locations() []Location {
return nil
}
func (p UnadornedPosting) Size() int {
return reflectStaticSizeUnadornedPosting
}

View file

@ -0,0 +1,77 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorch
import (
"fmt"
"github.com/blevesearch/bleve/index/scorch/segment"
zapv11 "github.com/blevesearch/zap/v11"
zapv12 "github.com/blevesearch/zap/v12"
)
var supportedSegmentPlugins map[string]map[uint32]segment.Plugin
var defaultSegmentPlugin segment.Plugin
func init() {
ResetPlugins()
RegisterPlugin(zapv12.Plugin(), false)
RegisterPlugin(zapv11.Plugin(), true)
}
func ResetPlugins() {
supportedSegmentPlugins = map[string]map[uint32]segment.Plugin{}
}
func RegisterPlugin(plugin segment.Plugin, makeDefault bool) {
if _, ok := supportedSegmentPlugins[plugin.Type()]; !ok {
supportedSegmentPlugins[plugin.Type()] = map[uint32]segment.Plugin{}
}
supportedSegmentPlugins[plugin.Type()][plugin.Version()] = plugin
if makeDefault {
defaultSegmentPlugin = plugin
}
}
func SupportedSegmentTypes() (rv []string) {
for k := range supportedSegmentPlugins {
rv = append(rv, k)
}
return
}
func SupportedSegmentTypeVersions(typ string) (rv []uint32) {
for k := range supportedSegmentPlugins[typ] {
rv = append(rv, k)
}
return rv
}
func (s *Scorch) loadSegmentPlugin(forcedSegmentType string,
forcedSegmentVersion uint32) error {
if versions, ok := supportedSegmentPlugins[forcedSegmentType]; ok {
if segPlugin, ok := versions[uint32(forcedSegmentVersion)]; ok {
s.segPlugin = segPlugin
return nil
}
return fmt.Errorf(
"unsupported version %d for segment type: %s, supported: %v",
forcedSegmentVersion, forcedSegmentType,
SupportedSegmentTypeVersions(forcedSegmentType))
}
return fmt.Errorf("unsupported segment type: %s, supported: %v",
forcedSegmentType, SupportedSegmentTypes())
}

View file

@ -27,9 +27,15 @@ import (
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
lev "github.com/couchbase/vellum/levenshtein"
)
// re usable, threadsafe levenshtein builders
var lb1, lb2 *lev.LevenshteinAutomatonBuilder
type asynchSegmentResult struct {
dict segment.TermDictionary
dictItr segment.DictionaryIterator
index int
@ -45,6 +51,15 @@ var reflectStaticSizeIndexSnapshot int
func init() {
var is interface{} = IndexSnapshot{}
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
var err error
lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
}
lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
}
}
type IndexSnapshot struct {
@ -59,9 +74,8 @@ type IndexSnapshot struct {
m sync.Mutex // Protects the fields that follow.
refs int64
m2 sync.Mutex // Protects the fields that follow.
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
fieldDicts map[string][]segment.TermDictionary // keyed by field, recycled dicts
m2 sync.Mutex // Protects the fields that follow.
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
}
func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
@ -113,16 +127,22 @@ func (i *IndexSnapshot) updateSize() {
}
}
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string,
makeItr func(i segment.TermDictionary) segment.DictionaryIterator,
randomLookup bool) (*IndexSnapshotFieldDict, error) {
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {
go func(index int, segment *SegmentSnapshot) {
dict, err := segment.Dictionary(field)
dict, err := segment.segment.Dictionary(field)
if err != nil {
results <- &asynchSegmentResult{err: err}
} else {
results <- &asynchSegmentResult{dictItr: makeItr(dict)}
if randomLookup {
results <- &asynchSegmentResult{dict: dict}
} else {
results <- &asynchSegmentResult{dictItr: makeItr(dict)}
}
}
}(index, segment)
}
@ -137,14 +157,20 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
if asr.err != nil && err == nil {
err = asr.err
} else {
next, err2 := asr.dictItr.Next()
if err2 != nil && err == nil {
err = err2
}
if next != nil {
if !randomLookup {
next, err2 := asr.dictItr.Next()
if err2 != nil && err == nil {
err = err2
}
if next != nil {
rv.cursors = append(rv.cursors, &segmentDictCursor{
itr: asr.dictItr,
curr: *next,
})
}
} else {
rv.cursors = append(rv.cursors, &segmentDictCursor{
itr: asr.dictItr,
curr: *next,
dict: asr.dict,
})
}
}
@ -153,8 +179,11 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
if err != nil {
return nil, err
}
// prepare heap
heap.Init(rv)
if !randomLookup {
// prepare heap
heap.Init(rv)
}
return rv, nil
}
@ -162,42 +191,75 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
func (i *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.Iterator()
})
}, false)
}
func (i *IndexSnapshot) FieldDictRange(field string, startTerm []byte,
endTerm []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.RangeIterator(string(startTerm), string(endTerm))
})
}, false)
}
func (i *IndexSnapshot) FieldDictPrefix(field string,
termPrefix []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.PrefixIterator(string(termPrefix))
})
}, false)
}
func (i *IndexSnapshot) FieldDictRegexp(field string,
termRegex []byte) (index.FieldDict, error) {
termRegex string) (index.FieldDict, error) {
// TODO: potential optimization where the literal prefix represents the,
// entire regexp, allowing us to use PrefixIterator(prefixTerm)?
a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex)
if err != nil {
return nil, err
}
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.RegexpIterator(string(termRegex))
})
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
}, false)
}
func (i *IndexSnapshot) getLevAutomaton(term string,
fuzziness uint8) (vellum.Automaton, error) {
if fuzziness == 1 {
return lb1.BuildDfa(term, fuzziness)
} else if fuzziness == 2 {
return lb2.BuildDfa(term, fuzziness)
}
return nil, fmt.Errorf("fuzziness exceeds the max limit")
}
func (i *IndexSnapshot) FieldDictFuzzy(field string,
term []byte, fuzziness int) (index.FieldDict, error) {
term string, fuzziness int, prefix string) (index.FieldDict, error) {
a, err := i.getLevAutomaton(term, uint8(fuzziness))
if err != nil {
return nil, err
}
var prefixBeg, prefixEnd []byte
if prefix != "" {
prefixBeg = []byte(prefix)
prefixEnd = segment.IncrementBytes(prefixBeg)
}
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.FuzzyIterator(string(term), fuzziness)
})
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
}, false)
}
func (i *IndexSnapshot) FieldDictOnly(field string,
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.OnlyIterator(onlyTerms, includeCount)
})
}, false)
}
func (i *IndexSnapshot) FieldDictContains(field string) (index.FieldDictContains, error) {
return i.newIndexSnapshotFieldDict(field, nil, true)
}
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
@ -393,8 +455,8 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err
}
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
includeNorm, includeTermVectors bool) (tfr index.TermFieldReader, err error) {
rv, dicts := i.allocTermFieldReaderDicts(field)
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
rv := i.allocTermFieldReaderDicts(field)
rv.term = term
rv.field = field
@ -412,20 +474,19 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
rv.currPosting = nil
rv.currID = rv.currID[:0]
if dicts == nil {
dicts = make([]segment.TermDictionary, len(i.segment))
if rv.dicts == nil {
rv.dicts = make([]segment.TermDictionary, len(i.segment))
for i, segment := range i.segment {
dict, err := segment.Dictionary(field)
dict, err := segment.segment.Dictionary(field)
if err != nil {
return nil, err
}
dicts[i] = dict
rv.dicts[i] = dict
}
}
rv.dicts = dicts
for i := range i.segment {
pl, err := dicts[i].PostingsList(term, nil, rv.postings[i])
for i, segment := range i.segment {
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i])
if err != nil {
return nil, err
}
@ -436,37 +497,37 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
return rv, nil
}
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (
tfr *IndexSnapshotTermFieldReader, dicts []segment.TermDictionary) {
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) {
i.m2.Lock()
if i.fieldDicts != nil {
dicts = i.fieldDicts[field]
}
if i.fieldTFRs != nil {
tfrs := i.fieldTFRs[field]
last := len(tfrs) - 1
if last >= 0 {
rv := tfrs[last]
tfr = tfrs[last]
tfrs[last] = nil
i.fieldTFRs[field] = tfrs[:last]
i.m2.Unlock()
return rv, dicts
return
}
}
i.m2.Unlock()
return &IndexSnapshotTermFieldReader{}, dicts
return &IndexSnapshotTermFieldReader{}
}
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) {
i.parent.rootLock.RLock()
obsolete := i.parent.root != i
i.parent.rootLock.RUnlock()
if obsolete {
// if we're not the current root (mutations happened), don't bother recycling
return
}
i.m2.Lock()
if i.fieldTFRs == nil {
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{}
}
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr)
if i.fieldDicts == nil {
i.fieldDicts = map[string][]segment.TermDictionary{}
}
i.fieldDicts[tfr.field] = tfr.dicts
i.m2.Unlock()
}
@ -636,7 +697,7 @@ func (i *IndexSnapshot) DumpFields() chan interface{} {
// subtractStrings returns set a minus elements of set b.
func subtractStrings(a, b []string) []string {
if len(b) <= 0 {
if len(b) == 0 {
return a
}

View file

@ -22,6 +22,7 @@ import (
)
type segmentDictCursor struct {
dict segment.TermDictionary
itr segment.DictionaryIterator
curr index.DictEntry
}
@ -52,7 +53,7 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} {
}
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
if len(i.cursors) <= 0 {
if len(i.cursors) == 0 {
return nil, nil
}
i.entry = i.cursors[0].curr
@ -91,3 +92,17 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
func (i *IndexSnapshotFieldDict) Close() error {
return nil
}
func (i *IndexSnapshotFieldDict) Contains(key []byte) (bool, error) {
if len(i.cursors) == 0 {
return false, nil
}
for _, cursor := range i.cursors {
if found, _ := cursor.dict.Contains(key); found {
return true, nil
}
}
return false, nil
}

View file

@ -74,7 +74,7 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
rv = &index.TermFieldDoc{}
}
// find the next hit
for i.segmentOffset < len(i.postings) {
for i.segmentOffset < len(i.iterators) {
next, err := i.iterators[i.segmentOffset].Next()
if err != nil {
return nil, err

View file

@ -17,9 +17,10 @@ package scorch
import (
"fmt"
"log"
"os"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/boltdb/bolt"
bolt "go.etcd.io/bbolt"
)
type RollbackPoint struct {
@ -34,13 +35,22 @@ func (r *RollbackPoint) GetInternal(key []byte) []byte {
// RollbackPoints returns an array of rollback points available for
// the application to rollback to, with more recent rollback points
// (higher epochs) coming first.
func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) {
if s.rootBolt == nil {
return nil, fmt.Errorf("RollbackPoints: root is nil")
func RollbackPoints(path string) ([]*RollbackPoint, error) {
if len(path) == 0 {
return nil, fmt.Errorf("RollbackPoints: invalid path")
}
rootBoltPath := path + string(os.PathSeparator) + "root.bolt"
rootBoltOpt := &bolt.Options{
ReadOnly: true,
}
rootBolt, err := bolt.Open(rootBoltPath, 0600, rootBoltOpt)
if err != nil || rootBolt == nil {
return nil, err
}
// start a read-only bolt transaction
tx, err := s.rootBolt.Begin(false)
tx, err := rootBolt.Begin(false)
if err != nil {
return nil, fmt.Errorf("RollbackPoints: failed to start" +
" read-only transaction")
@ -49,6 +59,7 @@ func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) {
// read-only bolt transactions to be rolled back
defer func() {
_ = tx.Rollback()
_ = rootBolt.Close()
}()
snapshots := tx.Bucket(boltSnapshotsBucket)
@ -105,69 +116,98 @@ func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) {
return rollbackPoints, nil
}
// Rollback atomically and durably (if unsafeBatch is unset) brings
// the store back to the point in time as represented by the
// RollbackPoint. Rollback() should only be passed a RollbackPoint
// that came from the same store using the RollbackPoints() API.
func (s *Scorch) Rollback(to *RollbackPoint) error {
// Rollback atomically and durably brings the store back to the point
// in time as represented by the RollbackPoint.
// Rollback() should only be passed a RollbackPoint that came from the
// same store using the RollbackPoints() API along with the index path.
func Rollback(path string, to *RollbackPoint) error {
if to == nil {
return fmt.Errorf("Rollback: RollbackPoint is nil")
}
if s.rootBolt == nil {
return fmt.Errorf("Rollback: root is nil")
if len(path) == 0 {
return fmt.Errorf("Rollback: index path is empty")
}
revert := &snapshotReversion{}
rootBoltPath := path + string(os.PathSeparator) + "root.bolt"
rootBoltOpt := &bolt.Options{
ReadOnly: false,
}
rootBolt, err := bolt.Open(rootBoltPath, 0600, rootBoltOpt)
if err != nil || rootBolt == nil {
return err
}
defer func() {
err1 := rootBolt.Close()
if err1 != nil && err == nil {
err = err1
}
}()
s.rootLock.Lock()
err := s.rootBolt.View(func(tx *bolt.Tx) error {
// pick all the younger persisted epochs in bolt store
// including the target one.
var found bool
var eligibleEpochs []uint64
err = rootBolt.View(func(tx *bolt.Tx) error {
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return fmt.Errorf("Rollback: no snapshots available")
return nil
}
pos := segment.EncodeUvarintAscending(nil, to.epoch)
snapshot := snapshots.Bucket(pos)
if snapshot == nil {
return fmt.Errorf("Rollback: snapshot not found")
sc := snapshots.Cursor()
for sk, _ := sc.Last(); sk != nil && !found; sk, _ = sc.Prev() {
_, snapshotEpoch, err := segment.DecodeUvarintAscending(sk)
if err != nil {
continue
}
if snapshotEpoch == to.epoch {
found = true
}
eligibleEpochs = append(eligibleEpochs, snapshotEpoch)
}
indexSnapshot, err := s.loadSnapshot(snapshot)
if err != nil {
return fmt.Errorf("Rollback: unable to load snapshot: %v", err)
}
// add segments referenced by loaded index snapshot to the
// ineligibleForRemoval map
for _, segSnap := range indexSnapshot.segment {
filename := zapFileName(segSnap.id)
s.ineligibleForRemoval[filename] = true
}
revert.snapshot = indexSnapshot
revert.applied = make(chan error)
revert.persisted = make(chan error)
return nil
})
s.rootLock.Unlock()
if len(eligibleEpochs) == 0 {
return fmt.Errorf("Rollback: no persisted epochs found in bolt")
}
if !found {
return fmt.Errorf("Rollback: target epoch %d not found in bolt", to.epoch)
}
// start a write transaction
tx, err := rootBolt.Begin(true)
if err != nil {
return err
}
// introduce the reversion
s.revertToSnapshots <- revert
defer func() {
if err == nil {
err = tx.Commit()
} else {
_ = tx.Rollback()
}
if err == nil {
err = rootBolt.Sync()
}
}()
// block until this snapshot is applied
err = <-revert.applied
if err != nil {
return fmt.Errorf("Rollback: failed with err: %v", err)
snapshots := tx.Bucket(boltSnapshotsBucket)
if snapshots == nil {
return nil
}
for _, epoch := range eligibleEpochs {
k := segment.EncodeUvarintAscending(nil, epoch)
if err != nil {
continue
}
if epoch == to.epoch {
// return here as it already processed until the given epoch
return nil
}
err = snapshots.DeleteBucket(k)
if err == bolt.ErrBucketNotFound {
err = nil
}
}
return <-revert.persisted
return err
}

View file

@ -29,43 +29,6 @@ var TermSeparator byte = 0xff
var TermSeparatorSplitSlice = []byte{TermSeparator}
type SegmentDictionarySnapshot struct {
s *SegmentSnapshot
d segment.TermDictionary
}
func (s *SegmentDictionarySnapshot) PostingsList(term []byte, except *roaring.Bitmap,
prealloc segment.PostingsList) (segment.PostingsList, error) {
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted?
return s.d.PostingsList(term, s.s.deleted, prealloc)
}
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator {
return s.d.Iterator()
}
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator {
return s.d.PrefixIterator(prefix)
}
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator {
return s.d.RangeIterator(start, end)
}
func (s *SegmentDictionarySnapshot) RegexpIterator(regex string) segment.DictionaryIterator {
return s.d.RegexpIterator(regex)
}
func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
fuzziness int) segment.DictionaryIterator {
return s.d.FuzzyIterator(term, fuzziness)
}
func (s *SegmentDictionarySnapshot) OnlyIterator(onlyTerms [][]byte,
includeCount bool) segment.DictionaryIterator {
return s.d.OnlyIterator(onlyTerms, includeCount)
}
type SegmentSnapshot struct {
id uint64
segment segment.Segment
@ -115,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 {
return rv
}
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) {
d, err := s.segment.Dictionary(field)
if err != nil {
return nil, err
}
return &SegmentDictionarySnapshot{
s: s,
d: d,
}, nil
}
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
rv, err := s.segment.DocNumbers(docIDs)
if err != nil {
@ -137,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
return rv, nil
}
// DocNumbersLive returns bitsit containing doc numbers for all live docs
// DocNumbersLive returns a bitmap containing doc numbers for all live docs
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
rv := roaring.NewBitmap()
rv.AddRange(0, s.segment.Count())
@ -161,14 +113,29 @@ func (s *SegmentSnapshot) Size() (rv int) {
}
type cachedFieldDocs struct {
m sync.Mutex
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
err error // Non-nil if there was an error when preparing this cachedFieldDocs.
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
size uint64
}
func (cfd *cachedFieldDocs) Size() int {
var rv int
cfd.m.Lock()
for _, entry := range cfd.docs {
rv += 8 /* size of uint64 */ + len(entry)
}
cfd.m.Unlock()
return rv
}
func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
defer close(cfd.readyCh)
cfd.m.Lock()
defer func() {
close(cfd.readyCh)
cfd.m.Unlock()
}()
cfd.size += uint64(size.SizeOfUint64) /* size field */
dict, err := ss.segment.Dictionary(field)
@ -216,9 +183,9 @@ func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
}
type cachedDocs struct {
size uint64
m sync.Mutex // As the cache is asynchronously prepared, need a lock
cache map[string]*cachedFieldDocs // Keyed by field
size uint64
}
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
@ -279,9 +246,7 @@ func (c *cachedDocs) updateSizeLOCKED() {
for k, v := range c.cache { // cachedFieldDocs
sizeInBytes += len(k)
if v != nil {
for _, entry := range v.docs { // docs
sizeInBytes += 8 /* size of uint64 */ + len(entry)
}
sizeInBytes += v.Size()
}
}
atomic.StoreUint64(&c.size, uint64(sizeInBytes))

View file

@ -69,11 +69,15 @@ type Stats struct {
TotPersistLoopEnd uint64
TotPersistedItems uint64
TotItemsToPersist uint64
TotPersistedSegments uint64
TotPersisterSlowMergerPause uint64
TotPersisterSlowMergerResume uint64
TotPersisterNapPauseCompleted uint64
TotPersisterMergerNapBreak uint64
TotFileMergeLoopBeg uint64
TotFileMergeLoopErr uint64
TotFileMergeLoopEnd uint64
@ -91,24 +95,32 @@ type Stats struct {
TotFileMergeSegmentsEmpty uint64
TotFileMergeSegments uint64
TotFileSegmentsAtRoot uint64
TotFileMergeWrittenBytes uint64
TotFileMergeZapBeg uint64
TotFileMergeZapEnd uint64
TotFileMergeZapTime uint64
MaxFileMergeZapTime uint64
TotFileMergeZapBeg uint64
TotFileMergeZapEnd uint64
TotFileMergeZapTime uint64
MaxFileMergeZapTime uint64
TotFileMergeZapIntroductionTime uint64
MaxFileMergeZapIntroductionTime uint64
TotFileMergeIntroductions uint64
TotFileMergeIntroductionsDone uint64
TotFileMergeIntroductions uint64
TotFileMergeIntroductionsDone uint64
TotFileMergeIntroductionsSkipped uint64
TotMemMergeBeg uint64
TotMemMergeErr uint64
TotMemMergeDone uint64
TotMemMergeZapBeg uint64
TotMemMergeZapEnd uint64
TotMemMergeZapTime uint64
MaxMemMergeZapTime uint64
TotMemMergeSegments uint64
CurFilesIneligibleForRemoval uint64
TotSnapshotsRemovedFromMetaStore uint64
TotMemMergeBeg uint64
TotMemMergeErr uint64
TotMemMergeDone uint64
TotMemMergeZapBeg uint64
TotMemMergeZapEnd uint64
TotMemMergeZapTime uint64
MaxMemMergeZapTime uint64
TotMemMergeSegments uint64
TotMemorySegmentsAtRoot uint64
}
// atomically populates the returned map

View file

@ -17,7 +17,7 @@ package boltdb
import (
"bytes"
"github.com/boltdb/bolt"
bolt "go.etcd.io/bbolt"
)
type Iterator struct {

View file

@ -16,7 +16,7 @@ package boltdb
import (
"github.com/blevesearch/bleve/index/store"
"github.com/boltdb/bolt"
bolt "go.etcd.io/bbolt"
)
type Reader struct {

View file

@ -30,7 +30,7 @@ import (
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry"
"github.com/boltdb/bolt"
bolt "go.etcd.io/bbolt"
)
const (
@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore,
bo.ReadOnly = ro
}
if initialMmapSize, ok := config["initialMmapSize"].(int); ok {
bo.InitialMmapSize = initialMmapSize
} else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok {
bo.InitialMmapSize = int(initialMmapSize)
}
db, err := bolt.Open(path, 0600, bo)
if err != nil {
return nil, err

View file

@ -584,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error {
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
tfr.doc = key[3+len(term)+1:]
if len(tfr.doc) <= 0 {
if len(tfr.doc) == 0 {
return fmt.Errorf("invalid term frequency key, empty docid")
}

View file

@ -415,7 +415,6 @@ func (udc *UpsideDownCouch) Close() error {
func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
// do analysis before acquiring write lock
analysisStart := time.Now()
numPlainTextBytes := doc.NumPlainTextBytes()
resultChan := make(chan *index.AnalysisResult)
aw := index.NewAnalysisWork(udc, doc, resultChan)
@ -452,6 +451,11 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
return
}
return udc.UpdateWithAnalysis(doc, result, backIndexRow)
}
func (udc *UpsideDownCouch) UpdateWithAnalysis(doc *document.Document,
result *index.AnalysisResult, backIndexRow *BackIndexRow) (err error) {
// start a writer for this update
indexStart := time.Now()
var kvwriter store.KVWriter
@ -490,7 +494,7 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
atomic.AddUint64(&udc.stats.indexTime, uint64(time.Since(indexStart)))
if err == nil {
atomic.AddUint64(&udc.stats.updates, 1)
atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, numPlainTextBytes)
atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, doc.NumPlainTextBytes())
} else {
atomic.AddUint64(&udc.stats.errors, 1)
}
@ -775,7 +779,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
}
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
if len(in) <= 0 {
if len(in) == 0 {
return nil
}
@ -797,6 +801,10 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []
}
func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
persistedCallback := batch.PersistedCallback()
if persistedCallback != nil {
defer persistedCallback(err)
}
analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps))
@ -810,15 +818,18 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
}
}
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
aw := index.NewAnalysisWork(udc, doc, resultChan)
// put the work on the queue
udc.analysisQueue.Queue(aw)
if numUpdates > 0 {
go func() {
for k := range batch.IndexOps {
doc := batch.IndexOps[k]
if doc != nil {
aw := index.NewAnalysisWork(udc, doc, resultChan)
// put the work on the queue
udc.analysisQueue.Queue(aw)
}
}
}
}()
}()
}
// retrieve back index rows concurrent with analysis
docBackIndexRowErr := error(nil)
@ -958,6 +969,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
} else {
atomic.AddUint64(&udc.stats.errors, 1)
}
return
}

View file

@ -433,6 +433,9 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest {
Explain: req.Explain,
Sort: req.Sort.Copy(),
IncludeLocations: req.IncludeLocations,
Score: req.Score,
SearchAfter: req.SearchAfter,
SearchBefore: req.SearchBefore,
}
return &rv
}
@ -450,6 +453,14 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se
searchStart := time.Now()
asyncResults := make(chan *asyncSearchResult, len(indexes))
var reverseQueryExecution bool
if req.SearchBefore != nil {
reverseQueryExecution = true
req.Sort.Reverse()
req.SearchAfter = req.SearchBefore
req.SearchBefore = nil
}
// run search on each index in separate go routine
var waitGroup sync.WaitGroup
@ -502,7 +513,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se
// sort all hits with the requested order
if len(req.Sort) > 0 {
sorter := newMultiSearchHitSorter(req.Sort, sr.Hits)
sorter := newSearchHitSorter(req.Sort, sr.Hits)
sort.Sort(sorter)
}
@ -523,6 +534,17 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se
sr.Facets.Fixup(name, fr.Size)
}
if reverseQueryExecution {
// reverse the sort back to the original
req.Sort.Reverse()
// resort using the original order
mhs := newSearchHitSorter(req.Sort, sr.Hits)
sort.Sort(mhs)
// reset request
req.SearchBefore = req.SearchAfter
req.SearchAfter = nil
}
// fix up original request
sr.Request = req
searchDuration := time.Since(searchStart)
@ -580,26 +602,3 @@ func (f *indexAliasImplFieldDict) Close() error {
defer f.index.mutex.RUnlock()
return f.fieldDict.Close()
}
type multiSearchHitSorter struct {
hits search.DocumentMatchCollection
sort search.SortOrder
cachedScoring []bool
cachedDesc []bool
}
func newMultiSearchHitSorter(sort search.SortOrder, hits search.DocumentMatchCollection) *multiSearchHitSorter {
return &multiSearchHitSorter{
sort: sort,
hits: hits,
cachedScoring: sort.CacheIsScore(),
cachedDesc: sort.CacheDescending(),
}
}
func (m *multiSearchHitSorter) Len() int { return len(m.hits) }
func (m *multiSearchHitSorter) Swap(i, j int) { m.hits[i], m.hits[j] = m.hits[j], m.hits[i] }
func (m *multiSearchHitSorter) Less(i, j int) bool {
c := m.sort.Compare(m.cachedScoring, m.cachedDesc, m.hits[i], m.hits[j])
return c < 0
}

View file

@ -19,6 +19,7 @@ import (
"encoding/json"
"fmt"
"os"
"sort"
"sync"
"sync/atomic"
"time"
@ -442,7 +443,20 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
return nil, ErrorIndexClosed
}
collector := collector.NewTopNCollector(req.Size, req.From, req.Sort)
var reverseQueryExecution bool
if req.SearchBefore != nil {
reverseQueryExecution = true
req.Sort.Reverse()
req.SearchAfter = req.SearchBefore
req.SearchBefore = nil
}
var coll *collector.TopNCollector
if req.SearchAfter != nil {
coll = collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter)
} else {
coll = collector.NewTopNCollector(req.Size, req.From, req.Sort)
}
// open a reader for this search
indexReader, err := i.i.Reader()
@ -458,6 +472,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{
Explain: req.Explain,
IncludeTermVectors: req.IncludeLocations || req.Highlight != nil,
Score: req.Score,
})
if err != nil {
return nil, err
@ -493,10 +508,10 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
facetsBuilder.Add(facetName, facetBuilder)
}
}
collector.SetFacetsBuilder(facetsBuilder)
coll.SetFacetsBuilder(facetsBuilder)
}
memNeeded := memNeededForSearch(req, searcher, collector)
memNeeded := memNeededForSearch(req, searcher, coll)
if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil {
if cbF, ok := cb.(SearchQueryStartCallbackFn); ok {
err = cbF(memNeeded)
@ -514,12 +529,12 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
}
}
err = collector.Collect(ctx, searcher, indexReader)
err = coll.Collect(ctx, searcher, indexReader)
if err != nil {
return nil, err
}
hits := collector.Results()
hits := coll.Results()
var highlighter highlight.Highlighter
@ -541,71 +556,13 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
}
for _, hit := range hits {
if len(req.Fields) > 0 || highlighter != nil {
doc, err := indexReader.Document(hit.ID)
if err == nil && doc != nil {
if len(req.Fields) > 0 {
fieldsToLoad := deDuplicate(req.Fields)
for _, f := range fieldsToLoad {
for _, docF := range doc.Fields {
if f == "*" || docF.Name() == f {
var value interface{}
switch docF := docF.(type) {
case *document.TextField:
value = string(docF.Value())
case *document.NumericField:
num, err := docF.Number()
if err == nil {
value = num
}
case *document.DateTimeField:
datetime, err := docF.DateTime()
if err == nil {
value = datetime.Format(time.RFC3339)
}
case *document.BooleanField:
boolean, err := docF.Boolean()
if err == nil {
value = boolean
}
case *document.GeoPointField:
lon, err := docF.Lon()
if err == nil {
lat, err := docF.Lat()
if err == nil {
value = []float64{lon, lat}
}
}
}
if value != nil {
hit.AddFieldValue(docF.Name(), value)
}
}
}
}
}
if highlighter != nil {
highlightFields := req.Highlight.Fields
if highlightFields == nil {
// add all fields with matches
highlightFields = make([]string, 0, len(hit.Locations))
for k := range hit.Locations {
highlightFields = append(highlightFields, k)
}
}
for _, hf := range highlightFields {
highlighter.BestFragmentsInField(hit, doc, hf, 1)
}
}
} else if doc == nil {
// unexpected case, a doc ID that was found as a search hit
// was unable to be found during document lookup
return nil, ErrorIndexReadInconsistency
}
}
if i.name != "" {
hit.Index = i.name
}
err = LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter)
if err != nil {
return nil, err
}
}
atomic.AddUint64(&i.stats.searches, 1)
@ -617,6 +574,17 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
logger.Printf("slow search took %s - %v", searchDuration, req)
}
if reverseQueryExecution {
// reverse the sort back to the original
req.Sort.Reverse()
// resort using the original order
mhs := newSearchHitSorter(req.Sort, hits)
sort.Sort(mhs)
// reset request
req.SearchBefore = req.SearchAfter
req.SearchAfter = nil
}
return &SearchResult{
Status: &SearchStatus{
Total: 1,
@ -624,13 +592,82 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
},
Request: req,
Hits: hits,
Total: collector.Total(),
MaxScore: collector.MaxScore(),
Total: coll.Total(),
MaxScore: coll.MaxScore(),
Took: searchDuration,
Facets: collector.FacetResults(),
Facets: coll.FacetResults(),
}, nil
}
func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest,
indexName string, r index.IndexReader,
highlighter highlight.Highlighter) error {
if len(req.Fields) > 0 || highlighter != nil {
doc, err := r.Document(hit.ID)
if err == nil && doc != nil {
if len(req.Fields) > 0 {
fieldsToLoad := deDuplicate(req.Fields)
for _, f := range fieldsToLoad {
for _, docF := range doc.Fields {
if f == "*" || docF.Name() == f {
var value interface{}
switch docF := docF.(type) {
case *document.TextField:
value = string(docF.Value())
case *document.NumericField:
num, err := docF.Number()
if err == nil {
value = num
}
case *document.DateTimeField:
datetime, err := docF.DateTime()
if err == nil {
value = datetime.Format(time.RFC3339)
}
case *document.BooleanField:
boolean, err := docF.Boolean()
if err == nil {
value = boolean
}
case *document.GeoPointField:
lon, err := docF.Lon()
if err == nil {
lat, err := docF.Lat()
if err == nil {
value = []float64{lon, lat}
}
}
}
if value != nil {
hit.AddFieldValue(docF.Name(), value)
}
}
}
}
}
if highlighter != nil {
highlightFields := req.Highlight.Fields
if highlightFields == nil {
// add all fields with matches
highlightFields = make([]string, 0, len(hit.Locations))
for k := range hit.Locations {
highlightFields = append(highlightFields, k)
}
}
for _, hf := range highlightFields {
highlighter.BestFragmentsInField(hit, doc, hf, 1)
}
}
} else if doc == nil {
// unexpected case, a doc ID that was found as a search hit
// was unable to be found during document lookup
return ErrorIndexReadInconsistency
}
}
return nil
}
// Fields returns the name of all the fields this
// Index has operated on.
func (i *indexImpl) Fields() (fields []string, err error) {
@ -853,3 +890,26 @@ func deDuplicate(fields []string) []string {
}
return ret
}
type searchHitSorter struct {
hits search.DocumentMatchCollection
sort search.SortOrder
cachedScoring []bool
cachedDesc []bool
}
func newSearchHitSorter(sort search.SortOrder, hits search.DocumentMatchCollection) *searchHitSorter {
return &searchHitSorter{
sort: sort,
hits: hits,
cachedScoring: sort.CacheIsScore(),
cachedDesc: sort.CacheDescending(),
}
}
func (m *searchHitSorter) Len() int { return len(m.hits) }
func (m *searchHitSorter) Swap(i, j int) { m.hits[i], m.hits[j] = m.hits[j], m.hits[i] }
func (m *searchHitSorter) Less(i, j int) bool {
c := m.sort.Compare(m.cachedScoring, m.cachedDesc, m.hits[i], m.hits[j])
return c < 0
}

View file

@ -18,6 +18,7 @@ import (
"encoding/json"
"io/ioutil"
"os"
"path/filepath"
"github.com/blevesearch/bleve/index/upsidedown"
)
@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) {
}
func indexMetaPath(path string) string {
return path + string(os.PathSeparator) + metaFilename
return filepath.Join(path, metaFilename)
}

View file

@ -42,7 +42,7 @@ type DocumentMapping struct {
Dynamic bool `json:"dynamic"`
Properties map[string]*DocumentMapping `json:"properties,omitempty"`
Fields []*FieldMapping `json:"fields,omitempty"`
DefaultAnalyzer string `json:"default_analyzer"`
DefaultAnalyzer string `json:"default_analyzer,omitempty"`
// StructTagKey overrides "json" when looking for field names in struct tags
StructTagKey string `json:"struct_tag_key,omitempty"`
@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string {
}
func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) {
// allow default "json" tag to be overriden
// allow default "json" tag to be overridden
structTagKey := dm.StructTagKey
if structTagKey == "" {
structTagKey = "json"
}
val := reflect.ValueOf(data)
if !val.IsValid() {
return
}
typ := val.Type()
switch typ.Kind() {
case reflect.Map:
@ -420,7 +424,11 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
if subDocMapping != nil {
// index by explicit mapping
for _, fieldMapping := range subDocMapping.Fields {
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
if fieldMapping.Type == "geopoint" {
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
} else {
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
}
}
} else if closestDocMapping.Dynamic {
// automatic indexing behavior
@ -517,19 +525,27 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
if !propertyValue.IsNil() {
switch property := property.(type) {
case encoding.TextMarshaler:
txt, err := property.MarshalText()
if err == nil && subDocMapping != nil {
// index by explicit mapping
// ONLY process TextMarshaler if there is an explicit mapping
// AND all of the fiels are of type text
// OTHERWISE process field without TextMarshaler
if subDocMapping != nil {
allFieldsText := true
for _, fieldMapping := range subDocMapping.Fields {
if fieldMapping.Type == "text" {
fieldMapping.processString(string(txt), pathString, path, indexes, context)
if fieldMapping.Type != "text" {
allFieldsText = false
break
}
}
} else {
dm.walkDocument(property, path, indexes, context)
txt, err := property.MarshalText()
if err == nil && allFieldsText {
txtStr := string(txt)
for _, fieldMapping := range subDocMapping.Fields {
fieldMapping.processString(txtStr, pathString, path, indexes, context)
}
return
}
}
dm.walkDocument(property, path, indexes, context)
default:
dm.walkDocument(property, path, indexes, context)
}

View file

@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string {
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
docType := im.determineType(data)
docMapping := im.mappingForType(docType)
walkContext := im.newWalkContext(doc, docMapping)
if docMapping.Enabled {
walkContext := im.newWalkContext(doc, docMapping)
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
// see if the _all field was disabled

View file

@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} {
func lookupPropertyPathPart(data interface{}, part string) interface{} {
val := reflect.ValueOf(data)
if !val.IsValid() {
return nil
}
typ := val.Type()
switch typ.Kind() {
case reflect.Map:

View file

@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16}
// Interleave the first 32 bits of each uint64
// apdated from org.apache.lucene.util.BitUtil
// whcih was adapted from:
// which was adapted from:
// http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
func Interleave(v1, v2 uint64) uint64 {
v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4]

View file

@ -23,12 +23,26 @@ const ShiftStartInt64 byte = 0x20
type PrefixCoded []byte
func NewPrefixCodedInt64(in int64, shift uint) (PrefixCoded, error) {
rv, _, err := NewPrefixCodedInt64Prealloc(in, shift, nil)
return rv, err
}
func NewPrefixCodedInt64Prealloc(in int64, shift uint, prealloc []byte) (
rv PrefixCoded, preallocRest []byte, err error) {
if shift > 63 {
return nil, fmt.Errorf("cannot shift %d, must be between 0 and 63", shift)
return nil, prealloc, fmt.Errorf("cannot shift %d, must be between 0 and 63", shift)
}
nChars := ((63 - shift) / 7) + 1
rv := make(PrefixCoded, nChars+1)
size := int(nChars + 1)
if len(prealloc) >= size {
rv = PrefixCoded(prealloc[0:size])
preallocRest = prealloc[size:]
} else {
rv = make(PrefixCoded, size)
}
rv[0] = ShiftStartInt64 + byte(shift)
sortableBits := int64(uint64(in) ^ 0x8000000000000000)
@ -40,7 +54,8 @@ func NewPrefixCodedInt64(in int64, shift uint) (PrefixCoded, error) {
nChars--
sortableBits = int64(uint64(sortableBits) >> 7)
}
return rv, nil
return rv, preallocRest, nil
}
func MustNewPrefixCodedInt64(in int64, shift uint) PrefixCoded {

View file

@ -261,6 +261,9 @@ func (h *HighlightRequest) AddField(field string) {
// Explain triggers inclusion of additional search
// result score explanations.
// Sort describes the desired order for the results to be returned.
// Score controls the kind of scoring performed
// SearchAfter supports deep paging by providing a minimum sort key
// SearchBefore supports deep paging by providing a maximum sort key
//
// A special field named "*" can be used to return all fields.
type SearchRequest struct {
@ -273,6 +276,9 @@ type SearchRequest struct {
Explain bool `json:"explain"`
Sort search.SortOrder `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score,omitempty"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
}
func (r *SearchRequest) Validate() error {
@ -283,6 +289,27 @@ func (r *SearchRequest) Validate() error {
}
}
if r.SearchAfter != nil && r.SearchBefore != nil {
return fmt.Errorf("cannot use search after and search before together")
}
if r.SearchAfter != nil {
if r.From != 0 {
return fmt.Errorf("cannot use search after with from !=0")
}
if len(r.SearchAfter) != len(r.Sort) {
return fmt.Errorf("search after must have same size as sort order")
}
}
if r.SearchBefore != nil {
if r.From != 0 {
return fmt.Errorf("cannot use search before with from !=0")
}
if len(r.SearchBefore) != len(r.Sort) {
return fmt.Errorf("search before must have same size as sort order")
}
}
return r.Facets.Validate()
}
@ -309,6 +336,18 @@ func (r *SearchRequest) SortByCustom(order search.SortOrder) {
r.Sort = order
}
// SetSearchAfter sets the request to skip over hits with a sort
// value less than the provided sort after key
func (r *SearchRequest) SetSearchAfter(after []string) {
r.SearchAfter = after
}
// SetSearchBefore sets the request to skip over hits with a sort
// value greater than the provided sort before key
func (r *SearchRequest) SetSearchBefore(before []string) {
r.SearchBefore = before
}
// UnmarshalJSON deserializes a JSON representation of
// a SearchRequest
func (r *SearchRequest) UnmarshalJSON(input []byte) error {
@ -322,6 +361,9 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
Explain bool `json:"explain"`
Sort []json.RawMessage `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
}
err := json.Unmarshal(input, &temp)
@ -348,6 +390,9 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
r.Fields = temp.Fields
r.Facets = temp.Facets
r.IncludeLocations = temp.IncludeLocations
r.Score = temp.Score
r.SearchAfter = temp.SearchAfter
r.SearchBefore = temp.SearchBefore
r.Query, err = query.ParseQuery(temp.Q)
if err != nil {
return err

View file

@ -30,3 +30,23 @@ type Collector interface {
SetFacetsBuilder(facetsBuilder *FacetsBuilder)
FacetResults() FacetResults
}
// DocumentMatchHandler is the type of document match callback
// bleve will invoke during the search.
// Eventually, bleve will indicate the completion of an ongoing search,
// by passing a nil value for the document match callback.
// The application should take a copy of the hit/documentMatch
// if it wish to own it or need prolonged access to it.
type DocumentMatchHandler func(hit *DocumentMatch) error
type MakeDocumentMatchHandlerKeyType string
var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType(
"MakeDocumentMatchHandlerKey")
// MakeDocumentMatchHandler is an optional DocumentMatchHandler
// builder function which the applications can pass to bleve.
// These builder methods gives a DocumentMatchHandler function
// to bleve, which it will invoke on every document matches.
type MakeDocumentMatchHandler func(ctx *SearchContext) (
callback DocumentMatchHandler, loadID bool, err error)

View file

@ -25,9 +25,9 @@ type collectStoreHeap struct {
compare collectorCompare
}
func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap {
func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap {
rv := &collectStoreHeap{
heap: make(search.DocumentMatchCollection, 0, cap),
heap: make(search.DocumentMatchCollection, 0, capacity),
compare: compare,
}
heap.Init(rv)

View file

@ -25,7 +25,7 @@ type collectStoreList struct {
compare collectorCompare
}
func newStoreList(cap int, compare collectorCompare) *collectStoreList {
func newStoreList(capacity int, compare collectorCompare) *collectStoreList {
rv := &collectStoreList{
results: list.New(),
compare: compare,
@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList {
return rv
}
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch,
size int) *search.DocumentMatch {
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch {
c.add(doc)
if c.len() > size {
return c.removeLast()

View file

@ -21,9 +21,9 @@ type collectStoreSlice struct {
compare collectorCompare
}
func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice {
func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice {
rv := &collectStoreSlice{
slice: make(search.DocumentMatchCollection, 0, cap),
slice: make(search.DocumentMatchCollection, 0, capacity),
compare: compare,
}
return rv

View file

@ -17,6 +17,7 @@ package collector
import (
"context"
"reflect"
"strconv"
"time"
"github.com/blevesearch/bleve/index"
@ -69,6 +70,7 @@ type TopNCollector struct {
lowestMatchOutsideResults *search.DocumentMatch
updateFieldVisitor index.DocumentFieldTermVisitor
dvReader index.DocValueReader
searchAfter *search.DocumentMatch
}
// CheckDoneEvery controls how frequently we check the context deadline
@ -78,6 +80,33 @@ const CheckDoneEvery = uint64(1024)
// skipping over the first 'skip' hits
// ordering hits by the provided sort order
func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
return newTopNCollector(size, skip, sort)
}
// NewTopNCollector builds a collector to find the top 'size' hits
// skipping over the first 'skip' hits
// ordering hits by the provided sort order
func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector {
rv := newTopNCollector(size, 0, sort)
rv.searchAfter = &search.DocumentMatch{
Sort: after,
}
for pos, ss := range sort {
if ss.RequiresDocID() {
rv.searchAfter.ID = after[pos]
}
if ss.RequiresScoring() {
if score, err := strconv.ParseFloat(after[pos], 64); err == nil {
rv.searchAfter.Score = score
}
}
}
return rv
}
func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
hc := &TopNCollector{size: size, skip: skip, sort: sort}
// pre-allocate space on the store to avoid reslicing
@ -140,6 +169,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
}
searchContext := &search.SearchContext{
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
Collector: hc,
IndexReader: reader,
}
hc.dvReader, err = reader.DocValueReader(hc.neededFields)
@ -154,6 +185,19 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
hc.sort.UpdateVisitor(field, term)
}
dmHandlerMaker := MakeTopNDocumentMatchHandler
if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
}
// use the application given builder for making the custom document match
// handler and perform callbacks/invocations on the newly made handler.
dmHandler, loadID, err := dmHandlerMaker(searchContext)
if err != nil {
return err
}
hc.needDocIds = hc.needDocIds || loadID
select {
case <-ctx.Done():
return ctx.Err()
@ -169,13 +213,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
}
}
err = hc.collectSingle(searchContext, reader, next)
err = hc.prepareDocumentMatch(searchContext, reader, next)
if err != nil {
break
}
err = dmHandler(next)
if err != nil {
break
}
next, err = searcher.Next(searchContext)
}
// help finalize/flush the results in case
// of custom document match handlers.
err = dmHandler(nil)
if err != nil {
return err
}
// compute search duration
hc.took = time.Since(startTime)
if err != nil {
@ -191,8 +248,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
var sortByScoreOpt = []string{"_score"}
func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error {
var err error
func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
reader index.IndexReader, d *search.DocumentMatch) (err error) {
// visit field terms for features that require it (sort, facets)
if len(hc.neededFields) > 0 {
@ -226,35 +283,64 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
hc.sort.Value(d)
}
// optimization, we track lowest sorting hit already removed from heap
// with this one comparison, we can avoid all heap operations if
// this hit would have been added and then immediately removed
if hc.lowestMatchOutsideResults != nil {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults)
if cmp >= 0 {
// this hit can't possibly be in the result set, so avoid heap ops
ctx.DocumentMatchPool.Put(d)
return nil
}
}
removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
if removed != nil {
if hc.lowestMatchOutsideResults == nil {
hc.lowestMatchOutsideResults = removed
} else {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults)
if cmp < 0 {
tmp := hc.lowestMatchOutsideResults
hc.lowestMatchOutsideResults = removed
ctx.DocumentMatchPool.Put(tmp)
}
}
}
return nil
}
func MakeTopNDocumentMatchHandler(
ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) {
var hc *TopNCollector
var ok bool
if hc, ok = ctx.Collector.(*TopNCollector); ok {
return func(d *search.DocumentMatch) error {
if d == nil {
return nil
}
// support search after based pagination,
// if this hit is <= the search after sort key
// we should skip it
if hc.searchAfter != nil {
// exact sort order matches use hit number to break tie
// but we want to allow for exact match, so we pretend
hc.searchAfter.HitNumber = d.HitNumber
if hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.searchAfter) <= 0 {
return nil
}
}
// optimization, we track lowest sorting hit already removed from heap
// with this one comparison, we can avoid all heap operations if
// this hit would have been added and then immediately removed
if hc.lowestMatchOutsideResults != nil {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d,
hc.lowestMatchOutsideResults)
if cmp >= 0 {
// this hit can't possibly be in the result set, so avoid heap ops
ctx.DocumentMatchPool.Put(d)
return nil
}
}
removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
if removed != nil {
if hc.lowestMatchOutsideResults == nil {
hc.lowestMatchOutsideResults = removed
} else {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc,
removed, hc.lowestMatchOutsideResults)
if cmp < 0 {
tmp := hc.lowestMatchOutsideResults
hc.lowestMatchOutsideResults = removed
ctx.DocumentMatchPool.Put(tmp)
}
}
}
return nil
}, false, nil
}
return nil, false, nil
}
// visitFieldTerms is responsible for visiting the field terms of the
// search hit, and passing visited terms to the sort and facet builder
func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error {

View file

@ -54,14 +54,14 @@ type FacetBuilder interface {
type FacetsBuilder struct {
indexReader index.IndexReader
facets map[string]FacetBuilder
facetNames []string
facets []FacetBuilder
fields []string
}
func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder {
return &FacetsBuilder{
indexReader: indexReader,
facets: make(map[string]FacetBuilder, 0),
}
}
@ -69,8 +69,7 @@ func (fb *FacetsBuilder) Size() int {
sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr
for k, v := range fb.facets {
sizeInBytes += size.SizeOfString + len(k) +
v.Size()
sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k])
}
for _, entry := range fb.fields {
@ -81,7 +80,8 @@ func (fb *FacetsBuilder) Size() int {
}
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) {
fb.facets[name] = facetBuilder
fb.facetNames = append(fb.facetNames, name)
fb.facets = append(fb.facets, facetBuilder)
fb.fields = append(fb.fields, facetBuilder.Field())
}
@ -333,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) {
func (fb *FacetsBuilder) Results() FacetResults {
fr := make(FacetResults)
for facetName, facetBuilder := range fb.facets {
for i, facetBuilder := range fb.facets {
facetResult := facetBuilder.Result()
fr[facetName] = facetResult
fr[fb.facetNames[i]] = facetResult
}
return fr
}

View file

@ -58,6 +58,11 @@ OUTER:
// push back towards beginning
// without cross maxbegin
for start > 0 && used < s.fragmentSize {
if start > len(orig) {
// bail if out of bounds, possibly due to token replacement
// e.g with a regexp replacement
continue OUTER
}
r, size := utf8.DecodeLastRune(orig[0:start])
if r == utf8.RuneError {
continue OUTER // bail

View file

@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
}
ss = append(ss, sr)
}
if len(ss) < 1 {
return searcher.NewMatchNoneSearcher(i)
}
return searcher.NewConjunctionSearcher(i, ss, options)
}

View file

@ -41,6 +41,14 @@ type BleveQueryTime struct {
time.Time
}
var MinRFC3339CompatibleTime time.Time
var MaxRFC3339CompatibleTime time.Time
func init() {
MinRFC3339CompatibleTime, _ = time.Parse(time.RFC3339, "1677-12-01T00:00:00Z")
MaxRFC3339CompatibleTime, _ = time.Parse(time.RFC3339, "2262-04-11T11:59:59Z")
}
func queryTimeFromString(t string) (time.Time, error) {
dateTimeParser, err := cache.DateTimeParserNamed(QueryDateTimeParser)
if err != nil {
@ -143,10 +151,20 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) {
min := math.Inf(-1)
max := math.Inf(1)
if !q.Start.IsZero() {
min = numeric.Int64ToFloat64(q.Start.UnixNano())
if !isDatetimeCompatible(q.Start) {
// overflow
return nil, nil, fmt.Errorf("invalid/unsupported date range, start: %v", q.Start)
}
startInt64 := q.Start.UnixNano()
min = numeric.Int64ToFloat64(startInt64)
}
if !q.End.IsZero() {
max = numeric.Int64ToFloat64(q.End.UnixNano())
if !isDatetimeCompatible(q.End) {
// overflow
return nil, nil, fmt.Errorf("invalid/unsupported date range, end: %v", q.End)
}
endInt64 := q.End.UnixNano()
max = numeric.Int64ToFloat64(endInt64)
}
return &min, &max, nil
@ -162,3 +180,12 @@ func (q *DateRangeQuery) Validate() error {
}
return nil
}
func isDatetimeCompatible(t BleveQueryTime) bool {
if QueryDateTimeFormat == time.RFC3339 &&
(t.Before(MinRFC3339CompatibleTime) || t.After(MaxRFC3339CompatibleTime)) {
return false
}
return true
}

View file

@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) {
q.Min = m
}
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
options search.SearcherOptions) (search.Searcher, error) {
ss := make([]search.Searcher, 0, len(q.Disjuncts))
for _, disjunct := range q.Disjuncts {
sr, err := disjunct.Searcher(i, m, options)
@ -76,9 +77,11 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
}
ss = append(ss, sr)
}
if len(ss) < 1 {
return searcher.NewMatchNoneSearcher(i)
}
return searcher.NewDisjunctionSearcher(i, ss, q.Min, options)
}

View file

@ -0,0 +1,94 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package query
import (
"encoding/json"
"fmt"
"github.com/blevesearch/bleve/geo"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/searcher"
)
type GeoBoundingPolygonQuery struct {
Points []geo.Point `json:"polygon_points"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
}
func NewGeoBoundingPolygonQuery(points []geo.Point) *GeoBoundingPolygonQuery {
return &GeoBoundingPolygonQuery{
Points: points}
}
func (q *GeoBoundingPolygonQuery) SetBoost(b float64) {
boost := Boost(b)
q.BoostVal = &boost
}
func (q *GeoBoundingPolygonQuery) Boost() float64 {
return q.BoostVal.Value()
}
func (q *GeoBoundingPolygonQuery) SetField(f string) {
q.FieldVal = f
}
func (q *GeoBoundingPolygonQuery) Field() string {
return q.FieldVal
}
func (q *GeoBoundingPolygonQuery) Searcher(i index.IndexReader,
m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
field := q.FieldVal
if q.FieldVal == "" {
field = m.DefaultSearchField()
}
return searcher.NewGeoBoundedPolygonSearcher(i, q.Points, field, q.BoostVal.Value(), options)
}
func (q *GeoBoundingPolygonQuery) Validate() error {
return nil
}
func (q *GeoBoundingPolygonQuery) UnmarshalJSON(data []byte) error {
tmp := struct {
Points []interface{} `json:"polygon_points"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
}{}
err := json.Unmarshal(data, &tmp)
if err != nil {
return err
}
q.Points = make([]geo.Point, 0, len(tmp.Points))
for _, i := range tmp.Points {
// now use our generic point parsing code from the geo package
lon, lat, found := geo.ExtractGeoPoint(i)
if !found {
return fmt.Errorf("geo polygon point: %v is not in a valid format", i)
}
q.Points = append(q.Points, geo.Point{Lon: lon, Lat: lat})
}
q.FieldVal = tmp.FieldVal
q.BoostVal = tmp.BoostVal
return nil
}

View file

@ -273,6 +273,15 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, hasPoints := tmp["polygon_points"]
if hasPoints {
var rv GeoBoundingPolygonQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
return nil, fmt.Errorf("unknown query type")
}
@ -296,32 +305,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
}
expand = func(query Query) (Query, error) {
switch query.(type) {
switch q := query.(type) {
case *QueryStringQuery:
q := query.(*QueryStringQuery)
parsed, err := parseQuerySyntax(q.Query)
if err != nil {
return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err)
}
return expand(parsed)
case *ConjunctionQuery:
q := *query.(*ConjunctionQuery)
children, err := expandSlice(q.Conjuncts)
if err != nil {
return nil, err
}
q.Conjuncts = children
return &q, nil
return q, nil
case *DisjunctionQuery:
q := *query.(*DisjunctionQuery)
children, err := expandSlice(q.Disjuncts)
if err != nil {
return nil, err
}
q.Disjuncts = children
return &q, nil
return q, nil
case *BooleanQuery:
q := *query.(*BooleanQuery)
var err error
q.Must, err = expand(q.Must)
if err != nil {
@ -335,7 +340,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
if err != nil {
return nil, err
}
return &q, nil
return q, nil
default:
return query, nil
}

View file

@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// see where to go
if !l.seenDot && next == '.' {
// stay in this state
l.seenDot = true
l.buf += string(next)
return inNumOrStrState, true
} else if unicode.IsDigit(next) {

View file

@ -15,7 +15,6 @@
package query
import (
"regexp"
"strings"
"github.com/blevesearch/bleve/index"
@ -28,7 +27,6 @@ type RegexpQuery struct {
Regexp string `json:"regexp"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
compiled *regexp.Regexp
}
// NewRegexpQuery creates a new Query which finds
@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti
if q.FieldVal == "" {
field = m.DefaultSearchField()
}
err := q.compile()
if err != nil {
return nil, err
// require that pattern NOT be anchored to start and end of term.
// do not attempt to remove trailing $, its presence is not
// known to interfere with LiteralPrefix() the way ^ does
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
actualRegexp := q.Regexp
if strings.HasPrefix(actualRegexp, "^") {
actualRegexp = actualRegexp[1:] // remove leading ^
}
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
return searcher.NewRegexpStringSearcher(i, actualRegexp, field,
q.BoostVal.Value(), options)
}
func (q *RegexpQuery) Validate() error {
return q.compile()
}
func (q *RegexpQuery) compile() error {
if q.compiled == nil {
// require that pattern NOT be anchored to start and end of term
actualRegexp := q.Regexp
if strings.HasPrefix(actualRegexp, "^") {
actualRegexp = actualRegexp[1:] // remove leading ^
}
// do not attempt to remove trailing $, it's presence is not
// known to interfere with LiteralPrefix() the way ^ does
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
var err error
q.compiled, err = regexp.Compile(actualRegexp)
if err != nil {
return err
}
}
return nil
return nil // real validation delayed until searcher constructor
}

View file

@ -15,7 +15,6 @@
package query
import (
"regexp"
"strings"
"github.com/blevesearch/bleve/index"
@ -47,7 +46,6 @@ type WildcardQuery struct {
Wildcard string `json:"wildcard"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
compiled *regexp.Regexp
}
// NewWildcardQuery creates a new Query which finds
@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op
if q.FieldVal == "" {
field = m.DefaultSearchField()
}
if q.compiled == nil {
var err error
q.compiled, err = q.convertToRegexp()
if err != nil {
return nil, err
}
}
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
return searcher.NewRegexpStringSearcher(i, regexpString, field,
q.BoostVal.Value(), options)
}
func (q *WildcardQuery) Validate() error {
var err error
q.compiled, err = q.convertToRegexp()
return err
}
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
return regexp.Compile(regexpString)
return nil // real validation delayed until searcher constructor
}

View file

@ -40,6 +40,7 @@ type TermQueryScorer struct {
idf float64
options search.SearcherOptions
idfExplanation *search.Explanation
includeScore bool
queryNorm float64
queryWeight float64
queryWeightExplanation *search.Explanation
@ -62,14 +63,15 @@ func (s *TermQueryScorer) Size() int {
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer {
rv := TermQueryScorer{
queryTerm: string(queryTerm),
queryField: queryField,
queryBoost: queryBoost,
docTerm: docTerm,
docTotal: docTotal,
idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)),
options: options,
queryWeight: 1.0,
queryTerm: string(queryTerm),
queryField: queryField,
queryBoost: queryBoost,
docTerm: docTerm,
docTotal: docTotal,
idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)),
options: options,
queryWeight: 1.0,
includeScore: options.Score != "none",
}
if options.Explain {
@ -113,56 +115,61 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
}
func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch {
var scoreExplanation *search.Explanation
// need to compute score
var tf float64
if termMatch.Freq < MaxSqrtCache {
tf = SqrtCache[int(termMatch.Freq)]
} else {
tf = math.Sqrt(float64(termMatch.Freq))
}
score := tf * termMatch.Norm * s.idf
if s.options.Explain {
childrenExplanations := make([]*search.Explanation, 3)
childrenExplanations[0] = &search.Explanation{
Value: tf,
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
rv := ctx.DocumentMatchPool.Get()
// perform any score computations only when needed
if s.includeScore || s.options.Explain {
var scoreExplanation *search.Explanation
var tf float64
if termMatch.Freq < MaxSqrtCache {
tf = SqrtCache[int(termMatch.Freq)]
} else {
tf = math.Sqrt(float64(termMatch.Freq))
}
childrenExplanations[1] = &search.Explanation{
Value: termMatch.Norm,
Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID),
}
childrenExplanations[2] = s.idfExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),
Children: childrenExplanations,
}
}
score := tf * termMatch.Norm * s.idf
// if the query weight isn't 1, multiply
if s.queryWeight != 1.0 {
score = score * s.queryWeight
if s.options.Explain {
childExplanations := make([]*search.Explanation, 2)
childExplanations[0] = s.queryWeightExplanation
childExplanations[1] = scoreExplanation
childrenExplanations := make([]*search.Explanation, 3)
childrenExplanations[0] = &search.Explanation{
Value: tf,
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
}
childrenExplanations[1] = &search.Explanation{
Value: termMatch.Norm,
Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID),
}
childrenExplanations[2] = s.idfExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
Children: childExplanations,
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),
Children: childrenExplanations,
}
}
// if the query weight isn't 1, multiply
if s.queryWeight != 1.0 {
score = score * s.queryWeight
if s.options.Explain {
childExplanations := make([]*search.Explanation, 2)
childExplanations[0] = s.queryWeightExplanation
childExplanations[1] = scoreExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
Children: childExplanations,
}
}
}
if s.includeScore {
rv.Score = score
}
if s.options.Explain {
rv.Expl = scoreExplanation
}
}
rv := ctx.DocumentMatchPool.Get()
rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...)
rv.Score = score
if s.options.Explain {
rv.Expl = scoreExplanation
}
if len(termMatch.Vectors) > 0 {
if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {

View file

@ -17,8 +17,8 @@ package search
import (
"fmt"
"reflect"
"sort"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/size"
)
@ -50,6 +50,24 @@ func (ap ArrayPositions) Equals(other ArrayPositions) bool {
return true
}
func (ap ArrayPositions) Compare(other ArrayPositions) int {
for i, p := range ap {
if i >= len(other) {
return 1
}
if p < other[i] {
return -1
}
if p > other[i] {
return 1
}
}
if len(ap) < len(other) {
return -1
}
return 0
}
type Location struct {
// Pos is the position of the term within the field, starting at 1
Pos uint64 `json:"pos"`
@ -69,6 +87,46 @@ func (l *Location) Size() int {
type Locations []*Location
func (p Locations) Len() int { return len(p) }
func (p Locations) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func (p Locations) Less(i, j int) bool {
c := p[i].ArrayPositions.Compare(p[j].ArrayPositions)
if c < 0 {
return true
}
if c > 0 {
return false
}
return p[i].Pos < p[j].Pos
}
func (p Locations) Dedupe() Locations { // destructive!
if len(p) <= 1 {
return p
}
sort.Sort(p)
slow := 0
for _, pfast := range p {
pslow := p[slow]
if pslow.Pos == pfast.Pos &&
pslow.Start == pfast.Start &&
pslow.End == pfast.End &&
pslow.ArrayPositions.Equals(pfast.ArrayPositions) {
continue // duplicate, so only move fast ahead
}
slow++
p[slow] = pfast
}
return p[:slow+1]
}
type TermLocationMap map[string]Locations
func (t TermLocationMap) AddLocation(term string, location *Location) {
@ -100,9 +158,6 @@ type DocumentMatch struct {
// fields as float64s and date fields as time.RFC3339 formatted strings.
Fields map[string]interface{} `json:"fields,omitempty"`
// if we load the document for this hit, remember it so we dont load again
Document *document.Document `json:"-"`
// used to maintain natural index order
HitNumber uint64 `json:"-"`
@ -195,10 +250,6 @@ func (dm *DocumentMatch) Size() int {
size.SizeOfPtr
}
if dm.Document != nil {
sizeInBytes += dm.Document.Size()
}
return sizeInBytes
}
@ -216,6 +267,7 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
var lastField string
var tlm TermLocationMap
var needsDedupe bool
for i, ftl := range dm.FieldTermLocations {
if lastField != ftl.Field {
@ -239,7 +291,19 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...)
}
tlm[ftl.Term] = append(tlm[ftl.Term], loc)
locs := tlm[ftl.Term]
// if the loc is before or at the last location, then there
// might be duplicates that need to be deduplicated
if !needsDedupe && len(locs) > 0 {
last := locs[len(locs)-1]
cmp := loc.ArrayPositions.Compare(last.ArrayPositions)
if cmp < 0 || (cmp == 0 && loc.Pos <= last.Pos) {
needsDedupe = true
}
}
tlm[ftl.Term] = append(locs, loc)
dm.FieldTermLocations[i] = FieldTermLocation{ // recycle
Location: Location{
@ -247,6 +311,14 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
},
}
}
if needsDedupe {
for _, tlm := range dm.Locations {
for term, locs := range tlm {
tlm[term] = locs.Dedupe()
}
}
}
}
dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle
@ -280,11 +352,14 @@ type Searcher interface {
type SearcherOptions struct {
Explain bool
IncludeTermVectors bool
Score string
}
// SearchContext represents the context around a single search
type SearchContext struct {
DocumentMatchPool *DocumentMatchPool
Collector Collector
IndexReader index.IndexReader
}
func (sc *SearchContext) Size() int {

View file

@ -45,6 +45,7 @@ type BooleanSearcher struct {
scorer *scorer.ConjunctionQueryScorer
matches []*search.DocumentMatch
initialized bool
done bool
}
func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searcher, shouldSearcher search.Searcher, mustNotSearcher search.Searcher, options search.SearcherOptions) (*BooleanSearcher, error) {
@ -207,6 +208,10 @@ func (s *BooleanSearcher) SetQueryNorm(qnorm float64) {
func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
if s.done {
return nil, nil
}
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
@ -319,11 +324,20 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch
return nil, err
}
}
if rv == nil {
s.done = true
}
return rv, nil
}
func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) {
if s.done {
return nil, nil
}
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
@ -331,41 +345,51 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
}
}
var err error
if s.mustSearcher != nil {
if s.currMust != nil {
ctx.DocumentMatchPool.Put(s.currMust)
// Advance the searcher only if the cursor is trailing the lookup ID
if s.currentID == nil || s.currentID.Compare(ID) < 0 {
var err error
if s.mustSearcher != nil {
if s.currMust != nil {
ctx.DocumentMatchPool.Put(s.currMust)
}
s.currMust, err = s.mustSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
s.currMust, err = s.mustSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
if s.shouldSearcher != nil {
if s.currShould != nil {
ctx.DocumentMatchPool.Put(s.currShould)
}
s.currShould, err = s.shouldSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
if s.mustNotSearcher != nil {
if s.currMustNot != nil {
ctx.DocumentMatchPool.Put(s.currMustNot)
}
s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
if s.mustSearcher != nil && s.currMust != nil {
s.currentID = s.currMust.IndexInternalID
} else if s.mustSearcher == nil && s.currShould != nil {
s.currentID = s.currShould.IndexInternalID
} else {
s.currentID = nil
if s.shouldSearcher != nil {
if s.currShould != nil {
ctx.DocumentMatchPool.Put(s.currShould)
}
s.currShould, err = s.shouldSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
if s.mustNotSearcher != nil {
// Additional check for mustNotSearcher, whose cursor isn't tracked by
// currentID to prevent it from moving when the searcher's tracked
// position is already ahead of or at the requested ID.
if s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0 {
if s.currMustNot != nil {
ctx.DocumentMatchPool.Put(s.currMustNot)
}
s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID)
if err != nil {
return nil, err
}
}
}
if s.mustSearcher != nil && s.currMust != nil {
s.currentID = s.currMust.IndexInternalID
} else if s.mustSearcher == nil && s.currShould != nil {
s.currentID = s.currShould.IndexInternalID
} else {
s.currentID = nil
}
}
return s.Next(ctx)

View file

@ -43,14 +43,27 @@ type ConjunctionSearcher struct {
options search.SearcherOptions
}
func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) {
// build the downstream searchers
func NewConjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, options search.SearcherOptions) (
search.Searcher, error) {
// build the sorted downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
}
// sort the searchers
sort.Sort(searchers)
// attempt the "unadorned" conjunction optimization only when we
// do not need extra information like freq-norm's or term vectors
if len(searchers) > 1 &&
options.Score == "none" && !options.IncludeTermVectors {
rv, err := optimizeCompositeSearcher("conjunction:unadorned",
indexReader, searchers, options)
if err != nil || rv != nil {
return rv, err
}
}
// build our searcher
rv := ConjunctionSearcher{
indexReader: indexReader,
@ -63,24 +76,10 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
// attempt push-down conjunction optimization when there's >1 searchers
if len(searchers) > 1 {
var octx index.OptimizableContext
for _, searcher := range searchers {
o, ok := searcher.(index.Optimizable)
if ok {
var err error
octx, err = o.Optimize("conjunction", octx)
if err != nil {
return nil, err
}
}
}
if octx != nil {
err := octx.Finish()
if err != nil {
return nil, err
}
rv, err := optimizeCompositeSearcher("conjunction",
indexReader, searchers, options)
if err != nil || rv != nil {
return rv, err
}
}
@ -158,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM
var rv *search.DocumentMatch
var err error
OUTER:
for s.currs[s.maxIDIdx] != nil {
for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil {
maxID := s.currs[s.maxIDIdx].IndexInternalID
i := 0

View file

@ -40,6 +40,18 @@ func NewDisjunctionSearcher(indexReader index.IndexReader,
func newDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (search.Searcher, error) {
// attempt the "unadorned" disjunction optimization only when we
// do not need extra information like freq-norm's or term vectors
// and the requested min is simple
if len(qsearchers) > 1 && min <= 1 &&
options.Score == "none" && !options.IncludeTermVectors {
rv, err := optimizeCompositeSearcher("disjunction:unadorned",
indexReader, qsearchers, options)
if err != nil || rv != nil {
return rv, err
}
}
if len(qsearchers) > DisjunctionHeapTakeover {
return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options,
limit)
@ -48,6 +60,42 @@ func newDisjunctionSearcher(indexReader index.IndexReader,
limit)
}
func optimizeCompositeSearcher(optimizationKind string,
indexReader index.IndexReader, qsearchers []search.Searcher,
options search.SearcherOptions) (search.Searcher, error) {
var octx index.OptimizableContext
for _, searcher := range qsearchers {
o, ok := searcher.(index.Optimizable)
if !ok {
return nil, nil
}
var err error
octx, err = o.Optimize(optimizationKind, octx)
if err != nil {
return nil, err
}
if octx == nil {
return nil, nil
}
}
optimized, err := octx.Finish()
if err != nil || optimized == nil {
return nil, err
}
tfr, ok := optimized.(index.TermFieldReader)
if !ok {
return nil, nil
}
return newTermSearcherFromReader(indexReader, tfr,
[]byte(optimizationKind), "*", 1.0, options)
}
func tooManyClauses(count int) bool {
if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount {
return true
@ -55,7 +103,7 @@ func tooManyClauses(count int) bool {
return false
}
func tooManyClausesErr() error {
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]",
DisjunctionMaxClauseCount)
func tooManyClausesErr(count int) error {
return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]",
count, DisjunctionMaxClauseCount)
}

View file

@ -62,7 +62,7 @@ func newDisjunctionHeapSearcher(indexReader index.IndexReader,
limit bool) (
*DisjunctionHeapSearcher, error) {
if limit && tooManyClauses(len(searchers)) {
return nil, tooManyClausesErr()
return nil, tooManyClausesErr(len(searchers))
}
// build our searcher

View file

@ -50,7 +50,7 @@ func newDisjunctionSliceSearcher(indexReader index.IndexReader,
limit bool) (
*DisjunctionSliceSearcher, error) {
if limit && tooManyClauses(len(qsearchers)) {
return nil, tooManyClausesErr()
return nil, tooManyClausesErr(len(qsearchers))
}
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))

View file

@ -31,6 +31,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
}
if fuzziness < 0 {
return nil, fmt.Errorf("invalid fuzziness, negative")
}
// Note: we don't byte slice the term for a prefix because of runes.
prefixTerm := ""
for i, r := range term {
@ -53,32 +57,40 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
fuzziness int, field, prefixTerm string) (rv []string, err error) {
rv = make([]string, 0)
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm)
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) {
return nil, tooManyClausesErr(len(rv))
}
tfd, err = fieldDict.Next()
}
return rv, err
}
var fieldDict index.FieldDict
if len(prefixTerm) > 0 {
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
} else {
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
if err != nil {
return rv, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv = append(rv, tfd.Term)
tfd, err = fieldDict.Next()
}
return rv, err
}
fieldDict, err = indexReader.FieldDict(field)
}
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
@ -95,7 +107,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
if !exceeded && ld <= fuzziness {
rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) {
return rv, tooManyClausesErr()
return nil, tooManyClausesErr(len(rv))
}
}
tfd, err = fieldDict.Next()

View file

@ -22,6 +22,11 @@ import (
"github.com/blevesearch/bleve/search"
)
type filterFunc func(key []byte) bool
var GeoBitsShift1 = (geo.GeoBits << 1)
var GeoBitsShift1Minus1 = GeoBitsShift1 - 1
func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat,
maxLon, maxLat float64, field string, boost float64,
options search.SearcherOptions, checkBoundaries bool) (
@ -36,10 +41,18 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat,
}
// do math to produce list of terms needed for this search
onBoundaryTerms, notOnBoundaryTerms := ComputeGeoRange(0, (geo.GeoBits<<1)-1,
minLon, minLat, maxLon, maxLat, checkBoundaries)
onBoundaryTerms, notOnBoundaryTerms, err := ComputeGeoRange(0, GeoBitsShift1Minus1,
minLon, minLat, maxLon, maxLat, checkBoundaries, indexReader, field)
if err != nil {
return nil, err
}
var onBoundarySearcher search.Searcher
dvReader, err := indexReader.DocValueReader([]string{field})
if err != nil {
return nil, err
}
if len(onBoundaryTerms) > 0 {
rawOnBoundarySearcher, err := NewMultiTermSearcherBytes(indexReader,
onBoundaryTerms, field, boost, options, false)
@ -48,7 +61,7 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat,
}
// add filter to check points near the boundary
onBoundarySearcher = NewFilteringSearcher(rawOnBoundarySearcher,
buildRectFilter(indexReader, field, minLon, minLat, maxLon, maxLat))
buildRectFilter(dvReader, field, minLon, minLat, maxLon, maxLat))
openedSearchers = append(openedSearchers, onBoundarySearcher)
}
@ -89,84 +102,152 @@ var geoMaxShift = document.GeoPrecisionStep * 4
var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2
func ComputeGeoRange(term uint64, shift uint,
sminLon, sminLat, smaxLon, smaxLat float64,
checkBoundaries bool) (
onBoundary [][]byte, notOnBoundary [][]byte) {
split := term | uint64(0x1)<<shift
var upperMax uint64
if shift < 63 {
upperMax = term | ((uint64(1) << (shift + 1)) - 1)
} else {
upperMax = 0xffffffffffffffff
}
lowerMax := split - 1
onBoundary, notOnBoundary = relateAndRecurse(term, lowerMax, shift,
sminLon, sminLat, smaxLon, smaxLat, checkBoundaries)
plusOnBoundary, plusNotOnBoundary := relateAndRecurse(split, upperMax, shift,
sminLon, sminLat, smaxLon, smaxLat, checkBoundaries)
onBoundary = append(onBoundary, plusOnBoundary...)
notOnBoundary = append(notOnBoundary, plusNotOnBoundary...)
return
}
sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool,
indexReader index.IndexReader, field string) (
onBoundary [][]byte, notOnBoundary [][]byte, err error) {
preallocBytesLen := 32
preallocBytes := make([]byte, preallocBytesLen)
func relateAndRecurse(start, end uint64, res uint,
sminLon, sminLat, smaxLon, smaxLat float64,
checkBoundaries bool) (
onBoundary [][]byte, notOnBoundary [][]byte) {
minLon := geo.MortonUnhashLon(start)
minLat := geo.MortonUnhashLat(start)
maxLon := geo.MortonUnhashLon(end)
maxLat := geo.MortonUnhashLat(end)
level := ((geo.GeoBits << 1) - res) >> 1
within := res%document.GeoPrecisionStep == 0 &&
geo.RectWithin(minLon, minLat, maxLon, maxLat,
sminLon, sminLat, smaxLon, smaxLat)
if within || (level == geoDetailLevel &&
geo.RectIntersects(minLon, minLat, maxLon, maxLat,
sminLon, sminLat, smaxLon, smaxLat)) {
if !within && checkBoundaries {
return [][]byte{
numeric.MustNewPrefixCodedInt64(int64(start), res),
}, nil
makePrefixCoded := func(in int64, shift uint) (rv numeric.PrefixCoded) {
if len(preallocBytes) <= 0 {
preallocBytesLen = preallocBytesLen * 2
preallocBytes = make([]byte, preallocBytesLen)
}
return nil,
[][]byte{
numeric.MustNewPrefixCodedInt64(int64(start), res),
}
} else if level < geoDetailLevel &&
geo.RectIntersects(minLon, minLat, maxLon, maxLat,
sminLon, sminLat, smaxLon, smaxLat) {
return ComputeGeoRange(start, res-1, sminLon, sminLat, smaxLon, smaxLat,
checkBoundaries)
rv, preallocBytes, err =
numeric.NewPrefixCodedInt64Prealloc(in, shift, preallocBytes)
return rv
}
return nil, nil
var fieldDict index.FieldDictContains
var isIndexed filterFunc
if irr, ok := indexReader.(index.IndexReaderContains); ok {
fieldDict, err = irr.FieldDictContains(field)
if err != nil {
return nil, nil, err
}
isIndexed = func(term []byte) bool {
found, err := fieldDict.Contains(term)
return err == nil && found
}
}
defer func() {
if fieldDict != nil {
if fd, ok := fieldDict.(index.FieldDict); ok {
cerr := fd.Close()
if cerr != nil {
err = cerr
}
}
}
}()
if isIndexed == nil {
isIndexed = func(term []byte) bool {
if indexReader != nil {
reader, err := indexReader.TermFieldReader(term, field, false, false, false)
if err != nil || reader == nil {
return false
}
if reader.Count() == 0 {
_ = reader.Close()
return false
}
_ = reader.Close()
}
return true
}
}
var computeGeoRange func(term uint64, shift uint) // declare for recursion
relateAndRecurse := func(start, end uint64, res, level uint) {
minLon := geo.MortonUnhashLon(start)
minLat := geo.MortonUnhashLat(start)
maxLon := geo.MortonUnhashLon(end)
maxLat := geo.MortonUnhashLat(end)
within := res%document.GeoPrecisionStep == 0 &&
geo.RectWithin(minLon, minLat, maxLon, maxLat,
sminLon, sminLat, smaxLon, smaxLat)
if within || (level == geoDetailLevel &&
geo.RectIntersects(minLon, minLat, maxLon, maxLat,
sminLon, sminLat, smaxLon, smaxLat)) {
codedTerm := makePrefixCoded(int64(start), res)
if isIndexed(codedTerm) {
if !within && checkBoundaries {
onBoundary = append(onBoundary, codedTerm)
} else {
notOnBoundary = append(notOnBoundary, codedTerm)
}
}
} else if level < geoDetailLevel &&
geo.RectIntersects(minLon, minLat, maxLon, maxLat,
sminLon, sminLat, smaxLon, smaxLat) {
computeGeoRange(start, res-1)
}
}
computeGeoRange = func(term uint64, shift uint) {
if err != nil {
return
}
split := term | uint64(0x1)<<shift
var upperMax uint64
if shift < 63 {
upperMax = term | ((uint64(1) << (shift + 1)) - 1)
} else {
upperMax = 0xffffffffffffffff
}
lowerMax := split - 1
level := (GeoBitsShift1 - shift) >> 1
relateAndRecurse(term, lowerMax, shift, level)
relateAndRecurse(split, upperMax, shift, level)
}
computeGeoRange(term, shift)
if err != nil {
return nil, nil, err
}
return onBoundary, notOnBoundary, err
}
func buildRectFilter(indexReader index.IndexReader, field string,
func buildRectFilter(dvReader index.DocValueReader, field string,
minLon, minLat, maxLon, maxLat float64) FilterFunc {
return func(d *search.DocumentMatch) bool {
var lon, lat float64
// check geo matches against all numeric type terms indexed
var lons, lats []float64
var found bool
err := indexReader.DocumentVisitFieldTerms(d.IndexInternalID,
[]string{field}, func(field string, term []byte) {
// only consider the values which are shifted 0
prefixCoded := numeric.PrefixCoded(term)
shift, err := prefixCoded.Shift()
if err == nil && shift == 0 {
var i64 int64
i64, err = prefixCoded.Int64()
if err == nil {
lon = geo.MortonUnhashLon(uint64(i64))
lat = geo.MortonUnhashLat(uint64(i64))
found = true
}
err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) {
// only consider the values which are shifted 0
prefixCoded := numeric.PrefixCoded(term)
shift, err := prefixCoded.Shift()
if err == nil && shift == 0 {
var i64 int64
i64, err = prefixCoded.Int64()
if err == nil {
lons = append(lons, geo.MortonUnhashLon(uint64(i64)))
lats = append(lats, geo.MortonUnhashLat(uint64(i64)))
found = true
}
})
}
})
if err == nil && found {
return geo.BoundingBoxContains(lon, lat,
minLon, minLat, maxLon, maxLat)
for i := range lons {
if geo.BoundingBoxContains(lons[i], lats[i],
minLon, minLat, maxLon, maxLat) {
return true
}
}
}
return false
}

View file

@ -34,14 +34,19 @@ func NewGeoPointDistanceSearcher(indexReader index.IndexReader, centerLon,
// build a searcher for the box
boxSearcher, err := boxSearcher(indexReader,
topLeftLon, topLeftLat, bottomRightLon, bottomRightLat,
field, boost, options)
field, boost, options, false)
if err != nil {
return nil, err
}
dvReader, err := indexReader.DocValueReader([]string{field})
if err != nil {
return nil, err
}
// wrap it in a filtering searcher which checks the actual distance
return NewFilteringSearcher(boxSearcher,
buildDistFilter(indexReader, field, centerLon, centerLat, dist)), nil
buildDistFilter(dvReader, field, centerLon, centerLat, dist)), nil
}
// boxSearcher builds a searcher for the described bounding box
@ -49,19 +54,20 @@ func NewGeoPointDistanceSearcher(indexReader index.IndexReader, centerLon,
// two boxes joined through a disjunction searcher
func boxSearcher(indexReader index.IndexReader,
topLeftLon, topLeftLat, bottomRightLon, bottomRightLat float64,
field string, boost float64, options search.SearcherOptions) (
field string, boost float64, options search.SearcherOptions, checkBoundaries bool) (
search.Searcher, error) {
if bottomRightLon < topLeftLon {
// cross date line, rewrite as two parts
leftSearcher, err := NewGeoBoundingBoxSearcher(indexReader,
-180, bottomRightLat, bottomRightLon, topLeftLat,
field, boost, options, false)
field, boost, options, checkBoundaries)
if err != nil {
return nil, err
}
rightSearcher, err := NewGeoBoundingBoxSearcher(indexReader,
topLeftLon, bottomRightLat, 180, topLeftLat, field, boost, options, false)
topLeftLon, bottomRightLat, 180, topLeftLat, field, boost, options,
checkBoundaries)
if err != nil {
_ = leftSearcher.Close()
return nil, err
@ -77,39 +83,42 @@ func boxSearcher(indexReader index.IndexReader,
return boxSearcher, nil
}
// build geoboundinggox searcher for that bounding box
// build geoboundingbox searcher for that bounding box
boxSearcher, err := NewGeoBoundingBoxSearcher(indexReader,
topLeftLon, bottomRightLat, bottomRightLon, topLeftLat, field, boost,
options, false)
options, checkBoundaries)
if err != nil {
return nil, err
}
return boxSearcher, nil
}
func buildDistFilter(indexReader index.IndexReader, field string,
func buildDistFilter(dvReader index.DocValueReader, field string,
centerLon, centerLat, maxDist float64) FilterFunc {
return func(d *search.DocumentMatch) bool {
var lon, lat float64
// check geo matches against all numeric type terms indexed
var lons, lats []float64
var found bool
err := indexReader.DocumentVisitFieldTerms(d.IndexInternalID,
[]string{field}, func(field string, term []byte) {
// only consider the values which are shifted 0
prefixCoded := numeric.PrefixCoded(term)
shift, err := prefixCoded.Shift()
if err == nil && shift == 0 {
i64, err := prefixCoded.Int64()
if err == nil {
lon = geo.MortonUnhashLon(uint64(i64))
lat = geo.MortonUnhashLat(uint64(i64))
found = true
}
err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) {
// only consider the values which are shifted 0
prefixCoded := numeric.PrefixCoded(term)
shift, err := prefixCoded.Shift()
if err == nil && shift == 0 {
i64, err := prefixCoded.Int64()
if err == nil {
lons = append(lons, geo.MortonUnhashLon(uint64(i64)))
lats = append(lats, geo.MortonUnhashLat(uint64(i64)))
found = true
}
})
}
})
if err == nil && found {
dist := geo.Haversin(lon, lat, centerLon, centerLat)
if dist <= maxDist/1000 {
return true
for i := range lons {
dist := geo.Haversin(lons[i], lats[i], centerLon, centerLat)
if dist <= maxDist/1000 {
return true
}
}
}
return false

View file

@ -0,0 +1,126 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
import (
"fmt"
"github.com/blevesearch/bleve/geo"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/numeric"
"github.com/blevesearch/bleve/search"
"math"
)
func NewGeoBoundedPolygonSearcher(indexReader index.IndexReader,
polygon []geo.Point, field string, boost float64,
options search.SearcherOptions) (search.Searcher, error) {
if len(polygon) < 3 {
return nil, fmt.Errorf("Too few points specified for the polygon boundary")
}
// compute the bounding box enclosing the polygon
topLeftLon, topLeftLat, bottomRightLon, bottomRightLat, err :=
geo.BoundingRectangleForPolygon(polygon)
if err != nil {
return nil, err
}
// build a searcher for the bounding box on the polygon
boxSearcher, err := boxSearcher(indexReader,
topLeftLon, topLeftLat, bottomRightLon, bottomRightLat,
field, boost, options, true)
if err != nil {
return nil, err
}
dvReader, err := indexReader.DocValueReader([]string{field})
if err != nil {
return nil, err
}
// wrap it in a filtering searcher that checks for the polygon inclusivity
return NewFilteringSearcher(boxSearcher,
buildPolygonFilter(dvReader, field, polygon)), nil
}
const float64EqualityThreshold = 1e-6
func almostEqual(a, b float64) bool {
return math.Abs(a-b) <= float64EqualityThreshold
}
// buildPolygonFilter returns true if the point lies inside the
// polygon. It is based on the ray-casting technique as referred
// here: https://wrf.ecse.rpi.edu/nikola/pubdetails/pnpoly.html
func buildPolygonFilter(dvReader index.DocValueReader, field string,
polygon []geo.Point) FilterFunc {
return func(d *search.DocumentMatch) bool {
// check geo matches against all numeric type terms indexed
var lons, lats []float64
var found bool
err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) {
// only consider the values which are shifted 0
prefixCoded := numeric.PrefixCoded(term)
shift, err := prefixCoded.Shift()
if err == nil && shift == 0 {
i64, err := prefixCoded.Int64()
if err == nil {
lons = append(lons, geo.MortonUnhashLon(uint64(i64)))
lats = append(lats, geo.MortonUnhashLat(uint64(i64)))
found = true
}
}
})
// Note: this approach works for points which are strictly inside
// the polygon. ie it might fail for certain points on the polygon boundaries.
if err == nil && found {
nVertices := len(polygon)
if len(polygon) < 3 {
return false
}
rayIntersectsSegment := func(point, a, b geo.Point) bool {
return (a.Lat > point.Lat) != (b.Lat > point.Lat) &&
point.Lon < (b.Lon-a.Lon)*(point.Lat-a.Lat)/(b.Lat-a.Lat)+a.Lon
}
for i := range lons {
pt := geo.Point{Lon: lons[i], Lat: lats[i]}
inside := rayIntersectsSegment(pt, polygon[len(polygon)-1], polygon[0])
// check for a direct vertex match
if almostEqual(polygon[0].Lat, lats[i]) &&
almostEqual(polygon[0].Lon, lons[i]) {
return true
}
for j := 1; j < nVertices; j++ {
if almostEqual(polygon[j].Lat, lats[i]) &&
almostEqual(polygon[j].Lon, lons[i]) {
return true
}
if rayIntersectsSegment(pt, polygon[j-1], polygon[j]) {
inside = !inside
}
}
if inside {
return true
}
}
}
return false
}
}

View file

@ -22,6 +22,10 @@ import (
func NewMultiTermSearcher(indexReader index.IndexReader, terms []string,
field string, boost float64, options search.SearcherOptions, limit bool) (
search.Searcher, error) {
if limit && tooManyClauses(len(terms)) {
return nil, tooManyClausesErr(len(terms))
}
qsearchers := make([]search.Searcher, len(terms))
qsearchersClose := func() {
for _, searcher := range qsearchers {
@ -46,6 +50,10 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string,
func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions, limit bool) (
search.Searcher, error) {
if limit && tooManyClauses(len(terms)) {
return nil, tooManyClausesErr(len(terms))
}
qsearchers := make([]search.Searcher, len(terms))
qsearchersClose := func() {
for _, searcher := range qsearchers {

View file

@ -53,22 +53,51 @@ func NewNumericRangeSearcher(indexReader index.IndexReader,
if !*inclusiveMax && maxInt64 != math.MinInt64 {
maxInt64--
}
var fieldDict index.FieldDictContains
var isIndexed filterFunc
var err error
if irr, ok := indexReader.(index.IndexReaderContains); ok {
fieldDict, err = irr.FieldDictContains(field)
if err != nil {
return nil, err
}
isIndexed = func(term []byte) bool {
found, err := fieldDict.Contains(term)
return err == nil && found
}
}
// FIXME hard-coded precision, should match field declaration
termRanges := splitInt64Range(minInt64, maxInt64, 4)
terms := termRanges.Enumerate()
terms := termRanges.Enumerate(isIndexed)
if fieldDict != nil {
if fd, ok := fieldDict.(index.FieldDict); ok {
cerr := fd.Close()
if cerr != nil {
err = cerr
}
}
}
if len(terms) < 1 {
// cannot return MatchNoneSearcher because of interaction with
// commit f391b991c20f02681bacd197afc6d8aed444e132
return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options,
true)
}
var err error
terms, err = filterCandidateTerms(indexReader, terms, field)
if err != nil {
return nil, err
// for upside_down
if isIndexed == nil {
terms, err = filterCandidateTerms(indexReader, terms, field)
if err != nil {
return nil, err
}
}
if tooManyClauses(len(terms)) {
return nil, tooManyClausesErr()
return nil, tooManyClausesErr(len(terms))
}
return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options,
@ -125,11 +154,17 @@ type termRange struct {
endTerm []byte
}
func (t *termRange) Enumerate() [][]byte {
func (t *termRange) Enumerate(filter filterFunc) [][]byte {
var rv [][]byte
next := t.startTerm
for bytes.Compare(next, t.endTerm) <= 0 {
rv = append(rv, next)
if filter != nil {
if filter(next) {
rv = append(rv, next)
}
} else {
rv = append(rv, next)
}
next = incrementBytes(next)
}
return rv
@ -150,10 +185,10 @@ func incrementBytes(in []byte) []byte {
type termRanges []*termRange
func (tr termRanges) Enumerate() [][]byte {
func (tr termRanges) Enumerate(filter filterFunc) [][]byte {
var rv [][]byte
for _, tri := range tr {
trie := tri.Enumerate()
trie := tri.Enumerate(filter)
rv = append(rv, trie...)
}
return rv

View file

@ -32,7 +32,7 @@ func init() {
}
type PhraseSearcher struct {
mustSearcher *ConjunctionSearcher
mustSearcher search.Searcher
queryNorm float64
currMust *search.DocumentMatch
terms [][]string
@ -210,7 +210,7 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch,
return nil, nil
}
// checkCurrMustMatch is soley concerned with determining if the DocumentMatch
// checkCurrMustMatch is solely concerned with determining if the DocumentMatch
// pointed to by s.currMust (which satisifies the pre-condition searcher)
// also satisfies the phase constraints. if so, it returns a DocumentMatch
// for this document, otherwise nil
@ -241,7 +241,7 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D
return nil
}
// checkCurrMustMatchField is soley concerned with determining if one
// checkCurrMustMatchField is solely concerned with determining if one
// particular field within the currMust DocumentMatch Locations
// satisfies the phase constraints (possibly more than once). if so,
// the matching field term locations are appended to the provided

View file

@ -21,48 +21,67 @@ import (
"github.com/blevesearch/bleve/search"
)
// NewRegexpStringSearcher is similar to NewRegexpSearcher, but
// additionally optimizes for index readers that handle regexp's.
func NewRegexpStringSearcher(indexReader index.IndexReader, pattern string,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
ir, ok := indexReader.(index.IndexReaderRegexp)
if !ok {
r, err := regexp.Compile(pattern)
if err != nil {
return nil, err
}
return NewRegexpSearcher(indexReader, r, field, boost, options)
}
fieldDict, err := ir.FieldDictRegexp(field, pattern)
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
var candidateTerms []string
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
candidateTerms = append(candidateTerms, tfd.Term)
tfd, err = fieldDict.Next()
}
if err != nil {
return nil, err
}
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,
options, true)
}
// NewRegexpSearcher creates a searcher which will match documents that
// contain terms which match the pattern regexp. The match must be EXACT
// matching the entire term. The provided regexp SHOULD NOT start with ^
// or end with $ as this can intefere with the implementation. Separately,
// matches will be checked to ensure they match the entire term.
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
func NewRegexpSearcher(indexReader index.IndexReader, pattern index.Regexp,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
var candidateTerms []string
if ir, ok := indexReader.(index.IndexReaderRegexp); ok {
fieldDict, err := ir.FieldDictRegexp(field, []byte(pattern.String()))
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
// enumerate the terms and check against regexp
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
candidateTerms = append(candidateTerms, tfd.Term)
tfd, err = fieldDict.Next()
}
prefixTerm, complete := pattern.LiteralPrefix()
if complete {
// there is no pattern
candidateTerms = []string{prefixTerm}
} else {
var err error
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
prefixTerm)
if err != nil {
return nil, err
}
} else {
prefixTerm, complete := pattern.LiteralPrefix()
if complete {
// there is no pattern
candidateTerms = []string{prefixTerm}
} else {
var err error
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
prefixTerm)
if err != nil {
return nil, err
}
}
}
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,
@ -70,7 +89,7 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
}
func findRegexpCandidateTerms(indexReader index.IndexReader,
pattern *regexp.Regexp, field, prefixTerm string) (rv []string, err error) {
pattern index.Regexp, field, prefixTerm string) (rv []string, err error) {
rv = make([]string, 0)
var fieldDict index.FieldDict
if len(prefixTerm) > 0 {
@ -91,7 +110,7 @@ func findRegexpCandidateTerms(indexReader index.IndexReader,
if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) {
rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) {
return rv, tooManyClausesErr()
return rv, tooManyClausesErr(len(rv))
}
}
tfd, err = fieldDict.Next()

View file

@ -38,28 +38,20 @@ type TermSearcher struct {
}
func NewTermSearcher(indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) {
reader, err := indexReader.TermFieldReader([]byte(term), field, true, true, options.IncludeTermVectors)
if err != nil {
return nil, err
}
count, err := indexReader.DocCount()
if err != nil {
_ = reader.Close()
return nil, err
}
scorer := scorer.NewTermQueryScorer([]byte(term), field, boost, count, reader.Count(), options)
return &TermSearcher{
indexReader: indexReader,
reader: reader,
scorer: scorer,
}, nil
return NewTermSearcherBytes(indexReader, []byte(term), field, boost, options)
}
func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) {
reader, err := indexReader.TermFieldReader(term, field, true, true, options.IncludeTermVectors)
needFreqNorm := options.Score != "none"
reader, err := indexReader.TermFieldReader(term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors)
if err != nil {
return nil, err
}
return newTermSearcherFromReader(indexReader, reader, term, field, boost, options)
}
func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermFieldReader,
term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) {
count, err := indexReader.DocCount()
if err != nil {
_ = reader.Close()

View file

@ -27,13 +27,24 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string,
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
var terms []string
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
terms = append(terms, tfd.Term)
if tooManyClauses(len(terms)) {
return nil, tooManyClausesErr(len(terms))
}
tfd, err = fieldDict.Next()
}
if err != nil {
return nil, err
}
return NewMultiTermSearcher(indexReader, terms, field, boost, options, true)
}

View file

@ -48,6 +48,12 @@ func NewTermRangeSearcher(indexReader index.IndexReader,
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
var terms []string
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {

View file

@ -38,6 +38,8 @@ type SearchSort interface {
RequiresScoring() bool
RequiresFields() []string
Reverse()
Copy() SearchSort
}
@ -293,6 +295,12 @@ func (so SortOrder) CacheDescending() []bool {
return rv
}
func (so SortOrder) Reverse() {
for _, soi := range so {
soi.Reverse()
}
}
// SortFieldType lets you control some internal sort behavior
// normally leaving this to the zero-value of SortFieldAuto is fine
type SortFieldType int
@ -492,6 +500,15 @@ func (s *SortField) Copy() SearchSort {
return &rv
}
func (s *SortField) Reverse() {
s.Desc = !s.Desc
if s.Missing == SortFieldMissingFirst {
s.Missing = SortFieldMissingLast
} else {
s.Missing = SortFieldMissingFirst
}
}
// SortDocID will sort results by the document identifier
type SortDocID struct {
Desc bool
@ -533,6 +550,10 @@ func (s *SortDocID) Copy() SearchSort {
return &rv
}
func (s *SortDocID) Reverse() {
s.Desc = !s.Desc
}
// SortScore will sort results by the document match score
type SortScore struct {
Desc bool
@ -574,6 +595,10 @@ func (s *SortScore) Copy() SearchSort {
return &rv
}
func (s *SortScore) Reverse() {
s.Desc = !s.Desc
}
var maxDistance = string(numeric.MustNewPrefixCodedInt64(math.MaxInt64, 0))
// NewSortGeoDistance creates SearchSort instance for sorting documents by
@ -705,6 +730,10 @@ func (s *SortGeoDistance) Copy() SearchSort {
return &rv
}
func (s *SortGeoDistance) Reverse() {
s.Desc = !s.Desc
}
type BytesSlice [][]byte
func (p BytesSlice) Len() int { return len(p) }

View file

@ -0,0 +1,3 @@
module github.com/blevesearch/go-porterstemmer
go 1.13

10
vendor/github.com/blevesearch/mmap-go/.gitignore generated vendored Normal file
View file

@ -0,0 +1,10 @@
*.out
*.5
*.6
*.8
*.swp
_obj
_test
testdata
/.idea
*.iml

16
vendor/github.com/blevesearch/mmap-go/.travis.yml generated vendored Normal file
View file

@ -0,0 +1,16 @@
language: go
os:
- linux
- osx
- windows
go:
- 1.11.4
env:
global:
- GO111MODULE=on
install:
- go mod download
- go get github.com/mattn/goveralls
script:
- go test -v -covermode=count -coverprofile=coverage.out -bench . -cpu 1,4
- '[ "${TRAVIS_PULL_REQUEST}" = "false" ] && $HOME/gopath/bin/goveralls -coverprofile=coverage.out -service=travis-ci -repotoken $COVERALLS_TOKEN || true'

25
vendor/github.com/blevesearch/mmap-go/LICENSE generated vendored Normal file
View file

@ -0,0 +1,25 @@
Copyright (c) 2011, Evan Shaw <edsrzf@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

12
vendor/github.com/blevesearch/mmap-go/README.md generated vendored Normal file
View file

@ -0,0 +1,12 @@
mmap-go
=======
mmap-go is a portable mmap package for the [Go programming language](http://golang.org).
It has been tested on Linux (386, amd64), OS X, and Windows (386). It should also
work on other Unix-like platforms, but hasn't been tested with them. I'm interested
to hear about the results.
I haven't been able to add more features without adding significant complexity,
so mmap-go doesn't support mprotect, mincore, and maybe a few other things.
If you're running on a Unix-like platform and need some of these features,
I suggest Gustavo Niemeyer's [gommap](http://labix.org/gommap).

3
vendor/github.com/blevesearch/mmap-go/go.mod generated vendored Normal file
View file

@ -0,0 +1,3 @@
module github.com/blevesearch/mmap-go
require golang.org/x/sys v0.0.0-20181221143128-b4a75ba826a6

2
vendor/github.com/blevesearch/mmap-go/go.sum generated vendored Normal file
View file

@ -0,0 +1,2 @@
golang.org/x/sys v0.0.0-20181221143128-b4a75ba826a6 h1:IcgEB62HYgAhX0Nd/QrVgZlxlcyxbGQHElLUhW2X4Fo=
golang.org/x/sys v0.0.0-20181221143128-b4a75ba826a6/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=

117
vendor/github.com/blevesearch/mmap-go/mmap.go generated vendored Normal file
View file

@ -0,0 +1,117 @@
// Copyright 2011 Evan Shaw. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file defines the common package interface and contains a little bit of
// factored out logic.
// Package mmap allows mapping files into memory. It tries to provide a simple, reasonably portable interface,
// but doesn't go out of its way to abstract away every little platform detail.
// This specifically means:
// * forked processes may or may not inherit mappings
// * a file's timestamp may or may not be updated by writes through mappings
// * specifying a size larger than the file's actual size can increase the file's size
// * If the mapped file is being modified by another process while your program's running, don't expect consistent results between platforms
package mmap
import (
"errors"
"os"
"reflect"
"unsafe"
)
const (
// RDONLY maps the memory read-only.
// Attempts to write to the MMap object will result in undefined behavior.
RDONLY = 0
// RDWR maps the memory as read-write. Writes to the MMap object will update the
// underlying file.
RDWR = 1 << iota
// COPY maps the memory as copy-on-write. Writes to the MMap object will affect
// memory, but the underlying file will remain unchanged.
COPY
// If EXEC is set, the mapped memory is marked as executable.
EXEC
)
const (
// If the ANON flag is set, the mapped memory will not be backed by a file.
ANON = 1 << iota
)
// MMap represents a file mapped into memory.
type MMap []byte
// Map maps an entire file into memory.
// If ANON is set in flags, f is ignored.
func Map(f *os.File, prot, flags int) (MMap, error) {
return MapRegion(f, -1, prot, flags, 0)
}
// MapRegion maps part of a file into memory.
// The offset parameter must be a multiple of the system's page size.
// If length < 0, the entire file will be mapped.
// If ANON is set in flags, f is ignored.
func MapRegion(f *os.File, length int, prot, flags int, offset int64) (MMap, error) {
if offset%int64(os.Getpagesize()) != 0 {
return nil, errors.New("offset parameter must be a multiple of the system's page size")
}
var fd uintptr
if flags&ANON == 0 {
fd = uintptr(f.Fd())
if length < 0 {
fi, err := f.Stat()
if err != nil {
return nil, err
}
length = int(fi.Size())
}
} else {
if length <= 0 {
return nil, errors.New("anonymous mapping requires non-zero length")
}
fd = ^uintptr(0)
}
return mmap(length, uintptr(prot), uintptr(flags), fd, offset)
}
func (m *MMap) header() *reflect.SliceHeader {
return (*reflect.SliceHeader)(unsafe.Pointer(m))
}
func (m *MMap) addrLen() (uintptr, uintptr) {
header := m.header()
return header.Data, uintptr(header.Len)
}
// Lock keeps the mapped region in physical memory, ensuring that it will not be
// swapped out.
func (m MMap) Lock() error {
return m.lock()
}
// Unlock reverses the effect of Lock, allowing the mapped region to potentially
// be swapped out.
// If m is already unlocked, aan error will result.
func (m MMap) Unlock() error {
return m.unlock()
}
// Flush synchronizes the mapping's contents to the file's contents on disk.
func (m MMap) Flush() error {
return m.flush()
}
// Unmap deletes the memory mapped region, flushes any remaining changes, and sets
// m to nil.
// Trying to read or write any remaining references to m after Unmap is called will
// result in undefined behavior.
// Unmap should only be called on the slice value that was originally returned from
// a call to Map. Calling Unmap on a derived slice may cause errors.
func (m *MMap) Unmap() error {
err := m.unmap()
*m = nil
return err
}

51
vendor/github.com/blevesearch/mmap-go/mmap_unix.go generated vendored Normal file
View file

@ -0,0 +1,51 @@
// Copyright 2011 Evan Shaw. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build darwin dragonfly freebsd linux openbsd solaris netbsd
package mmap
import (
"golang.org/x/sys/unix"
)
func mmap(len int, inprot, inflags, fd uintptr, off int64) ([]byte, error) {
flags := unix.MAP_SHARED
prot := unix.PROT_READ
switch {
case inprot&COPY != 0:
prot |= unix.PROT_WRITE
flags = unix.MAP_PRIVATE
case inprot&RDWR != 0:
prot |= unix.PROT_WRITE
}
if inprot&EXEC != 0 {
prot |= unix.PROT_EXEC
}
if inflags&ANON != 0 {
flags |= unix.MAP_ANON
}
b, err := unix.Mmap(int(fd), off, len, prot, flags)
if err != nil {
return nil, err
}
return b, nil
}
func (m MMap) flush() error {
return unix.Msync([]byte(m), unix.MS_SYNC)
}
func (m MMap) lock() error {
return unix.Mlock([]byte(m))
}
func (m MMap) unlock() error {
return unix.Munlock([]byte(m))
}
func (m MMap) unmap() error {
return unix.Munmap([]byte(m))
}

153
vendor/github.com/blevesearch/mmap-go/mmap_windows.go generated vendored Normal file
View file

@ -0,0 +1,153 @@
// Copyright 2011 Evan Shaw. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package mmap
import (
"errors"
"os"
"sync"
"golang.org/x/sys/windows"
)
// mmap on Windows is a two-step process.
// First, we call CreateFileMapping to get a handle.
// Then, we call MapviewToFile to get an actual pointer into memory.
// Because we want to emulate a POSIX-style mmap, we don't want to expose
// the handle -- only the pointer. We also want to return only a byte slice,
// not a struct, so it's convenient to manipulate.
// We keep this map so that we can get back the original handle from the memory address.
type addrinfo struct {
file windows.Handle
mapview windows.Handle
writable bool
}
var handleLock sync.Mutex
var handleMap = map[uintptr]*addrinfo{}
func mmap(len int, prot, flags, hfile uintptr, off int64) ([]byte, error) {
flProtect := uint32(windows.PAGE_READONLY)
dwDesiredAccess := uint32(windows.FILE_MAP_READ)
writable := false
switch {
case prot&COPY != 0:
flProtect = windows.PAGE_WRITECOPY
dwDesiredAccess = windows.FILE_MAP_COPY
writable = true
case prot&RDWR != 0:
flProtect = windows.PAGE_READWRITE
dwDesiredAccess = windows.FILE_MAP_WRITE
writable = true
}
if prot&EXEC != 0 {
flProtect <<= 4
dwDesiredAccess |= windows.FILE_MAP_EXECUTE
}
// The maximum size is the area of the file, starting from 0,
// that we wish to allow to be mappable. It is the sum of
// the length the user requested, plus the offset where that length
// is starting from. This does not map the data into memory.
maxSizeHigh := uint32((off + int64(len)) >> 32)
maxSizeLow := uint32((off + int64(len)) & 0xFFFFFFFF)
// TODO: Do we need to set some security attributes? It might help portability.
h, errno := windows.CreateFileMapping(windows.Handle(hfile), nil, flProtect, maxSizeHigh, maxSizeLow, nil)
if h == 0 {
return nil, os.NewSyscallError("CreateFileMapping", errno)
}
// Actually map a view of the data into memory. The view's size
// is the length the user requested.
fileOffsetHigh := uint32(off >> 32)
fileOffsetLow := uint32(off & 0xFFFFFFFF)
addr, errno := windows.MapViewOfFile(h, dwDesiredAccess, fileOffsetHigh, fileOffsetLow, uintptr(len))
if addr == 0 {
return nil, os.NewSyscallError("MapViewOfFile", errno)
}
handleLock.Lock()
handleMap[addr] = &addrinfo{
file: windows.Handle(hfile),
mapview: h,
writable: writable,
}
handleLock.Unlock()
m := MMap{}
dh := m.header()
dh.Data = addr
dh.Len = len
dh.Cap = dh.Len
return m, nil
}
func (m MMap) flush() error {
addr, len := m.addrLen()
errno := windows.FlushViewOfFile(addr, len)
if errno != nil {
return os.NewSyscallError("FlushViewOfFile", errno)
}
handleLock.Lock()
defer handleLock.Unlock()
handle, ok := handleMap[addr]
if !ok {
// should be impossible; we would've errored above
return errors.New("unknown base address")
}
if handle.writable {
if err := windows.FlushFileBuffers(handle.file); err != nil {
return os.NewSyscallError("FlushFileBuffers", err)
}
}
return nil
}
func (m MMap) lock() error {
addr, len := m.addrLen()
errno := windows.VirtualLock(addr, len)
return os.NewSyscallError("VirtualLock", errno)
}
func (m MMap) unlock() error {
addr, len := m.addrLen()
errno := windows.VirtualUnlock(addr, len)
return os.NewSyscallError("VirtualUnlock", errno)
}
func (m MMap) unmap() error {
err := m.flush()
if err != nil {
return err
}
addr := m.header().Data
// Lock the UnmapViewOfFile along with the handleMap deletion.
// As soon as we unmap the view, the OS is free to give the
// same addr to another new map. We don't want another goroutine
// to insert and remove the same addr into handleMap while
// we're trying to remove our old addr/handle pair.
handleLock.Lock()
defer handleLock.Unlock()
err = windows.UnmapViewOfFile(addr)
if err != nil {
return err
}
handle, ok := handleMap[addr]
if !ok {
// should be impossible; we would've errored above
return errors.New("unknown base address")
}
delete(handleMap, addr)
e := windows.CloseHandle(windows.Handle(handle.mapview))
return os.NewSyscallError("CloseHandle", e)
}

3
vendor/github.com/blevesearch/segment/go.mod generated vendored Normal file
View file

@ -0,0 +1,3 @@
module github.com/blevesearch/segment
go 1.13

29
vendor/github.com/blevesearch/snowballstem/COPYING generated vendored Normal file
View file

@ -0,0 +1,29 @@
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2004,2005, Richard Boulton
Copyright (c) 2013, Yoshiki Shibukawa
Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the Snowball project nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

66
vendor/github.com/blevesearch/snowballstem/README.md generated vendored Normal file
View file

@ -0,0 +1,66 @@
# snowballstem
This repository contains the Go stemmers generated by the [Snowball](https://github.com/snowballstem/snowball) project. They are maintained outside of the core bleve package so that they may be more easily be reused in other contexts.
## Usage
All these stemmers export a single `Stem()` method which operates on a snowball `Env` structure. The `Env` structure maintains all state for the stemmer. A new `Env` is created to point at an initial string. After stemming, the results of the `Stem()` operation can be retrieved using the `Current()` method. The `Env` structure can be reused for subsequent calls by using the `SetCurrent()` method.
## Example
```
package main
import (
"fmt"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/english"
)
func main() {
// words to stem
words := []string{
"running",
"jumping",
}
// build new environment
env := snowballstem.NewEnv("")
for _, word := range words {
// set up environment for word
env.SetCurrent(word)
// invoke stemmer
english.Stem(env)
// print results
fmt.Printf("%s stemmed to %s\n", word, env.Current())
}
}
```
Produces Output:
```
$ ./snowtest
running stemmed to run
jumping stemmed to jump
```
## Testing
The test harness for these stemmers is hosted in the main [Snowball](https://github.com/snowballstem/snowball) repository. There are functional tests built around the separate [snowballstem-data](https://github.com/snowballstem/snowball-data) repository, and there is support for fuzz-testing the stemmers there as well.
## Generating the Stemmers
```
$ export SNOWBALL=/path/to/github.com/snowballstem/snowball/after/snowball/built
$ go generate
```
## Updated the Go Generate Commands
A simple tool is provided to automate these from the snowball algorithms directory:
```
$ go run gengen.go /path/to/github.com/snowballstem/snowball/algorithms
```

16
vendor/github.com/blevesearch/snowballstem/among.go generated vendored Normal file
View file

@ -0,0 +1,16 @@
package snowballstem
import "fmt"
type AmongF func(env *Env, ctx interface{}) bool
type Among struct {
Str string
A int32
B int32
F AmongF
}
func (a *Among) String() string {
return fmt.Sprintf("str: `%s`, a: %d, b: %d, f: %p", a.Str, a.A, a.B, a.F)
}

File diff suppressed because it is too large Load diff

389
vendor/github.com/blevesearch/snowballstem/env.go generated vendored Normal file
View file

@ -0,0 +1,389 @@
package snowballstem
import (
"log"
"strings"
"unicode/utf8"
)
// Env represents the Snowball execution environment
type Env struct {
current string
Cursor int
Limit int
LimitBackward int
Bra int
Ket int
}
// NewEnv creates a new Snowball execution environment on the provided string
func NewEnv(val string) *Env {
return &Env{
current: val,
Cursor: 0,
Limit: len(val),
LimitBackward: 0,
Bra: 0,
Ket: len(val),
}
}
func (env *Env) Current() string {
return env.current
}
func (env *Env) SetCurrent(s string) {
env.current = s
env.Cursor = 0
env.Limit = len(s)
env.LimitBackward = 0
env.Bra = 0
env.Ket = len(s)
}
func (env *Env) ReplaceS(bra, ket int, s string) int32 {
adjustment := int32(len(s)) - (int32(ket) - int32(bra))
result, _ := splitAt(env.current, bra)
rsplit := ket
if ket < bra {
rsplit = bra
}
_, rhs := splitAt(env.current, rsplit)
result += s
result += rhs
newLim := int32(env.Limit) + adjustment
env.Limit = int(newLim)
if env.Cursor >= ket {
newCur := int32(env.Cursor) + adjustment
env.Cursor = int(newCur)
} else if env.Cursor > bra {
env.Cursor = bra
}
env.current = result
return adjustment
}
func (env *Env) EqS(s string) bool {
if env.Cursor >= env.Limit {
return false
}
if strings.HasPrefix(env.current[env.Cursor:], s) {
env.Cursor += len(s)
for !onCharBoundary(env.current, env.Cursor) {
env.Cursor++
}
return true
}
return false
}
func (env *Env) EqSB(s string) bool {
if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) {
return false
} else if !onCharBoundary(env.current, env.Cursor-len(s)) ||
!strings.HasPrefix(env.current[env.Cursor-len(s):], s) {
return false
} else {
env.Cursor -= len(s)
return true
}
}
func (env *Env) SliceFrom(s string) bool {
bra, ket := env.Bra, env.Ket
env.ReplaceS(bra, ket, s)
return true
}
func (env *Env) NextChar() {
env.Cursor++
for !onCharBoundary(env.current, env.Cursor) {
env.Cursor++
}
}
func (env *Env) PrevChar() {
env.Cursor--
for !onCharBoundary(env.current, env.Cursor) {
env.Cursor--
}
}
func (env *Env) ByteIndexForHop(delta int32) int32 {
if delta > 0 {
res := env.Cursor
for delta > 0 {
res++
delta--
for res <= len(env.current) && !onCharBoundary(env.current, res) {
res++
}
}
return int32(res)
} else if delta < 0 {
res := env.Cursor
for delta < 0 {
res--
delta++
for res >= 0 && !onCharBoundary(env.current, res) {
res--
}
}
return int32(res)
} else {
return int32(env.Cursor)
}
}
func (env *Env) InGrouping(chars []byte, min, max int32) bool {
if env.Cursor >= env.Limit {
return false
}
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
if r != utf8.RuneError {
if r > max || r < min {
return false
}
r -= min
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
return false
}
env.NextChar()
return true
}
return false
}
func (env *Env) InGroupingB(chars []byte, min, max int32) bool {
if env.Cursor <= env.LimitBackward {
return false
}
env.PrevChar()
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
if r != utf8.RuneError {
env.NextChar()
if r > max || r < min {
return false
}
r -= min
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
return false
}
env.PrevChar()
return true
}
return false
}
func (env *Env) OutGrouping(chars []byte, min, max int32) bool {
if env.Cursor >= env.Limit {
return false
}
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
if r != utf8.RuneError {
if r > max || r < min {
env.NextChar()
return true
}
r -= min
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
env.NextChar()
return true
}
}
return false
}
func (env *Env) OutGroupingB(chars []byte, min, max int32) bool {
if env.Cursor <= env.LimitBackward {
return false
}
env.PrevChar()
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
if r != utf8.RuneError {
env.NextChar()
if r > max || r < min {
env.PrevChar()
return true
}
r -= min
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
env.PrevChar()
return true
}
}
return false
}
func (env *Env) SliceDel() bool {
return env.SliceFrom("")
}
func (env *Env) Insert(bra, ket int, s string) {
adjustment := env.ReplaceS(bra, ket, s)
if bra <= env.Bra {
env.Bra = int(int32(env.Bra) + adjustment)
}
if bra <= env.Ket {
env.Ket = int(int32(env.Ket) + adjustment)
}
}
func (env *Env) SliceTo() string {
return env.current[env.Bra:env.Ket]
}
func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 {
var i int32
j := int32(len(amongs))
c := env.Cursor
l := env.Limit
var commonI, commonJ int
firstKeyInspected := false
for {
k := i + ((j - i) >> 1)
var diff int32
common := min(commonI, commonJ)
w := amongs[k]
for lvar := common; lvar < len(w.Str); lvar++ {
if c+common == l {
diff--
break
}
diff = int32(env.current[c+common]) - int32(w.Str[lvar])
if diff != 0 {
break
}
common++
}
if diff < 0 {
j = k
commonJ = common
} else {
i = k
commonI = common
}
if j-i <= 1 {
if i > 0 {
break
}
if j == i {
break
}
if firstKeyInspected {
break
}
firstKeyInspected = true
}
}
for {
w := amongs[i]
if commonI >= len(w.Str) {
env.Cursor = c + len(w.Str)
if w.F != nil {
res := w.F(env, ctx)
env.Cursor = c + len(w.Str)
if res {
return w.B
}
} else {
return w.B
}
}
i = w.A
if i < 0 {
return 0
}
}
}
func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 {
var i int32
j := int32(len(amongs))
c := env.Cursor
lb := env.LimitBackward
var commonI, commonJ int
firstKeyInspected := false
for {
k := i + ((j - i) >> 1)
diff := int32(0)
common := min(commonI, commonJ)
w := amongs[k]
for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- {
if c-common == lb {
diff--
break
}
diff = int32(env.current[c-common-1]) - int32(w.Str[lvar])
if diff != 0 {
break
}
// Count up commons. But not one character but the byte width of that char
common++
}
if diff < 0 {
j = k
commonJ = common
} else {
i = k
commonI = common
}
if j-i <= 1 {
if i > 0 {
break
}
if j == i {
break
}
if firstKeyInspected {
break
}
firstKeyInspected = true
}
}
for {
w := amongs[i]
if commonI >= len(w.Str) {
env.Cursor = c - len(w.Str)
if w.F != nil {
res := w.F(env, ctx)
env.Cursor = c - len(w.Str)
if res {
return w.B
}
} else {
return w.B
}
}
i = w.A
if i < 0 {
return 0
}
}
}
func (env *Env) Debug(count, lineNumber int) {
log.Printf("snowball debug, count: %d, line: %d", count, lineNumber)
}
func (env *Env) Clone() *Env {
clone := *env
return &clone
}
func (env *Env) AssignTo() string {
return env.Current()
}

61
vendor/github.com/blevesearch/snowballstem/gen.go generated vendored Normal file
View file

@ -0,0 +1,61 @@
package snowballstem
// to regenerate these commands, run
// go run gengen.go /path/to/snowball/algorithms/directory
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/arabic/stem_Unicode.sbl -go -o arabic/arabic_stemmer -gop arabic -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w arabic/arabic_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/danish/stem_ISO_8859_1.sbl -go -o danish/danish_stemmer -gop danish -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w danish/danish_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/dutch/stem_ISO_8859_1.sbl -go -o dutch/dutch_stemmer -gop dutch -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w dutch/dutch_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/english/stem_ISO_8859_1.sbl -go -o english/english_stemmer -gop english -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w english/english_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/finnish/stem_ISO_8859_1.sbl -go -o finnish/finnish_stemmer -gop finnish -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w finnish/finnish_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/french/stem_ISO_8859_1.sbl -go -o french/french_stemmer -gop french -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w french/french_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/german/stem_ISO_8859_1.sbl -go -o german/german_stemmer -gop german -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w german/german_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/hungarian/stem_Unicode.sbl -go -o hungarian/hungarian_stemmer -gop hungarian -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w hungarian/hungarian_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/irish/stem_ISO_8859_1.sbl -go -o irish/irish_stemmer -gop irish -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w irish/irish_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/italian/stem_ISO_8859_1.sbl -go -o italian/italian_stemmer -gop italian -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w italian/italian_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/norwegian/stem_ISO_8859_1.sbl -go -o norwegian/norwegian_stemmer -gop norwegian -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w norwegian/norwegian_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/porter/stem_ISO_8859_1.sbl -go -o porter/porter_stemmer -gop porter -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w porter/porter_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/portuguese/stem_ISO_8859_1.sbl -go -o portuguese/portuguese_stemmer -gop portuguese -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w portuguese/portuguese_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/romanian/stem_Unicode.sbl -go -o romanian/romanian_stemmer -gop romanian -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w romanian/romanian_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/russian/stem_Unicode.sbl -go -o russian/russian_stemmer -gop russian -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w russian/russian_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/spanish/stem_ISO_8859_1.sbl -go -o spanish/spanish_stemmer -gop spanish -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w spanish/spanish_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/swedish/stem_ISO_8859_1.sbl -go -o swedish/swedish_stemmer -gop swedish -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w swedish/swedish_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/tamil/stem_Unicode.sbl -go -o tamil/tamil_stemmer -gop tamil -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w tamil/tamil_stemmer.go
//go:generate $SNOWBALL/snowball $SNOWBALL/algorithms/turkish/stem_Unicode.sbl -go -o turkish/turkish_stemmer -gop turkish -gor github.com/blevesearch/snowballstem
//go:generate gofmt -s -w turkish/turkish_stemmer.go

3
vendor/github.com/blevesearch/snowballstem/go.mod generated vendored Normal file
View file

@ -0,0 +1,3 @@
module github.com/blevesearch/snowballstem
go 1.13

34
vendor/github.com/blevesearch/snowballstem/util.go generated vendored Normal file
View file

@ -0,0 +1,34 @@
package snowballstem
import (
"math"
"unicode/utf8"
)
const MaxInt = math.MaxInt32
const MinInt = math.MinInt32
func splitAt(str string, mid int) (string, string) {
return str[:mid], str[mid:]
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func onCharBoundary(s string, pos int) bool {
if pos <= 0 || pos >= len(s) {
return true
}
return utf8.RuneStart(s[pos])
}
// RuneCountInString is a wrapper around utf8.RuneCountInString
// this allows us to not have to conditionally include
// the utf8 package into some stemmers and not others
func RuneCountInString(str string) int {
return utf8.RuneCountInString(str)
}

12
vendor/github.com/blevesearch/zap/v11/.gitignore generated vendored Normal file
View file

@ -0,0 +1,12 @@
#*
*.sublime-*
*~
.#*
.project
.settings
**/.idea/
**/*.iml
.DS_Store
/cmd/zap/zap
*.test
tags

202
vendor/github.com/blevesearch/zap/v11/LICENSE generated vendored Normal file
View file

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -1,5 +1,7 @@
# zap file format
Advanced ZAP File Format Documentation is [here](zap.md).
The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written.
Current usage:
@ -90,16 +92,6 @@ If you know the doc number you're interested in, this format lets you jump to th
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## bitmaps of hits with location info
- for each posting list
- preparation phase:
- encode roaring bitmap (inidicating which hits have location details indexed) posting list to bytes (so we know the length)
- file writing phase:
- remember the start position for this bitmap
- write length of encoded roaring bitmap
- write the serialized roaring bitmap data
## postings list section
- for each posting list
@ -109,7 +101,6 @@ If you know the doc number you're interested in, this format lets you jump to th
- remember the start position for this posting list
- write freq/norm details offset (remembered from previous, as varint uint64)
- write location details offset (remembered from previous, as varint uint64)
- write location bitmap offset (remembered from pervious, as varint uint64)
- write length of encoded roaring bitmap
- write the serialized roaring bitmap data

View file

@ -18,6 +18,8 @@ import (
"bufio"
"math"
"os"
"github.com/couchbase/vellum"
)
const Version uint32 = 11
@ -26,6 +28,10 @@ const Type string = "zap"
const fieldNotUninverted = math.MaxUint64
func (sb *SegmentBase) Persist(path string) error {
return PersistSegmentBase(sb, path)
}
// PersistSegmentBase persists SegmentBase in the zap file format.
func PersistSegmentBase(sb *SegmentBase, path string) error {
flag := os.O_RDWR | os.O_CREATE
@ -137,6 +143,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
docValueOffset: docValueOffset,
dictLocs: dictLocs,
fieldDvReaders: make(map[uint16]*docValueReader),
fieldFSTs: make(map[uint16]*vellum.FST),
}
sb.updateSize()

Some files were not shown because too many files have changed in this diff Show more