// Copyright (c) 2014 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package searcher import ( "fmt" "math" "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/size" ) var reflectStaticSizePhraseSearcher int func init() { var ps PhraseSearcher reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size()) } type PhraseSearcher struct { mustSearcher *ConjunctionSearcher queryNorm float64 currMust *search.DocumentMatch terms [][]string path phrasePath paths []phrasePath locations []search.Location initialized bool } func (s *PhraseSearcher) Size() int { sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr if s.mustSearcher != nil { sizeInBytes += s.mustSearcher.Size() } if s.currMust != nil { sizeInBytes += s.currMust.Size() } for _, entry := range s.terms { sizeInBytes += size.SizeOfSlice for _, entry1 := range entry { sizeInBytes += size.SizeOfString + len(entry1) } } return sizeInBytes } func NewPhraseSearcher(indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) { // turn flat terms []string into [][]string mterms := make([][]string, len(terms)) for i, term := range terms { mterms[i] = []string{term} } return NewMultiPhraseSearcher(indexReader, mterms, field, options) } func NewMultiPhraseSearcher(indexReader index.IndexReader, terms [][]string, field string, options search.SearcherOptions) (*PhraseSearcher, error) { options.IncludeTermVectors = true var termPositionSearchers []search.Searcher for _, termPos := range terms { if len(termPos) == 1 && termPos[0] != "" { // single term ts, err := NewTermSearcher(indexReader, termPos[0], field, 1.0, options) if err != nil { // close any searchers already opened for _, ts := range termPositionSearchers { _ = ts.Close() } return nil, fmt.Errorf("phrase searcher error building term searcher: %v", err) } termPositionSearchers = append(termPositionSearchers, ts) } else if len(termPos) > 1 { // multiple terms var termSearchers []search.Searcher for _, term := range termPos { if term == "" { continue } ts, err := NewTermSearcher(indexReader, term, field, 1.0, options) if err != nil { // close any searchers already opened for _, ts := range termPositionSearchers { _ = ts.Close() } return nil, fmt.Errorf("phrase searcher error building term searcher: %v", err) } termSearchers = append(termSearchers, ts) } disjunction, err := NewDisjunctionSearcher(indexReader, termSearchers, 1, options) if err != nil { // close any searchers already opened for _, ts := range termPositionSearchers { _ = ts.Close() } return nil, fmt.Errorf("phrase searcher error building term position disjunction searcher: %v", err) } termPositionSearchers = append(termPositionSearchers, disjunction) } } mustSearcher, err := NewConjunctionSearcher(indexReader, termPositionSearchers, options) if err != nil { // close any searchers already opened for _, ts := range termPositionSearchers { _ = ts.Close() } return nil, fmt.Errorf("phrase searcher error building conjunction searcher: %v", err) } // build our searcher rv := PhraseSearcher{ mustSearcher: mustSearcher, terms: terms, } rv.computeQueryNorm() return &rv, nil } func (s *PhraseSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 if s.mustSearcher != nil { sumOfSquaredWeights += s.mustSearcher.Weight() } // now compute query norm from this s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) // finally tell all the downstream searchers the norm if s.mustSearcher != nil { s.mustSearcher.SetQueryNorm(s.queryNorm) } } func (s *PhraseSearcher) initSearchers(ctx *search.SearchContext) error { err := s.advanceNextMust(ctx) if err != nil { return err } s.initialized = true return nil } func (s *PhraseSearcher) advanceNextMust(ctx *search.SearchContext) error { var err error if s.mustSearcher != nil { if s.currMust != nil { ctx.DocumentMatchPool.Put(s.currMust) } s.currMust, err = s.mustSearcher.Next(ctx) if err != nil { return err } } return nil } func (s *PhraseSearcher) Weight() float64 { return s.mustSearcher.Weight() } func (s *PhraseSearcher) SetQueryNorm(qnorm float64) { s.mustSearcher.SetQueryNorm(qnorm) } func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { if !s.initialized { err := s.initSearchers(ctx) if err != nil { return nil, err } } for s.currMust != nil { // check this match against phrase constraints rv := s.checkCurrMustMatch(ctx) // prepare for next iteration (either loop or subsequent call to Next()) err := s.advanceNextMust(ctx) if err != nil { return nil, err } // if match satisfied phrase constraints return it as a hit if rv != nil { return rv, nil } } return nil, nil } // checkCurrMustMatch is soley concerned with determining if the DocumentMatch // pointed to by s.currMust (which satisifies the pre-condition searcher) // also satisfies the phase constraints. if so, it returns a DocumentMatch // for this document, otherwise nil func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch { s.locations = s.currMust.Complete(s.locations) locations := s.currMust.Locations s.currMust.Locations = nil ftls := s.currMust.FieldTermLocations // typically we would expect there to only actually be results in // one field, but we allow for this to not be the case // but, we note that phrase constraints can only be satisfied within // a single field, so we can check them each independently for field, tlm := range locations { ftls = s.checkCurrMustMatchField(ctx, field, tlm, ftls) } if len(ftls) > 0 { // return match rv := s.currMust s.currMust = nil rv.FieldTermLocations = ftls return rv } return nil } // checkCurrMustMatchField is soley concerned with determining if one // particular field within the currMust DocumentMatch Locations // satisfies the phase constraints (possibly more than once). if so, // the matching field term locations are appended to the provided // slice func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, field string, tlm search.TermLocationMap, ftls []search.FieldTermLocation) []search.FieldTermLocation { if s.path == nil { s.path = make(phrasePath, 0, len(s.terms)) } s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0]) for _, p := range s.paths { for _, pp := range p { ftls = append(ftls, search.FieldTermLocation{ Field: field, Term: pp.term, Location: search.Location{ Pos: pp.loc.Pos, Start: pp.loc.Start, End: pp.loc.End, ArrayPositions: pp.loc.ArrayPositions, }, }) } } return ftls } type phrasePart struct { term string loc *search.Location } func (p *phrasePart) String() string { return fmt.Sprintf("[%s %v]", p.term, p.loc) } type phrasePath []phrasePart func (p phrasePath) MergeInto(in search.TermLocationMap) { for _, pp := range p { in[pp.term] = append(in[pp.term], pp.loc) } } func (p phrasePath) String() string { rv := "[" for i, pp := range p { if i > 0 { rv += ", " } rv += pp.String() } rv += "]" return rv } // findPhrasePaths is a function to identify phase matches from a set // of known term locations. it recursive so care must be taken with // arguments and return values. // // prevPos - the previous location, 0 on first invocation // ap - array positions of the first candidate phrase part to // which further recursive phrase parts must match, // nil on initial invocation or when there are no array positions // phraseTerms - slice containing the phrase terms, // may contain empty string as placeholder (don't care) // tlm - the Term Location Map containing all relevant term locations // p - the current path being explored (appended to in recursive calls) // this is the primary state being built during the traversal // remainingSlop - amount of sloppiness that's allowed, which is the // sum of the editDistances from each matching phrase part, // where 0 means no sloppiness allowed (all editDistances must be 0), // decremented during recursion // rv - the final result being appended to by all the recursive calls // // returns slice of paths, or nil if invocation did not find any successul paths func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath { // no more terms if len(phraseTerms) < 1 { // snapshot or copy the recursively built phrasePath p and // append it to the rv, also optimizing by checking if next // phrasePath item in the rv (which we're about to overwrite) // is available for reuse var pcopy phrasePath if len(rv) < cap(rv) { pcopy = rv[:len(rv)+1][len(rv)][:0] } return append(rv, append(pcopy, p...)) } car := phraseTerms[0] cdr := phraseTerms[1:] // empty term is treated as match (continue) if len(car) == 0 || (len(car) == 1 && car[0] == "") { nextPos := prevPos + 1 if prevPos == 0 { // if prevPos was 0, don't set it to 1 (as thats not a real abs pos) nextPos = 0 // don't advance nextPos if prevPos was 0 } return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop, rv) } // locations for this term for _, carTerm := range car { locations := tlm[carTerm] LOCATIONS_LOOP: for _, loc := range locations { if prevPos != 0 && !loc.ArrayPositions.Equals(ap) { // if the array positions are wrong, can't match, try next location continue } // compute distance from previous phrase term dist := 0 if prevPos != 0 { dist = editDistance(prevPos+1, loc.Pos) } // if enough slop remaining, continue recursively if prevPos == 0 || (remainingSlop-dist) >= 0 { // skip if we've already used this term+loc already for _, ppart := range p { if ppart.term == carTerm && ppart.loc == loc { continue LOCATIONS_LOOP } } // this location works, add it to the path (but not for empty term) px := append(p, phrasePart{term: carTerm, loc: loc}) rv = findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv) } } } return rv } func editDistance(p1, p2 uint64) int { dist := int(p1 - p2) if dist < 0 { return -dist } return dist } func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { if !s.initialized { err := s.initSearchers(ctx) if err != nil { return nil, err } } if s.currMust != nil { if s.currMust.IndexInternalID.Compare(ID) >= 0 { return s.Next(ctx) } ctx.DocumentMatchPool.Put(s.currMust) } if s.currMust == nil { return nil, nil } var err error s.currMust, err = s.mustSearcher.Advance(ctx, ID) if err != nil { return nil, err } return s.Next(ctx) } func (s *PhraseSearcher) Count() uint64 { // for now return a worst case return s.mustSearcher.Count() } func (s *PhraseSearcher) Close() error { if s.mustSearcher != nil { err := s.mustSearcher.Close() if err != nil { return err } } return nil } func (s *PhraseSearcher) Min() int { return 0 } func (s *PhraseSearcher) DocumentMatchPoolSize() int { return s.mustSearcher.DocumentMatchPoolSize() + 1 }