dispatch/vendor/github.com/blevesearch/bleve/analysis/tokenizers/exception/exception.go

//  Copyright (c) 2015 Couchbase, Inc.
//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
//  except in compliance with the License. You may obtain a copy of the License at
//    http://www.apache.org/licenses/LICENSE-2.0
//  Unless required by applicable law or agreed to in writing, software distributed under the
//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
//  either express or implied. See the License for the specific language governing permissions
//  and limitations under the License.

package exception

import (
	"fmt"
	"regexp"
	"strings"

	"github.com/blevesearch/bleve/analysis"
	"github.com/blevesearch/bleve/registry"
)

const Name = "exception"

type ExceptionsTokenizer struct {
	exception *regexp.Regexp
	remaining analysis.Tokenizer
}

func NewExceptionsTokenizer(exception *regexp.Regexp, remaining analysis.Tokenizer) *ExceptionsTokenizer {
	return &ExceptionsTokenizer{
		exception: exception,
		remaining: remaining,
	}
}

func (t *ExceptionsTokenizer) Tokenize(input []byte) analysis.TokenStream {
	rv := make(analysis.TokenStream, 0)
	matches := t.exception.FindAllIndex(input, -1)
	currInput := 0
	lastPos := 0
	for _, match := range matches {
		start := match[0]
		end := match[1]
		if start > currInput {
			// need to defer to remaining for unprocessed section
			intermediate := t.remaining.Tokenize(input[currInput:start])
			// add intermediate tokens to our result stream
			for _, token := range intermediate {
				// adjust token offsets
				token.Position += lastPos
				token.Start += currInput
				token.End += currInput
				rv = append(rv, token)
			}
			lastPos += len(intermediate)
			currInput = start
		}

		// create single token with this regexp match
		token := &analysis.Token{
			Term:     input[start:end],
			Start:    start,
			End:      end,
			Position: lastPos + 1,
		}
		rv = append(rv, token)
		lastPos++
		currInput = end

	}

	if currInput < len(input) {
		// need to defer to remaining for unprocessed section
		intermediate := t.remaining.Tokenize(input[currInput:])
		// add intermediate tokens to our result stream
		for _, token := range intermediate {
			// adjust token offsets
			token.Position += lastPos
			token.Start += currInput
			token.End += currInput
			rv = append(rv, token)
		}
	}

	return rv
}

func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
	exceptions := []string{}
	iexceptions, ok := config["exceptions"].([]interface{})
	if ok {
		for _, exception := range iexceptions {
			exception, ok := exception.(string)
			if ok {
				exceptions = append(exceptions, exception)
			}
		}
	}
	aexceptions, ok := config["exceptions"].([]string)
	if ok {
		exceptions = append(exceptions, aexceptions...)
	}
	exceptionPattern := strings.Join(exceptions, "|")
	r, err := regexp.Compile(exceptionPattern)
	if err != nil {
		return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
	}

	remainingName, ok := config["tokenizer"].(string)
	if !ok {
		return nil, fmt.Errorf("must specify tokenizer for remaining input")
	}
	remaining, err := cache.TokenizerNamed(remainingName)
	if err != nil {
		return nil, err
	}
	return NewExceptionsTokenizer(r, remaining), nil
}

func init() {
	registry.RegisterTokenizer(Name, ExceptionsTokenizerConstructor)
}