Switch from Godep to go vendoring

This commit is contained in:
Ken-Håvard Lieng 2016-03-01 01:51:26 +01:00
parent 6b37713bc0
commit cd317761c5
1504 changed files with 263076 additions and 34441 deletions

View file

@ -0,0 +1,63 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/language/in"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
)
const AnalyzerName = "hi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
indicNormalizeFilter, err := cache.TokenFilterNamed(in.NormalizeName)
if err != nil {
return nil, err
}
hindiNormalizeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stopHiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerHiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
indicNormalizeFilter,
hindiNormalizeFilter,
stopHiFilter,
stemmerHiFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View file

@ -0,0 +1,61 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestHindiAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// two ways to write 'hindi' itself
{
input: []byte("हिन्दी"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("हिंद"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{
input: []byte("हिंदी"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("हिंद"),
Position: 1,
Start: 0,
End: 15,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,133 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_hi"
type HindiNormalizeFilter struct {
}
func NewHindiNormalizeFilter() *HindiNormalizeFilter {
return &HindiNormalizeFilter{}
}
func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
// dead n -> bindu
case '\u0928':
if i+1 < len(runes) && runes[i+1] == '\u094D' {
runes[i] = '\u0902'
runes = analysis.DeleteRune(runes, i+1)
}
// candrabindu -> bindu
case '\u0901':
runes[i] = '\u0902'
// nukta deletions
case '\u093C':
runes = analysis.DeleteRune(runes, i)
i--
case '\u0929':
runes[i] = '\u0928'
case '\u0931':
runes[i] = '\u0930'
case '\u0934':
runes[i] = '\u0933'
case '\u0958':
runes[i] = '\u0915'
case '\u0959':
runes[i] = '\u0916'
case '\u095A':
runes[i] = '\u0917'
case '\u095B':
runes[i] = '\u091C'
case '\u095C':
runes[i] = '\u0921'
case '\u095D':
runes[i] = '\u0922'
case '\u095E':
runes[i] = '\u092B'
case '\u095F':
runes[i] = '\u092F'
// zwj/zwnj -> delete
case '\u200D', '\u200C':
runes = analysis.DeleteRune(runes, i)
i--
// virama -> delete
case '\u094D':
runes = analysis.DeleteRune(runes, i)
i--
// chandra/short -> replace
case '\u0945', '\u0946':
runes[i] = '\u0947'
case '\u0949', '\u094A':
runes[i] = '\u094B'
case '\u090D', '\u090E':
runes[i] = '\u090F'
case '\u0911', '\u0912':
runes[i] = '\u0913'
case '\u0972':
runes[i] = '\u0905'
// long -> short ind. vowels
case '\u0906':
runes[i] = '\u0905'
case '\u0908':
runes[i] = '\u0907'
case '\u090A':
runes[i] = '\u0909'
case '\u0960':
runes[i] = '\u090B'
case '\u0961':
runes[i] = '\u090C'
case '\u0910':
runes[i] = '\u090F'
case '\u0914':
runes[i] = '\u0913'
// long -> short dep. vowels
case '\u0940':
runes[i] = '\u093F'
case '\u0942':
runes[i] = '\u0941'
case '\u0944':
runes[i] = '\u0943'
case '\u0963':
runes[i] = '\u0962'
case '\u0948':
runes[i] = '\u0947'
case '\u094C':
runes[i] = '\u094B'
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View file

@ -0,0 +1,246 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestHindiNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// basics
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँगरेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँगरेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँग्रेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँग्रेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंग्रेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंग्रेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
// test decompositions
// removing nukta dot
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("क़िताब"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("फ़र्ज़"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("फरज"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("क़र्ज़"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("करज"),
},
},
},
// some other composed nukta forms
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ऱऴख़ग़ड़ढ़य़"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("रळखगडढय"),
},
},
},
// removal of format (ZWJ/ZWNJ)
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("शार्‍मा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("शारमा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("शार्‌मा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("शारमा"),
},
},
},
// removal of chandra
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ॅॆॉॊऍऎऑऒ\u0972"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ेेोोएएओओअ"),
},
},
},
// vowel shortening
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आईऊॠॡऐऔीूॄॣैौ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अइउऋऌएओिुृॢेो"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
hindiNormalizeFilter := NewHindiNormalizeFilter()
for _, test := range tests {
actual := hindiNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,144 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_hi"
type HindiStemmerFilter struct {
}
func NewHindiStemmerFilter() *HindiStemmerFilter {
return &HindiStemmerFilter{}
}
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
}
return input
}
func stem(input []byte) []byte {
inputLen := utf8.RuneCount(input)
// 5
if inputLen > 6 &&
(bytes.HasSuffix(input, []byte("ाएंगी")) ||
bytes.HasSuffix(input, []byte("ाएंगे")) ||
bytes.HasSuffix(input, []byte("ाऊंगी")) ||
bytes.HasSuffix(input, []byte("ाऊंगा")) ||
bytes.HasSuffix(input, []byte("ाइयाँ")) ||
bytes.HasSuffix(input, []byte("ाइयों")) ||
bytes.HasSuffix(input, []byte("ाइयां"))) {
return analysis.TruncateRunes(input, 5)
}
// 4
if inputLen > 5 &&
(bytes.HasSuffix(input, []byte("ाएगी")) ||
bytes.HasSuffix(input, []byte("ाएगा")) ||
bytes.HasSuffix(input, []byte("ाओगी")) ||
bytes.HasSuffix(input, []byte("ाओगे")) ||
bytes.HasSuffix(input, []byte("एंगी")) ||
bytes.HasSuffix(input, []byte("ेंगी")) ||
bytes.HasSuffix(input, []byte("एंगे")) ||
bytes.HasSuffix(input, []byte("ेंगे")) ||
bytes.HasSuffix(input, []byte("ूंगी")) ||
bytes.HasSuffix(input, []byte("ूंगा")) ||
bytes.HasSuffix(input, []byte("ातीं")) ||
bytes.HasSuffix(input, []byte("नाओं")) ||
bytes.HasSuffix(input, []byte("नाएं")) ||
bytes.HasSuffix(input, []byte("ताओं")) ||
bytes.HasSuffix(input, []byte("ताएं")) ||
bytes.HasSuffix(input, []byte("ियाँ")) ||
bytes.HasSuffix(input, []byte("ियों")) ||
bytes.HasSuffix(input, []byte("ियां"))) {
return analysis.TruncateRunes(input, 4)
}
// 3
if inputLen > 4 &&
(bytes.HasSuffix(input, []byte("ाकर")) ||
bytes.HasSuffix(input, []byte("ाइए")) ||
bytes.HasSuffix(input, []byte("ाईं")) ||
bytes.HasSuffix(input, []byte("ाया")) ||
bytes.HasSuffix(input, []byte("ेगी")) ||
bytes.HasSuffix(input, []byte("ेगा")) ||
bytes.HasSuffix(input, []byte("ोगी")) ||
bytes.HasSuffix(input, []byte("ोगे")) ||
bytes.HasSuffix(input, []byte("ाने")) ||
bytes.HasSuffix(input, []byte("ाना")) ||
bytes.HasSuffix(input, []byte("ाते")) ||
bytes.HasSuffix(input, []byte("ाती")) ||
bytes.HasSuffix(input, []byte("ाता")) ||
bytes.HasSuffix(input, []byte("तीं")) ||
bytes.HasSuffix(input, []byte("ाओं")) ||
bytes.HasSuffix(input, []byte("ाएं")) ||
bytes.HasSuffix(input, []byte("ुओं")) ||
bytes.HasSuffix(input, []byte("ुएं")) ||
bytes.HasSuffix(input, []byte("ुआं"))) {
return analysis.TruncateRunes(input, 3)
}
// 2
if inputLen > 3 &&
(bytes.HasSuffix(input, []byte("कर")) ||
bytes.HasSuffix(input, []byte("ाओ")) ||
bytes.HasSuffix(input, []byte("िए")) ||
bytes.HasSuffix(input, []byte("ाई")) ||
bytes.HasSuffix(input, []byte("ाए")) ||
bytes.HasSuffix(input, []byte("ने")) ||
bytes.HasSuffix(input, []byte("नी")) ||
bytes.HasSuffix(input, []byte("ना")) ||
bytes.HasSuffix(input, []byte("ते")) ||
bytes.HasSuffix(input, []byte("ीं")) ||
bytes.HasSuffix(input, []byte("ती")) ||
bytes.HasSuffix(input, []byte("ता")) ||
bytes.HasSuffix(input, []byte("ाँ")) ||
bytes.HasSuffix(input, []byte("ां")) ||
bytes.HasSuffix(input, []byte("ों")) ||
bytes.HasSuffix(input, []byte("ें"))) {
return analysis.TruncateRunes(input, 2)
}
// 1
if inputLen > 2 &&
(bytes.HasSuffix(input, []byte("ो")) ||
bytes.HasSuffix(input, []byte("े")) ||
bytes.HasSuffix(input, []byte("ू")) ||
bytes.HasSuffix(input, []byte("ु")) ||
bytes.HasSuffix(input, []byte("ी")) ||
bytes.HasSuffix(input, []byte("ि")) ||
bytes.HasSuffix(input, []byte("ा"))) {
return analysis.TruncateRunes(input, 1)
}
return input
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View file

@ -0,0 +1,303 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestHindiStemmerFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// masc noun inflections
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडका"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडके"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुरु"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुर"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुरुओं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुर"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्तों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
},
// feminine noun inflections
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकियों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताबें"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताबों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीका"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीकाएं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीकाओं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
// some verb forms
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाना"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाता"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाती"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
// exceptions
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिनाइयां"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
hindiStemmerFilter := NewHindiStemmerFilter()
for _, test := range tests {
actual := hindiStemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View file

@ -0,0 +1,259 @@
package hi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_hi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-license.html
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# This file was created by Jacques Savoy and is distributed under the BSD license.
# Note: by default this file also contains forms normalized by HindiNormalizer
# for spelling variation (see section below), such that it can be used whether or
# not you enable that feature. When adding additional entries to this list,
# please add the normalized form as well.
अंदर
अत
अपन
अपन
अपने
अभ
आदि
आप
इत्यि
इन
इनक
इन्ह
इन्हें
इन्ह
इस
इसक
इसक
इसके
इसमें
इस
इसे
उन
उनक
उनक
उनके
उनक
उन्ह
उन्हें
उन्ह
उस
उसके
उस
उसे
एक
एवं
एस
ऐसे
और
कई
कर
करत
करते
करन
करने
करें
कहते
कह
ि
ितन
िन्हें
िन्ह
ि
ि
ि
ि
िसे
कुछ
कुल
के
नस
गय
घर
जब
जह
ितन
ि
िन्हें
िन्ह
ि
िसे
धर
जैस
जैसे
तक
तब
तरह
ि
िन्हें
िन्ह
ि
िसे
थे
दब
ि
दुसर
दूसरे
द्व
नह
ियत
चे
ने
पर
पर
पहले
पूर
पे
ि
बन
बह
बहुत
िलकुल
तर
मगर
मे
में
यदि
यह
यह
यह
ि
ये
रखें
रह
रहे
ऱ्व
ि
िये
लेकि
वर्ग
वह
वह
वह
वह
ले
वुह
वे
वग़ैरह
संग
सकत
सकते
सबसे
सभ
बुत
से
हुआ
हुई
हुए
है
हैं
ते
ने
# additional normalized forms of the above
अपनि
जेसे
ि
सभि
िंह
इंह
दव
इसि
िंहें
ि
उंह
ओर
िंहें
वहि
अभि
बनि
ि
उंहि
उंहें
हें
वगेरह
एसे
रव
िचे
ि
उसि
पुर
ितर
हे
बहि
वह
यह
िंह
िंहें
िि
कइ
यहि
इंहि
िधर
इंहें
अदि
इतयि
हुइ
नस
इसकि
दुसरे
जह
अप
िंह
उनकि
ि
वरग
हुअ
जेस
नहि
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HindiStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}