Switch from Godep to go vendoring
This commit is contained in:
parent
6b37713bc0
commit
cd317761c5
1504 changed files with 263076 additions and 34441 deletions
60
vendor/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar.go
generated
vendored
Normal file
60
vendor/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar.go
generated
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ar"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
|
||||
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normalizeFilter,
|
||||
stopArFilter,
|
||||
normalizeArFilter,
|
||||
stemmerArFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
179
vendor/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar_test.go
generated
vendored
Normal file
179
vendor/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar_test.go
generated
vendored
Normal file
|
@ -0,0 +1,179 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestArabicAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("كبير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كبير"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// feminine marker
|
||||
{
|
||||
input: []byte("كبيرة"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كبير"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("مشروب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مشروب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
// plural -at
|
||||
{
|
||||
input: []byte("مشروبات"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مشروب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
// plural -in
|
||||
{
|
||||
input: []byte("أمريكيين"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("امريك"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
// singular with bare alif
|
||||
{
|
||||
input: []byte("امريكي"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("امريك"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("كتاب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتاب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// definite article
|
||||
{
|
||||
input: []byte("الكتاب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتاب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("ما ملكت أيمانكم"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ملكت"),
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ايمانكم"),
|
||||
Position: 3,
|
||||
Start: 14,
|
||||
End: 28,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stopwords
|
||||
{
|
||||
input: []byte("الذين ملكت أيمانكم"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ملكت"),
|
||||
Position: 2,
|
||||
Start: 11,
|
||||
End: 19,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ايمانكم"),
|
||||
Position: 3,
|
||||
Start: 20,
|
||||
End: 34,
|
||||
},
|
||||
},
|
||||
},
|
||||
// presentation form normalization
|
||||
{
|
||||
input: []byte("ﺍﻟﺴﻼﻢ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 15,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
80
vendor/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize.go
generated
vendored
Normal file
80
vendor/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize.go
generated
vendored
Normal file
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_ar"
|
||||
|
||||
const (
|
||||
Alef = '\u0627'
|
||||
AlefMadda = '\u0622'
|
||||
AlefHamzaAbove = '\u0623'
|
||||
AlefHamzaBelow = '\u0625'
|
||||
Yeh = '\u064A'
|
||||
DotlessYeh = '\u0649'
|
||||
TehMarbuta = '\u0629'
|
||||
Heh = '\u0647'
|
||||
Tatweel = '\u0640'
|
||||
Fathatan = '\u064B'
|
||||
Dammatan = '\u064C'
|
||||
Kasratan = '\u064D'
|
||||
Fatha = '\u064E'
|
||||
Damma = '\u064F'
|
||||
Kasra = '\u0650'
|
||||
Shadda = '\u0651'
|
||||
Sukun = '\u0652'
|
||||
)
|
||||
|
||||
type ArabicNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
|
||||
return &ArabicNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
|
||||
runes[i] = Alef
|
||||
case DotlessYeh:
|
||||
runes[i] = Yeh
|
||||
case TehMarbuta:
|
||||
runes[i] = Heh
|
||||
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewArabicNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
}
|
229
vendor/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize_test.go
generated
vendored
Normal file
229
vendor/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize_test.go
generated
vendored
Normal file
|
@ -0,0 +1,229 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestArabicNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// AlifMadda
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("آجن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اجن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifHamzaAbove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("أحمد"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("احمد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifHamzaBelow
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("إعاذ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اعاذ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifMaksura
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بنى"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بني"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// TehMarbuta
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فاطمة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فاطمه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tatweel
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرـــــت"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرت"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Fatha
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مَبنا"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مبنا"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Kasra
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("علِي"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("علي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Damma
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بُوات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بوات"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Fathatan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولداً"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدا"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Kasratan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدٍ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Dammatan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدٌ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Sukun
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نلْسون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نلسون"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Shaddah
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هتميّ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هتمي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
arabicNormalizeFilter := NewArabicNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := arabicNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
113
vendor/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar.go
generated
vendored
Normal file
113
vendor/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar.go
generated
vendored
Normal file
|
@ -0,0 +1,113 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_ar"
|
||||
|
||||
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
|
||||
var prefixes = [][]rune{
|
||||
[]rune("ال"),
|
||||
[]rune("وال"),
|
||||
[]rune("بال"),
|
||||
[]rune("كال"),
|
||||
[]rune("فال"),
|
||||
[]rune("لل"),
|
||||
[]rune("و"),
|
||||
}
|
||||
var suffixes = [][]rune{
|
||||
[]rune("ها"),
|
||||
[]rune("ان"),
|
||||
[]rune("ات"),
|
||||
[]rune("ون"),
|
||||
[]rune("ين"),
|
||||
[]rune("يه"),
|
||||
[]rune("ية"),
|
||||
[]rune("ه"),
|
||||
[]rune("ة"),
|
||||
[]rune("ي"),
|
||||
}
|
||||
|
||||
type ArabicStemmerFilter struct{}
|
||||
|
||||
func NewArabicStemmerFilter() *ArabicStemmerFilter {
|
||||
return &ArabicStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := stem(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func canStemPrefix(input, prefix []rune) bool {
|
||||
// Wa- prefix requires at least 3 characters.
|
||||
if len(prefix) == 1 && len(input) < 4 {
|
||||
return false
|
||||
}
|
||||
// Other prefixes require only 2.
|
||||
if len(input)-len(prefix) < 2 {
|
||||
return false
|
||||
}
|
||||
for i := range prefix {
|
||||
if prefix[i] != input[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func canStemSuffix(input, suffix []rune) bool {
|
||||
// All suffixes require at least 2 characters after stemming.
|
||||
if len(input)-len(suffix) < 2 {
|
||||
return false
|
||||
}
|
||||
stemEnd := len(input) - len(suffix)
|
||||
for i := range suffix {
|
||||
if suffix[i] != input[stemEnd+i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
// Strip a single prefix.
|
||||
for _, p := range prefixes {
|
||||
if canStemPrefix(runes, p) {
|
||||
runes = runes[len(p):]
|
||||
break
|
||||
}
|
||||
}
|
||||
// Strip off multiple suffixes, in their order in the suffixes array.
|
||||
for _, s := range suffixes {
|
||||
if canStemSuffix(runes, s) {
|
||||
runes = runes[:len(runes)-len(s)]
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewArabicStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
392
vendor/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar_test.go
generated
vendored
Normal file
392
vendor/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar_test.go
generated
vendored
Normal file
|
@ -0,0 +1,392 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestArabicStemmerFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// AlPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("والحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// BalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// KalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// FalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// LlPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("للاخر"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اخر"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WaPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AhSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زوجها"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زوج"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدان"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AtSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدين"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YhSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهديه"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YpSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدية"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهده"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// PSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدي"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ComboPrefSuf
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وساهدون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ComboSuf
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدهات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ShouldntStem
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الو"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الو"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// NonArabic
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("English"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("English"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("السلام"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلامة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("السلامة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الوصل"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وصل"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("والصل"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("صل"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
arabicStemmerFilter := NewArabicStemmerFilter()
|
||||
for _, test := range tests {
|
||||
actual := arabicStemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
28
vendor/github.com/blevesearch/bleve/analysis/language/ar/stop_filter_ar.go
generated
vendored
Normal file
28
vendor/github.com/blevesearch/bleve/analysis/language/ar/stop_filter_ar.go
generated
vendored
Normal file
|
@ -0,0 +1,28 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
149
vendor/github.com/blevesearch/bleve/analysis/language/ar/stop_words_ar.go
generated
vendored
Normal file
149
vendor/github.com/blevesearch/bleve/analysis/language/ar/stop_words_ar.go
generated
vendored
Normal file
|
@ -0,0 +1,149 @@
|
|||
package ar
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ar"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
|
||||
# This means that when modifying this list, you might need to add some
|
||||
# redundant entries, for example containing forms with both أ and ا
|
||||
من
|
||||
ومن
|
||||
منها
|
||||
منه
|
||||
في
|
||||
وفي
|
||||
فيها
|
||||
فيه
|
||||
و
|
||||
ف
|
||||
ثم
|
||||
او
|
||||
أو
|
||||
ب
|
||||
بها
|
||||
به
|
||||
ا
|
||||
أ
|
||||
اى
|
||||
اي
|
||||
أي
|
||||
أى
|
||||
لا
|
||||
ولا
|
||||
الا
|
||||
ألا
|
||||
إلا
|
||||
لكن
|
||||
ما
|
||||
وما
|
||||
كما
|
||||
فما
|
||||
عن
|
||||
مع
|
||||
اذا
|
||||
إذا
|
||||
ان
|
||||
أن
|
||||
إن
|
||||
انها
|
||||
أنها
|
||||
إنها
|
||||
انه
|
||||
أنه
|
||||
إنه
|
||||
بان
|
||||
بأن
|
||||
فان
|
||||
فأن
|
||||
وان
|
||||
وأن
|
||||
وإن
|
||||
التى
|
||||
التي
|
||||
الذى
|
||||
الذي
|
||||
الذين
|
||||
الى
|
||||
الي
|
||||
إلى
|
||||
إلي
|
||||
على
|
||||
عليها
|
||||
عليه
|
||||
اما
|
||||
أما
|
||||
إما
|
||||
ايضا
|
||||
أيضا
|
||||
كل
|
||||
وكل
|
||||
لم
|
||||
ولم
|
||||
لن
|
||||
ولن
|
||||
هى
|
||||
هي
|
||||
هو
|
||||
وهى
|
||||
وهي
|
||||
وهو
|
||||
فهى
|
||||
فهي
|
||||
فهو
|
||||
انت
|
||||
أنت
|
||||
لك
|
||||
لها
|
||||
له
|
||||
هذه
|
||||
هذا
|
||||
تلك
|
||||
ذلك
|
||||
هناك
|
||||
كانت
|
||||
كان
|
||||
يكون
|
||||
تكون
|
||||
وكانت
|
||||
وكان
|
||||
غير
|
||||
بعض
|
||||
قد
|
||||
نحو
|
||||
بين
|
||||
بينما
|
||||
منذ
|
||||
ضمن
|
||||
حيث
|
||||
الان
|
||||
الآن
|
||||
خلال
|
||||
بعد
|
||||
قبل
|
||||
حتى
|
||||
عند
|
||||
عندما
|
||||
لدى
|
||||
جميع
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(ArabicStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue