Add message logging and search server side

This commit is contained in:
khlieng 2015-04-29 23:54:44 +02:00
parent 6378131a9d
commit 3365832ce3
738 changed files with 143131 additions and 112 deletions

4
.gitignore vendored
View File

@ -1,4 +1,2 @@
bin/
client/dist/
client/node_modules/
data.db
client/node_modules/

53
Godeps/Godeps.json generated
View File

@ -1,19 +1,72 @@
{
"ImportPath": "github.com/khlieng/name_pending",
"GoVersion": "go1.4",
"Packages": [
"./..."
],
"Deps": [
{
"ImportPath": "github.com/blevesearch/bleve",
"Rev": "16f538d7b76dd85c935a3104c390307cae5cbf79"
},
{
"ImportPath": "github.com/blevesearch/go-porterstemmer",
"Comment": "v1.0.1-9-g23a2c8e",
"Rev": "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e"
},
{
"ImportPath": "github.com/blevesearch/segment",
"Rev": "9588637ce3caba8516208ccc17193ddedd741418"
},
{
"ImportPath": "github.com/boltdb/bolt",
"Comment": "v1.0-43-gcf33c9e",
"Rev": "cf33c9e0ca0a23509b8bb8edfc63e4776bb1a330"
},
{
"ImportPath": "github.com/cznic/b",
"Rev": "c4adf3a58579a2d57cd3097f455dcdf75edcdfd8"
},
{
"ImportPath": "github.com/golang/protobuf/proto",
"Rev": "655cdfa588ea190e901bc5590e65d5621688847c"
},
{
"ImportPath": "github.com/julienschmidt/httprouter",
"Rev": "b428fda53bb0a764fea9c76c9413512eda291dec"
},
{
"ImportPath": "github.com/ryszard/goskiplist/skiplist",
"Rev": "2dfbae5fcf46374f166f8969cb07e167f1be6273"
},
{
"ImportPath": "github.com/steveyen/gtreap",
"Rev": "72cd76f34c91f8d64a031af97b499e4a0b1a6e0c"
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "4875955338b0a434238a31165cb87255ab6e9e4a"
},
{
"ImportPath": "github.com/syndtr/gosnappy/snappy",
"Rev": "156a073208e131d7d2e212cb749feae7c339e846"
},
{
"ImportPath": "github.com/willf/bitset",
"Comment": "v1.0.0-17-g4b22041",
"Rev": "4b220417a489359f934045d0509d941a7a2a1038"
},
{
"ImportPath": "golang.org/x/net/websocket",
"Rev": "3d87fd621ca9a824c5cff17216ce44769456cb3f"
},
{
"ImportPath": "golang.org/x/text/transform",
"Rev": "c92eb3cd6e70951a111680995e651ea4b2c35539"
},
{
"ImportPath": "golang.org/x/text/unicode/norm",
"Rev": "c92eb3cd6e70951a111680995e651ea4b2c35539"
}
]
}

View File

@ -0,0 +1,17 @@
#*
*.sublime-*
*~
.#*
.project
.settings
.DS_Store
/analysis/token_filters/cld2/cld2-read-only
/analysis/token_filters/cld2/libcld2_full.a
/utils/bleve_create/bleve_create
/utils/bleve_dump/bleve_dump
/utils/bleve_index/bleve_index
/utils/bleve_bulkindex/bleve_bulkindex
/utils/bleve_index/index.bleve/
/utils/bleve_query/bleve_query
/utils/bleve_registry/bleve_registry
/y.output

View File

@ -0,0 +1,19 @@
language: go
go:
- 1.4
script:
- go get golang.org/x/tools/cmd/vet
- go get golang.org/x/tools/cmd/cover
- go get github.com/mattn/goveralls
- go get github.com/kisielk/errcheck
- go test -v ./...
- go vet ./...
- errcheck ./...
- docs/project-code-coverage.sh
- docs/build_children.sh
notifications:
email:
- marty.schoch@gmail.com

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,61 @@
# ![bleve](docs/bleve.png) bleve
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/)
Try out bleve live by [searching our wiki](http://wikisearch.blevesearch.com/search/).
## Features
* Index any go data structure (including JSON)
* Intelligent defaults backed up by powerful configuration
* Supported field types:
* Text, Numeric, Date
* Supported query types:
* Term, Phrase, Match, Match Phrase, Prefix
* Conjunction, Disjunction, Boolean
* Numeric Range, Date Range
* Simple query [syntax](https://github.com/blevesearch/bleve/wiki/Query-String-Query) for human entry
* tf-idf Scoring
* Search result match highlighting
* Supports Aggregating Facets:
* Terms Facet
* Numeric Range Facet
* Date Range Facet
## Discussion
Discuss usage and development of bleve in the [google group](https://groups.google.com/forum/#!forum/bleve).
## Indexing
message := struct{
Id string
From string
Body string
}{
Id: "example",
From: "marty.schoch@gmail.com",
Body: "bleve indexing is easy",
}
mapping := bleve.NewIndexMapping()
index, err := bleve.New("example.bleve", mapping)
if err != nil {
panic(err)
}
index.Index(message.Id, message)
## Querying
index, _ := bleve.Open("example.bleve")
query := bleve.NewQueryStringQuery("bleve")
searchRequest := bleve.NewSearchRequest(query)
searchResult, _ := index.Search(searchRequest)
## License
Apache License Version 2.0

View File

@ -0,0 +1,130 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package standard_analyzer
import (
"fmt"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "custom"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
var err error
var charFilters []analysis.CharFilter
charFiltersNames, ok := config["char_filters"].([]string)
if ok {
charFilters, err = getCharFilters(charFiltersNames, cache)
if err != nil {
return nil, err
}
} else {
charFiltersNamesInterfaceSlice, ok := config["char_filters"].([]interface{})
if ok {
charFiltersNames, err := convertInterfaceSliceToStringSlice(charFiltersNamesInterfaceSlice, "char filter")
if err != nil {
return nil, err
}
charFilters, err = getCharFilters(charFiltersNames, cache)
if err != nil {
return nil, err
}
}
}
tokenizerName, ok := config["tokenizer"].(string)
if !ok {
return nil, fmt.Errorf("must specify tokenizer")
}
tokenizer, err := cache.TokenizerNamed(tokenizerName)
if err != nil {
return nil, err
}
var tokenFilters []analysis.TokenFilter
tokenFiltersNames, ok := config["token_filters"].([]string)
if ok {
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
if err != nil {
return nil, err
}
} else {
tokenFiltersNamesInterfaceSlice, ok := config["token_filters"].([]interface{})
if ok {
tokenFiltersNames, err := convertInterfaceSliceToStringSlice(tokenFiltersNamesInterfaceSlice, "token filter")
if err != nil {
return nil, err
}
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
if err != nil {
return nil, err
}
}
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
}
if charFilters != nil {
rv.CharFilters = charFilters
}
if tokenFilters != nil {
rv.TokenFilters = tokenFilters
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}
func getCharFilters(charFilterNames []string, cache *registry.Cache) ([]analysis.CharFilter, error) {
charFilters := make([]analysis.CharFilter, len(charFilterNames))
for i, charFilterName := range charFilterNames {
charFilter, err := cache.CharFilterNamed(charFilterName)
if err != nil {
return nil, err
}
charFilters[i] = charFilter
}
return charFilters, nil
}
func getTokenFilters(tokenFilterNames []string, cache *registry.Cache) ([]analysis.TokenFilter, error) {
tokenFilters := make([]analysis.TokenFilter, len(tokenFilterNames))
for i, tokenFilterName := range tokenFilterNames {
tokenFilter, err := cache.TokenFilterNamed(tokenFilterName)
if err != nil {
return nil, err
}
tokenFilters[i] = tokenFilter
}
return tokenFilters, nil
}
func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType string) ([]string, error) {
stringSlice := make([]string, len(interfaceSlice))
for i, interfaceObj := range interfaceSlice {
stringObj, ok := interfaceObj.(string)
if ok {
stringSlice[i] = stringObj
} else {
return nil, fmt.Errorf(objType + " name must be a string")
}
}
return stringSlice, nil
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build cld2 full
package detect_lang_analyzer
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/cld2"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "detect_lang"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
detectLangFilter, err := cache.TokenFilterNamed(cld2.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
detectLangFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,33 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package keyword_analyzer
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "keyword"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,41 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_analyzer
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "simple"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,47 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package standard_analyzer
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "standard"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,33 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ignore_byte_array_converter
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
type IgnoreByteArrayConverter struct{}
func NewIgnoreByteArrayConverter() *IgnoreByteArrayConverter {
return &IgnoreByteArrayConverter{}
}
func (c *IgnoreByteArrayConverter) Convert(in []byte) (interface{}, error) {
return nil, nil
}
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
return NewIgnoreByteArrayConverter(), nil
}
func init() {
registry.RegisterByteArrayConverter("ignore", Constructor)
}

View File

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package json_byte_array_converter
import (
"encoding/json"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
type JSONByteArrayConverter struct{}
func NewJSONByteArrayConverter() *JSONByteArrayConverter {
return &JSONByteArrayConverter{}
}
func (c *JSONByteArrayConverter) Convert(in []byte) (interface{}, error) {
var rv map[string]interface{}
err := json.Unmarshal(in, &rv)
if err != nil {
return nil, err
}
return rv, nil
}
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
return NewJSONByteArrayConverter(), nil
}
func init() {
registry.RegisterByteArrayConverter("json", Constructor)
}

View File

@ -0,0 +1,33 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package string_byte_array_converter
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
type StringByteArrayConverter struct{}
func NewStringByteArrayConverter() *StringByteArrayConverter {
return &StringByteArrayConverter{}
}
func (c *StringByteArrayConverter) Convert(in []byte) (interface{}, error) {
return string(in), nil
}
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
return NewStringByteArrayConverter(), nil
}
func init() {
registry.RegisterByteArrayConverter("string", Constructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"regexp"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "html"
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -0,0 +1,58 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package regexp_char_filter
import (
"bytes"
"fmt"
"regexp"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "regexp"
type RegexpCharFilter struct {
r *regexp.Regexp
replacement []byte
}
func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter {
return &RegexpCharFilter{
r: r,
replacement: replacement,
}
}
func (s *RegexpCharFilter) Filter(input []byte) []byte {
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
}
func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
regexpStr, ok := config["regexp"].(string)
if !ok {
return nil, fmt.Errorf("must specify regexp")
}
r, err := regexp.Compile(regexpStr)
if err != nil {
return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
}
replaceBytes := []byte(" ")
replaceStr, ok := config["replace"].(string)
if ok {
replaceBytes = []byte(replaceStr)
}
return NewRegexpCharFilter(r, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
}

View File

@ -0,0 +1,82 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package regexp_char_filter
import (
"reflect"
"regexp"
"testing"
)
func TestRegexpCharFilter(t *testing.T) {
htmlTagPattern := `</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`
htmlRegex := regexp.MustCompile(htmlTagPattern)
tests := []struct {
input []byte
output []byte
}{
{
input: []byte(`<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>`),
output: []byte(`
My First Heading
My first paragraph.
`),
},
}
for _, test := range tests {
filter := NewRegexpCharFilter(htmlRegex, []byte{' '})
output := filter.Filter(test.input)
if !reflect.DeepEqual(output, test.output) {
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
}
}
}
func TestZeroWidthNonJoinerCharFilter(t *testing.T) {
zeroWidthNonJoinerPattern := `\x{200C}`
zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern)
tests := []struct {
input []byte
output []byte
}{
{
input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"),
output: []byte("water under the bridge"),
},
}
for _, test := range tests {
filter := NewRegexpCharFilter(zeroWidthNonJoinerRegex, []byte{' '})
output := filter.Filter(test.input)
if !reflect.DeepEqual(output, test.output) {
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
}
}
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package zero_width_non_joiner
import (
"regexp"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "zero_width_spaces"
var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"time"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "dateTimeOptional"
const rfc3339NoTimezone = "2006-01-02T15:04:05"
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
const rfc3339NoTime = "2006-01-02"
var layouts = []string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
}
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
}
func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package flexible_go
import (
"fmt"
"time"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const Name = "flexiblego"
type FlexibleGoDateTimeParser struct {
layouts []string
}
func NewFlexibleGoDateTimeParser(layouts []string) *FlexibleGoDateTimeParser {
return &FlexibleGoDateTimeParser{
layouts: layouts,
}
}
func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error) {
for _, layout := range p.layouts {
rv, err := time.Parse(layout, input)
if err == nil {
return rv, nil
}
}
return time.Time{}, analysis.ErrInvalidDateTime
}
func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
layouts, ok := config["layouts"].([]interface{})
if !ok {
return nil, fmt.Errorf("must specify layouts")
}
layoutStrs := make([]string, 0)
for _, layout := range layouts {
layoutStr, ok := layout.(string)
if ok {
layoutStrs = append(layoutStrs, layoutStr)
}
}
return NewFlexibleGoDateTimeParser(layoutStrs), nil
}
func init() {
registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
}

View File

@ -0,0 +1,84 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package flexible_go
import (
"reflect"
"testing"
"time"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestFlexibleDateTimeParser(t *testing.T) {
testLocation := time.FixedZone("", -8*60*60)
tests := []struct {
input string
expectedTime time.Time
expectedError error
}{
{
input: "2014-08-03",
expectedTime: time.Date(2014, 8, 3, 0, 0, 0, 0, time.UTC),
expectedError: nil,
},
{
input: "2014-08-03T15:59:30",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
expectedError: nil,
},
{
input: "2014-08-03 15:59:30",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
expectedError: nil,
},
{
input: "2014-08-03T15:59:30-08:00",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, testLocation),
expectedError: nil,
},
{
input: "2014-08-03T15:59:30.999999999-08:00",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 999999999, testLocation),
expectedError: nil,
},
{
input: "not a date time",
expectedTime: time.Time{},
expectedError: analysis.ErrInvalidDateTime,
},
}
rfc3339NoTimezone := "2006-01-02T15:04:05"
rfc3339NoTimezoneNoT := "2006-01-02 15:04:05"
rfc3339NoTime := "2006-01-02"
dateOptionalTimeParser := NewFlexibleGoDateTimeParser(
[]string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
})
for _, test := range tests {
actualTime, actualErr := dateOptionalTimeParser.ParseDateTime(test.input)
if actualErr != test.expectedError {
t.Errorf("expected error %#v, got %#v", test.expectedError, actualErr)
continue
}
if !reflect.DeepEqual(actualTime, test.expectedTime) {
t.Errorf("expected time %#v, got %#v", test.expectedTime, actualTime)
t.Errorf("expected location %#v,\n got %#v", test.expectedTime.Location(), actualTime.Location())
}
}
}

View File

@ -0,0 +1,88 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package analysis
type TokenLocation struct {
Field string
Start int
End int
Position int
}
type TokenFreq struct {
Term []byte
Locations []*TokenLocation
}
type TokenFrequencies []*TokenFreq
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) TokenFrequencies {
// put existing tokens into a map
index := make(map[string]*TokenFreq)
for _, tf := range tfs {
index[string(tf.Term)] = tf
}
// walk the new token frequencies
for _, tf := range other {
// set the remoteField value in incoming token freqs
for _, l := range tf.Locations {
l.Field = remoteField
}
existingTf, exists := index[string(tf.Term)]
if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
} else {
index[string(tf.Term)] = tf
}
}
// flatten map back to array
rv := make(TokenFrequencies, len(index))
i := 0
for _, tf := range index {
rv[i] = tf
i++
}
return rv
}
func TokenFrequency(tokens TokenStream) TokenFrequencies {
index := make(map[string]*TokenFreq)
for _, token := range tokens {
curr, ok := index[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &TokenLocation{
Start: token.Start,
End: token.End,
Position: token.Position,
})
} else {
index[string(token.Term)] = &TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{
&TokenLocation{
Start: token.Start,
End: token.End,
Position: token.Position,
},
},
}
}
}
rv := make(TokenFrequencies, len(index))
i := 0
for _, tf := range index {
rv[i] = tf
i++
}
return rv
}

View File

@ -0,0 +1,167 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package analysis
import (
"reflect"
"testing"
)
func TestTokenFrequency(t *testing.T) {
tokens := TokenStream{
&Token{
Term: []byte("water"),
Position: 1,
Start: 0,
End: 5,
},
&Token{
Term: []byte("water"),
Position: 2,
Start: 6,
End: 11,
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
result := TokenFrequency(tokens)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}
}
func TestTokenFrequenciesMergeAll(t *testing.T) {
tf1 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
tf2 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
&TokenLocation{
Field: "tf2",
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Field: "tf2",
Position: 2,
Start: 6,
End: 11,
},
},
},
}
tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(tf1, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, tf1)
}
}
func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
tf1 := TokenFrequencies{}
tf2 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Field: "tf2",
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Field: "tf2",
Position: 2,
Start: 6,
End: 11,
},
},
},
}
result := tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "ar"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
stopArFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normalizeFilter,
stopArFilter,
normalizeArFilter,
stemmerArFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,179 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestArabicAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("كبير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كبير"),
Position: 1,
Start: 0,
End: 8,
},
},
},
// feminine marker
{
input: []byte("كبيرة"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كبير"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("مشروب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مشروب"),
Position: 1,
Start: 0,
End: 10,
},
},
},
// plural -at
{
input: []byte("مشروبات"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مشروب"),
Position: 1,
Start: 0,
End: 14,
},
},
},
// plural -in
{
input: []byte("أمريكيين"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 16,
},
},
},
// singular with bare alif
{
input: []byte("امريكي"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{
input: []byte("كتاب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتاب"),
Position: 1,
Start: 0,
End: 8,
},
},
},
// definite article
{
input: []byte("الكتاب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتاب"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{
input: []byte("ما ملكت أيمانكم"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ملكت"),
Position: 2,
Start: 5,
End: 13,
},
&analysis.Token{
Term: []byte("ايمانكم"),
Position: 3,
Start: 14,
End: 28,
},
},
},
// stopwords
{
input: []byte("الذين ملكت أيمانكم"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ملكت"),
Position: 2,
Start: 11,
End: 19,
},
&analysis.Token{
Term: []byte("ايمانكم"),
Position: 3,
Start: 20,
End: 34,
},
},
},
// presentation form normalization
{
input: []byte("ﺍﻟﺴﻼﻢ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
Position: 1,
Start: 0,
End: 15,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,80 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"bytes"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_ar"
const (
Alef = '\u0627'
AlefMadda = '\u0622'
AlefHamzaAbove = '\u0623'
AlefHamzaBelow = '\u0625'
Yeh = '\u064A'
DotlessYeh = '\u0649'
TehMarbuta = '\u0629'
Heh = '\u0647'
Tatweel = '\u0640'
Fathatan = '\u064B'
Dammatan = '\u064C'
Kasratan = '\u064D'
Fatha = '\u064E'
Damma = '\u064F'
Kasra = '\u0650'
Shadda = '\u0651'
Sukun = '\u0652'
)
type ArabicNormalizeFilter struct {
}
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
return &ArabicNormalizeFilter{}
}
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
runes[i] = Alef
case DotlessYeh:
runes[i] = Yeh
case TehMarbuta:
runes[i] = Heh
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
runes = analysis.DeleteRune(runes, i)
i--
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -0,0 +1,229 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestArabicNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// AlifMadda
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("آجن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اجن"),
},
},
},
// AlifHamzaAbove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("أحمد"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("احمد"),
},
},
},
// AlifHamzaBelow
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("إعاذ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اعاذ"),
},
},
},
// AlifMaksura
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بنى"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بني"),
},
},
},
// TehMarbuta
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فاطمة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("فاطمه"),
},
},
},
// Tatweel
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("روبرـــــت"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("روبرت"),
},
},
},
// Fatha
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("مَبنا"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مبنا"),
},
},
},
// Kasra
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("علِي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("علي"),
},
},
},
// Damma
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بُوات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بوات"),
},
},
},
// Fathatan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولداً"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدا"),
},
},
},
// Kasratan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدٍ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولد"),
},
},
},
// Dammatan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدٌ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولد"),
},
},
},
// Sukun
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("نلْسون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("نلسون"),
},
},
},
// Shaddah
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("هتميّ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هتمي"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
arabicNormalizeFilter := NewArabicNormalizeFilter()
for _, test := range tests {
actual := arabicNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,113 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"bytes"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_ar"
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
var prefixes = [][]rune{
[]rune("ال"),
[]rune("وال"),
[]rune("بال"),
[]rune("كال"),
[]rune("فال"),
[]rune("لل"),
[]rune("و"),
}
var suffixes = [][]rune{
[]rune("ها"),
[]rune("ان"),
[]rune("ات"),
[]rune("ون"),
[]rune("ين"),
[]rune("يه"),
[]rune("ية"),
[]rune("ه"),
[]rune("ة"),
[]rune("ي"),
}
type ArabicStemmerFilter struct{}
func NewArabicStemmerFilter() *ArabicStemmerFilter {
return &ArabicStemmerFilter{}
}
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := stem(token.Term)
token.Term = term
}
return input
}
func canStemPrefix(input, prefix []rune) bool {
// Wa- prefix requires at least 3 characters.
if len(prefix) == 1 && len(input) < 4 {
return false
}
// Other prefixes require only 2.
if len(input)-len(prefix) < 2 {
return false
}
for i := range prefix {
if prefix[i] != input[i] {
return false
}
}
return true
}
func canStemSuffix(input, suffix []rune) bool {
// All suffixes require at least 2 characters after stemming.
if len(input)-len(suffix) < 2 {
return false
}
stemEnd := len(input) - len(suffix)
for i := range suffix {
if suffix[i] != input[stemEnd+i] {
return false
}
}
return true
}
func stem(input []byte) []byte {
runes := bytes.Runes(input)
// Strip a single prefix.
for _, p := range prefixes {
if canStemPrefix(runes, p) {
runes = runes[len(p):]
break
}
}
// Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes {
if canStemSuffix(runes, s) {
runes = runes[:len(runes)-len(s)]
}
}
return analysis.BuildTermFromRunes(runes)
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,392 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestArabicStemmerFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// AlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// WalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// BalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// KalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// FalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// LlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("للاخر"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اخر"),
},
},
},
// WaPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// AhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوجها"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوج"),
},
},
},
// AnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدان"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// AtSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// WnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدين"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهديه"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YpSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدية"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// HSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهده"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// PSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboPrefSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدهات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ShouldntStem
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
},
// NonArabic
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("السلام"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلامة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("السلامة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الوصل"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("وصل"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والصل"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("صل"),
},
},
},
// Empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
arabicStemmerFilter := NewArabicStemmerFilter()
for _, test := range tests {
actual := arabicStemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,149 @@
package ar
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ar"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
// ` was changed to ' to allow for literal string
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
# This means that when modifying this list, you might need to add some
# redundant entries, for example containing forms with both أ and ا
من
ومن
منها
منه
في
وفي
فيها
فيه
و
ف
ثم
او
أو
ب
بها
به
ا
أ
اى
اي
أي
أى
لا
ولا
الا
ألا
إلا
لكن
ما
وما
كما
فما
عن
مع
اذا
إذا
ان
أن
إن
انها
أنها
إنها
انه
أنه
إنه
بان
بأن
فان
فأن
وان
وأن
وإن
التى
التي
الذى
الذي
الذين
الى
الي
إلى
إلي
على
عليها
عليه
اما
أما
إما
ايضا
أيضا
كل
وكل
لم
ولم
لن
ولن
هى
هي
هو
وهى
وهي
وهو
فهى
فهي
فهو
انت
أنت
لك
لها
له
هذه
هذا
تلك
ذلك
هناك
كانت
كان
يكون
تكون
وكانت
وكان
غير
بعض
قد
نحو
بين
بينما
منذ
ضمن
حيث
الان
الآن
خلال
بعد
قبل
حتى
عند
عندما
لدى
جميع
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArabicStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bg
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,217 @@
package bg
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_bg"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
а
аз
ако
ала
бе
без
беше
би
бил
била
били
било
близо
бъдат
бъде
бяха
в
вас
ваш
ваша
вероятно
вече
взема
ви
вие
винаги
все
всеки
всички
всичко
всяка
във
въпреки
върху
г
ги
главно
го
д
да
дали
до
докато
докога
дори
досега
доста
е
едва
един
ето
за
зад
заедно
заради
засега
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
ли
м
ме
между
мен
ми
мнозина
мога
могат
може
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нея
ни
ние
никой
нито
но
някои
някой
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първо
с
са
само
се
сега
си
скоро
след
сме
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
тн
то
това
тогава
този
той
толкова
точно
трябва
тук
тъй
тя
тях
у
харесва
ч
че
често
чрез
ще
щом
я
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BulgarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,30 @@
package ca
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const ArticlesName = "articles_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var CatalanArticles = []byte(`
d
l
m
n
s
t
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,32 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"fmt"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const ElisionName = "elision_ca"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'Institut"),
},
&analysis.Token{
Term: []byte("d'Estudis"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Institut"),
},
&analysis.Token{
Term: []byte("Estudis"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,244 @@
package ca
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var CatalanStopWords = []byte(`# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
a
abans
ací
ah
així
això
al
als
aleshores
algun
alguna
algunes
alguns
alhora
allà
allí
allò
altra
altre
altres
amb
ambdós
ambdues
apa
aquell
aquella
aquelles
aquells
aquest
aquesta
aquestes
aquests
aquí
baix
cada
cadascú
cadascuna
cadascunes
cadascuns
com
contra
d'un
d'una
d'unes
d'uns
dalt
de
del
dels
des
després
dins
dintre
donat
doncs
durant
e
eh
el
els
em
en
encara
ens
entre
érem
eren
éreu
es
és
esta
està
estàvem
estaven
estàveu
esteu
et
etc
ets
fins
fora
gairebé
ha
han
has
havia
he
hem
heu
hi
ho
i
igual
iguals
ja
l'hi
la
les
li
li'n
llavors
m'he
ma
mal
malgrat
mateix
mateixa
mateixes
mateixos
me
mentre
més
meu
meus
meva
meves
molt
molta
moltes
molts
mon
mons
n'he
n'hi
ne
ni
no
nogensmenys
només
nosaltres
nostra
nostre
nostres
o
oh
oi
on
pas
pel
pels
per
però
perquè
poc
poca
pocs
poques
potser
propi
qual
quals
quan
quant
que
què
quelcom
qui
quin
quina
quines
quins
s'ha
s'han
sa
semblant
semblants
ses
seu
seus
seva
seva
seves
si
sobre
sobretot
sóc
solament
sols
son
són
sons
sota
sou
t'ha
t'han
t'he
ta
tal
també
tampoc
tan
tant
tanta
tantes
teu
teus
teva
teves
ton
tons
tot
tota
totes
tots
un
una
unes
uns
us
va
vaig
vam
van
vas
veu
vosaltres
vostra
vostre
vostres
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "cjk"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
if err != nil {
return nil, err
}
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
bigramFilter, err := cache.TokenFilterNamed(BigramName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: whitespaceTokenizer,
TokenFilters: []analysis.TokenFilter{
normalizeFilter,
toLowerFilter,
bigramFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,620 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestCJKAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("こんにちは世界"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
},
},
{
input: []byte("一二三四五六七八九十"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一二"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("二三"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("三四"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("四五"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("五六"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("六七"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
&analysis.Token{
Term: []byte("七八"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("八九"),
Type: analysis.Double,
Position: 8,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("九十"),
Type: analysis.Double,
Position: 9,
Start: 24,
End: 30,
},
},
},
{
input: []byte("一 二三四 五六七八九 十"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("二三"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
&analysis.Token{
Term: []byte("三四"),
Type: analysis.Double,
Position: 3,
Start: 7,
End: 13,
},
&analysis.Token{
Term: []byte("五六"),
Type: analysis.Double,
Position: 5,
Start: 14,
End: 20,
},
&analysis.Token{
Term: []byte("六七"),
Type: analysis.Double,
Position: 6,
Start: 17,
End: 23,
},
&analysis.Token{
Term: []byte("七八"),
Type: analysis.Double,
Position: 7,
Start: 20,
End: 26,
},
&analysis.Token{
Term: []byte("八九"),
Type: analysis.Double,
Position: 8,
Start: 23,
End: 29,
},
&analysis.Token{
Term: []byte("十"),
Type: analysis.Single,
Position: 10,
Start: 30,
End: 33,
},
},
},
{
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abc"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("defgh"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 9,
},
&analysis.Token{
Term: []byte("ijklmn"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 10,
End: 16,
},
&analysis.Token{
Term: []byte("opqrstu"),
Type: analysis.AlphaNumeric,
Position: 4,
Start: 17,
End: 24,
},
&analysis.Token{
Term: []byte("vwxy"),
Type: analysis.AlphaNumeric,
Position: 5,
Start: 25,
End: 29,
},
&analysis.Token{
Term: []byte("z"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 30,
End: 31,
},
},
},
{
input: []byte("あい"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("あい "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("test"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("test "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("あいtest"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 6,
End: 10,
},
},
},
{
input: []byte("testあい "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
},
},
{
input: []byte("あいうえおabcかきくけこ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("いう"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("うえ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("えお"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("abc"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("かき"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("きく"),
Type: analysis.Double,
Position: 8,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("くけ"),
Type: analysis.Double,
Position: 9,
Start: 24,
End: 30,
},
&analysis.Token{
Term: []byte("けこ"),
Type: analysis.Double,
Position: 10,
Start: 27,
End: 33,
},
},
},
{
input: []byte("あいうえおabんcかきくけ こ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("いう"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("うえ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("えお"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("ab"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 15,
End: 17,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 7,
Start: 17,
End: 20,
},
&analysis.Token{
Term: []byte("c"),
Type: analysis.AlphaNumeric,
Position: 8,
Start: 20,
End: 21,
},
&analysis.Token{
Term: []byte("かき"),
Type: analysis.Double,
Position: 9,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("きく"),
Type: analysis.Double,
Position: 10,
Start: 24,
End: 30,
},
&analysis.Token{
Term: []byte("くけ"),
Type: analysis.Double,
Position: 11,
Start: 27,
End: 33,
},
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 13,
Start: 34,
End: 37,
},
},
},
{
input: []byte("一 روبرت موير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("روبرت"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 14,
},
&analysis.Token{
Term: []byte("موير"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 15,
End: 23,
},
},
},
{
input: []byte("一 رُوبرت موير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("رُوبرت"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 16,
},
&analysis.Token{
Term: []byte("موير"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 17,
End: 25,
},
},
},
{
input: []byte("𩬅艱鍟䇹愯瀛"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("𩬅艱"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 7,
},
&analysis.Token{
Term: []byte("艱鍟"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
&analysis.Token{
Term: []byte("鍟䇹"),
Type: analysis.Double,
Position: 3,
Start: 7,
End: 13,
},
&analysis.Token{
Term: []byte("䇹愯"),
Type: analysis.Double,
Position: 4,
Start: 10,
End: 16,
},
&analysis.Token{
Term: []byte("愯瀛"),
Type: analysis.Double,
Position: 5,
Start: 13,
End: 19,
},
},
},
{
input: []byte("一"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
},
},
{
input: []byte("一丁丂"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一丁"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("丁丂"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
},
},
}
cache := registry.NewCache()
for _, test := range tests {
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -0,0 +1,166 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"container/ring"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const BigramName = "cjk_bigram"
type CJKBigramFilter struct {
outputUnigram bool
}
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
return &CJKBigramFilter{
outputUnigram: outputUnigram,
}
}
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
r := ring.New(2)
itemsInRing := 0
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
if token.Type == analysis.Ideographic {
if itemsInRing > 0 {
// if items already buffered
// check to see if this is aligned
curr := r.Value.(*analysis.Token)
if token.Start-curr.End != 0 {
// not aligned flush
flushToken := s.flush(r, &itemsInRing)
if flushToken != nil {
rv = append(rv, flushToken)
}
}
}
// now we can add this token to the buffer
r = r.Next()
r.Value = token
if itemsInRing < 2 {
itemsInRing++
}
if itemsInRing > 1 && s.outputUnigram {
unigram := s.buildUnigram(r, &itemsInRing)
if unigram != nil {
rv = append(rv, unigram)
}
}
bigramToken := s.outputBigram(r, &itemsInRing)
if bigramToken != nil {
rv = append(rv, bigramToken)
}
} else {
// flush anything already buffered
flushToken := s.flush(r, &itemsInRing)
if flushToken != nil {
rv = append(rv, flushToken)
}
// output this token as is
rv = append(rv, token)
}
}
// deal with possible trailing unigram
if itemsInRing == 1 || s.outputUnigram {
if itemsInRing == 2 {
r = r.Next()
}
unigram := s.buildUnigram(r, &itemsInRing)
if unigram != nil {
rv = append(rv, unigram)
}
}
return rv
}
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token {
var rv *analysis.Token
if *itemsInRing == 1 {
rv = s.buildUnigram(r, itemsInRing)
}
r.Value = nil
*itemsInRing = 0
return rv
}
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
if *itemsInRing == 2 {
thisShingleRing := r.Move(-1)
shingledBytes := make([]byte, 0)
// do first token
prev := thisShingleRing.Value.(*analysis.Token)
shingledBytes = append(shingledBytes, prev.Term...)
// do second token
thisShingleRing = thisShingleRing.Next()
curr := thisShingleRing.Value.(*analysis.Token)
shingledBytes = append(shingledBytes, curr.Term...)
token := analysis.Token{
Type: analysis.Double,
Term: shingledBytes,
Position: prev.Position,
Start: prev.Start,
End: curr.End,
}
return &token
}
return nil
}
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
if *itemsInRing == 2 {
thisShingleRing := r.Move(-1)
// do first token
prev := thisShingleRing.Value.(*analysis.Token)
token := analysis.Token{
Type: analysis.Single,
Term: prev.Term,
Position: prev.Position,
Start: prev.Start,
End: prev.End,
}
return &token
} else if *itemsInRing == 1 {
// do first token
prev := r.Value.(*analysis.Token)
token := analysis.Token{
Type: analysis.Single,
Term: prev.Term,
Position: prev.Position,
Start: prev.Start,
End: prev.End,
}
return &token
}
return nil
}
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
outputUnigram := false
outVal, ok := config["output_unigram"].(bool)
if ok {
outputUnigram = outVal
}
return NewCJKBigramFilter(outputUnigram), nil
}
func init() {
registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
}

View File

@ -0,0 +1,420 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestCJKBigramFilter(t *testing.T) {
tests := []struct {
outputUnigram bool
input analysis.TokenStream
output analysis.TokenStream
}{
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 5,
End: 7,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 5,
End: 7,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
},
},
{
outputUnigram: true,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Single,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Single,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Single,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Single,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Single,
Position: 7,
Start: 18,
End: 21,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("cat"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 8,
Start: 21,
End: 24,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("cat"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
},
},
}
for _, test := range tests {
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
actual := cjkBigramFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View File

@ -0,0 +1,58 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package ckb
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "ckb"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
normCkbFilter,
toLowerFilter,
stopCkbFilter,
stemmerCkbFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,74 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package ckb
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestSoraniAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stop word removal
{
input: []byte("ئەم پیاوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 2,
Start: 7,
End: 17,
},
},
},
{
input: []byte("پیاوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("پیاو"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 8,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -0,0 +1,113 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"bytes"
"unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_ckb"
const (
Yeh = '\u064A'
DotlessYeh = '\u0649'
FarsiYeh = '\u06CC'
Kaf = '\u0643'
Keheh = '\u06A9'
Heh = '\u0647'
Ae = '\u06D5'
Zwnj = '\u200C'
HehDoachashmee = '\u06BE'
TehMarbuta = '\u0629'
Reh = '\u0631'
Rreh = '\u0695'
RrehAbove = '\u0692'
Tatweel = '\u0640'
Fathatan = '\u064B'
Dammatan = '\u064C'
Kasratan = '\u064D'
Fatha = '\u064E'
Damma = '\u064F'
Kasra = '\u0650'
Shadda = '\u0651'
Sukun = '\u0652'
)
type SoraniNormalizeFilter struct {
}
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
return &SoraniNormalizeFilter{}
}
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case Yeh, DotlessYeh:
runes[i] = FarsiYeh
case Kaf:
runes[i] = Keheh
case Zwnj:
if i > 0 && runes[i-1] == Heh {
runes[i-1] = Ae
}
runes = analysis.DeleteRune(runes, i)
i--
case Heh:
if i == len(runes)-1 {
runes[i] = Ae
}
case TehMarbuta:
runes[i] = Ae
case HehDoachashmee:
runes[i] = Heh
case Reh:
if i == 0 {
runes[i] = Rreh
}
case RrehAbove:
runes[i] = Rreh
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
runes = analysis.DeleteRune(runes, i)
i--
default:
if unicode.In(runes[i], unicode.Cf) {
runes = analysis.DeleteRune(runes, i)
i--
}
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -0,0 +1,318 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestSoraniNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// test Y
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064A"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0649"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
// test K
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0643"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
},
// test H
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u200C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u200C\u06A9"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5\u06A9"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06BE"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0629"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5"),
},
},
},
// test final H
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u0647\u0647"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u0647\u06D5"),
},
},
},
// test RR
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0692"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0695"),
},
},
},
// test initial RR
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0631\u0631\u0631"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0695\u0631\u0631"),
},
},
},
// test remove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0640"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064B"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064D"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064E"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064F"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0650"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0651"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0652"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u200C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
soraniNormalizeFilter := NewSoraniNormalizeFilter()
for _, test := range tests {
actual := soraniNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,143 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"bytes"
"unicode/utf8"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_ckb"
type SoraniStemmerFilter struct {
}
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
return &SoraniStemmerFilter{}
}
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
}
return input
}
func stem(input []byte) []byte {
inputLen := utf8.RuneCount(input)
// postposition
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
input = truncateRunes(input, 2)
inputLen = utf8.RuneCount(input)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
input = truncateRunes(input, 1)
inputLen = utf8.RuneCount(input)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
input = truncateRunes(input, 3)
inputLen = utf8.RuneCount(input)
}
// possessive pronoun
if inputLen > 6 &&
(bytes.HasSuffix(input, []byte("مان")) ||
bytes.HasSuffix(input, []byte("یان")) ||
bytes.HasSuffix(input, []byte("تان"))) {
input = truncateRunes(input, 3)
inputLen = utf8.RuneCount(input)
}
// indefinite singular ezafe
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
return truncateRunes(input, 3)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
return truncateRunes(input, 4)
}
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
// indefinite singular
return truncateRunes(input, 2)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
// indefinite singular
return truncateRunes(input, 3)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
// definite singular
return truncateRunes(input, 3)
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
// definite singular
return truncateRunes(input, 2)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
// definite plural
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
// definite plural
return truncateRunes(input, 3)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
// indefinite plural ezafe
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
// indefinite plural ezafe
return truncateRunes(input, 3)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
// indefinite plural
return truncateRunes(input, 3)
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
// indefinite plural
return truncateRunes(input, 2)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
// demonstrative plural
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
// demonstrative plural
return truncateRunes(input, 3)
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
// demonstrative singular
return truncateRunes(input, 2)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
// demonstrative singular
return truncateRunes(input, 1)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
// absolute singular ezafe
return truncateRunes(input, 1)
}
return input
}
func truncateRunes(input []byte, num int) []byte {
runes := bytes.Runes(input)
runes = runes[:len(runes)-num]
out := buildTermFromRunes(runes)
return out
}
func buildTermFromRunes(runes []rune) []byte {
rv := make([]byte, 0, len(runes)*4)
for _, r := range runes {
runeBytes := make([]byte, utf8.RuneLen(r))
utf8.EncodeRune(runeBytes, r)
rv = append(rv, runeBytes...)
}
return rv
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,294 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
)
func TestSoraniStemmerFilter(t *testing.T) {
// in order to match the lucene tests
// we will test with an analyzer, not just the stemmer
analyzer := analysis.Analyzer{
Tokenizer: single_token.NewSingleTokenTokenizer(),
TokenFilters: []analysis.TokenFilter{
NewSoraniNormalizeFilter(),
NewSoraniStemmerFilter(),
},
}
tests := []struct {
input []byte
output analysis.TokenStream
}{
{ // -ek
input: []byte("پیاوێک"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -yek
input: []byte("دەرگایەک"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -aka
input: []byte("پیاوەكە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -ka
input: []byte("دەرگاكە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -a
input: []byte("کتاویە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("کتاوی"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -ya
input: []byte("دەرگایە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -An
input: []byte("پیاوان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -yAn
input: []byte("دەرگایان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -akAn
input: []byte("پیاوەکان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -kAn
input: []byte("دەرگاکان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -Ana
input: []byte("پیاوانە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -yAna
input: []byte("دەرگایانە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{ // Ezafe singular
input: []byte("هۆتیلی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // Ezafe indefinite
input: []byte("هۆتیلێکی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // Ezafe plural
input: []byte("هۆتیلانی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -awa
input: []byte("دوورەوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دوور"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -dA
input: []byte("نیوەشەودا"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("نیوەشەو"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{ // -A
input: []byte("سۆرانا"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سۆران"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -mAn
input: []byte("پارەمان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -tAn
input: []byte("پارەتان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -yAn
input: []byte("پارەیان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // empty
input: []byte(""),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
Position: 1,
Start: 0,
End: 0,
},
},
},
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("for input %s(% x)", test.input, test.input)
t.Errorf("\texpected:")
for _, token := range test.output {
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
}
t.Errorf("\tactual:")
for _, token := range actual {
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
}
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,160 @@
package ckb
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ckb"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var SoraniStopWords = []byte(`# set of kurdish stopwords
# note these have been normalized with our scheme (e represented with U+06D5, etc)
# constructed from:
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
# and
و
# which
کە
# of
ی
# made/did
کرد
# that/which
ئەوەی
# on/head
سەر
# two
دوو
# also
هەروەها
# from/that
لەو
# makes/does
دەکات
# some
چەند
# every
هەر
# demonstratives
# that
ئەو
# this
ئەم
# personal pronouns
# I
من
# we
ئێمە
# you
تۆ
# you
ئێوە
# he/she/it
ئەو
# they
ئەوان
# prepositions
# to/with/by
بە
پێ
# without
بەبێ
# along with/while/during
بەدەم
# in the opinion of
بەلای
# according to
بەپێی
# before
بەرلە
# in the direction of
بەرەوی
# in front of/toward
بەرەوە
# before/in the face of
بەردەم
# without
بێ
# except for
بێجگە
# for
بۆ
# on/in
دە
تێ
# with
دەگەڵ
# after
دوای
# except for/aside from
جگە
# in/from
لە
لێ
# in front of/before/because of
لەبەر
# between/among
لەبەینی
# concerning/about
لەبابەت
# concerning
لەبارەی
# instead of
لەباتی
# beside
لەبن
# instead of
لەبرێتی
# behind
لەدەم
# with/together with
لەگەڵ
# by
لەلایەن
# within
لەناو
# between/among
لەنێو
# for the sake of
لەپێناوی
# with respect to
لەرەوی
# by means of/for
لەرێ
# for the sake of
لەرێگا
# on/on top of/according to
لەسەر
# under
لەژێر
# between/among
ناو
# between/among
نێوان
# after
پاش
# before
پێش
# like
وەک
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SoraniStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cs
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,196 @@
package cs
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_cs"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var CzechStopWords = []byte(`a
s
k
o
i
u
v
z
dnes
cz
tímto
budeš
budem
byli
jseš
můj
svým
ta
tomto
tohle
tuto
tyto
jej
zda
proč
máte
tato
kam
tohoto
kdo
kteří
mi
nám
tom
tomuto
mít
nic
proto
kterou
byla
toho
protože
asi
ho
naši
napište
re
což
tím
takže
svých
její
svými
jste
aj
tu
tedy
teto
bylo
kde
ke
pravé
ji
nad
nejsou
či
pod
téma
mezi
přes
ty
pak
vám
ani
když
však
neg
jsem
tento
článku
články
aby
jsme
před
pta
jejich
byl
ještě
bez
také
pouze
první
vaše
která
nás
nový
tipy
pokud
může
strana
jeho
své
jiné
zprávy
nové
není
vás
jen
podle
zde
být
více
bude
již
než
který
by
které
co
nebo
ten
tak
při
od
po
jsou
jak
další
ale
si
se
ve
to
jako
za
zpět
ze
do
pro
je
na
atd
atp
jakmile
přičemž
on
ona
ono
oni
ony
my
vy
ji
mne
jemu
tomu
těm
těmu
němu
němuž
jehož
jíž
jelikož
jež
jakož
načež
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CzechStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,54 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package da
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "da"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDaFilter,
stemmerDaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,69 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package da
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestDanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("undersøg"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 9,
},
},
},
{
input: []byte("undersøgelse"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 13,
},
},
},
// stop word
{
input: []byte("på"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package da
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_da"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("da")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,134 @@
package da
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_da"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
| on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/youself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package de
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "de"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDeFilter,
normalizeDeFilter,
stemmerDeFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,97 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package de
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestGermanAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("Tisch"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("Tische"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("Tischen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 7,
},
},
},
// german specials
{
input: []byte("Schaltflächen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{
input: []byte("Schaltflaechen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -0,0 +1,94 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"bytes"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_de"
const (
N = /* ordinary state */ 0
V = 1 /* stops 'u' from entering umlaut state */
U = 2 /* umlaut state, allows e-deletion */
)
type GermanNormalizeFilter struct {
}
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
return &GermanNormalizeFilter{}
}
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
state := N
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'a', 'o':
state = U
case 'u':
if state == N {
state = U
} else {
state = V
}
case 'e':
if state == U {
runes = analysis.DeleteRune(runes, i)
i--
}
state = V
case 'i', 'q', 'y':
state = V
case 'ä':
runes[i] = 'a'
state = V
case 'ö':
runes[i] = 'o'
state = V
case 'ü':
runes[i] = 'u'
state = V
case 'ß':
runes[i] = 's'
i++
// newrunes := make([]rune, len(runes)+1)
// copy(newrunes, runes)
// runes = newrunes
// runes[i] = 's'
runes = analysis.InsertRune(runes, i, 's')
state = N
default:
state = N
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -0,0 +1,98 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestGermanNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// Tests that a/o/u + e is equivalent to the umlaut form
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflächen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflaechen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
// Tests the specific heuristic that ue is not folded after a vowel or q.
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
},
// Tests german specific folding of sharp-s
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("weißbier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("weissbier"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
germanNormalizeFilter := NewGermanNormalizeFilter()
for _, test := range tests {
actual := germanNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package de
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_de"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("de")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,318 @@
package de
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_de"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The number of forms in this list is reduced significantly by passing it
| through the German stemmer.
aber | but
alle | all
allem
allen
aller
alles
als | than, as
also | so
am | an + dem
an | at
ander | other
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch | also
auf | on
aus | out of
bei | by
bin | am
bis | until
bist | art
da | there
damit | with it
dann | then
der | the
den
des
dem
die
das
daß | that
derselbe | the same
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu | to that
dein | thy
deine
deinem
deinen
deiner
deines
denn | because
derer | of those
dessen | of him
dich | thee
dir | to thee
du | thou
dies | this
diese
diesem
diesen
dieser
dieses
doch | (several meanings)
dort | (over) there
durch | through
ein | a
eine
einem
einen
einer
eines
einig | some
einige
einigem
einigen
einiger
einiges
einmal | once
er | he
ihn | him
ihm | to him
es | it
etwas | something
euer | your
eure
eurem
euren
eurer
eures
für | for
gegen | towards
gewesen | p.p. of sein
hab | have
habe | have
haben | have
hat | has
hatte | had
hatten | had
hier | here
hin | there
hinter | behind
ich | I
mich | me
mir | to me
ihr | you, to her
ihre
ihrem
ihren
ihrer
ihres
euch | to you
im | in + dem
in | in
indem | while
ins | in + das
ist | is
jede | each, every
jedem
jeden
jeder
jedes
jene | that
jenem
jenen
jener
jenes
jetzt | now
kann | can
kein | no
keine
keinem
keinen
keiner
keines
können | can
könnte | could
machen | do
man | one
manche | some, many a
manchem
manchen
mancher
manches
mein | my
meine
meinem
meinen
meiner
meines
mit | with
muss | must
musste | had to
nach | to(wards)
nicht | not
nichts | nothing
noch | still, yet
nun | now
nur | only
ob | whether
oder | or
ohne | without
sehr | very
sein | his
seine
seinem
seinen
seiner
seines
selbst | self
sich | herself
sie | they, she
ihnen | to them
sind | are
so | so
solche | such
solchem
solchen
solcher
solches
soll | shall
sollte | should
sondern | but
sonst | else
über | over
um | about, around
und | and
uns | us
unse
unsem
unsen
unser
unses
unter | under
viel | much
vom | von + dem
von | from
vor | before
während | while
war | was
waren | were
warst | wast
was | what
weg | away, off
weil | because
weiter | further
welche | which
welchem
welchen
welcher
welches
wenn | when
werde | will
werden | will
wie | how
wieder | again
will | want
wir | we
wird | will
wirst | willst
wo | where
wollen | want
wollte | wanted
würde | would
würden | would
zu | to
zum | zu + dem
zur | zu + der
zwar | indeed
zwischen | between
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GermanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package el
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,102 @@
package el
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_el"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var GreekStopWords = []byte(`# Lucene Greek Stopwords list
# Note: by default this file is used after GreekLowerCaseFilter,
# so when modifying this file use 'σ' instead of 'ς'
ο
η
το
οι
τα
του
τησ
των
τον
την
και
κι
κ
ειμαι
εισαι
ειναι
ειμαστε
ειστε
στο
στον
στη
στην
μα
αλλα
απο
για
προσ
με
σε
ωσ
παρα
αντι
κατα
μετα
θα
να
δε
δεν
μη
μην
επι
ενω
εαν
αν
τοτε
που
πωσ
ποιοσ
ποια
ποιο
ποιοι
ποιεσ
ποιων
ποιουσ
αυτοσ
αυτη
αυτο
αυτοι
αυτων
αυτουσ
αυτεσ
αυτα
εκεινοσ
εκεινη
εκεινο
εκεινοι
εκεινεσ
εκεινα
εκεινων
εκεινουσ
οπωσ
ομωσ
ισωσ
οσο
οτι
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GreekStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,57 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/porter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "en"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
possEnFilter,
toLowerFilter,
stopEnFilter,
stemmerEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,100 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestEnglishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("books"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("book"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("book"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("book"),
Position: 1,
Start: 0,
End: 4,
},
},
},
// stop word removal
{
input: []byte("the"),
output: analysis.TokenStream{},
},
// possessive removal
{
input: []byte("steven's"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("steven"),
Position: 1,
Start: 0,
End: 8,
},
},
},
{
input: []byte("steven\u2019s"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("steven"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("steven\uFF07s"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("steven"),
Position: 1,
Start: 0,
End: 10,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -0,0 +1,57 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"bytes"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const PossessiveName = "possessive_en"
const rightSingleQuotationMark = ''
const apostrophe = '\''
const fullWidthApostrophe = ''
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
type PossessiveFilter struct {
}
func NewPossessiveFilter() *PossessiveFilter {
return &PossessiveFilter{}
}
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
if len(runes) >= 2 {
secondToLastRune := runes[len(runes)-2]
lastRune := runes[len(runes)-1]
if (secondToLastRune == rightSingleQuotationMark ||
secondToLastRune == apostrophe ||
secondToLastRune == fullWidthApostrophe) &&
(lastRune == 's' || lastRune == 'S') {
token.Term = analysis.TruncateRunes(token.Term, 2)
}
}
}
return input
}
func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPossessiveFilter(), nil
}
func init() {
registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
}

View File

@ -0,0 +1,86 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestEnglishPossessiveFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("marty's"),
},
&analysis.Token{
Term: []byte("MARTY'S"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("m"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("marty"),
},
&analysis.Token{
Term: []byte("MARTY"),
},
&analysis.Token{
Term: []byte("marty"),
},
&analysis.Token{
Term: []byte("MARTY"),
},
&analysis.Token{
Term: []byte("marty"),
},
&analysis.Token{
Term: []byte("MARTY"),
},
&analysis.Token{
Term: []byte("m"),
},
},
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := stemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package en
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_en"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("en")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,72 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package en
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestEnglishStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := stemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,343 @@
package en
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_en"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| An English stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| Many of the forms below are quite rare (e.g. "yourselves") but included for
| completeness.
| PRONOUNS FORMS
| 1st person sing
i | subject, always in upper case of course
me | object
my | possessive adjective
| the possessive pronoun 'mine' is best suppressed, because of the
| sense of coal-mine etc.
myself | reflexive
| 1st person plural
we | subject
| us | object
| care is required here because US = United States. It is usually
| safe to remove it if it is in lower case.
our | possessive adjective
ours | possessive pronoun
ourselves | reflexive
| second person (archaic 'thou' forms not included)
you | subject and object
your | possessive adjective
yours | possessive pronoun
yourself | reflexive (singular)
yourselves | reflexive (plural)
| third person singular
he | subject
him | object
his | possessive adjective and pronoun
himself | reflexive
she | subject
her | object and possessive adjective
hers | possessive pronoun
herself | reflexive
it | subject and object
its | possessive adjective
itself | reflexive
| third person plural
they | subject
them | object
their | possessive adjective
theirs | possessive pronoun
themselves | reflexive
| other forms (demonstratives, interrogatives)
what
which
who
whom
this
that
these
those
| VERB FORMS (using F.R. Palmer's nomenclature)
| BE
am | 1st person, present
is | -s form (3rd person, present)
are | present
was | 1st person, past
were | past
be | infinitive
been | past participle
being | -ing form
| HAVE
have | simple
has | -s form
had | past
having | -ing form
| DO
do | simple
does | -s form
did | past
doing | -ing form
| The forms below are, I believe, best omitted, because of the significant
| homonym forms:
| He made a WILL
| old tin CAN
| merry month of MAY
| a smell of MUST
| fight the good fight with all thy MIGHT
| would, could, should, ought might however be included
| | AUXILIARIES
| | WILL
|will
would
| | SHALL
|shall
should
| | CAN
|can
could
| | MAY
|may
|might
| | MUST
|must
| | OUGHT
ought
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
| pronoun + verb
i'm
you're
he's
she's
it's
we're
they're
i've
you've
we've
they've
i'd
you'd
he'd
she'd
we'd
they'd
i'll
you'll
he'll
she'll
we'll
they'll
| verb + negation
isn't
aren't
wasn't
weren't
hasn't
haven't
hadn't
doesn't
don't
didn't
| auxiliary + negation
won't
wouldn't
shan't
shouldn't
can't
cannot
couldn't
mustn't
| miscellaneous forms
let's
that's
who's
what's
here's
there's
when's
where's
why's
how's
| rarer forms
| daren't needn't
| doubtful forms
| oughtn't mightn't
| ARTICLES
a
an
the
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
| high, that classification is pointless.)
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
| Just for the record, the following words are among the commonest in English
| one
| every
| least
| less
| many
| now
| ever
| never
| say
| says
| said
| also
| get
| go
| goes
| just
| made
| make
| put
| see
| seen
| whether
| like
| well
| back
| even
| still
| way
| take
| since
| another
| however
| two
| three
| four
| five
| first
| second
| new
| old
| high
| long
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(EnglishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,54 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package es
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "es"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
stemmerEsFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,64 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package es
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestSpanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("chicana"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chican"),
Position: 1,
Start: 0,
End: 7,
},
},
},
{
input: []byte("chicano"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chican"),
Position: 1,
Start: 0,
End: 7,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package es
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_es"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("es")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,380 @@
package es
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_es"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | from, of
la | the, her
que | who, that
el | the
en | in
y | and
a | to
los | the, them
del | de + el
se | himself, from him etc
las | the, them
por | for, by, etc
un | a
para | for
con | with
no | no
una | a
su | his, her
al | a + el
| es from SER
lo | him
como | how
más | more
pero | pero
sus | su plural
le | to him, her
ya | already
o | or
| fue from SER
este | this
| ha from HABER
| himself etc
porque | because
esta | this
| son from SER
entre | between
| está from ESTAR
cuando | when
muy | very
sin | without
sobre | on
| ser from SER
| tiene from TENER
también | also
me | me
hasta | until
hay | there is/are
donde | where
| han from HABER
quien | whom, that
| están from ESTAR
| estado from ESTAR
desde | from
todo | all
nos | us
durante | during
| estados from ESTAR
todos | all
uno | a
les | to them
ni | nor
contra | against
otros | other
| fueron from SER
ese | that
eso | that
| había from HABER
ante | before
ellos | they
e | and (variant of y)
esto | this
| me
antes | before
algunos | some
qué | what?
unos | a
yo | I
otro | other
otras | other
otra | other
él | he
tanto | so much, many
esa | that
estos | these
mucho | much, many
quienes | who
nada | nothing
muchos | many
cual | who
| sea from SER
poco | few
ella | she
estar | to be
| haber from HABER
estas | these
| estaba from ESTAR
| estamos from ESTAR
algunas | some
algo | something
nosotros | we
| other forms
mi | me
mis | mi plural
| thou
te | thee
ti | thee
tu | thy
tus | tu plural
ellas | they
nosotras | we
vosotros | you
vosotras | you
os | you
mío | mine
mía |
míos |
mías |
tuyo | thine
tuya |
tuyos |
tuyas |
suyo | his, hers, theirs
suya |
suyos |
suyas |
nuestro | ours
nuestra |
nuestros |
nuestras |
vuestro | yours
vuestra |
vuestros |
vuestras |
esos | those
esas | those
| forms of estar, to be (not including the infinitive):
estoy
estás
está
estamos
estáis
están
esté
estés
estemos
estéis
estén
estaré
estarás
estará
estaremos
estaréis
estarán
estaría
estarías
estaríamos
estaríais
estarían
estaba
estabas
estábamos
estabais
estaban
estuve
estuviste
estuvo
estuvimos
estuvisteis
estuvieron
estuviera
estuvieras
estuviéramos
estuvierais
estuvieran
estuviese
estuvieses
estuviésemos
estuvieseis
estuviesen
estando
estado
estada
estados
estadas
estad
| forms of haber, to have (not including the infinitive):
he
has
ha
hemos
habéis
han
haya
hayas
hayamos
hayáis
hayan
habré
habrás
habrá
habremos
habréis
habrán
habría
habrías
habríamos
habríais
habrían
había
habías
habíamos
habíais
habían
hube
hubiste
hubo
hubimos
hubisteis
hubieron
hubiera
hubieras
hubiéramos
hubierais
hubieran
hubiese
hubieses
hubiésemos
hubieseis
hubiesen
habiendo
habido
habida
habidos
habidas
| forms of ser, to be (not including the infinitive):
soy
eres
es
somos
sois
son
sea
seas
seamos
seáis
sean
seré
serás
será
seremos
seréis
serán
sería
serías
seríamos
seríais
serían
era
eras
éramos
erais
eran
fui
fuiste
fue
fuimos
fuisteis
fueron
fuera
fueras
fuéramos
fuerais
fueran
fuese
fueses
fuésemos
fueseis
fuesen
siendo
sido
| sed also means 'thirst'
| forms of tener, to have (not including the infinitive):
tengo
tienes
tiene
tenemos
tenéis
tienen
tenga
tengas
tengamos
tengáis
tengan
tendré
tendrás
tendrá
tendremos
tendréis
tendrán
tendría
tendrías
tendríamos
tendríais
tendrían
tenía
tenías
teníamos
teníais
tenían
tuve
tuviste
tuvo
tuvimos
tuvisteis
tuvieron
tuviera
tuvieras
tuviéramos
tuvierais
tuvieran
tuviese
tuvieses
tuviésemos
tuvieseis
tuviesen
teniendo
tenido
tenida
tenidos
tenidas
tened
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SpanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package eu
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,123 @@
package eu
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_eu"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var BasqueStopWords = []byte(`# example set of basque stopwords
al
anitz
arabera
asko
baina
bat
batean
batek
bati
batzuei
batzuek
batzuetan
batzuk
bera
beraiek
berau
berauek
bere
berori
beroriek
beste
bezala
da
dago
dira
ditu
du
dute
edo
egin
ere
eta
eurak
ez
gainera
gu
gutxi
guzti
haiei
haiek
haietan
hainbeste
hala
han
handik
hango
hara
hari
hark
hartan
hau
hauei
hauek
hauetan
hemen
hemendik
hemengo
hi
hona
honek
honela
honetan
honi
hor
hori
horiei
horiek
horietan
horko
horra
horrek
horrela
horretan
horri
hortik
hura
izan
ni
noiz
nola
non
nondik
nongo
nor
nora
ze
zein
zen
zenbait
zenbat
zer
zergatik
ziren
zituen
zu
zuek
zuen
zuten
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BasqueStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,67 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package fa
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "fa"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name)
if err != nil {
return nil, err
}
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
if err != nil {
return nil, err
}
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
CharFilters: []analysis.CharFilter{
zFilter,
},
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normArFilter,
normFaFilter,
stopFaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,681 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package fa
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestPersianAnalyzerVerbs(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// active present indicative
{
input: []byte("می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite indicative
{
input: []byte("خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active imperfective preterite indicative
{
input: []byte("می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active future indicative
{
input: []byte("خواهد خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active present progressive indicative
{
input: []byte("دارد می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite progressive indicative
{
input: []byte("داشت می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active perfect indicative
{
input: []byte("خورده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective perfect indicative
{
input: []byte("می‌خورده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect indicative
{
input: []byte("خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect indicative
{
input: []byte("می‌خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active preterite subjunctive
{
input: []byte("خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective preterite subjunctive
{
input: []byte("می‌خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect subjunctive
{
input: []byte("خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect subjunctive
{
input: []byte("می‌خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present indicative
{
input: []byte("خورده می‌شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite indicative
{
input: []byte("خورده شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite indicative
{
input: []byte("خورده می‌شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive perfect indicative
{
input: []byte("خورده شده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective perfect indicative
{
input: []byte("خورده می‌شده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect indicative
{
input: []byte("خورده شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect indicative
{
input: []byte("خورده می‌شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive future indicative
{
input: []byte("خورده خواهد شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present progressive indicative
{
input: []byte("دارد خورده می‌شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite progressive indicative
{
input: []byte("داشت خورده می‌شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present subjunctive
{
input: []byte("خورده شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite subjunctive
{
input: []byte("خورده شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite subjunctive
{
input: []byte("خورده می‌شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect subjunctive
{
input: []byte("خورده شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect subjunctive
{
input: []byte("خورده می‌شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active present subjunctive
{
input: []byte("بخورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بخورد"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// active present indicative
{
input: []byte("مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite indicative
{
input: []byte("خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active imperfective preterite indicative
{
input: []byte("مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active future indicative
{
input: []byte("خواهد خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active present progressive indicative
{
input: []byte("دارد مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite progressive indicative
{
input: []byte("داشت مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active perfect indicative
{
input: []byte("خورده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective perfect indicative
{
input: []byte("مي خورده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect indicative
{
input: []byte("خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect indicative
{
input: []byte("مي خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active preterite subjunctive
{
input: []byte("خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective preterite subjunctive
{
input: []byte("مي خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect subjunctive
{
input: []byte("خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect subjunctive
{
input: []byte("مي خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present indicative
{
input: []byte("خورده مي شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite indicative
{
input: []byte("خورده شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite indicative
{
input: []byte("خورده مي شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive perfect indicative
{
input: []byte("خورده شده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective perfect indicative
{
input: []byte("خورده مي شده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect indicative
{
input: []byte("خورده شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect indicative
{
input: []byte("خورده مي شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive future indicative
{
input: []byte("خورده خواهد شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present progressive indicative
{
input: []byte("دارد خورده مي شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite progressive indicative
{
input: []byte("داشت خورده مي شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present subjunctive
{
input: []byte("خورده شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite subjunctive
{
input: []byte("خورده شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite subjunctive
{
input: []byte("خورده مي شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect subjunctive
{
input: []byte("خورده شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect subjunctive
{
input: []byte("خورده مي شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active present subjunctive
{
input: []byte("بخورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بخورد"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}
func TestPersianAnalyzerOthers(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// nouns
{
input: []byte("برگ ها"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("برگ"),
},
},
},
{
input: []byte("برگ‌ها"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("برگ"),
},
},
},
// non persian
{
input: []byte("English test."),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("english"),
},
&analysis.Token{
Term: []byte("test"),
},
},
},
// others
{
input: []byte("خورده مي شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
{
input: []byte("برگ‌ها"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("برگ"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -0,0 +1,72 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"bytes"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_fa"
const (
Yeh = '\u064A'
FarsiYeh = '\u06CC'
YehBarree = '\u06D2'
Keheh = '\u06A9'
Kaf = '\u0643'
HamzaAbove = '\u0654'
HehYeh = '\u06C0'
HehGoal = '\u06C1'
Heh = '\u0647'
)
type PersianNormalizeFilter struct {
}
func NewPersianNormalizeFilter() *PersianNormalizeFilter {
return &PersianNormalizeFilter{}
}
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case FarsiYeh, YehBarree:
runes[i] = Yeh
case Keheh:
runes[i] = Kaf
case HehYeh, HehGoal:
runes[i] = Heh
case HamzaAbove: // necessary for HEH + HAMZA
runes = analysis.DeleteRune(runes, i)
i--
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPersianNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -0,0 +1,125 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
)
func TestPersianNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// FarsiYeh
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("های"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هاي"),
},
},
},
// YehBarree
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("هاے"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هاي"),
},
},
},
// Keheh
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("کشاندن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كشاندن"),
},
},
},
// HehYeh
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابۀ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابه"),
},
},
},
// HehHamzaAbove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابهٔ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابه"),
},
},
},
// HehGoal
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("زادہ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("زاده"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
persianNormalizeFilter := NewPersianNormalizeFilter()
for _, test := range tests {
actual := persianNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,337 @@
package fa
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_fa"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Note: by default this file is used after normalization, so when adding entries
# to this file, use the arabic 'ي' instead of 'ی'
انان
نداشته
سراسر
خياه
ايشان
وي
تاكنون
بيشتري
دوم
پس
ناشي
وگو
يا
داشتند
سپس
هنگام
هرگز
پنج
نشان
امسال
ديگر
گروهي
شدند
چطور
ده
و
دو
نخستين
ولي
چرا
چه
وسط
ه
كدام
قابل
يك
رفت
هفت
همچنين
در
هزار
بله
بلي
شايد
اما
شناسي
گرفته
دهد
داشته
دانست
داشتن
خواهيم
ميليارد
وقتيكه
امد
خواهد
جز
اورده
شده
بلكه
خدمات
شدن
برخي
نبود
بسياري
جلوگيري
حق
كردند
نوعي
بعري
نكرده
نظير
نبايد
بوده
بودن
داد
اورد
هست
جايي
شود
دنبال
داده
بايد
سابق
هيچ
همان
انجا
كمتر
كجاست
گردد
كسي
تر
مردم
تان
دادن
بودند
سري
جدا
ندارند
مگر
يكديگر
دارد
دهند
بنابراين
هنگامي
سمت
جا
انچه
خود
دادند
زياد
دارند
اثر
بدون
بهترين
بيشتر
البته
به
براساس
بيرون
كرد
بعضي
گرفت
توي
اي
ميليون
او
جريان
تول
بر
مانند
برابر
باشيم
مدتي
گويند
اكنون
تا
تنها
جديد
چند
بي
نشده
كردن
كردم
گويد
كرده
كنيم
نمي
نزد
روي
قصد
فقط
بالاي
ديگران
اين
ديروز
توسط
سوم
ايم
دانند
سوي
استفاده
شما
كنار
داريم
ساخته
طور
امده
رفته
نخست
بيست
نزديك
طي
كنيد
از
انها
تمامي
داشت
يكي
طريق
اش
چيست
روب
نمايد
گفت
چندين
چيزي
تواند
ام
ايا
با
ان
ايد
ترين
اينكه
ديگري
راه
هايي
بروز
همچنان
پاعين
كس
حدود
مختلف
مقابل
چيز
گيرد
ندارد
ضد
همچون
سازي
شان
مورد
باره
مرسي
خويش
برخوردار
چون
خارج
شش
هنوز
تحت
ضمن
هستيم
گفته
فكر
بسيار
پيش
براي
روزهاي
انكه
نخواهد
بالا
كل
وقتي
كي
چنين
كه
گيري
نيست
است
كجا
كند
نيز
يابد
بندي
حتي
توانند
عقب
خواست
كنند
بين
تمام
همه
ما
باشند
مثل
شد
اري
باشد
اره
طبق
بعد
اگر
صورت
غير
جاي
بيش
ريزي
اند
زيرا
چگونه
بار
لطفا
مي
درباره
من
ديده
همين
گذاري
برداري
علت
گذاشته
هم
فوق
نه
ها
شوند
اباد
همواره
هر
اول
خواهند
چهار
نام
امروز
مان
هاي
قبل
كنم
سعي
تازه
را
هستند
زير
جلوي
عنوان
بود
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(PersianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,54 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package fi
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "fi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopFiFilter,
stemmerFiFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,68 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package fi
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestFinishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("edeltäjiinsä"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("edeltäj"),
},
},
},
{
input: []byte("edeltäjistään"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("edeltäj"),
},
},
},
// stop word
{
input: []byte("olla"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package fi
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_fi"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fi")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,28 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,121 @@
package fi
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const StopName = "stop_fi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| forms of BE
olla
olen
olet
on
olemme
olette
ovat
ole | negative form
oli
olisi
olisit
olisin
olisimme
olisitte
olisivat
olit
olin
olimme
olitte
olivat
ollut
olleet
en | negation
et
ei
emme
ette
eivät
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
mitkä | (pl)
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
| conjunctions
että | that
ja | and
jos | if
koska | because
kuin | than
mutta | but
niin | so
sekä | and
sillä | for
tai | or
vaan | but
vai | or
vaikka | although
| prepositions
kanssa | with
mukaan | according to
noin | about
poikki | across
yli | over, across
| other
kun | when
niin | so
nyt | now
itse | self
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FinnishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "fr"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFrFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFrFilter, err := cache.TokenFilterNamed(LightStemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
elisionFilter,
toLowerFilter,
stopFrFilter,
stemmerFrFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,196 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestFrenchAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte(""),
output: analysis.TokenStream{},
},
{
input: []byte("chien chat cheval"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte("chien CHAT CHEVAL"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte(" chien ,? + = - CHAT /: > CHEVAL"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte("chien++"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
},
},
{
input: []byte("mot \"entreguillemet\""),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("mot"),
},
&analysis.Token{
Term: []byte("entreguilemet"),
},
},
},
{
input: []byte("Jean-François"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jean"),
},
&analysis.Token{
Term: []byte("francoi"),
},
},
},
// stop words
{
input: []byte("le la chien les aux chat du des à cheval"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
// nouns and adjectives
{
input: []byte("lances chismes habitable chiste éléments captifs"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("lanc"),
},
&analysis.Token{
Term: []byte("chism"),
},
&analysis.Token{
Term: []byte("habitabl"),
},
&analysis.Token{
Term: []byte("chist"),
},
&analysis.Token{
Term: []byte("element"),
},
&analysis.Token{
Term: []byte("captif"),
},
},
},
// verbs
{
input: []byte("finissions souffrirent rugissante"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("finision"),
},
&analysis.Token{
Term: []byte("soufrirent"),
},
&analysis.Token{
Term: []byte("rugisant"),
},
},
},
{
input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("c3po"),
},
&analysis.Token{
Term: []byte("aujourd'hui"),
},
&analysis.Token{
Term: []byte("oeuf"),
},
&analysis.Token{
Term: []byte("ïaöuaä"),
},
&analysis.Token{
Term: []byte("anticonstitutionel"),
},
&analysis.Token{
Term: []byte("java"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -0,0 +1,37 @@
package fr
import (
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const ArticlesName = "articles_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var FrenchArticles = []byte(`
l
m
t
qu
n
s
j
d
c
jusqu
quoiqu
lorsqu
puisqu
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,32 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"fmt"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const ElisionName = "elision_fr"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,50 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'avion"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("avion"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,308 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"bytes"
"unicode"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const LightStemmerName = "stemmer_fr_light"
type FrenchLightStemmerFilter struct {
}
func NewFrenchLightStemmerFilter() *FrenchLightStemmerFilter {
return &FrenchLightStemmerFilter{}
}
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = stem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func stem(input []rune) []rune {
inputLen := len(input)
if inputLen > 5 && input[inputLen-1] == 'x' {
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' && input[inputLen-4] != 'e' {
input[inputLen-2] = 'l'
}
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 3 && input[inputLen-1] == 'x' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 3 && input[inputLen-1] == 's' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "issement") {
input = input[0 : inputLen-6]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "issant") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "ement") {
input = input[0 : inputLen-4]
inputLen = len(input)
if inputLen > 3 && analysis.RunesEndsWith(input, "ive") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'f'
}
return norm(input)
}
if inputLen > 11 && analysis.RunesEndsWith(input, "ficatrice") {
input = input[0 : inputLen-5]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 10 && analysis.RunesEndsWith(input, "ficateur") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "catrice") {
input = input[0 : inputLen-3]
inputLen = len(input)
input[inputLen-4] = 'q'
input[inputLen-3] = 'u'
input[inputLen-2] = 'e'
//s[len-1] = 'r' <-- unnecessary, already 'r'.
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "cateur") {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-4] = 'q'
input[inputLen-3] = 'u'
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "atrice") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 7 && analysis.RunesEndsWith(input, "ateur") {
input = input[0 : inputLen-3]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "trice") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-3] = 'e'
input[inputLen-2] = 'u'
input[inputLen-1] = 'r'
}
if inputLen > 5 && analysis.RunesEndsWith(input, "ième") {
return norm(input[0 : inputLen-4])
}
if inputLen > 7 && analysis.RunesEndsWith(input, "teuse") {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "teur") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 5 && analysis.RunesEndsWith(input, "euse") {
return norm(input[0 : inputLen-2])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ère") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-2] = 'e'
return norm(input)
}
if inputLen > 7 && analysis.RunesEndsWith(input, "ive") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'f'
return norm(input)
}
if inputLen > 4 &&
(analysis.RunesEndsWith(input, "folle") ||
analysis.RunesEndsWith(input, "molle")) {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-1] = 'u'
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "nnelle") {
return norm(input[0 : inputLen-5])
}
if inputLen > 9 && analysis.RunesEndsWith(input, "nnel") {
return norm(input[0 : inputLen-3])
}
if inputLen > 4 && analysis.RunesEndsWith(input, "ète") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-2] = 'e'
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ique") {
input = input[0 : inputLen-4]
inputLen = len(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "esse") {
return norm(input[0 : inputLen-3])
}
if inputLen > 7 && analysis.RunesEndsWith(input, "inage") {
return norm(input[0 : inputLen-3])
}
if inputLen > 9 && analysis.RunesEndsWith(input, "isation") {
input = input[0 : inputLen-7]
inputLen = len(input)
if inputLen > 5 && analysis.RunesEndsWith(input, "ual") {
input[inputLen-2] = 'e'
}
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "isateur") {
return norm(input[0 : inputLen-7])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ation") {
return norm(input[0 : inputLen-5])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ition") {
return norm(input[0 : inputLen-5])
}
return norm(input)
}
func norm(input []rune) []rune {
inputLen := len(input)
if inputLen > 4 {
for i := 0; i < inputLen; i++ {
switch input[i] {
case 'à', 'á', 'â':
input[i] = 'a'
case 'ô':
input[i] = 'o'
case 'è', 'é', 'ê':
input[i] = 'e'
case 'ù', 'û':
input[i] = 'u'
case 'î':
input[i] = 'i'
case 'ç':
input[i] = 'c'
}
ch := input[0]
for i := 1; i < inputLen; i++ {
if input[i] == ch && unicode.IsLetter(ch) {
input = analysis.DeleteRune(input, i)
i -= 1
inputLen = len(input)
} else {
ch = input[i]
}
}
}
}
if inputLen > 4 && analysis.RunesEndsWith(input, "ie") {
input = input[0 : inputLen-2]
inputLen = len(input)
}
if inputLen > 4 {
if input[inputLen-1] == 'r' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == 'e' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == 'e' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == input[inputLen-2] && unicode.IsLetter(input[inputLen-1]) {
input = input[0 : inputLen-1]
inputLen = len(input)
}
}
return input
}
func FrenchLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchLightStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(LightStemmerName, FrenchLightStemmerFilterConstructor)
}

View File

@ -0,0 +1,997 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
func TestFrenchLightStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chevaux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("cheval"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("hiboux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("hibou"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("hibou"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("hibou"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chantés"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chanter"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chante"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("baronnes"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("barons"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("peaux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("peau"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("peau"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("peau"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("anneaux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("aneau"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("anneau"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("aneau"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("neveux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("neveu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("neveu"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("neveu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("affreux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("afreu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("affreuse"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("afreu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("investissement"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("investi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("investir"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("investi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("assourdissant"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("asourdi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("assourdir"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("asourdi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("pratiquement"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("pratiqu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("pratique"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("pratiqu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("administrativement"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("administratif"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("administratif"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("administratif"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("justificatrice"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("justifi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("justificateur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("justifi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("justifier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("justifi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("educatrice"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("eduqu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("eduquer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("eduqu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("communicateur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("comuniqu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("communiquer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("comuniqu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("accompagnatrice"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("acompagn"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("accompagnateur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("acompagn"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("administrateur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("administr"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("administrer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("administr"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("productrice"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("product"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("producteur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("product"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("acheteuse"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("achet"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("acheteur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("achet"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("planteur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("plant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("plante"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("plant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("poreuse"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("poreu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("poreux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("poreu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("plieuse"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("plieu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("bijoutière"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("bijouti"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("bijoutier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("bijouti"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("caissière"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("caisi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("caissier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("caisi"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abrasive"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abrasif"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abrasif"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abrasif"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("folle"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("fou"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("fou"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("fou"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("personnelle"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("person"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("personne"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("person"),
},
},
},
// algo bug: too short length
// {
// input: analysis.TokenStream{
// &analysis.Token{
// Term: []byte("personnel"),
// },
// },
// output: analysis.TokenStream{
// &analysis.Token{
// Term: []byte("person"),
// },
// },
// },
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("complète"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("complet"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("complet"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("complet"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("aromatique"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("aromat"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("faiblesse"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("faibl"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("faible"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("faibl"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("patinage"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("patin"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("patin"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("patin"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("sonorisation"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("sono"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ritualisation"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("rituel"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("rituel"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("rituel"),
},
},
},
// algo bug: masked by rules above
// {
// input: analysis.TokenStream{
// &analysis.Token{
// Term: []byte("colonisateur"),
// },
// },
// output: analysis.TokenStream{
// &analysis.Token{
// Term: []byte("colon"),
// },
// },
// },
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("nomination"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("nomin"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("disposition"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("dispos"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dispose"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("dispos"),
},
},
},
// SOLR-3463 : abusive compression of repeated characters in numbers
// Trailing repeated char elision :
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("1234555"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("1234555"),
},
},
},
// Repeated char within numbers with more than 4 characters :
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("12333345"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("12333345"),
},
},
},
// Short numbers weren't affected already:
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("1234"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("1234"),
},
},
},
// Ensure behaviour is preserved for words!
// Trailing repeated char elision :
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcdeff"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcdef"),
},
},
},
// Repeated char within words with more than 4 characters :
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcccddeef"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcdef"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("créées"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("cre"),
},
},
},
// Combined letter and digit repetition
// 10:00pm
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("22hh00"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("22h00"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(LightStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,81 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"bytes"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
)
const MinimalStemmerName = "stemmer_fr_min"
type FrenchMinimalStemmerFilter struct {
}
func NewFrenchMinimalStemmerFilter() *FrenchMinimalStemmerFilter {
return &FrenchMinimalStemmerFilter{}
}
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = minstem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func minstem(input []rune) []rune {
inputLen := len(input)
if inputLen < 6 {
return input
}
if input[inputLen-1] == 'x' {
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' {
input[inputLen-2] = 'l'
}
return input[0 : inputLen-1]
}
if input[inputLen-1] == 's' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == 'r' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == 'e' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == 'é' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if input[inputLen-1] == input[inputLen-2] {
input = input[0 : inputLen-1]
inputLen = len(input)
}
return input
}
func FrenchMinimalStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchMinimalStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(MinimalStemmerName, FrenchMinimalStemmerFilterConstructor)
}

Some files were not shown because too many files have changed in this diff Show More