Add message logging and search server side
This commit is contained in:
parent
6378131a9d
commit
3365832ce3
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,4 +1,2 @@
|
||||
bin/
|
||||
client/dist/
|
||||
client/node_modules/
|
||||
data.db
|
||||
client/node_modules/
|
53
Godeps/Godeps.json
generated
53
Godeps/Godeps.json
generated
@ -1,19 +1,72 @@
|
||||
{
|
||||
"ImportPath": "github.com/khlieng/name_pending",
|
||||
"GoVersion": "go1.4",
|
||||
"Packages": [
|
||||
"./..."
|
||||
],
|
||||
"Deps": [
|
||||
{
|
||||
"ImportPath": "github.com/blevesearch/bleve",
|
||||
"Rev": "16f538d7b76dd85c935a3104c390307cae5cbf79"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/blevesearch/go-porterstemmer",
|
||||
"Comment": "v1.0.1-9-g23a2c8e",
|
||||
"Rev": "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/blevesearch/segment",
|
||||
"Rev": "9588637ce3caba8516208ccc17193ddedd741418"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/boltdb/bolt",
|
||||
"Comment": "v1.0-43-gcf33c9e",
|
||||
"Rev": "cf33c9e0ca0a23509b8bb8edfc63e4776bb1a330"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/cznic/b",
|
||||
"Rev": "c4adf3a58579a2d57cd3097f455dcdf75edcdfd8"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/golang/protobuf/proto",
|
||||
"Rev": "655cdfa588ea190e901bc5590e65d5621688847c"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/julienschmidt/httprouter",
|
||||
"Rev": "b428fda53bb0a764fea9c76c9413512eda291dec"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/ryszard/goskiplist/skiplist",
|
||||
"Rev": "2dfbae5fcf46374f166f8969cb07e167f1be6273"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/steveyen/gtreap",
|
||||
"Rev": "72cd76f34c91f8d64a031af97b499e4a0b1a6e0c"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
|
||||
"Rev": "4875955338b0a434238a31165cb87255ab6e9e4a"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/syndtr/gosnappy/snappy",
|
||||
"Rev": "156a073208e131d7d2e212cb749feae7c339e846"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/willf/bitset",
|
||||
"Comment": "v1.0.0-17-g4b22041",
|
||||
"Rev": "4b220417a489359f934045d0509d941a7a2a1038"
|
||||
},
|
||||
{
|
||||
"ImportPath": "golang.org/x/net/websocket",
|
||||
"Rev": "3d87fd621ca9a824c5cff17216ce44769456cb3f"
|
||||
},
|
||||
{
|
||||
"ImportPath": "golang.org/x/text/transform",
|
||||
"Rev": "c92eb3cd6e70951a111680995e651ea4b2c35539"
|
||||
},
|
||||
{
|
||||
"ImportPath": "golang.org/x/text/unicode/norm",
|
||||
"Rev": "c92eb3cd6e70951a111680995e651ea4b2c35539"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
17
Godeps/_workspace/src/github.com/blevesearch/bleve/.gitignore
generated
vendored
Normal file
17
Godeps/_workspace/src/github.com/blevesearch/bleve/.gitignore
generated
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
#*
|
||||
*.sublime-*
|
||||
*~
|
||||
.#*
|
||||
.project
|
||||
.settings
|
||||
.DS_Store
|
||||
/analysis/token_filters/cld2/cld2-read-only
|
||||
/analysis/token_filters/cld2/libcld2_full.a
|
||||
/utils/bleve_create/bleve_create
|
||||
/utils/bleve_dump/bleve_dump
|
||||
/utils/bleve_index/bleve_index
|
||||
/utils/bleve_bulkindex/bleve_bulkindex
|
||||
/utils/bleve_index/index.bleve/
|
||||
/utils/bleve_query/bleve_query
|
||||
/utils/bleve_registry/bleve_registry
|
||||
/y.output
|
19
Godeps/_workspace/src/github.com/blevesearch/bleve/.travis.yml
generated
vendored
Normal file
19
Godeps/_workspace/src/github.com/blevesearch/bleve/.travis.yml
generated
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
language: go
|
||||
|
||||
go:
|
||||
- 1.4
|
||||
|
||||
script:
|
||||
- go get golang.org/x/tools/cmd/vet
|
||||
- go get golang.org/x/tools/cmd/cover
|
||||
- go get github.com/mattn/goveralls
|
||||
- go get github.com/kisielk/errcheck
|
||||
- go test -v ./...
|
||||
- go vet ./...
|
||||
- errcheck ./...
|
||||
- docs/project-code-coverage.sh
|
||||
- docs/build_children.sh
|
||||
|
||||
notifications:
|
||||
email:
|
||||
- marty.schoch@gmail.com
|
202
Godeps/_workspace/src/github.com/blevesearch/bleve/LICENSE
generated
vendored
Normal file
202
Godeps/_workspace/src/github.com/blevesearch/bleve/LICENSE
generated
vendored
Normal file
@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
61
Godeps/_workspace/src/github.com/blevesearch/bleve/README.md
generated
vendored
Normal file
61
Godeps/_workspace/src/github.com/blevesearch/bleve/README.md
generated
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
# ![bleve](docs/bleve.png) bleve
|
||||
|
||||
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
|
||||
modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/)
|
||||
|
||||
Try out bleve live by [searching our wiki](http://wikisearch.blevesearch.com/search/).
|
||||
|
||||
## Features
|
||||
|
||||
* Index any go data structure (including JSON)
|
||||
* Intelligent defaults backed up by powerful configuration
|
||||
* Supported field types:
|
||||
* Text, Numeric, Date
|
||||
* Supported query types:
|
||||
* Term, Phrase, Match, Match Phrase, Prefix
|
||||
* Conjunction, Disjunction, Boolean
|
||||
* Numeric Range, Date Range
|
||||
* Simple query [syntax](https://github.com/blevesearch/bleve/wiki/Query-String-Query) for human entry
|
||||
* tf-idf Scoring
|
||||
* Search result match highlighting
|
||||
* Supports Aggregating Facets:
|
||||
* Terms Facet
|
||||
* Numeric Range Facet
|
||||
* Date Range Facet
|
||||
|
||||
## Discussion
|
||||
|
||||
Discuss usage and development of bleve in the [google group](https://groups.google.com/forum/#!forum/bleve).
|
||||
|
||||
## Indexing
|
||||
|
||||
message := struct{
|
||||
Id string
|
||||
From string
|
||||
Body string
|
||||
}{
|
||||
Id: "example",
|
||||
From: "marty.schoch@gmail.com",
|
||||
Body: "bleve indexing is easy",
|
||||
}
|
||||
|
||||
mapping := bleve.NewIndexMapping()
|
||||
index, err := bleve.New("example.bleve", mapping)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
index.Index(message.Id, message)
|
||||
|
||||
## Querying
|
||||
|
||||
index, _ := bleve.Open("example.bleve")
|
||||
query := bleve.NewQueryStringQuery("bleve")
|
||||
searchRequest := bleve.NewSearchRequest(query)
|
||||
searchResult, _ := index.Search(searchRequest)
|
||||
|
||||
## License
|
||||
|
||||
Apache License Version 2.0
|
||||
|
||||
|
130
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer/custom_analyzer.go
generated
vendored
Normal file
130
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer/custom_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,130 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package standard_analyzer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "custom"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
|
||||
var err error
|
||||
var charFilters []analysis.CharFilter
|
||||
charFiltersNames, ok := config["char_filters"].([]string)
|
||||
if ok {
|
||||
charFilters, err = getCharFilters(charFiltersNames, cache)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
charFiltersNamesInterfaceSlice, ok := config["char_filters"].([]interface{})
|
||||
if ok {
|
||||
charFiltersNames, err := convertInterfaceSliceToStringSlice(charFiltersNamesInterfaceSlice, "char filter")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
charFilters, err = getCharFilters(charFiltersNames, cache)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokenizerName, ok := config["tokenizer"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify tokenizer")
|
||||
}
|
||||
|
||||
tokenizer, err := cache.TokenizerNamed(tokenizerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var tokenFilters []analysis.TokenFilter
|
||||
tokenFiltersNames, ok := config["token_filters"].([]string)
|
||||
if ok {
|
||||
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
tokenFiltersNamesInterfaceSlice, ok := config["token_filters"].([]interface{})
|
||||
if ok {
|
||||
tokenFiltersNames, err := convertInterfaceSliceToStringSlice(tokenFiltersNamesInterfaceSlice, "token filter")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
}
|
||||
if charFilters != nil {
|
||||
rv.CharFilters = charFilters
|
||||
}
|
||||
if tokenFilters != nil {
|
||||
rv.TokenFilters = tokenFilters
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||
}
|
||||
|
||||
func getCharFilters(charFilterNames []string, cache *registry.Cache) ([]analysis.CharFilter, error) {
|
||||
charFilters := make([]analysis.CharFilter, len(charFilterNames))
|
||||
for i, charFilterName := range charFilterNames {
|
||||
charFilter, err := cache.CharFilterNamed(charFilterName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
charFilters[i] = charFilter
|
||||
}
|
||||
|
||||
return charFilters, nil
|
||||
}
|
||||
|
||||
func getTokenFilters(tokenFilterNames []string, cache *registry.Cache) ([]analysis.TokenFilter, error) {
|
||||
tokenFilters := make([]analysis.TokenFilter, len(tokenFilterNames))
|
||||
for i, tokenFilterName := range tokenFilterNames {
|
||||
tokenFilter, err := cache.TokenFilterNamed(tokenFilterName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tokenFilters[i] = tokenFilter
|
||||
}
|
||||
|
||||
return tokenFilters, nil
|
||||
}
|
||||
|
||||
func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType string) ([]string, error) {
|
||||
stringSlice := make([]string, len(interfaceSlice))
|
||||
for i, interfaceObj := range interfaceSlice {
|
||||
stringObj, ok := interfaceObj.(string)
|
||||
if ok {
|
||||
stringSlice[i] = stringObj
|
||||
} else {
|
||||
return nil, fmt.Errorf(objType + " name must be a string")
|
||||
}
|
||||
}
|
||||
|
||||
return stringSlice, nil
|
||||
}
|
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go
generated
vendored
Normal file
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build cld2 full
|
||||
|
||||
package detect_lang_analyzer
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/cld2"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "detect_lang"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
detectLangFilter, err := cache.TokenFilterNamed(cld2.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: keywordTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
detectLangFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||
}
|
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer/keyword_analyzer.go
generated
vendored
Normal file
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer/keyword_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package keyword_analyzer
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "keyword"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: keywordTokenizer,
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||
}
|
41
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer/simple_analyzer.go
generated
vendored
Normal file
41
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer/simple_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,41 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package simple_analyzer
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "simple"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||
}
|
47
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer/standard_analyzer.go
generated
vendored
Normal file
47
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer/standard_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package standard_analyzer
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "standard"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopEnFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||
}
|
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/ignore/ignore_byte_array_converter.go
generated
vendored
Normal file
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/ignore/ignore_byte_array_converter.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ignore_byte_array_converter
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
type IgnoreByteArrayConverter struct{}
|
||||
|
||||
func NewIgnoreByteArrayConverter() *IgnoreByteArrayConverter {
|
||||
return &IgnoreByteArrayConverter{}
|
||||
}
|
||||
|
||||
func (c *IgnoreByteArrayConverter) Convert(in []byte) (interface{}, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
|
||||
return NewIgnoreByteArrayConverter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterByteArrayConverter("ignore", Constructor)
|
||||
}
|
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/json/json_byte_array_converter.go
generated
vendored
Normal file
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/json/json_byte_array_converter.go
generated
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package json_byte_array_converter
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
type JSONByteArrayConverter struct{}
|
||||
|
||||
func NewJSONByteArrayConverter() *JSONByteArrayConverter {
|
||||
return &JSONByteArrayConverter{}
|
||||
}
|
||||
|
||||
func (c *JSONByteArrayConverter) Convert(in []byte) (interface{}, error) {
|
||||
var rv map[string]interface{}
|
||||
err := json.Unmarshal(in, &rv)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
|
||||
return NewJSONByteArrayConverter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterByteArrayConverter("json", Constructor)
|
||||
}
|
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/string/string_byte_array_conveter.go
generated
vendored
Normal file
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/string/string_byte_array_conveter.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package string_byte_array_converter
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
type StringByteArrayConverter struct{}
|
||||
|
||||
func NewStringByteArrayConverter() *StringByteArrayConverter {
|
||||
return &StringByteArrayConverter{}
|
||||
}
|
||||
|
||||
func (c *StringByteArrayConverter) Convert(in []byte) (interface{}, error) {
|
||||
return string(in), nil
|
||||
}
|
||||
|
||||
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
|
||||
return NewStringByteArrayConverter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterByteArrayConverter("string", Constructor)
|
||||
}
|
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/html_char_filter/html_char_filter.go
generated
vendored
Normal file
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/html_char_filter/html_char_filter.go
generated
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package html_char_filter
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "html"
|
||||
|
||||
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
|
||||
|
||||
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
|
||||
replaceBytes := []byte(" ")
|
||||
return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterCharFilter(Name, CharFilterConstructor)
|
||||
}
|
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter.go
generated
vendored
Normal file
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter.go
generated
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package regexp_char_filter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"regexp"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "regexp"
|
||||
|
||||
type RegexpCharFilter struct {
|
||||
r *regexp.Regexp
|
||||
replacement []byte
|
||||
}
|
||||
|
||||
func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter {
|
||||
return &RegexpCharFilter{
|
||||
r: r,
|
||||
replacement: replacement,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RegexpCharFilter) Filter(input []byte) []byte {
|
||||
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
|
||||
}
|
||||
|
||||
func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
|
||||
regexpStr, ok := config["regexp"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify regexp")
|
||||
}
|
||||
r, err := regexp.Compile(regexpStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
|
||||
}
|
||||
replaceBytes := []byte(" ")
|
||||
replaceStr, ok := config["replace"].(string)
|
||||
if ok {
|
||||
replaceBytes = []byte(replaceStr)
|
||||
}
|
||||
return NewRegexpCharFilter(r, replaceBytes), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
|
||||
}
|
82
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter_test.go
generated
vendored
Normal file
82
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter_test.go
generated
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package regexp_char_filter
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"regexp"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRegexpCharFilter(t *testing.T) {
|
||||
|
||||
htmlTagPattern := `</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`
|
||||
htmlRegex := regexp.MustCompile(htmlTagPattern)
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output []byte
|
||||
}{
|
||||
{
|
||||
input: []byte(`<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
|
||||
<h1>My First Heading</h1>
|
||||
|
||||
<p>My first paragraph.</p>
|
||||
|
||||
</body>
|
||||
</html>`),
|
||||
output: []byte(`
|
||||
|
||||
|
||||
|
||||
My First Heading
|
||||
|
||||
My first paragraph.
|
||||
|
||||
|
||||
`),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
filter := NewRegexpCharFilter(htmlRegex, []byte{' '})
|
||||
output := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(output, test.output) {
|
||||
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestZeroWidthNonJoinerCharFilter(t *testing.T) {
|
||||
|
||||
zeroWidthNonJoinerPattern := `\x{200C}`
|
||||
zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern)
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output []byte
|
||||
}{
|
||||
{
|
||||
input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"),
|
||||
output: []byte("water under the bridge"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
filter := NewRegexpCharFilter(zeroWidthNonJoinerRegex, []byte{' '})
|
||||
output := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(output, test.output) {
|
||||
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go
generated
vendored
Normal file
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go
generated
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package zero_width_non_joiner
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "zero_width_spaces"
|
||||
|
||||
var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
|
||||
|
||||
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
|
||||
replaceBytes := []byte(" ")
|
||||
return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterCharFilter(Name, CharFilterConstructor)
|
||||
}
|
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional/datetime_optional.go
generated
vendored
Normal file
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional/datetime_optional.go
generated
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package html_char_filter
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "dateTimeOptional"
|
||||
|
||||
const rfc3339NoTimezone = "2006-01-02T15:04:05"
|
||||
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
|
||||
const rfc3339NoTime = "2006-01-02"
|
||||
|
||||
var layouts = []string{
|
||||
time.RFC3339Nano,
|
||||
time.RFC3339,
|
||||
rfc3339NoTimezone,
|
||||
rfc3339NoTimezoneNoT,
|
||||
rfc3339NoTime,
|
||||
}
|
||||
|
||||
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||
return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
|
||||
}
|
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go.go
generated
vendored
Normal file
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package flexible_go
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "flexiblego"
|
||||
|
||||
type FlexibleGoDateTimeParser struct {
|
||||
layouts []string
|
||||
}
|
||||
|
||||
func NewFlexibleGoDateTimeParser(layouts []string) *FlexibleGoDateTimeParser {
|
||||
return &FlexibleGoDateTimeParser{
|
||||
layouts: layouts,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error) {
|
||||
for _, layout := range p.layouts {
|
||||
rv, err := time.Parse(layout, input)
|
||||
if err == nil {
|
||||
return rv, nil
|
||||
}
|
||||
}
|
||||
return time.Time{}, analysis.ErrInvalidDateTime
|
||||
}
|
||||
|
||||
func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||
layouts, ok := config["layouts"].([]interface{})
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify layouts")
|
||||
}
|
||||
layoutStrs := make([]string, 0)
|
||||
for _, layout := range layouts {
|
||||
layoutStr, ok := layout.(string)
|
||||
if ok {
|
||||
layoutStrs = append(layoutStrs, layoutStr)
|
||||
}
|
||||
}
|
||||
return NewFlexibleGoDateTimeParser(layoutStrs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
|
||||
}
|
84
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go_test.go
generated
vendored
Normal file
84
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go_test.go
generated
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package flexible_go
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestFlexibleDateTimeParser(t *testing.T) {
|
||||
testLocation := time.FixedZone("", -8*60*60)
|
||||
|
||||
tests := []struct {
|
||||
input string
|
||||
expectedTime time.Time
|
||||
expectedError error
|
||||
}{
|
||||
{
|
||||
input: "2014-08-03",
|
||||
expectedTime: time.Date(2014, 8, 3, 0, 0, 0, 0, time.UTC),
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
input: "2014-08-03T15:59:30",
|
||||
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
input: "2014-08-03 15:59:30",
|
||||
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
input: "2014-08-03T15:59:30-08:00",
|
||||
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, testLocation),
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
input: "2014-08-03T15:59:30.999999999-08:00",
|
||||
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 999999999, testLocation),
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
input: "not a date time",
|
||||
expectedTime: time.Time{},
|
||||
expectedError: analysis.ErrInvalidDateTime,
|
||||
},
|
||||
}
|
||||
|
||||
rfc3339NoTimezone := "2006-01-02T15:04:05"
|
||||
rfc3339NoTimezoneNoT := "2006-01-02 15:04:05"
|
||||
rfc3339NoTime := "2006-01-02"
|
||||
|
||||
dateOptionalTimeParser := NewFlexibleGoDateTimeParser(
|
||||
[]string{
|
||||
time.RFC3339Nano,
|
||||
time.RFC3339,
|
||||
rfc3339NoTimezone,
|
||||
rfc3339NoTimezoneNoT,
|
||||
rfc3339NoTime,
|
||||
})
|
||||
|
||||
for _, test := range tests {
|
||||
actualTime, actualErr := dateOptionalTimeParser.ParseDateTime(test.input)
|
||||
if actualErr != test.expectedError {
|
||||
t.Errorf("expected error %#v, got %#v", test.expectedError, actualErr)
|
||||
continue
|
||||
}
|
||||
if !reflect.DeepEqual(actualTime, test.expectedTime) {
|
||||
t.Errorf("expected time %#v, got %#v", test.expectedTime, actualTime)
|
||||
t.Errorf("expected location %#v,\n got %#v", test.expectedTime.Location(), actualTime.Location())
|
||||
}
|
||||
}
|
||||
}
|
88
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq.go
generated
vendored
Normal file
88
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq.go
generated
vendored
Normal file
@ -0,0 +1,88 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package analysis
|
||||
|
||||
type TokenLocation struct {
|
||||
Field string
|
||||
Start int
|
||||
End int
|
||||
Position int
|
||||
}
|
||||
|
||||
type TokenFreq struct {
|
||||
Term []byte
|
||||
Locations []*TokenLocation
|
||||
}
|
||||
|
||||
type TokenFrequencies []*TokenFreq
|
||||
|
||||
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) TokenFrequencies {
|
||||
// put existing tokens into a map
|
||||
index := make(map[string]*TokenFreq)
|
||||
for _, tf := range tfs {
|
||||
index[string(tf.Term)] = tf
|
||||
}
|
||||
// walk the new token frequencies
|
||||
for _, tf := range other {
|
||||
// set the remoteField value in incoming token freqs
|
||||
for _, l := range tf.Locations {
|
||||
l.Field = remoteField
|
||||
}
|
||||
existingTf, exists := index[string(tf.Term)]
|
||||
if exists {
|
||||
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
|
||||
} else {
|
||||
index[string(tf.Term)] = tf
|
||||
}
|
||||
}
|
||||
// flatten map back to array
|
||||
rv := make(TokenFrequencies, len(index))
|
||||
i := 0
|
||||
for _, tf := range index {
|
||||
rv[i] = tf
|
||||
i++
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func TokenFrequency(tokens TokenStream) TokenFrequencies {
|
||||
index := make(map[string]*TokenFreq)
|
||||
|
||||
for _, token := range tokens {
|
||||
curr, ok := index[string(token.Term)]
|
||||
if ok {
|
||||
curr.Locations = append(curr.Locations, &TokenLocation{
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Position: token.Position,
|
||||
})
|
||||
} else {
|
||||
index[string(token.Term)] = &TokenFreq{
|
||||
Term: token.Term,
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Position: token.Position,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rv := make(TokenFrequencies, len(index))
|
||||
i := 0
|
||||
for _, tf := range index {
|
||||
rv[i] = tf
|
||||
i++
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
167
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq_test.go
generated
vendored
Normal file
167
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq_test.go
generated
vendored
Normal file
@ -0,0 +1,167 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package analysis
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTokenFrequency(t *testing.T) {
|
||||
tokens := TokenStream{
|
||||
&Token{
|
||||
Term: []byte("water"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&Token{
|
||||
Term: []byte("water"),
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
}
|
||||
expectedResult := TokenFrequencies{
|
||||
&TokenFreq{
|
||||
Term: []byte("water"),
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
result := TokenFrequency(tokens)
|
||||
if !reflect.DeepEqual(result, expectedResult) {
|
||||
t.Errorf("expected %#v, got %#v", expectedResult, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokenFrequenciesMergeAll(t *testing.T) {
|
||||
tf1 := TokenFrequencies{
|
||||
&TokenFreq{
|
||||
Term: []byte("water"),
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
tf2 := TokenFrequencies{
|
||||
&TokenFreq{
|
||||
Term: []byte("water"),
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
expectedResult := TokenFrequencies{
|
||||
&TokenFreq{
|
||||
Term: []byte("water"),
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
&TokenLocation{
|
||||
Field: "tf2",
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Field: "tf2",
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
tf1.MergeAll("tf2", tf2)
|
||||
if !reflect.DeepEqual(tf1, expectedResult) {
|
||||
t.Errorf("expected %#v, got %#v", expectedResult, tf1)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
|
||||
tf1 := TokenFrequencies{}
|
||||
tf2 := TokenFrequencies{
|
||||
&TokenFreq{
|
||||
Term: []byte("water"),
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
expectedResult := TokenFrequencies{
|
||||
&TokenFreq{
|
||||
Term: []byte("water"),
|
||||
Locations: []*TokenLocation{
|
||||
&TokenLocation{
|
||||
Field: "tf2",
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
&TokenLocation{
|
||||
Field: "tf2",
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
result := tf1.MergeAll("tf2", tf2)
|
||||
if !reflect.DeepEqual(result, expectedResult) {
|
||||
t.Errorf("expected %#v, got %#v", expectedResult, result)
|
||||
}
|
||||
}
|
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar.go
generated
vendored
Normal file
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ar"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
|
||||
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normalizeFilter,
|
||||
stopArFilter,
|
||||
normalizeArFilter,
|
||||
stemmerArFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
179
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar_test.go
generated
vendored
Normal file
179
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar_test.go
generated
vendored
Normal file
@ -0,0 +1,179 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestArabicAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("كبير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كبير"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// feminine marker
|
||||
{
|
||||
input: []byte("كبيرة"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كبير"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("مشروب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مشروب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
// plural -at
|
||||
{
|
||||
input: []byte("مشروبات"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مشروب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
// plural -in
|
||||
{
|
||||
input: []byte("أمريكيين"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("امريك"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
// singular with bare alif
|
||||
{
|
||||
input: []byte("امريكي"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("امريك"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("كتاب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتاب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// definite article
|
||||
{
|
||||
input: []byte("الكتاب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتاب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("ما ملكت أيمانكم"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ملكت"),
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ايمانكم"),
|
||||
Position: 3,
|
||||
Start: 14,
|
||||
End: 28,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stopwords
|
||||
{
|
||||
input: []byte("الذين ملكت أيمانكم"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ملكت"),
|
||||
Position: 2,
|
||||
Start: 11,
|
||||
End: 19,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ايمانكم"),
|
||||
Position: 3,
|
||||
Start: 20,
|
||||
End: 34,
|
||||
},
|
||||
},
|
||||
},
|
||||
// presentation form normalization
|
||||
{
|
||||
input: []byte("ﺍﻟﺴﻼﻢ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 15,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
80
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize.go
generated
vendored
Normal file
80
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize.go
generated
vendored
Normal file
@ -0,0 +1,80 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_ar"
|
||||
|
||||
const (
|
||||
Alef = '\u0627'
|
||||
AlefMadda = '\u0622'
|
||||
AlefHamzaAbove = '\u0623'
|
||||
AlefHamzaBelow = '\u0625'
|
||||
Yeh = '\u064A'
|
||||
DotlessYeh = '\u0649'
|
||||
TehMarbuta = '\u0629'
|
||||
Heh = '\u0647'
|
||||
Tatweel = '\u0640'
|
||||
Fathatan = '\u064B'
|
||||
Dammatan = '\u064C'
|
||||
Kasratan = '\u064D'
|
||||
Fatha = '\u064E'
|
||||
Damma = '\u064F'
|
||||
Kasra = '\u0650'
|
||||
Shadda = '\u0651'
|
||||
Sukun = '\u0652'
|
||||
)
|
||||
|
||||
type ArabicNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
|
||||
return &ArabicNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
|
||||
runes[i] = Alef
|
||||
case DotlessYeh:
|
||||
runes[i] = Yeh
|
||||
case TehMarbuta:
|
||||
runes[i] = Heh
|
||||
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewArabicNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
}
|
229
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize_test.go
generated
vendored
Normal file
229
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,229 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestArabicNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// AlifMadda
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("آجن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اجن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifHamzaAbove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("أحمد"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("احمد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifHamzaBelow
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("إعاذ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اعاذ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifMaksura
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بنى"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بني"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// TehMarbuta
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فاطمة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فاطمه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tatweel
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرـــــت"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرت"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Fatha
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مَبنا"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مبنا"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Kasra
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("علِي"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("علي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Damma
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بُوات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بوات"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Fathatan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولداً"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدا"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Kasratan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدٍ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Dammatan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدٌ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Sukun
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نلْسون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نلسون"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Shaddah
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هتميّ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هتمي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
arabicNormalizeFilter := NewArabicNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := arabicNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar.go
generated
vendored
Normal file
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar.go
generated
vendored
Normal file
@ -0,0 +1,113 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_ar"
|
||||
|
||||
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
|
||||
var prefixes = [][]rune{
|
||||
[]rune("ال"),
|
||||
[]rune("وال"),
|
||||
[]rune("بال"),
|
||||
[]rune("كال"),
|
||||
[]rune("فال"),
|
||||
[]rune("لل"),
|
||||
[]rune("و"),
|
||||
}
|
||||
var suffixes = [][]rune{
|
||||
[]rune("ها"),
|
||||
[]rune("ان"),
|
||||
[]rune("ات"),
|
||||
[]rune("ون"),
|
||||
[]rune("ين"),
|
||||
[]rune("يه"),
|
||||
[]rune("ية"),
|
||||
[]rune("ه"),
|
||||
[]rune("ة"),
|
||||
[]rune("ي"),
|
||||
}
|
||||
|
||||
type ArabicStemmerFilter struct{}
|
||||
|
||||
func NewArabicStemmerFilter() *ArabicStemmerFilter {
|
||||
return &ArabicStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := stem(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func canStemPrefix(input, prefix []rune) bool {
|
||||
// Wa- prefix requires at least 3 characters.
|
||||
if len(prefix) == 1 && len(input) < 4 {
|
||||
return false
|
||||
}
|
||||
// Other prefixes require only 2.
|
||||
if len(input)-len(prefix) < 2 {
|
||||
return false
|
||||
}
|
||||
for i := range prefix {
|
||||
if prefix[i] != input[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func canStemSuffix(input, suffix []rune) bool {
|
||||
// All suffixes require at least 2 characters after stemming.
|
||||
if len(input)-len(suffix) < 2 {
|
||||
return false
|
||||
}
|
||||
stemEnd := len(input) - len(suffix)
|
||||
for i := range suffix {
|
||||
if suffix[i] != input[stemEnd+i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
// Strip a single prefix.
|
||||
for _, p := range prefixes {
|
||||
if canStemPrefix(runes, p) {
|
||||
runes = runes[len(p):]
|
||||
break
|
||||
}
|
||||
}
|
||||
// Strip off multiple suffixes, in their order in the suffixes array.
|
||||
for _, s := range suffixes {
|
||||
if canStemSuffix(runes, s) {
|
||||
runes = runes[:len(runes)-len(s)]
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewArabicStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
392
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar_test.go
generated
vendored
Normal file
392
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar_test.go
generated
vendored
Normal file
@ -0,0 +1,392 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestArabicStemmerFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// AlPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("والحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// BalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// KalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// FalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// LlPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("للاخر"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اخر"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WaPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AhSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زوجها"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زوج"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدان"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AtSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدين"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YhSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهديه"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YpSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدية"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهده"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// PSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدي"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ComboPrefSuf
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وساهدون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ComboSuf
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدهات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ShouldntStem
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الو"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الو"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// NonArabic
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("English"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("English"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("السلام"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلامة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("السلامة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الوصل"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وصل"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("والصل"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("صل"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
arabicStemmerFilter := NewArabicStemmerFilter()
|
||||
for _, test := range tests {
|
||||
actual := arabicStemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_filter_ar.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_filter_ar.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
149
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_words_ar.go
generated
vendored
Normal file
149
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_words_ar.go
generated
vendored
Normal file
@ -0,0 +1,149 @@
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ar"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
|
||||
# This means that when modifying this list, you might need to add some
|
||||
# redundant entries, for example containing forms with both أ and ا
|
||||
من
|
||||
ومن
|
||||
منها
|
||||
منه
|
||||
في
|
||||
وفي
|
||||
فيها
|
||||
فيه
|
||||
و
|
||||
ف
|
||||
ثم
|
||||
او
|
||||
أو
|
||||
ب
|
||||
بها
|
||||
به
|
||||
ا
|
||||
أ
|
||||
اى
|
||||
اي
|
||||
أي
|
||||
أى
|
||||
لا
|
||||
ولا
|
||||
الا
|
||||
ألا
|
||||
إلا
|
||||
لكن
|
||||
ما
|
||||
وما
|
||||
كما
|
||||
فما
|
||||
عن
|
||||
مع
|
||||
اذا
|
||||
إذا
|
||||
ان
|
||||
أن
|
||||
إن
|
||||
انها
|
||||
أنها
|
||||
إنها
|
||||
انه
|
||||
أنه
|
||||
إنه
|
||||
بان
|
||||
بأن
|
||||
فان
|
||||
فأن
|
||||
وان
|
||||
وأن
|
||||
وإن
|
||||
التى
|
||||
التي
|
||||
الذى
|
||||
الذي
|
||||
الذين
|
||||
الى
|
||||
الي
|
||||
إلى
|
||||
إلي
|
||||
على
|
||||
عليها
|
||||
عليه
|
||||
اما
|
||||
أما
|
||||
إما
|
||||
ايضا
|
||||
أيضا
|
||||
كل
|
||||
وكل
|
||||
لم
|
||||
ولم
|
||||
لن
|
||||
ولن
|
||||
هى
|
||||
هي
|
||||
هو
|
||||
وهى
|
||||
وهي
|
||||
وهو
|
||||
فهى
|
||||
فهي
|
||||
فهو
|
||||
انت
|
||||
أنت
|
||||
لك
|
||||
لها
|
||||
له
|
||||
هذه
|
||||
هذا
|
||||
تلك
|
||||
ذلك
|
||||
هناك
|
||||
كانت
|
||||
كان
|
||||
يكون
|
||||
تكون
|
||||
وكانت
|
||||
وكان
|
||||
غير
|
||||
بعض
|
||||
قد
|
||||
نحو
|
||||
بين
|
||||
بينما
|
||||
منذ
|
||||
ضمن
|
||||
حيث
|
||||
الان
|
||||
الآن
|
||||
خلال
|
||||
بعد
|
||||
قبل
|
||||
حتى
|
||||
عند
|
||||
عندما
|
||||
لدى
|
||||
جميع
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(ArabicStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_filter_bg.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_filter_bg.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package bg
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
217
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_words_bg.go
generated
vendored
Normal file
217
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_words_bg.go
generated
vendored
Normal file
@ -0,0 +1,217 @@
|
||||
package bg
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_bg"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
а
|
||||
аз
|
||||
ако
|
||||
ала
|
||||
бе
|
||||
без
|
||||
беше
|
||||
би
|
||||
бил
|
||||
била
|
||||
били
|
||||
било
|
||||
близо
|
||||
бъдат
|
||||
бъде
|
||||
бяха
|
||||
в
|
||||
вас
|
||||
ваш
|
||||
ваша
|
||||
вероятно
|
||||
вече
|
||||
взема
|
||||
ви
|
||||
вие
|
||||
винаги
|
||||
все
|
||||
всеки
|
||||
всички
|
||||
всичко
|
||||
всяка
|
||||
във
|
||||
въпреки
|
||||
върху
|
||||
г
|
||||
ги
|
||||
главно
|
||||
го
|
||||
д
|
||||
да
|
||||
дали
|
||||
до
|
||||
докато
|
||||
докога
|
||||
дори
|
||||
досега
|
||||
доста
|
||||
е
|
||||
едва
|
||||
един
|
||||
ето
|
||||
за
|
||||
зад
|
||||
заедно
|
||||
заради
|
||||
засега
|
||||
затова
|
||||
защо
|
||||
защото
|
||||
и
|
||||
из
|
||||
или
|
||||
им
|
||||
има
|
||||
имат
|
||||
иска
|
||||
й
|
||||
каза
|
||||
как
|
||||
каква
|
||||
какво
|
||||
както
|
||||
какъв
|
||||
като
|
||||
кога
|
||||
когато
|
||||
което
|
||||
които
|
||||
кой
|
||||
който
|
||||
колко
|
||||
която
|
||||
къде
|
||||
където
|
||||
към
|
||||
ли
|
||||
м
|
||||
ме
|
||||
между
|
||||
мен
|
||||
ми
|
||||
мнозина
|
||||
мога
|
||||
могат
|
||||
може
|
||||
моля
|
||||
момента
|
||||
му
|
||||
н
|
||||
на
|
||||
над
|
||||
назад
|
||||
най
|
||||
направи
|
||||
напред
|
||||
например
|
||||
нас
|
||||
не
|
||||
него
|
||||
нея
|
||||
ни
|
||||
ние
|
||||
никой
|
||||
нито
|
||||
но
|
||||
някои
|
||||
някой
|
||||
няма
|
||||
обаче
|
||||
около
|
||||
освен
|
||||
особено
|
||||
от
|
||||
отгоре
|
||||
отново
|
||||
още
|
||||
пак
|
||||
по
|
||||
повече
|
||||
повечето
|
||||
под
|
||||
поне
|
||||
поради
|
||||
после
|
||||
почти
|
||||
прави
|
||||
пред
|
||||
преди
|
||||
през
|
||||
при
|
||||
пък
|
||||
първо
|
||||
с
|
||||
са
|
||||
само
|
||||
се
|
||||
сега
|
||||
си
|
||||
скоро
|
||||
след
|
||||
сме
|
||||
според
|
||||
сред
|
||||
срещу
|
||||
сте
|
||||
съм
|
||||
със
|
||||
също
|
||||
т
|
||||
тази
|
||||
така
|
||||
такива
|
||||
такъв
|
||||
там
|
||||
твой
|
||||
те
|
||||
тези
|
||||
ти
|
||||
тн
|
||||
то
|
||||
това
|
||||
тогава
|
||||
този
|
||||
той
|
||||
толкова
|
||||
точно
|
||||
трябва
|
||||
тук
|
||||
тъй
|
||||
тя
|
||||
тях
|
||||
у
|
||||
харесва
|
||||
ч
|
||||
че
|
||||
често
|
||||
чрез
|
||||
ще
|
||||
щом
|
||||
я
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(BulgarianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
30
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/articles_ca.go
generated
vendored
Normal file
30
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/articles_ca.go
generated
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
package ca
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const ArticlesName = "articles_ca"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
|
||||
var CatalanArticles = []byte(`
|
||||
d
|
||||
l
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
||||
`)
|
||||
|
||||
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CatalanArticles)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||
}
|
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca.go
generated
vendored
Normal file
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca.go
generated
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ca
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const ElisionName = "elision_ca"
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||
}
|
||||
return elision_filter.NewElisionFilter(articlesTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||
}
|
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca_test.go
generated
vendored
Normal file
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca_test.go
generated
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ca
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestFrenchElision(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("l'Institut"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("d'Estudis"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Institut"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Estudis"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := elisionFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_filter_ca.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_filter_ca.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ca
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
244
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_words_ca.go
generated
vendored
Normal file
244
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_words_ca.go
generated
vendored
Normal file
@ -0,0 +1,244 @@
|
||||
package ca
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ca"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var CatalanStopWords = []byte(`# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
|
||||
a
|
||||
abans
|
||||
ací
|
||||
ah
|
||||
així
|
||||
això
|
||||
al
|
||||
als
|
||||
aleshores
|
||||
algun
|
||||
alguna
|
||||
algunes
|
||||
alguns
|
||||
alhora
|
||||
allà
|
||||
allí
|
||||
allò
|
||||
altra
|
||||
altre
|
||||
altres
|
||||
amb
|
||||
ambdós
|
||||
ambdues
|
||||
apa
|
||||
aquell
|
||||
aquella
|
||||
aquelles
|
||||
aquells
|
||||
aquest
|
||||
aquesta
|
||||
aquestes
|
||||
aquests
|
||||
aquí
|
||||
baix
|
||||
cada
|
||||
cadascú
|
||||
cadascuna
|
||||
cadascunes
|
||||
cadascuns
|
||||
com
|
||||
contra
|
||||
d'un
|
||||
d'una
|
||||
d'unes
|
||||
d'uns
|
||||
dalt
|
||||
de
|
||||
del
|
||||
dels
|
||||
des
|
||||
després
|
||||
dins
|
||||
dintre
|
||||
donat
|
||||
doncs
|
||||
durant
|
||||
e
|
||||
eh
|
||||
el
|
||||
els
|
||||
em
|
||||
en
|
||||
encara
|
||||
ens
|
||||
entre
|
||||
érem
|
||||
eren
|
||||
éreu
|
||||
es
|
||||
és
|
||||
esta
|
||||
està
|
||||
estàvem
|
||||
estaven
|
||||
estàveu
|
||||
esteu
|
||||
et
|
||||
etc
|
||||
ets
|
||||
fins
|
||||
fora
|
||||
gairebé
|
||||
ha
|
||||
han
|
||||
has
|
||||
havia
|
||||
he
|
||||
hem
|
||||
heu
|
||||
hi
|
||||
ho
|
||||
i
|
||||
igual
|
||||
iguals
|
||||
ja
|
||||
l'hi
|
||||
la
|
||||
les
|
||||
li
|
||||
li'n
|
||||
llavors
|
||||
m'he
|
||||
ma
|
||||
mal
|
||||
malgrat
|
||||
mateix
|
||||
mateixa
|
||||
mateixes
|
||||
mateixos
|
||||
me
|
||||
mentre
|
||||
més
|
||||
meu
|
||||
meus
|
||||
meva
|
||||
meves
|
||||
molt
|
||||
molta
|
||||
moltes
|
||||
molts
|
||||
mon
|
||||
mons
|
||||
n'he
|
||||
n'hi
|
||||
ne
|
||||
ni
|
||||
no
|
||||
nogensmenys
|
||||
només
|
||||
nosaltres
|
||||
nostra
|
||||
nostre
|
||||
nostres
|
||||
o
|
||||
oh
|
||||
oi
|
||||
on
|
||||
pas
|
||||
pel
|
||||
pels
|
||||
per
|
||||
però
|
||||
perquè
|
||||
poc
|
||||
poca
|
||||
pocs
|
||||
poques
|
||||
potser
|
||||
propi
|
||||
qual
|
||||
quals
|
||||
quan
|
||||
quant
|
||||
que
|
||||
què
|
||||
quelcom
|
||||
qui
|
||||
quin
|
||||
quina
|
||||
quines
|
||||
quins
|
||||
s'ha
|
||||
s'han
|
||||
sa
|
||||
semblant
|
||||
semblants
|
||||
ses
|
||||
seu
|
||||
seus
|
||||
seva
|
||||
seva
|
||||
seves
|
||||
si
|
||||
sobre
|
||||
sobretot
|
||||
sóc
|
||||
solament
|
||||
sols
|
||||
son
|
||||
són
|
||||
sons
|
||||
sota
|
||||
sou
|
||||
t'ha
|
||||
t'han
|
||||
t'he
|
||||
ta
|
||||
tal
|
||||
també
|
||||
tampoc
|
||||
tan
|
||||
tant
|
||||
tanta
|
||||
tantes
|
||||
teu
|
||||
teus
|
||||
teva
|
||||
teves
|
||||
ton
|
||||
tons
|
||||
tot
|
||||
tota
|
||||
totes
|
||||
tots
|
||||
un
|
||||
una
|
||||
unes
|
||||
uns
|
||||
us
|
||||
va
|
||||
vaig
|
||||
vam
|
||||
van
|
||||
vas
|
||||
veu
|
||||
vosaltres
|
||||
vostra
|
||||
vostre
|
||||
vostres
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CatalanStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk.go
generated
vendored
Normal file
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk.go
generated
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "cjk"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bigramFilter, err := cache.TokenFilterNamed(BigramName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: whitespaceTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
normalizeFilter,
|
||||
toLowerFilter,
|
||||
bigramFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
620
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk_test.go
generated
vendored
Normal file
620
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk_test.go
generated
vendored
Normal file
@ -0,0 +1,620 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestCJKAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("こんにちは世界"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一二三四五六七八九十"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一二"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("二三"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("三四"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("四五"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("五六"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("六七"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("七八"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("八九"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("九十"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 二三四 五六七八九 十"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("二三"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("三四"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 7,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("五六"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 14,
|
||||
End: 20,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("六七"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 17,
|
||||
End: 23,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("七八"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 20,
|
||||
End: 26,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("八九"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 23,
|
||||
End: 29,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("十"),
|
||||
Type: analysis.Single,
|
||||
Position: 10,
|
||||
Start: 30,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("defgh"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ijklmn"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 10,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("opqrstu"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 4,
|
||||
Start: 17,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vwxy"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 5,
|
||||
Start: 25,
|
||||
End: 29,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("z"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 30,
|
||||
End: 31,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あい"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あい "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("test"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("test "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいtest"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("testあい "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいうえおabcかきくけこ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("いう"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("うえ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("えお"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("かき"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("きく"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("くけ"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("けこ"),
|
||||
Type: analysis.Double,
|
||||
Position: 10,
|
||||
Start: 27,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいうえおabんcかきくけ こ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("いう"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("うえ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("えお"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 17,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 7,
|
||||
Start: 17,
|
||||
End: 20,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 8,
|
||||
Start: 20,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("かき"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("きく"),
|
||||
Type: analysis.Double,
|
||||
Position: 10,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("くけ"),
|
||||
Type: analysis.Double,
|
||||
Position: 11,
|
||||
Start: 27,
|
||||
End: 33,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 13,
|
||||
Start: 34,
|
||||
End: 37,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 روبرت موير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرت"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 14,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("موير"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 15,
|
||||
End: 23,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 رُوبرت موير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("رُوبرت"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("موير"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 17,
|
||||
End: 25,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("𩬅艱鍟䇹愯瀛"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("𩬅艱"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("艱鍟"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("鍟䇹"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 7,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("䇹愯"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 10,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("愯瀛"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 13,
|
||||
End: 19,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一丁丂"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一丁"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("丁丂"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
for _, test := range tests {
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
166
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram.go
generated
vendored
Normal file
166
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram.go
generated
vendored
Normal file
@ -0,0 +1,166 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"container/ring"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const BigramName = "cjk_bigram"
|
||||
|
||||
type CJKBigramFilter struct {
|
||||
outputUnigram bool
|
||||
}
|
||||
|
||||
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
|
||||
return &CJKBigramFilter{
|
||||
outputUnigram: outputUnigram,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
r := ring.New(2)
|
||||
itemsInRing := 0
|
||||
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
if token.Type == analysis.Ideographic {
|
||||
if itemsInRing > 0 {
|
||||
// if items already buffered
|
||||
// check to see if this is aligned
|
||||
curr := r.Value.(*analysis.Token)
|
||||
if token.Start-curr.End != 0 {
|
||||
// not aligned flush
|
||||
flushToken := s.flush(r, &itemsInRing)
|
||||
if flushToken != nil {
|
||||
rv = append(rv, flushToken)
|
||||
}
|
||||
}
|
||||
}
|
||||
// now we can add this token to the buffer
|
||||
r = r.Next()
|
||||
r.Value = token
|
||||
if itemsInRing < 2 {
|
||||
itemsInRing++
|
||||
}
|
||||
if itemsInRing > 1 && s.outputUnigram {
|
||||
unigram := s.buildUnigram(r, &itemsInRing)
|
||||
if unigram != nil {
|
||||
rv = append(rv, unigram)
|
||||
}
|
||||
}
|
||||
bigramToken := s.outputBigram(r, &itemsInRing)
|
||||
if bigramToken != nil {
|
||||
rv = append(rv, bigramToken)
|
||||
}
|
||||
} else {
|
||||
// flush anything already buffered
|
||||
flushToken := s.flush(r, &itemsInRing)
|
||||
if flushToken != nil {
|
||||
rv = append(rv, flushToken)
|
||||
}
|
||||
// output this token as is
|
||||
rv = append(rv, token)
|
||||
}
|
||||
}
|
||||
|
||||
// deal with possible trailing unigram
|
||||
if itemsInRing == 1 || s.outputUnigram {
|
||||
if itemsInRing == 2 {
|
||||
r = r.Next()
|
||||
}
|
||||
unigram := s.buildUnigram(r, &itemsInRing)
|
||||
if unigram != nil {
|
||||
rv = append(rv, unigram)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||
var rv *analysis.Token
|
||||
if *itemsInRing == 1 {
|
||||
rv = s.buildUnigram(r, itemsInRing)
|
||||
}
|
||||
r.Value = nil
|
||||
*itemsInRing = 0
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||
if *itemsInRing == 2 {
|
||||
thisShingleRing := r.Move(-1)
|
||||
shingledBytes := make([]byte, 0)
|
||||
|
||||
// do first token
|
||||
prev := thisShingleRing.Value.(*analysis.Token)
|
||||
shingledBytes = append(shingledBytes, prev.Term...)
|
||||
|
||||
// do second token
|
||||
thisShingleRing = thisShingleRing.Next()
|
||||
curr := thisShingleRing.Value.(*analysis.Token)
|
||||
shingledBytes = append(shingledBytes, curr.Term...)
|
||||
|
||||
token := analysis.Token{
|
||||
Type: analysis.Double,
|
||||
Term: shingledBytes,
|
||||
Position: prev.Position,
|
||||
Start: prev.Start,
|
||||
End: curr.End,
|
||||
}
|
||||
return &token
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||
if *itemsInRing == 2 {
|
||||
thisShingleRing := r.Move(-1)
|
||||
// do first token
|
||||
prev := thisShingleRing.Value.(*analysis.Token)
|
||||
token := analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: prev.Term,
|
||||
Position: prev.Position,
|
||||
Start: prev.Start,
|
||||
End: prev.End,
|
||||
}
|
||||
return &token
|
||||
} else if *itemsInRing == 1 {
|
||||
// do first token
|
||||
prev := r.Value.(*analysis.Token)
|
||||
token := analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: prev.Term,
|
||||
Position: prev.Position,
|
||||
Start: prev.Start,
|
||||
End: prev.End,
|
||||
}
|
||||
return &token
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
outputUnigram := false
|
||||
outVal, ok := config["output_unigram"].(bool)
|
||||
if ok {
|
||||
outputUnigram = outVal
|
||||
}
|
||||
return NewCJKBigramFilter(outputUnigram), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
|
||||
}
|
420
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram_test.go
generated
vendored
Normal file
420
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram_test.go
generated
vendored
Normal file
@ -0,0 +1,420 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestCJKBigramFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
outputUnigram bool
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: true,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Single,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Single,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Single,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Single,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Single,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
|
||||
actual := cjkBigramFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb.go
generated
vendored
Normal file
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb.go
generated
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build icu full
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ckb"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: icuTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
normCkbFilter,
|
||||
toLowerFilter,
|
||||
stopCkbFilter,
|
||||
stemmerCkbFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
74
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb_test.go
generated
vendored
Normal file
74
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb_test.go
generated
vendored
Normal file
@ -0,0 +1,74 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build icu full
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestSoraniAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stop word removal
|
||||
{
|
||||
input: []byte("ئەم پیاوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 2,
|
||||
Start: 7,
|
||||
End: 17,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("پیاوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("پیاو"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize.go
generated
vendored
Normal file
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize.go
generated
vendored
Normal file
@ -0,0 +1,113 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_ckb"
|
||||
|
||||
const (
|
||||
Yeh = '\u064A'
|
||||
DotlessYeh = '\u0649'
|
||||
FarsiYeh = '\u06CC'
|
||||
|
||||
Kaf = '\u0643'
|
||||
Keheh = '\u06A9'
|
||||
|
||||
Heh = '\u0647'
|
||||
Ae = '\u06D5'
|
||||
Zwnj = '\u200C'
|
||||
HehDoachashmee = '\u06BE'
|
||||
TehMarbuta = '\u0629'
|
||||
|
||||
Reh = '\u0631'
|
||||
Rreh = '\u0695'
|
||||
RrehAbove = '\u0692'
|
||||
|
||||
Tatweel = '\u0640'
|
||||
Fathatan = '\u064B'
|
||||
Dammatan = '\u064C'
|
||||
Kasratan = '\u064D'
|
||||
Fatha = '\u064E'
|
||||
Damma = '\u064F'
|
||||
Kasra = '\u0650'
|
||||
Shadda = '\u0651'
|
||||
Sukun = '\u0652'
|
||||
)
|
||||
|
||||
type SoraniNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
|
||||
return &SoraniNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case Yeh, DotlessYeh:
|
||||
runes[i] = FarsiYeh
|
||||
case Kaf:
|
||||
runes[i] = Keheh
|
||||
case Zwnj:
|
||||
if i > 0 && runes[i-1] == Heh {
|
||||
runes[i-1] = Ae
|
||||
}
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
case Heh:
|
||||
if i == len(runes)-1 {
|
||||
runes[i] = Ae
|
||||
}
|
||||
case TehMarbuta:
|
||||
runes[i] = Ae
|
||||
case HehDoachashmee:
|
||||
runes[i] = Heh
|
||||
case Reh:
|
||||
if i == 0 {
|
||||
runes[i] = Rreh
|
||||
}
|
||||
case RrehAbove:
|
||||
runes[i] = Rreh
|
||||
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
default:
|
||||
if unicode.In(runes[i], unicode.Cf) {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSoraniNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
}
|
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize_test.go
generated
vendored
Normal file
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,318 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestSoraniNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// test Y
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064A"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0649"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test K
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0643"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test H
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u200C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u200C\u06A9"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06BE"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0629"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test final H
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u0647\u0647"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u0647\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test RR
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0692"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0695"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test initial RR
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0631\u0631\u0631"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0695\u0631\u0631"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test remove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0640"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064B"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064D"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064E"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064F"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0650"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0651"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0652"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u200C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
soraniNormalizeFilter := NewSoraniNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := soraniNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
143
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter.go
generated
vendored
Normal file
143
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter.go
generated
vendored
Normal file
@ -0,0 +1,143 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_ckb"
|
||||
|
||||
type SoraniStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
|
||||
return &SoraniStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := stem(token.Term)
|
||||
token.Term = stemmed
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
inputLen := utf8.RuneCount(input)
|
||||
|
||||
// postposition
|
||||
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
|
||||
input = truncateRunes(input, 2)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
|
||||
input = truncateRunes(input, 1)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
|
||||
input = truncateRunes(input, 3)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
}
|
||||
|
||||
// possessive pronoun
|
||||
if inputLen > 6 &&
|
||||
(bytes.HasSuffix(input, []byte("مان")) ||
|
||||
bytes.HasSuffix(input, []byte("یان")) ||
|
||||
bytes.HasSuffix(input, []byte("تان"))) {
|
||||
input = truncateRunes(input, 3)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
}
|
||||
|
||||
// indefinite singular ezafe
|
||||
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
|
||||
return truncateRunes(input, 4)
|
||||
}
|
||||
|
||||
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
|
||||
// indefinite singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
|
||||
// indefinite singular
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
|
||||
// definite singular
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
|
||||
// definite singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
|
||||
// definite plural
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
|
||||
// definite plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
|
||||
// indefinite plural ezafe
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
|
||||
// indefinite plural ezafe
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
|
||||
// indefinite plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
|
||||
// indefinite plural
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
|
||||
// demonstrative plural
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
|
||||
// demonstrative plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
|
||||
// demonstrative singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
|
||||
// demonstrative singular
|
||||
return truncateRunes(input, 1)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
|
||||
// absolute singular ezafe
|
||||
return truncateRunes(input, 1)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func truncateRunes(input []byte, num int) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
runes = runes[:len(runes)-num]
|
||||
out := buildTermFromRunes(runes)
|
||||
return out
|
||||
}
|
||||
|
||||
func buildTermFromRunes(runes []rune) []byte {
|
||||
rv := make([]byte, 0, len(runes)*4)
|
||||
for _, r := range runes {
|
||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
||||
utf8.EncodeRune(runeBytes, r)
|
||||
rv = append(rv, runeBytes...)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSoraniStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
294
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter_test.go
generated
vendored
Normal file
294
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter_test.go
generated
vendored
Normal file
@ -0,0 +1,294 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
|
||||
)
|
||||
|
||||
func TestSoraniStemmerFilter(t *testing.T) {
|
||||
|
||||
// in order to match the lucene tests
|
||||
// we will test with an analyzer, not just the stemmer
|
||||
analyzer := analysis.Analyzer{
|
||||
Tokenizer: single_token.NewSingleTokenTokenizer(),
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
NewSoraniNormalizeFilter(),
|
||||
NewSoraniStemmerFilter(),
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{ // -ek
|
||||
input: []byte("پیاوێک"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yek
|
||||
input: []byte("دەرگایەک"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -aka
|
||||
input: []byte("پیاوەكە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -ka
|
||||
input: []byte("دەرگاكە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -a
|
||||
input: []byte("کتاویە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("کتاوی"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -ya
|
||||
input: []byte("دەرگایە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -An
|
||||
input: []byte("پیاوان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAn
|
||||
input: []byte("دەرگایان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -akAn
|
||||
input: []byte("پیاوەکان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -kAn
|
||||
input: []byte("دەرگاکان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -Ana
|
||||
input: []byte("پیاوانە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAna
|
||||
input: []byte("دەرگایانە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe singular
|
||||
input: []byte("هۆتیلی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe indefinite
|
||||
input: []byte("هۆتیلێکی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe plural
|
||||
input: []byte("هۆتیلانی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -awa
|
||||
input: []byte("دوورەوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دوور"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -dA
|
||||
input: []byte("نیوەشەودا"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نیوەشەو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -A
|
||||
input: []byte("سۆرانا"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سۆران"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -mAn
|
||||
input: []byte("پارەمان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -tAn
|
||||
input: []byte("پارەتان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAn
|
||||
input: []byte("پارەیان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // empty
|
||||
input: []byte(""),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("for input %s(% x)", test.input, test.input)
|
||||
t.Errorf("\texpected:")
|
||||
for _, token := range test.output {
|
||||
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||
}
|
||||
t.Errorf("\tactual:")
|
||||
for _, token := range actual {
|
||||
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_filter_ckb.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_filter_ckb.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
160
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_words_ckb.go
generated
vendored
Normal file
160
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_words_ckb.go
generated
vendored
Normal file
@ -0,0 +1,160 @@
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ckb"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var SoraniStopWords = []byte(`# set of kurdish stopwords
|
||||
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||
# constructed from:
|
||||
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||
|
||||
# and
|
||||
و
|
||||
# which
|
||||
کە
|
||||
# of
|
||||
ی
|
||||
# made/did
|
||||
کرد
|
||||
# that/which
|
||||
ئەوەی
|
||||
# on/head
|
||||
سەر
|
||||
# two
|
||||
دوو
|
||||
# also
|
||||
هەروەها
|
||||
# from/that
|
||||
لەو
|
||||
# makes/does
|
||||
دەکات
|
||||
# some
|
||||
چەند
|
||||
# every
|
||||
هەر
|
||||
|
||||
# demonstratives
|
||||
# that
|
||||
ئەو
|
||||
# this
|
||||
ئەم
|
||||
|
||||
# personal pronouns
|
||||
# I
|
||||
من
|
||||
# we
|
||||
ئێمە
|
||||
# you
|
||||
تۆ
|
||||
# you
|
||||
ئێوە
|
||||
# he/she/it
|
||||
ئەو
|
||||
# they
|
||||
ئەوان
|
||||
|
||||
# prepositions
|
||||
# to/with/by
|
||||
بە
|
||||
پێ
|
||||
# without
|
||||
بەبێ
|
||||
# along with/while/during
|
||||
بەدەم
|
||||
# in the opinion of
|
||||
بەلای
|
||||
# according to
|
||||
بەپێی
|
||||
# before
|
||||
بەرلە
|
||||
# in the direction of
|
||||
بەرەوی
|
||||
# in front of/toward
|
||||
بەرەوە
|
||||
# before/in the face of
|
||||
بەردەم
|
||||
# without
|
||||
بێ
|
||||
# except for
|
||||
بێجگە
|
||||
# for
|
||||
بۆ
|
||||
# on/in
|
||||
دە
|
||||
تێ
|
||||
# with
|
||||
دەگەڵ
|
||||
# after
|
||||
دوای
|
||||
# except for/aside from
|
||||
جگە
|
||||
# in/from
|
||||
لە
|
||||
لێ
|
||||
# in front of/before/because of
|
||||
لەبەر
|
||||
# between/among
|
||||
لەبەینی
|
||||
# concerning/about
|
||||
لەبابەت
|
||||
# concerning
|
||||
لەبارەی
|
||||
# instead of
|
||||
لەباتی
|
||||
# beside
|
||||
لەبن
|
||||
# instead of
|
||||
لەبرێتی
|
||||
# behind
|
||||
لەدەم
|
||||
# with/together with
|
||||
لەگەڵ
|
||||
# by
|
||||
لەلایەن
|
||||
# within
|
||||
لەناو
|
||||
# between/among
|
||||
لەنێو
|
||||
# for the sake of
|
||||
لەپێناوی
|
||||
# with respect to
|
||||
لەرەوی
|
||||
# by means of/for
|
||||
لەرێ
|
||||
# for the sake of
|
||||
لەرێگا
|
||||
# on/on top of/according to
|
||||
لەسەر
|
||||
# under
|
||||
لەژێر
|
||||
# between/among
|
||||
ناو
|
||||
# between/among
|
||||
نێوان
|
||||
# after
|
||||
پاش
|
||||
# before
|
||||
پێش
|
||||
# like
|
||||
وەک
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(SoraniStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_filter_cs.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_filter_cs.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cs
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_words_cs.go
generated
vendored
Normal file
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_words_cs.go
generated
vendored
Normal file
@ -0,0 +1,196 @@
|
||||
package cs
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_cs"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var CzechStopWords = []byte(`a
|
||||
s
|
||||
k
|
||||
o
|
||||
i
|
||||
u
|
||||
v
|
||||
z
|
||||
dnes
|
||||
cz
|
||||
tímto
|
||||
budeš
|
||||
budem
|
||||
byli
|
||||
jseš
|
||||
můj
|
||||
svým
|
||||
ta
|
||||
tomto
|
||||
tohle
|
||||
tuto
|
||||
tyto
|
||||
jej
|
||||
zda
|
||||
proč
|
||||
máte
|
||||
tato
|
||||
kam
|
||||
tohoto
|
||||
kdo
|
||||
kteří
|
||||
mi
|
||||
nám
|
||||
tom
|
||||
tomuto
|
||||
mít
|
||||
nic
|
||||
proto
|
||||
kterou
|
||||
byla
|
||||
toho
|
||||
protože
|
||||
asi
|
||||
ho
|
||||
naši
|
||||
napište
|
||||
re
|
||||
což
|
||||
tím
|
||||
takže
|
||||
svých
|
||||
její
|
||||
svými
|
||||
jste
|
||||
aj
|
||||
tu
|
||||
tedy
|
||||
teto
|
||||
bylo
|
||||
kde
|
||||
ke
|
||||
pravé
|
||||
ji
|
||||
nad
|
||||
nejsou
|
||||
či
|
||||
pod
|
||||
téma
|
||||
mezi
|
||||
přes
|
||||
ty
|
||||
pak
|
||||
vám
|
||||
ani
|
||||
když
|
||||
však
|
||||
neg
|
||||
jsem
|
||||
tento
|
||||
článku
|
||||
články
|
||||
aby
|
||||
jsme
|
||||
před
|
||||
pta
|
||||
jejich
|
||||
byl
|
||||
ještě
|
||||
až
|
||||
bez
|
||||
také
|
||||
pouze
|
||||
první
|
||||
vaše
|
||||
která
|
||||
nás
|
||||
nový
|
||||
tipy
|
||||
pokud
|
||||
může
|
||||
strana
|
||||
jeho
|
||||
své
|
||||
jiné
|
||||
zprávy
|
||||
nové
|
||||
není
|
||||
vás
|
||||
jen
|
||||
podle
|
||||
zde
|
||||
už
|
||||
být
|
||||
více
|
||||
bude
|
||||
již
|
||||
než
|
||||
který
|
||||
by
|
||||
které
|
||||
co
|
||||
nebo
|
||||
ten
|
||||
tak
|
||||
má
|
||||
při
|
||||
od
|
||||
po
|
||||
jsou
|
||||
jak
|
||||
další
|
||||
ale
|
||||
si
|
||||
se
|
||||
ve
|
||||
to
|
||||
jako
|
||||
za
|
||||
zpět
|
||||
ze
|
||||
do
|
||||
pro
|
||||
je
|
||||
na
|
||||
atd
|
||||
atp
|
||||
jakmile
|
||||
přičemž
|
||||
já
|
||||
on
|
||||
ona
|
||||
ono
|
||||
oni
|
||||
ony
|
||||
my
|
||||
vy
|
||||
jí
|
||||
ji
|
||||
mě
|
||||
mne
|
||||
jemu
|
||||
tomu
|
||||
těm
|
||||
těmu
|
||||
němu
|
||||
němuž
|
||||
jehož
|
||||
jíž
|
||||
jelikož
|
||||
jež
|
||||
jakož
|
||||
načež
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CzechStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da.go
generated
vendored
Normal file
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "da"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopDaFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: icuTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopDaFilter,
|
||||
stemmerDaFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
69
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da_test.go
generated
vendored
Normal file
69
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da_test.go
generated
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestDanishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("undersøg"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("undersøg"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("undersøgelse"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("undersøg"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 13,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("på"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stemmer_da.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stemmer_da.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_da"
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return stemmer_filter.NewStemmerFilter("da")
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_filter_da.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_filter_da.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
134
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_words_da.go
generated
vendored
Normal file
134
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_words_da.go
generated
vendored
Normal file
@ -0,0 +1,134 @@
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_da"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | that (dem. pronoun)/it (pers. pronoun)
|
||||
at | that (in front of a sentence)/to (with infinitive)
|
||||
en | a/an
|
||||
den | it (pers. pronoun)/that (dem. pronoun)
|
||||
til | to/at/for/until/against/by/of/into, more
|
||||
er | present tense of "to be"
|
||||
som | who, as
|
||||
på | on/upon/in/on/at/to/after/of/with/for, on
|
||||
de | they
|
||||
med | with/by/in, along
|
||||
han | he
|
||||
af | of/by/from/off/for/in/with/on, off
|
||||
for | at/for/to/from/by/of/ago, in front/before, because
|
||||
ikke | not
|
||||
der | who/which, there/those
|
||||
var | past tense of "to be"
|
||||
mig | me/myself
|
||||
sig | oneself/himself/herself/itself/themselves
|
||||
men | but
|
||||
et | a/an/one, one (number), someone/somebody/one
|
||||
har | present tense of "to have"
|
||||
om | round/about/for/in/a, about/around/down, if
|
||||
vi | we
|
||||
min | my
|
||||
havde | past tense of "to have"
|
||||
ham | him
|
||||
hun | she
|
||||
nu | now
|
||||
over | over/above/across/by/beyond/past/on/about, over/past
|
||||
da | then, when/as/since
|
||||
fra | from/off/since, off, since
|
||||
du | you
|
||||
ud | out
|
||||
sin | his/her/its/one's
|
||||
dem | them
|
||||
os | us/ourselves
|
||||
op | up
|
||||
man | you/one
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hvad | what
|
||||
skal | must/shall etc.
|
||||
selv | myself/youself/herself/ourselves etc., even
|
||||
her | here
|
||||
alle | all/everyone/everybody etc.
|
||||
vil | will (verb)
|
||||
blev | past tense of "to stay/to remain/to get/to become"
|
||||
kunne | could
|
||||
ind | in
|
||||
når | when
|
||||
være | present tense of "to be"
|
||||
dog | however/yet/after all
|
||||
noget | something
|
||||
ville | would
|
||||
jo | you know/you see (adv), yes
|
||||
deres | their/theirs
|
||||
efter | after/behind/according to/for/by/from, later/afterwards
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
end | than
|
||||
dette | this
|
||||
mit | my/mine
|
||||
også | also
|
||||
under | under/beneath/below/during, below/underneath
|
||||
have | have
|
||||
dig | you
|
||||
anden | other
|
||||
hende | her
|
||||
mine | my
|
||||
alt | everything
|
||||
meget | much/very, plenty of
|
||||
sit | his, her, its, one's
|
||||
sine | his, her, its, one's
|
||||
vor | our
|
||||
mod | against
|
||||
disse | these
|
||||
hvis | if
|
||||
din | your/yours
|
||||
nogle | some
|
||||
hos | by/at
|
||||
blive | be/become
|
||||
mange | many
|
||||
ad | by/through
|
||||
bliver | present tense of "to be/to become"
|
||||
hendes | her/hers
|
||||
været | be
|
||||
thi | for (conj)
|
||||
jer | you
|
||||
sådan | such, like this/like that
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(DanishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de.go
generated
vendored
Normal file
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "de"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: icuTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopDeFilter,
|
||||
normalizeDeFilter,
|
||||
stemmerDeFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
97
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de_test.go
generated
vendored
Normal file
97
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de_test.go
generated
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestGermanAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("Tisch"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tisch"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Tische"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tisch"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Tischen"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tisch"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
// german specials
|
||||
{
|
||||
input: []byte("Schaltflächen"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("schaltflach"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Schaltflaechen"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("schaltflach"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
94
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize.go
generated
vendored
Normal file
94
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize.go
generated
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_de"
|
||||
|
||||
const (
|
||||
N = /* ordinary state */ 0
|
||||
V = 1 /* stops 'u' from entering umlaut state */
|
||||
U = 2 /* umlaut state, allows e-deletion */
|
||||
)
|
||||
|
||||
type GermanNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
|
||||
return &GermanNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
state := N
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case 'a', 'o':
|
||||
state = U
|
||||
case 'u':
|
||||
if state == N {
|
||||
state = U
|
||||
} else {
|
||||
state = V
|
||||
}
|
||||
case 'e':
|
||||
if state == U {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
state = V
|
||||
case 'i', 'q', 'y':
|
||||
state = V
|
||||
case 'ä':
|
||||
runes[i] = 'a'
|
||||
state = V
|
||||
case 'ö':
|
||||
runes[i] = 'o'
|
||||
state = V
|
||||
case 'ü':
|
||||
runes[i] = 'u'
|
||||
state = V
|
||||
case 'ß':
|
||||
runes[i] = 's'
|
||||
i++
|
||||
// newrunes := make([]rune, len(runes)+1)
|
||||
// copy(newrunes, runes)
|
||||
// runes = newrunes
|
||||
// runes[i] = 's'
|
||||
runes = analysis.InsertRune(runes, i, 's')
|
||||
state = N
|
||||
default:
|
||||
state = N
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewGermanNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
}
|
98
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize_test.go
generated
vendored
Normal file
98
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestGermanNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// Tests that a/o/u + e is equivalent to the umlaut form
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflächen"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflachen"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflaechen"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflachen"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tests the specific heuristic that ue is not folded after a vowel or q.
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dauer"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dauer"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tests german specific folding of sharp-s
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("weißbier"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("weissbier"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
germanNormalizeFilter := NewGermanNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := germanNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stemmer_de.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stemmer_de.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_de"
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return stemmer_filter.NewStemmerFilter("de")
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_filter_de.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_filter_de.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_words_de.go
generated
vendored
Normal file
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_words_de.go
generated
vendored
Normal file
@ -0,0 +1,318 @@
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_de"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| The number of forms in this list is reduced significantly by passing it
|
||||
| through the German stemmer.
|
||||
|
||||
|
||||
aber | but
|
||||
|
||||
alle | all
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
alles
|
||||
|
||||
als | than, as
|
||||
also | so
|
||||
am | an + dem
|
||||
an | at
|
||||
|
||||
ander | other
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
|
||||
auch | also
|
||||
auf | on
|
||||
aus | out of
|
||||
bei | by
|
||||
bin | am
|
||||
bis | until
|
||||
bist | art
|
||||
da | there
|
||||
damit | with it
|
||||
dann | then
|
||||
|
||||
der | the
|
||||
den
|
||||
des
|
||||
dem
|
||||
die
|
||||
das
|
||||
|
||||
daß | that
|
||||
|
||||
derselbe | the same
|
||||
derselben
|
||||
denselben
|
||||
desselben
|
||||
demselben
|
||||
dieselbe
|
||||
dieselben
|
||||
dasselbe
|
||||
|
||||
dazu | to that
|
||||
|
||||
dein | thy
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
|
||||
denn | because
|
||||
|
||||
derer | of those
|
||||
dessen | of him
|
||||
|
||||
dich | thee
|
||||
dir | to thee
|
||||
du | thou
|
||||
|
||||
dies | this
|
||||
diese
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
|
||||
|
||||
doch | (several meanings)
|
||||
dort | (over) there
|
||||
|
||||
|
||||
durch | through
|
||||
|
||||
ein | a
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
|
||||
einig | some
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
|
||||
einmal | once
|
||||
|
||||
er | he
|
||||
ihn | him
|
||||
ihm | to him
|
||||
|
||||
es | it
|
||||
etwas | something
|
||||
|
||||
euer | your
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
|
||||
für | for
|
||||
gegen | towards
|
||||
gewesen | p.p. of sein
|
||||
hab | have
|
||||
habe | have
|
||||
haben | have
|
||||
hat | has
|
||||
hatte | had
|
||||
hatten | had
|
||||
hier | here
|
||||
hin | there
|
||||
hinter | behind
|
||||
|
||||
ich | I
|
||||
mich | me
|
||||
mir | to me
|
||||
|
||||
|
||||
ihr | you, to her
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
euch | to you
|
||||
|
||||
im | in + dem
|
||||
in | in
|
||||
indem | while
|
||||
ins | in + das
|
||||
ist | is
|
||||
|
||||
jede | each, every
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedes
|
||||
|
||||
jene | that
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
|
||||
jetzt | now
|
||||
kann | can
|
||||
|
||||
kein | no
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
|
||||
können | can
|
||||
könnte | could
|
||||
machen | do
|
||||
man | one
|
||||
|
||||
manche | some, many a
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
|
||||
mein | my
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
|
||||
mit | with
|
||||
muss | must
|
||||
musste | had to
|
||||
nach | to(wards)
|
||||
nicht | not
|
||||
nichts | nothing
|
||||
noch | still, yet
|
||||
nun | now
|
||||
nur | only
|
||||
ob | whether
|
||||
oder | or
|
||||
ohne | without
|
||||
sehr | very
|
||||
|
||||
sein | his
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
|
||||
selbst | self
|
||||
sich | herself
|
||||
|
||||
sie | they, she
|
||||
ihnen | to them
|
||||
|
||||
sind | are
|
||||
so | so
|
||||
|
||||
solche | such
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
|
||||
soll | shall
|
||||
sollte | should
|
||||
sondern | but
|
||||
sonst | else
|
||||
über | over
|
||||
um | about, around
|
||||
und | and
|
||||
|
||||
uns | us
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unses
|
||||
|
||||
unter | under
|
||||
viel | much
|
||||
vom | von + dem
|
||||
von | from
|
||||
vor | before
|
||||
während | while
|
||||
war | was
|
||||
waren | were
|
||||
warst | wast
|
||||
was | what
|
||||
weg | away, off
|
||||
weil | because
|
||||
weiter | further
|
||||
|
||||
welche | which
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
|
||||
wenn | when
|
||||
werde | will
|
||||
werden | will
|
||||
wie | how
|
||||
wieder | again
|
||||
will | want
|
||||
wir | we
|
||||
wird | will
|
||||
wirst | willst
|
||||
wo | where
|
||||
wollen | want
|
||||
wollte | wanted
|
||||
würde | would
|
||||
würden | would
|
||||
zu | to
|
||||
zum | zu + dem
|
||||
zur | zu + der
|
||||
zwar | indeed
|
||||
zwischen | between
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(GermanStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_filter_el.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_filter_el.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package el
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
102
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_words_el.go
generated
vendored
Normal file
102
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_words_el.go
generated
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
package el
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_el"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var GreekStopWords = []byte(`# Lucene Greek Stopwords list
|
||||
# Note: by default this file is used after GreekLowerCaseFilter,
|
||||
# so when modifying this file use 'σ' instead of 'ς'
|
||||
ο
|
||||
η
|
||||
το
|
||||
οι
|
||||
τα
|
||||
του
|
||||
τησ
|
||||
των
|
||||
τον
|
||||
την
|
||||
και
|
||||
κι
|
||||
κ
|
||||
ειμαι
|
||||
εισαι
|
||||
ειναι
|
||||
ειμαστε
|
||||
ειστε
|
||||
στο
|
||||
στον
|
||||
στη
|
||||
στην
|
||||
μα
|
||||
αλλα
|
||||
απο
|
||||
για
|
||||
προσ
|
||||
με
|
||||
σε
|
||||
ωσ
|
||||
παρα
|
||||
αντι
|
||||
κατα
|
||||
μετα
|
||||
θα
|
||||
να
|
||||
δε
|
||||
δεν
|
||||
μη
|
||||
μην
|
||||
επι
|
||||
ενω
|
||||
εαν
|
||||
αν
|
||||
τοτε
|
||||
που
|
||||
πωσ
|
||||
ποιοσ
|
||||
ποια
|
||||
ποιο
|
||||
ποιοι
|
||||
ποιεσ
|
||||
ποιων
|
||||
ποιουσ
|
||||
αυτοσ
|
||||
αυτη
|
||||
αυτο
|
||||
αυτοι
|
||||
αυτων
|
||||
αυτουσ
|
||||
αυτεσ
|
||||
αυτα
|
||||
εκεινοσ
|
||||
εκεινη
|
||||
εκεινο
|
||||
εκεινοι
|
||||
εκεινεσ
|
||||
εκεινα
|
||||
εκεινων
|
||||
εκεινουσ
|
||||
οπωσ
|
||||
ομωσ
|
||||
ισωσ
|
||||
οσο
|
||||
οτι
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(GreekStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en.go
generated
vendored
Normal file
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en.go
generated
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/porter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "en"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopEnFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
possEnFilter,
|
||||
toLowerFilter,
|
||||
stopEnFilter,
|
||||
stemmerEnFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
100
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en_test.go
generated
vendored
Normal file
100
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en_test.go
generated
vendored
Normal file
@ -0,0 +1,100 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestEnglishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("books"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("book"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("book"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("book"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word removal
|
||||
{
|
||||
input: []byte("the"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
// possessive removal
|
||||
{
|
||||
input: []byte("steven's"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("steven"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("steven\u2019s"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("steven"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("steven\uFF07s"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("steven"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en.go
generated
vendored
Normal file
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en.go
generated
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const PossessiveName = "possessive_en"
|
||||
|
||||
const rightSingleQuotationMark = '’'
|
||||
const apostrophe = '\''
|
||||
const fullWidthApostrophe = '''
|
||||
|
||||
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
|
||||
|
||||
type PossessiveFilter struct {
|
||||
}
|
||||
|
||||
func NewPossessiveFilter() *PossessiveFilter {
|
||||
return &PossessiveFilter{}
|
||||
}
|
||||
|
||||
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
if len(runes) >= 2 {
|
||||
secondToLastRune := runes[len(runes)-2]
|
||||
lastRune := runes[len(runes)-1]
|
||||
if (secondToLastRune == rightSingleQuotationMark ||
|
||||
secondToLastRune == apostrophe ||
|
||||
secondToLastRune == fullWidthApostrophe) &&
|
||||
(lastRune == 's' || lastRune == 'S') {
|
||||
token.Term = analysis.TruncateRunes(token.Term, 2)
|
||||
}
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPossessiveFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
|
||||
}
|
86
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en_test.go
generated
vendored
Normal file
86
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en_test.go
generated
vendored
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestEnglishPossessiveFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty’s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY’S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := stemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_en"
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return stemmer_filter.NewStemmerFilter("en")
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en_test.go
generated
vendored
Normal file
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en_test.go
generated
vendored
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestEnglishStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("busi"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := stemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_filter_en.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_filter_en.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
343
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_words_en.go
generated
vendored
Normal file
343
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_words_en.go
generated
vendored
Normal file
@ -0,0 +1,343 @@
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_en"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| An English stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| Many of the forms below are quite rare (e.g. "yourselves") but included for
|
||||
| completeness.
|
||||
|
||||
| PRONOUNS FORMS
|
||||
| 1st person sing
|
||||
|
||||
i | subject, always in upper case of course
|
||||
|
||||
me | object
|
||||
my | possessive adjective
|
||||
| the possessive pronoun 'mine' is best suppressed, because of the
|
||||
| sense of coal-mine etc.
|
||||
myself | reflexive
|
||||
| 1st person plural
|
||||
we | subject
|
||||
|
||||
| us | object
|
||||
| care is required here because US = United States. It is usually
|
||||
| safe to remove it if it is in lower case.
|
||||
our | possessive adjective
|
||||
ours | possessive pronoun
|
||||
ourselves | reflexive
|
||||
| second person (archaic 'thou' forms not included)
|
||||
you | subject and object
|
||||
your | possessive adjective
|
||||
yours | possessive pronoun
|
||||
yourself | reflexive (singular)
|
||||
yourselves | reflexive (plural)
|
||||
| third person singular
|
||||
he | subject
|
||||
him | object
|
||||
his | possessive adjective and pronoun
|
||||
himself | reflexive
|
||||
|
||||
she | subject
|
||||
her | object and possessive adjective
|
||||
hers | possessive pronoun
|
||||
herself | reflexive
|
||||
|
||||
it | subject and object
|
||||
its | possessive adjective
|
||||
itself | reflexive
|
||||
| third person plural
|
||||
they | subject
|
||||
them | object
|
||||
their | possessive adjective
|
||||
theirs | possessive pronoun
|
||||
themselves | reflexive
|
||||
| other forms (demonstratives, interrogatives)
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
|
||||
| VERB FORMS (using F.R. Palmer's nomenclature)
|
||||
| BE
|
||||
am | 1st person, present
|
||||
is | -s form (3rd person, present)
|
||||
are | present
|
||||
was | 1st person, past
|
||||
were | past
|
||||
be | infinitive
|
||||
been | past participle
|
||||
being | -ing form
|
||||
| HAVE
|
||||
have | simple
|
||||
has | -s form
|
||||
had | past
|
||||
having | -ing form
|
||||
| DO
|
||||
do | simple
|
||||
does | -s form
|
||||
did | past
|
||||
doing | -ing form
|
||||
|
||||
| The forms below are, I believe, best omitted, because of the significant
|
||||
| homonym forms:
|
||||
|
||||
| He made a WILL
|
||||
| old tin CAN
|
||||
| merry month of MAY
|
||||
| a smell of MUST
|
||||
| fight the good fight with all thy MIGHT
|
||||
|
||||
| would, could, should, ought might however be included
|
||||
|
||||
| | AUXILIARIES
|
||||
| | WILL
|
||||
|will
|
||||
|
||||
would
|
||||
|
||||
| | SHALL
|
||||
|shall
|
||||
|
||||
should
|
||||
|
||||
| | CAN
|
||||
|can
|
||||
|
||||
could
|
||||
|
||||
| | MAY
|
||||
|may
|
||||
|might
|
||||
| | MUST
|
||||
|must
|
||||
| | OUGHT
|
||||
|
||||
ought
|
||||
|
||||
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
|
||||
| pronoun + verb
|
||||
|
||||
i'm
|
||||
you're
|
||||
he's
|
||||
she's
|
||||
it's
|
||||
we're
|
||||
they're
|
||||
i've
|
||||
you've
|
||||
we've
|
||||
they've
|
||||
i'd
|
||||
you'd
|
||||
he'd
|
||||
she'd
|
||||
we'd
|
||||
they'd
|
||||
i'll
|
||||
you'll
|
||||
he'll
|
||||
she'll
|
||||
we'll
|
||||
they'll
|
||||
|
||||
| verb + negation
|
||||
|
||||
isn't
|
||||
aren't
|
||||
wasn't
|
||||
weren't
|
||||
hasn't
|
||||
haven't
|
||||
hadn't
|
||||
doesn't
|
||||
don't
|
||||
didn't
|
||||
|
||||
| auxiliary + negation
|
||||
|
||||
won't
|
||||
wouldn't
|
||||
shan't
|
||||
shouldn't
|
||||
can't
|
||||
cannot
|
||||
couldn't
|
||||
mustn't
|
||||
|
||||
| miscellaneous forms
|
||||
|
||||
let's
|
||||
that's
|
||||
who's
|
||||
what's
|
||||
here's
|
||||
there's
|
||||
when's
|
||||
where's
|
||||
why's
|
||||
how's
|
||||
|
||||
| rarer forms
|
||||
|
||||
| daren't needn't
|
||||
|
||||
| doubtful forms
|
||||
|
||||
| oughtn't mightn't
|
||||
|
||||
| ARTICLES
|
||||
a
|
||||
an
|
||||
the
|
||||
|
||||
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
|
||||
| high, that classification is pointless.)
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
|
||||
| Just for the record, the following words are among the commonest in English
|
||||
|
||||
| one
|
||||
| every
|
||||
| least
|
||||
| less
|
||||
| many
|
||||
| now
|
||||
| ever
|
||||
| never
|
||||
| say
|
||||
| says
|
||||
| said
|
||||
| also
|
||||
| get
|
||||
| go
|
||||
| goes
|
||||
| just
|
||||
| made
|
||||
| make
|
||||
| put
|
||||
| see
|
||||
| seen
|
||||
| whether
|
||||
| like
|
||||
| well
|
||||
| back
|
||||
| even
|
||||
| still
|
||||
| way
|
||||
| take
|
||||
| since
|
||||
| another
|
||||
| however
|
||||
| two
|
||||
| three
|
||||
| four
|
||||
| five
|
||||
| first
|
||||
| second
|
||||
| new
|
||||
| old
|
||||
| high
|
||||
| long
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(EnglishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es.go
generated
vendored
Normal file
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "es"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopEsFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: icuTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopEsFilter,
|
||||
stemmerEsFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
64
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es_test.go
generated
vendored
Normal file
64
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es_test.go
generated
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestSpanishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("chicana"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chican"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("chicano"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chican"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stemmer_es.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stemmer_es.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_es"
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return stemmer_filter.NewStemmerFilter("es")
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_filter_es.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_filter_es.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
380
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_words_es.go
generated
vendored
Normal file
380
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_words_es.go
generated
vendored
Normal file
@ -0,0 +1,380 @@
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_es"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords
|
||||
| deriving from a large sample of text.
|
||||
|
||||
| Extra words have been added at the end.
|
||||
|
||||
de | from, of
|
||||
la | the, her
|
||||
que | who, that
|
||||
el | the
|
||||
en | in
|
||||
y | and
|
||||
a | to
|
||||
los | the, them
|
||||
del | de + el
|
||||
se | himself, from him etc
|
||||
las | the, them
|
||||
por | for, by, etc
|
||||
un | a
|
||||
para | for
|
||||
con | with
|
||||
no | no
|
||||
una | a
|
||||
su | his, her
|
||||
al | a + el
|
||||
| es from SER
|
||||
lo | him
|
||||
como | how
|
||||
más | more
|
||||
pero | pero
|
||||
sus | su plural
|
||||
le | to him, her
|
||||
ya | already
|
||||
o | or
|
||||
| fue from SER
|
||||
este | this
|
||||
| ha from HABER
|
||||
sí | himself etc
|
||||
porque | because
|
||||
esta | this
|
||||
| son from SER
|
||||
entre | between
|
||||
| está from ESTAR
|
||||
cuando | when
|
||||
muy | very
|
||||
sin | without
|
||||
sobre | on
|
||||
| ser from SER
|
||||
| tiene from TENER
|
||||
también | also
|
||||
me | me
|
||||
hasta | until
|
||||
hay | there is/are
|
||||
donde | where
|
||||
| han from HABER
|
||||
quien | whom, that
|
||||
| están from ESTAR
|
||||
| estado from ESTAR
|
||||
desde | from
|
||||
todo | all
|
||||
nos | us
|
||||
durante | during
|
||||
| estados from ESTAR
|
||||
todos | all
|
||||
uno | a
|
||||
les | to them
|
||||
ni | nor
|
||||
contra | against
|
||||
otros | other
|
||||
| fueron from SER
|
||||
ese | that
|
||||
eso | that
|
||||
| había from HABER
|
||||
ante | before
|
||||
ellos | they
|
||||
e | and (variant of y)
|
||||
esto | this
|
||||
mí | me
|
||||
antes | before
|
||||
algunos | some
|
||||
qué | what?
|
||||
unos | a
|
||||
yo | I
|
||||
otro | other
|
||||
otras | other
|
||||
otra | other
|
||||
él | he
|
||||
tanto | so much, many
|
||||
esa | that
|
||||
estos | these
|
||||
mucho | much, many
|
||||
quienes | who
|
||||
nada | nothing
|
||||
muchos | many
|
||||
cual | who
|
||||
| sea from SER
|
||||
poco | few
|
||||
ella | she
|
||||
estar | to be
|
||||
| haber from HABER
|
||||
estas | these
|
||||
| estaba from ESTAR
|
||||
| estamos from ESTAR
|
||||
algunas | some
|
||||
algo | something
|
||||
nosotros | we
|
||||
|
||||
| other forms
|
||||
|
||||
mi | me
|
||||
mis | mi plural
|
||||
tú | thou
|
||||
te | thee
|
||||
ti | thee
|
||||
tu | thy
|
||||
tus | tu plural
|
||||
ellas | they
|
||||
nosotras | we
|
||||
vosotros | you
|
||||
vosotras | you
|
||||
os | you
|
||||
mío | mine
|
||||
mía |
|
||||
míos |
|
||||
mías |
|
||||
tuyo | thine
|
||||
tuya |
|
||||
tuyos |
|
||||
tuyas |
|
||||
suyo | his, hers, theirs
|
||||
suya |
|
||||
suyos |
|
||||
suyas |
|
||||
nuestro | ours
|
||||
nuestra |
|
||||
nuestros |
|
||||
nuestras |
|
||||
vuestro | yours
|
||||
vuestra |
|
||||
vuestros |
|
||||
vuestras |
|
||||
esos | those
|
||||
esas | those
|
||||
|
||||
| forms of estar, to be (not including the infinitive):
|
||||
estoy
|
||||
estás
|
||||
está
|
||||
estamos
|
||||
estáis
|
||||
están
|
||||
esté
|
||||
estés
|
||||
estemos
|
||||
estéis
|
||||
estén
|
||||
estaré
|
||||
estarás
|
||||
estará
|
||||
estaremos
|
||||
estaréis
|
||||
estarán
|
||||
estaría
|
||||
estarías
|
||||
estaríamos
|
||||
estaríais
|
||||
estarían
|
||||
estaba
|
||||
estabas
|
||||
estábamos
|
||||
estabais
|
||||
estaban
|
||||
estuve
|
||||
estuviste
|
||||
estuvo
|
||||
estuvimos
|
||||
estuvisteis
|
||||
estuvieron
|
||||
estuviera
|
||||
estuvieras
|
||||
estuviéramos
|
||||
estuvierais
|
||||
estuvieran
|
||||
estuviese
|
||||
estuvieses
|
||||
estuviésemos
|
||||
estuvieseis
|
||||
estuviesen
|
||||
estando
|
||||
estado
|
||||
estada
|
||||
estados
|
||||
estadas
|
||||
estad
|
||||
|
||||
| forms of haber, to have (not including the infinitive):
|
||||
he
|
||||
has
|
||||
ha
|
||||
hemos
|
||||
habéis
|
||||
han
|
||||
haya
|
||||
hayas
|
||||
hayamos
|
||||
hayáis
|
||||
hayan
|
||||
habré
|
||||
habrás
|
||||
habrá
|
||||
habremos
|
||||
habréis
|
||||
habrán
|
||||
habría
|
||||
habrías
|
||||
habríamos
|
||||
habríais
|
||||
habrían
|
||||
había
|
||||
habías
|
||||
habíamos
|
||||
habíais
|
||||
habían
|
||||
hube
|
||||
hubiste
|
||||
hubo
|
||||
hubimos
|
||||
hubisteis
|
||||
hubieron
|
||||
hubiera
|
||||
hubieras
|
||||
hubiéramos
|
||||
hubierais
|
||||
hubieran
|
||||
hubiese
|
||||
hubieses
|
||||
hubiésemos
|
||||
hubieseis
|
||||
hubiesen
|
||||
habiendo
|
||||
habido
|
||||
habida
|
||||
habidos
|
||||
habidas
|
||||
|
||||
| forms of ser, to be (not including the infinitive):
|
||||
soy
|
||||
eres
|
||||
es
|
||||
somos
|
||||
sois
|
||||
son
|
||||
sea
|
||||
seas
|
||||
seamos
|
||||
seáis
|
||||
sean
|
||||
seré
|
||||
serás
|
||||
será
|
||||
seremos
|
||||
seréis
|
||||
serán
|
||||
sería
|
||||
serías
|
||||
seríamos
|
||||
seríais
|
||||
serían
|
||||
era
|
||||
eras
|
||||
éramos
|
||||
erais
|
||||
eran
|
||||
fui
|
||||
fuiste
|
||||
fue
|
||||
fuimos
|
||||
fuisteis
|
||||
fueron
|
||||
fuera
|
||||
fueras
|
||||
fuéramos
|
||||
fuerais
|
||||
fueran
|
||||
fuese
|
||||
fueses
|
||||
fuésemos
|
||||
fueseis
|
||||
fuesen
|
||||
siendo
|
||||
sido
|
||||
| sed also means 'thirst'
|
||||
|
||||
| forms of tener, to have (not including the infinitive):
|
||||
tengo
|
||||
tienes
|
||||
tiene
|
||||
tenemos
|
||||
tenéis
|
||||
tienen
|
||||
tenga
|
||||
tengas
|
||||
tengamos
|
||||
tengáis
|
||||
tengan
|
||||
tendré
|
||||
tendrás
|
||||
tendrá
|
||||
tendremos
|
||||
tendréis
|
||||
tendrán
|
||||
tendría
|
||||
tendrías
|
||||
tendríamos
|
||||
tendríais
|
||||
tendrían
|
||||
tenía
|
||||
tenías
|
||||
teníamos
|
||||
teníais
|
||||
tenían
|
||||
tuve
|
||||
tuviste
|
||||
tuvo
|
||||
tuvimos
|
||||
tuvisteis
|
||||
tuvieron
|
||||
tuviera
|
||||
tuvieras
|
||||
tuviéramos
|
||||
tuvierais
|
||||
tuvieran
|
||||
tuviese
|
||||
tuvieses
|
||||
tuviésemos
|
||||
tuvieseis
|
||||
tuviesen
|
||||
teniendo
|
||||
tenido
|
||||
tenida
|
||||
tenidos
|
||||
tenidas
|
||||
tened
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(SpanishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_filter_eu.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_filter_eu.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package eu
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
123
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_words_eu.go
generated
vendored
Normal file
123
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_words_eu.go
generated
vendored
Normal file
@ -0,0 +1,123 @@
|
||||
package eu
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_eu"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var BasqueStopWords = []byte(`# example set of basque stopwords
|
||||
al
|
||||
anitz
|
||||
arabera
|
||||
asko
|
||||
baina
|
||||
bat
|
||||
batean
|
||||
batek
|
||||
bati
|
||||
batzuei
|
||||
batzuek
|
||||
batzuetan
|
||||
batzuk
|
||||
bera
|
||||
beraiek
|
||||
berau
|
||||
berauek
|
||||
bere
|
||||
berori
|
||||
beroriek
|
||||
beste
|
||||
bezala
|
||||
da
|
||||
dago
|
||||
dira
|
||||
ditu
|
||||
du
|
||||
dute
|
||||
edo
|
||||
egin
|
||||
ere
|
||||
eta
|
||||
eurak
|
||||
ez
|
||||
gainera
|
||||
gu
|
||||
gutxi
|
||||
guzti
|
||||
haiei
|
||||
haiek
|
||||
haietan
|
||||
hainbeste
|
||||
hala
|
||||
han
|
||||
handik
|
||||
hango
|
||||
hara
|
||||
hari
|
||||
hark
|
||||
hartan
|
||||
hau
|
||||
hauei
|
||||
hauek
|
||||
hauetan
|
||||
hemen
|
||||
hemendik
|
||||
hemengo
|
||||
hi
|
||||
hona
|
||||
honek
|
||||
honela
|
||||
honetan
|
||||
honi
|
||||
hor
|
||||
hori
|
||||
horiei
|
||||
horiek
|
||||
horietan
|
||||
horko
|
||||
horra
|
||||
horrek
|
||||
horrela
|
||||
horretan
|
||||
horri
|
||||
hortik
|
||||
hura
|
||||
izan
|
||||
ni
|
||||
noiz
|
||||
nola
|
||||
non
|
||||
nondik
|
||||
nongo
|
||||
nor
|
||||
nora
|
||||
ze
|
||||
zein
|
||||
zen
|
||||
zenbait
|
||||
zenbat
|
||||
zer
|
||||
zergatik
|
||||
ziren
|
||||
zituen
|
||||
zu
|
||||
zuek
|
||||
zuen
|
||||
zuten
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(BasqueStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
67
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa.go
generated
vendored
Normal file
67
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa.go
generated
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build icu full
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fa"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFaFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
CharFilters: []analysis.CharFilter{
|
||||
zFilter,
|
||||
},
|
||||
Tokenizer: icuTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normArFilter,
|
||||
normFaFilter,
|
||||
stopFaFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
681
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa_test.go
generated
vendored
Normal file
681
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa_test.go
generated
vendored
Normal file
@ -0,0 +1,681 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build icu full
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestPersianAnalyzerVerbs(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// active present indicative
|
||||
{
|
||||
input: []byte("میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite indicative
|
||||
{
|
||||
input: []byte("خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite indicative
|
||||
{
|
||||
input: []byte("میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active future indicative
|
||||
{
|
||||
input: []byte("خواهد خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present progressive indicative
|
||||
{
|
||||
input: []byte("دارد میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active perfect indicative
|
||||
{
|
||||
input: []byte("خوردهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective perfect indicative
|
||||
{
|
||||
input: []byte("میخوردهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("میخورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("میخورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("میخورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present indicative
|
||||
{
|
||||
input: []byte("خورده میشود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite indicative
|
||||
{
|
||||
input: []byte("خورده شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite indicative
|
||||
{
|
||||
input: []byte("خورده میشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive perfect indicative
|
||||
{
|
||||
input: []byte("خورده شدهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective perfect indicative
|
||||
{
|
||||
input: []byte("خورده میشدهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده میشده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive future indicative
|
||||
{
|
||||
input: []byte("خورده خواهد شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present progressive indicative
|
||||
{
|
||||
input: []byte("دارد خورده میشود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت خورده میشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present subjunctive
|
||||
{
|
||||
input: []byte("خورده شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده میشده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده میشده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present subjunctive
|
||||
{
|
||||
input: []byte("بخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بخورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// active present indicative
|
||||
{
|
||||
input: []byte("مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite indicative
|
||||
{
|
||||
input: []byte("خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite indicative
|
||||
{
|
||||
input: []byte("مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active future indicative
|
||||
{
|
||||
input: []byte("خواهد خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present progressive indicative
|
||||
{
|
||||
input: []byte("دارد مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active perfect indicative
|
||||
{
|
||||
input: []byte("خورده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective perfect indicative
|
||||
{
|
||||
input: []byte("مي خورده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("مي خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("مي خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("مي خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present indicative
|
||||
{
|
||||
input: []byte("خورده مي شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite indicative
|
||||
{
|
||||
input: []byte("خورده شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite indicative
|
||||
{
|
||||
input: []byte("خورده مي شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive perfect indicative
|
||||
{
|
||||
input: []byte("خورده شده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective perfect indicative
|
||||
{
|
||||
input: []byte("خورده مي شده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده مي شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive future indicative
|
||||
{
|
||||
input: []byte("خورده خواهد شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present progressive indicative
|
||||
{
|
||||
input: []byte("دارد خورده مي شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت خورده مي شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present subjunctive
|
||||
{
|
||||
input: []byte("خورده شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده مي شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده مي شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present subjunctive
|
||||
{
|
||||
input: []byte("بخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بخورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPersianAnalyzerOthers(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// nouns
|
||||
{
|
||||
input: []byte("برگ ها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("برگها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// non persian
|
||||
{
|
||||
input: []byte("English test."),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("english"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// others
|
||||
{
|
||||
input: []byte("خورده مي شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("برگها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize.go
generated
vendored
Normal file
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize.go
generated
vendored
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_fa"
|
||||
|
||||
const (
|
||||
Yeh = '\u064A'
|
||||
FarsiYeh = '\u06CC'
|
||||
YehBarree = '\u06D2'
|
||||
Keheh = '\u06A9'
|
||||
Kaf = '\u0643'
|
||||
HamzaAbove = '\u0654'
|
||||
HehYeh = '\u06C0'
|
||||
HehGoal = '\u06C1'
|
||||
Heh = '\u0647'
|
||||
)
|
||||
|
||||
type PersianNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewPersianNormalizeFilter() *PersianNormalizeFilter {
|
||||
return &PersianNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case FarsiYeh, YehBarree:
|
||||
runes[i] = Yeh
|
||||
case Keheh:
|
||||
runes[i] = Kaf
|
||||
case HehYeh, HehGoal:
|
||||
runes[i] = Heh
|
||||
case HamzaAbove: // necessary for HEH + HAMZA
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPersianNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
}
|
125
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize_test.go
generated
vendored
Normal file
125
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,125 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestPersianNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// FarsiYeh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("های"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YehBarree
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاے"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Keheh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("کشاندن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كشاندن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehYeh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابۀ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehHamzaAbove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابهٔ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehGoal
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زادہ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زاده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
persianNormalizeFilter := NewPersianNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := persianNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_filter_fa.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_filter_fa.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
337
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_words_fa.go
generated
vendored
Normal file
337
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_words_fa.go
generated
vendored
Normal file
@ -0,0 +1,337 @@
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_fa"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Note: by default this file is used after normalization, so when adding entries
|
||||
# to this file, use the arabic 'ي' instead of 'ی'
|
||||
انان
|
||||
نداشته
|
||||
سراسر
|
||||
خياه
|
||||
ايشان
|
||||
وي
|
||||
تاكنون
|
||||
بيشتري
|
||||
دوم
|
||||
پس
|
||||
ناشي
|
||||
وگو
|
||||
يا
|
||||
داشتند
|
||||
سپس
|
||||
هنگام
|
||||
هرگز
|
||||
پنج
|
||||
نشان
|
||||
امسال
|
||||
ديگر
|
||||
گروهي
|
||||
شدند
|
||||
چطور
|
||||
ده
|
||||
و
|
||||
دو
|
||||
نخستين
|
||||
ولي
|
||||
چرا
|
||||
چه
|
||||
وسط
|
||||
ه
|
||||
كدام
|
||||
قابل
|
||||
يك
|
||||
رفت
|
||||
هفت
|
||||
همچنين
|
||||
در
|
||||
هزار
|
||||
بله
|
||||
بلي
|
||||
شايد
|
||||
اما
|
||||
شناسي
|
||||
گرفته
|
||||
دهد
|
||||
داشته
|
||||
دانست
|
||||
داشتن
|
||||
خواهيم
|
||||
ميليارد
|
||||
وقتيكه
|
||||
امد
|
||||
خواهد
|
||||
جز
|
||||
اورده
|
||||
شده
|
||||
بلكه
|
||||
خدمات
|
||||
شدن
|
||||
برخي
|
||||
نبود
|
||||
بسياري
|
||||
جلوگيري
|
||||
حق
|
||||
كردند
|
||||
نوعي
|
||||
بعري
|
||||
نكرده
|
||||
نظير
|
||||
نبايد
|
||||
بوده
|
||||
بودن
|
||||
داد
|
||||
اورد
|
||||
هست
|
||||
جايي
|
||||
شود
|
||||
دنبال
|
||||
داده
|
||||
بايد
|
||||
سابق
|
||||
هيچ
|
||||
همان
|
||||
انجا
|
||||
كمتر
|
||||
كجاست
|
||||
گردد
|
||||
كسي
|
||||
تر
|
||||
مردم
|
||||
تان
|
||||
دادن
|
||||
بودند
|
||||
سري
|
||||
جدا
|
||||
ندارند
|
||||
مگر
|
||||
يكديگر
|
||||
دارد
|
||||
دهند
|
||||
بنابراين
|
||||
هنگامي
|
||||
سمت
|
||||
جا
|
||||
انچه
|
||||
خود
|
||||
دادند
|
||||
زياد
|
||||
دارند
|
||||
اثر
|
||||
بدون
|
||||
بهترين
|
||||
بيشتر
|
||||
البته
|
||||
به
|
||||
براساس
|
||||
بيرون
|
||||
كرد
|
||||
بعضي
|
||||
گرفت
|
||||
توي
|
||||
اي
|
||||
ميليون
|
||||
او
|
||||
جريان
|
||||
تول
|
||||
بر
|
||||
مانند
|
||||
برابر
|
||||
باشيم
|
||||
مدتي
|
||||
گويند
|
||||
اكنون
|
||||
تا
|
||||
تنها
|
||||
جديد
|
||||
چند
|
||||
بي
|
||||
نشده
|
||||
كردن
|
||||
كردم
|
||||
گويد
|
||||
كرده
|
||||
كنيم
|
||||
نمي
|
||||
نزد
|
||||
روي
|
||||
قصد
|
||||
فقط
|
||||
بالاي
|
||||
ديگران
|
||||
اين
|
||||
ديروز
|
||||
توسط
|
||||
سوم
|
||||
ايم
|
||||
دانند
|
||||
سوي
|
||||
استفاده
|
||||
شما
|
||||
كنار
|
||||
داريم
|
||||
ساخته
|
||||
طور
|
||||
امده
|
||||
رفته
|
||||
نخست
|
||||
بيست
|
||||
نزديك
|
||||
طي
|
||||
كنيد
|
||||
از
|
||||
انها
|
||||
تمامي
|
||||
داشت
|
||||
يكي
|
||||
طريق
|
||||
اش
|
||||
چيست
|
||||
روب
|
||||
نمايد
|
||||
گفت
|
||||
چندين
|
||||
چيزي
|
||||
تواند
|
||||
ام
|
||||
ايا
|
||||
با
|
||||
ان
|
||||
ايد
|
||||
ترين
|
||||
اينكه
|
||||
ديگري
|
||||
راه
|
||||
هايي
|
||||
بروز
|
||||
همچنان
|
||||
پاعين
|
||||
كس
|
||||
حدود
|
||||
مختلف
|
||||
مقابل
|
||||
چيز
|
||||
گيرد
|
||||
ندارد
|
||||
ضد
|
||||
همچون
|
||||
سازي
|
||||
شان
|
||||
مورد
|
||||
باره
|
||||
مرسي
|
||||
خويش
|
||||
برخوردار
|
||||
چون
|
||||
خارج
|
||||
شش
|
||||
هنوز
|
||||
تحت
|
||||
ضمن
|
||||
هستيم
|
||||
گفته
|
||||
فكر
|
||||
بسيار
|
||||
پيش
|
||||
براي
|
||||
روزهاي
|
||||
انكه
|
||||
نخواهد
|
||||
بالا
|
||||
كل
|
||||
وقتي
|
||||
كي
|
||||
چنين
|
||||
كه
|
||||
گيري
|
||||
نيست
|
||||
است
|
||||
كجا
|
||||
كند
|
||||
نيز
|
||||
يابد
|
||||
بندي
|
||||
حتي
|
||||
توانند
|
||||
عقب
|
||||
خواست
|
||||
كنند
|
||||
بين
|
||||
تمام
|
||||
همه
|
||||
ما
|
||||
باشند
|
||||
مثل
|
||||
شد
|
||||
اري
|
||||
باشد
|
||||
اره
|
||||
طبق
|
||||
بعد
|
||||
اگر
|
||||
صورت
|
||||
غير
|
||||
جاي
|
||||
بيش
|
||||
ريزي
|
||||
اند
|
||||
زيرا
|
||||
چگونه
|
||||
بار
|
||||
لطفا
|
||||
مي
|
||||
درباره
|
||||
من
|
||||
ديده
|
||||
همين
|
||||
گذاري
|
||||
برداري
|
||||
علت
|
||||
گذاشته
|
||||
هم
|
||||
فوق
|
||||
نه
|
||||
ها
|
||||
شوند
|
||||
اباد
|
||||
همواره
|
||||
هر
|
||||
اول
|
||||
خواهند
|
||||
چهار
|
||||
نام
|
||||
امروز
|
||||
مان
|
||||
هاي
|
||||
قبل
|
||||
كنم
|
||||
سعي
|
||||
تازه
|
||||
را
|
||||
هستند
|
||||
زير
|
||||
جلوي
|
||||
عنوان
|
||||
بود
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(PersianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi.go
generated
vendored
Normal file
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fi"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFiFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: icuTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopFiFilter,
|
||||
stemmerFiFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
68
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi_test.go
generated
vendored
Normal file
68
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi_test.go
generated
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
// +build icu full
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestFinishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("edeltäjiinsä"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("edeltäj"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("edeltäjistään"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("edeltäj"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("olla"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stemmer_fi.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stemmer_fi.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build libstemmer full
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_fi"
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return stemmer_filter.NewStemmerFilter("fi")
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_filter_fi.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_filter_fi.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
121
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_words_fi.go
generated
vendored
Normal file
121
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_words_fi.go
generated
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_fi"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| forms of BE
|
||||
|
||||
olla
|
||||
olen
|
||||
olet
|
||||
on
|
||||
olemme
|
||||
olette
|
||||
ovat
|
||||
ole | negative form
|
||||
|
||||
oli
|
||||
olisi
|
||||
olisit
|
||||
olisin
|
||||
olisimme
|
||||
olisitte
|
||||
olisivat
|
||||
olit
|
||||
olin
|
||||
olimme
|
||||
olitte
|
||||
olivat
|
||||
ollut
|
||||
olleet
|
||||
|
||||
en | negation
|
||||
et
|
||||
ei
|
||||
emme
|
||||
ette
|
||||
eivät
|
||||
|
||||
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
|
||||
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
|
||||
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
|
||||
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
|
||||
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
|
||||
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
|
||||
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
|
||||
|
||||
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
|
||||
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
|
||||
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
|
||||
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
|
||||
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
|
||||
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
|
||||
|
||||
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
|
||||
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
|
||||
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
|
||||
mitkä | (pl)
|
||||
|
||||
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
|
||||
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
|
||||
|
||||
| conjunctions
|
||||
|
||||
että | that
|
||||
ja | and
|
||||
jos | if
|
||||
koska | because
|
||||
kuin | than
|
||||
mutta | but
|
||||
niin | so
|
||||
sekä | and
|
||||
sillä | for
|
||||
tai | or
|
||||
vaan | but
|
||||
vai | or
|
||||
vaikka | although
|
||||
|
||||
|
||||
| prepositions
|
||||
|
||||
kanssa | with
|
||||
mukaan | according to
|
||||
noin | about
|
||||
poikki | across
|
||||
yli | over, across
|
||||
|
||||
| other
|
||||
|
||||
kun | when
|
||||
niin | so
|
||||
nyt | now
|
||||
itse | self
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(FinnishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr.go
generated
vendored
Normal file
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr.go
generated
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fr"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFrFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerFrFilter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
elisionFilter,
|
||||
toLowerFilter,
|
||||
stopFrFilter,
|
||||
stemmerFrFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr_test.go
generated
vendored
Normal file
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr_test.go
generated
vendored
Normal file
@ -0,0 +1,196 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestFrenchAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte(""),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
{
|
||||
input: []byte("chien chat cheval"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("chien CHAT CHEVAL"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte(" chien ,? + = - CHAT /: > CHEVAL"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("chien++"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("mot \"entreguillemet\""),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("mot"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("entreguilemet"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Jean-François"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("jean"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("francoi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop words
|
||||
{
|
||||
input: []byte("le la chien les aux chat du des à cheval"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// nouns and adjectives
|
||||
{
|
||||
input: []byte("lances chismes habitable chiste éléments captifs"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("lanc"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chism"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("habitabl"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chist"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("element"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("captif"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// verbs
|
||||
{
|
||||
input: []byte("finissions souffrirent rugissante"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("finision"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("soufrirent"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("rugisant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("c3po"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("aujourd'hui"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("oeuf"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ïaöuaä"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("anticonstitutionel"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("java"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
37
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/articles_fr.go
generated
vendored
Normal file
37
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/articles_fr.go
generated
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
package fr
|
||||
|
||||
import (
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const ArticlesName = "articles_fr"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
|
||||
var FrenchArticles = []byte(`
|
||||
l
|
||||
m
|
||||
t
|
||||
qu
|
||||
n
|
||||
s
|
||||
j
|
||||
d
|
||||
c
|
||||
jusqu
|
||||
quoiqu
|
||||
lorsqu
|
||||
puisqu
|
||||
`)
|
||||
|
||||
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(FrenchArticles)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||
}
|
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr.go
generated
vendored
Normal file
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr.go
generated
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const ElisionName = "elision_fr"
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||
}
|
||||
return elision_filter.NewElisionFilter(articlesTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||
}
|
50
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr_test.go
generated
vendored
Normal file
50
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr_test.go
generated
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestFrenchElision(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("l'avion"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("avion"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := elisionFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
308
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr.go
generated
vendored
Normal file
308
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr.go
generated
vendored
Normal file
@ -0,0 +1,308 @@
|
||||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const LightStemmerName = "stemmer_fr_light"
|
||||
|
||||
type FrenchLightStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewFrenchLightStemmerFilter() *FrenchLightStemmerFilter {
|
||||
return &FrenchLightStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
runes = stem(runes)
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []rune) []rune {
|
||||
|
||||
inputLen := len(input)
|
||||
|
||||
if inputLen > 5 && input[inputLen-1] == 'x' {
|
||||
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' && input[inputLen-4] != 'e' {
|
||||
input[inputLen-2] = 'l'
|
||||
}
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 3 && input[inputLen-1] == 'x' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 3 && input[inputLen-1] == 's' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "issement") {
|
||||
input = input[0 : inputLen-6]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "issant") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 6 && analysis.RunesEndsWith(input, "ement") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
if inputLen > 3 && analysis.RunesEndsWith(input, "ive") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'f'
|
||||
}
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 11 && analysis.RunesEndsWith(input, "ficatrice") {
|
||||
input = input[0 : inputLen-5]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 10 && analysis.RunesEndsWith(input, "ficateur") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "catrice") {
|
||||
input = input[0 : inputLen-3]
|
||||
inputLen = len(input)
|
||||
input[inputLen-4] = 'q'
|
||||
input[inputLen-3] = 'u'
|
||||
input[inputLen-2] = 'e'
|
||||
//s[len-1] = 'r' <-- unnecessary, already 'r'.
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "cateur") {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
input[inputLen-4] = 'q'
|
||||
input[inputLen-3] = 'u'
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "atrice") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "ateur") {
|
||||
input = input[0 : inputLen-3]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 6 && analysis.RunesEndsWith(input, "trice") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-3] = 'e'
|
||||
input[inputLen-2] = 'u'
|
||||
input[inputLen-1] = 'r'
|
||||
}
|
||||
|
||||
if inputLen > 5 && analysis.RunesEndsWith(input, "ième") {
|
||||
return norm(input[0 : inputLen-4])
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "teuse") {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 6 && analysis.RunesEndsWith(input, "teur") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 5 && analysis.RunesEndsWith(input, "euse") {
|
||||
return norm(input[0 : inputLen-2])
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ère") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "ive") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'f'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 4 &&
|
||||
(analysis.RunesEndsWith(input, "folle") ||
|
||||
analysis.RunesEndsWith(input, "molle")) {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'u'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "nnelle") {
|
||||
return norm(input[0 : inputLen-5])
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "nnel") {
|
||||
return norm(input[0 : inputLen-3])
|
||||
}
|
||||
|
||||
if inputLen > 4 && analysis.RunesEndsWith(input, "ète") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ique") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "esse") {
|
||||
return norm(input[0 : inputLen-3])
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "inage") {
|
||||
return norm(input[0 : inputLen-3])
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "isation") {
|
||||
input = input[0 : inputLen-7]
|
||||
inputLen = len(input)
|
||||
if inputLen > 5 && analysis.RunesEndsWith(input, "ual") {
|
||||
input[inputLen-2] = 'e'
|
||||
}
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "isateur") {
|
||||
return norm(input[0 : inputLen-7])
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ation") {
|
||||
return norm(input[0 : inputLen-5])
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ition") {
|
||||
return norm(input[0 : inputLen-5])
|
||||
}
|
||||
|
||||
return norm(input)
|
||||
|
||||
}
|
||||
|
||||
func norm(input []rune) []rune {
|
||||
|
||||
inputLen := len(input)
|
||||
if inputLen > 4 {
|
||||
for i := 0; i < inputLen; i++ {
|
||||
switch input[i] {
|
||||
case 'à', 'á', 'â':
|
||||
input[i] = 'a'
|
||||
case 'ô':
|
||||
input[i] = 'o'
|
||||
case 'è', 'é', 'ê':
|
||||
input[i] = 'e'
|
||||
case 'ù', 'û':
|
||||
input[i] = 'u'
|
||||
case 'î':
|
||||
input[i] = 'i'
|
||||
case 'ç':
|
||||
input[i] = 'c'
|
||||
}
|
||||
|
||||
ch := input[0]
|
||||
for i := 1; i < inputLen; i++ {
|
||||
if input[i] == ch && unicode.IsLetter(ch) {
|
||||
input = analysis.DeleteRune(input, i)
|
||||
i -= 1
|
||||
inputLen = len(input)
|
||||
} else {
|
||||
ch = input[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if inputLen > 4 && analysis.RunesEndsWith(input, "ie") {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 4 {
|
||||
if input[inputLen-1] == 'r' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == 'e' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == 'e' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == input[inputLen-2] && unicode.IsLetter(input[inputLen-1]) {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func FrenchLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewFrenchLightStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(LightStemmerName, FrenchLightStemmerFilterConstructor)
|
||||
}
|
997
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr_test.go
generated
vendored
Normal file
997
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr_test.go
generated
vendored
Normal file
@ -0,0 +1,997 @@
|
||||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestFrenchLightStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chevaux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hiboux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hibou"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hibou"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hibou"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chantés"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chanter"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chante"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baronnes"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("barons"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("peaux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("peau"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("peau"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("peau"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("anneaux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("aneau"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("anneau"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("aneau"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("neveux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("neveu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("neveu"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("neveu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("affreux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("afreu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("affreuse"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("afreu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("investissement"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("investi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("investir"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("investi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("assourdissant"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("asourdi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("assourdir"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("asourdi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("pratiquement"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("pratiqu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("pratique"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("pratiqu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administrativement"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administratif"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administratif"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administratif"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("justificatrice"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("justifi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("justificateur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("justifi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("justifier"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("justifi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("educatrice"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("eduqu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("eduquer"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("eduqu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("communicateur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("comuniqu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("communiquer"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("comuniqu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("accompagnatrice"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("acompagn"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("accompagnateur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("acompagn"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administrateur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administr"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administrer"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("administr"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("productrice"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("product"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("producteur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("product"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("acheteuse"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("achet"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("acheteur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("achet"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("planteur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("plant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("plante"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("plant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("poreuse"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("poreu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("poreux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("poreu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("plieuse"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("plieu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("bijoutière"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("bijouti"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("bijoutier"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("bijouti"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("caissière"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("caisi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("caissier"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("caisi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abrasive"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abrasif"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abrasif"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abrasif"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("folle"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("fou"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("fou"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("fou"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("personnelle"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("person"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("personne"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("person"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// algo bug: too short length
|
||||
// {
|
||||
// input: analysis.TokenStream{
|
||||
// &analysis.Token{
|
||||
// Term: []byte("personnel"),
|
||||
// },
|
||||
// },
|
||||
// output: analysis.TokenStream{
|
||||
// &analysis.Token{
|
||||
// Term: []byte("person"),
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("complète"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("complet"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("complet"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("complet"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("aromatique"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("aromat"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("faiblesse"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("faibl"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("faible"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("faibl"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("patinage"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("patin"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("patin"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("patin"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("sonorisation"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("sono"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ritualisation"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("rituel"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("rituel"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("rituel"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// algo bug: masked by rules above
|
||||
// {
|
||||
// input: analysis.TokenStream{
|
||||
// &analysis.Token{
|
||||
// Term: []byte("colonisateur"),
|
||||
// },
|
||||
// },
|
||||
// output: analysis.TokenStream{
|
||||
// &analysis.Token{
|
||||
// Term: []byte("colon"),
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("nomination"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("nomin"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("disposition"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dispos"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dispose"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dispos"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// SOLR-3463 : abusive compression of repeated characters in numbers
|
||||
// Trailing repeated char elision :
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1234555"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1234555"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Repeated char within numbers with more than 4 characters :
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("12333345"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("12333345"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Short numbers weren't affected already:
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1234"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1234"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Ensure behaviour is preserved for words!
|
||||
// Trailing repeated char elision :
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcdeff"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcdef"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Repeated char within words with more than 4 characters :
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcccddeef"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcdef"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("créées"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("cre"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Combined letter and digit repetition
|
||||
// 10:00pm
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("22hh00"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("22h00"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
81
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/minimal_stemmer_fr.go
generated
vendored
Normal file
81
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/minimal_stemmer_fr.go
generated
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const MinimalStemmerName = "stemmer_fr_min"
|
||||
|
||||
type FrenchMinimalStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewFrenchMinimalStemmerFilter() *FrenchMinimalStemmerFilter {
|
||||
return &FrenchMinimalStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
runes = minstem(runes)
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func minstem(input []rune) []rune {
|
||||
|
||||
inputLen := len(input)
|
||||
|
||||
if inputLen < 6 {
|
||||
return input
|
||||
}
|
||||
|
||||
if input[inputLen-1] == 'x' {
|
||||
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' {
|
||||
input[inputLen-2] = 'l'
|
||||
}
|
||||
return input[0 : inputLen-1]
|
||||
}
|
||||
|
||||
if input[inputLen-1] == 's' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == 'r' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == 'e' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == 'é' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
if input[inputLen-1] == input[inputLen-2] {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func FrenchMinimalStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewFrenchMinimalStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(MinimalStemmerName, FrenchMinimalStemmerFilterConstructor)
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user