Add message logging and search server side
This commit is contained in:
parent
6378131a9d
commit
3365832ce3
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,2 @@
|
|||||||
bin/
|
|
||||||
client/dist/
|
client/dist/
|
||||||
client/node_modules/
|
client/node_modules/
|
||||||
data.db
|
|
53
Godeps/Godeps.json
generated
53
Godeps/Godeps.json
generated
@ -1,19 +1,72 @@
|
|||||||
{
|
{
|
||||||
"ImportPath": "github.com/khlieng/name_pending",
|
"ImportPath": "github.com/khlieng/name_pending",
|
||||||
"GoVersion": "go1.4",
|
"GoVersion": "go1.4",
|
||||||
|
"Packages": [
|
||||||
|
"./..."
|
||||||
|
],
|
||||||
"Deps": [
|
"Deps": [
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/blevesearch/bleve",
|
||||||
|
"Rev": "16f538d7b76dd85c935a3104c390307cae5cbf79"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/blevesearch/go-porterstemmer",
|
||||||
|
"Comment": "v1.0.1-9-g23a2c8e",
|
||||||
|
"Rev": "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/blevesearch/segment",
|
||||||
|
"Rev": "9588637ce3caba8516208ccc17193ddedd741418"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"ImportPath": "github.com/boltdb/bolt",
|
"ImportPath": "github.com/boltdb/bolt",
|
||||||
"Comment": "v1.0-43-gcf33c9e",
|
"Comment": "v1.0-43-gcf33c9e",
|
||||||
"Rev": "cf33c9e0ca0a23509b8bb8edfc63e4776bb1a330"
|
"Rev": "cf33c9e0ca0a23509b8bb8edfc63e4776bb1a330"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/cznic/b",
|
||||||
|
"Rev": "c4adf3a58579a2d57cd3097f455dcdf75edcdfd8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/golang/protobuf/proto",
|
||||||
|
"Rev": "655cdfa588ea190e901bc5590e65d5621688847c"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"ImportPath": "github.com/julienschmidt/httprouter",
|
"ImportPath": "github.com/julienschmidt/httprouter",
|
||||||
"Rev": "b428fda53bb0a764fea9c76c9413512eda291dec"
|
"Rev": "b428fda53bb0a764fea9c76c9413512eda291dec"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/ryszard/goskiplist/skiplist",
|
||||||
|
"Rev": "2dfbae5fcf46374f166f8969cb07e167f1be6273"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/steveyen/gtreap",
|
||||||
|
"Rev": "72cd76f34c91f8d64a031af97b499e4a0b1a6e0c"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
|
||||||
|
"Rev": "4875955338b0a434238a31165cb87255ab6e9e4a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/syndtr/gosnappy/snappy",
|
||||||
|
"Rev": "156a073208e131d7d2e212cb749feae7c339e846"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/willf/bitset",
|
||||||
|
"Comment": "v1.0.0-17-g4b22041",
|
||||||
|
"Rev": "4b220417a489359f934045d0509d941a7a2a1038"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"ImportPath": "golang.org/x/net/websocket",
|
"ImportPath": "golang.org/x/net/websocket",
|
||||||
"Rev": "3d87fd621ca9a824c5cff17216ce44769456cb3f"
|
"Rev": "3d87fd621ca9a824c5cff17216ce44769456cb3f"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "golang.org/x/text/transform",
|
||||||
|
"Rev": "c92eb3cd6e70951a111680995e651ea4b2c35539"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "golang.org/x/text/unicode/norm",
|
||||||
|
"Rev": "c92eb3cd6e70951a111680995e651ea4b2c35539"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
17
Godeps/_workspace/src/github.com/blevesearch/bleve/.gitignore
generated
vendored
Normal file
17
Godeps/_workspace/src/github.com/blevesearch/bleve/.gitignore
generated
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#*
|
||||||
|
*.sublime-*
|
||||||
|
*~
|
||||||
|
.#*
|
||||||
|
.project
|
||||||
|
.settings
|
||||||
|
.DS_Store
|
||||||
|
/analysis/token_filters/cld2/cld2-read-only
|
||||||
|
/analysis/token_filters/cld2/libcld2_full.a
|
||||||
|
/utils/bleve_create/bleve_create
|
||||||
|
/utils/bleve_dump/bleve_dump
|
||||||
|
/utils/bleve_index/bleve_index
|
||||||
|
/utils/bleve_bulkindex/bleve_bulkindex
|
||||||
|
/utils/bleve_index/index.bleve/
|
||||||
|
/utils/bleve_query/bleve_query
|
||||||
|
/utils/bleve_registry/bleve_registry
|
||||||
|
/y.output
|
19
Godeps/_workspace/src/github.com/blevesearch/bleve/.travis.yml
generated
vendored
Normal file
19
Godeps/_workspace/src/github.com/blevesearch/bleve/.travis.yml
generated
vendored
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
language: go
|
||||||
|
|
||||||
|
go:
|
||||||
|
- 1.4
|
||||||
|
|
||||||
|
script:
|
||||||
|
- go get golang.org/x/tools/cmd/vet
|
||||||
|
- go get golang.org/x/tools/cmd/cover
|
||||||
|
- go get github.com/mattn/goveralls
|
||||||
|
- go get github.com/kisielk/errcheck
|
||||||
|
- go test -v ./...
|
||||||
|
- go vet ./...
|
||||||
|
- errcheck ./...
|
||||||
|
- docs/project-code-coverage.sh
|
||||||
|
- docs/build_children.sh
|
||||||
|
|
||||||
|
notifications:
|
||||||
|
email:
|
||||||
|
- marty.schoch@gmail.com
|
202
Godeps/_workspace/src/github.com/blevesearch/bleve/LICENSE
generated
vendored
Normal file
202
Godeps/_workspace/src/github.com/blevesearch/bleve/LICENSE
generated
vendored
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
61
Godeps/_workspace/src/github.com/blevesearch/bleve/README.md
generated
vendored
Normal file
61
Godeps/_workspace/src/github.com/blevesearch/bleve/README.md
generated
vendored
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#  bleve
|
||||||
|
|
||||||
|
[](https://travis-ci.org/blevesearch/bleve) [](https://coveralls.io/r/blevesearch/bleve?branch=master) [](https://godoc.org/github.com/blevesearch/bleve) [](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||||
|
|
||||||
|
modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/)
|
||||||
|
|
||||||
|
Try out bleve live by [searching our wiki](http://wikisearch.blevesearch.com/search/).
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
* Index any go data structure (including JSON)
|
||||||
|
* Intelligent defaults backed up by powerful configuration
|
||||||
|
* Supported field types:
|
||||||
|
* Text, Numeric, Date
|
||||||
|
* Supported query types:
|
||||||
|
* Term, Phrase, Match, Match Phrase, Prefix
|
||||||
|
* Conjunction, Disjunction, Boolean
|
||||||
|
* Numeric Range, Date Range
|
||||||
|
* Simple query [syntax](https://github.com/blevesearch/bleve/wiki/Query-String-Query) for human entry
|
||||||
|
* tf-idf Scoring
|
||||||
|
* Search result match highlighting
|
||||||
|
* Supports Aggregating Facets:
|
||||||
|
* Terms Facet
|
||||||
|
* Numeric Range Facet
|
||||||
|
* Date Range Facet
|
||||||
|
|
||||||
|
## Discussion
|
||||||
|
|
||||||
|
Discuss usage and development of bleve in the [google group](https://groups.google.com/forum/#!forum/bleve).
|
||||||
|
|
||||||
|
## Indexing
|
||||||
|
|
||||||
|
message := struct{
|
||||||
|
Id string
|
||||||
|
From string
|
||||||
|
Body string
|
||||||
|
}{
|
||||||
|
Id: "example",
|
||||||
|
From: "marty.schoch@gmail.com",
|
||||||
|
Body: "bleve indexing is easy",
|
||||||
|
}
|
||||||
|
|
||||||
|
mapping := bleve.NewIndexMapping()
|
||||||
|
index, err := bleve.New("example.bleve", mapping)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
index.Index(message.Id, message)
|
||||||
|
|
||||||
|
## Querying
|
||||||
|
|
||||||
|
index, _ := bleve.Open("example.bleve")
|
||||||
|
query := bleve.NewQueryStringQuery("bleve")
|
||||||
|
searchRequest := bleve.NewSearchRequest(query)
|
||||||
|
searchResult, _ := index.Search(searchRequest)
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Apache License Version 2.0
|
||||||
|
|
||||||
|
|
130
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer/custom_analyzer.go
generated
vendored
Normal file
130
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer/custom_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package standard_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "custom"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
|
||||||
|
var err error
|
||||||
|
var charFilters []analysis.CharFilter
|
||||||
|
charFiltersNames, ok := config["char_filters"].([]string)
|
||||||
|
if ok {
|
||||||
|
charFilters, err = getCharFilters(charFiltersNames, cache)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
charFiltersNamesInterfaceSlice, ok := config["char_filters"].([]interface{})
|
||||||
|
if ok {
|
||||||
|
charFiltersNames, err := convertInterfaceSliceToStringSlice(charFiltersNamesInterfaceSlice, "char filter")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
charFilters, err = getCharFilters(charFiltersNames, cache)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizerName, ok := config["tokenizer"].(string)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("must specify tokenizer")
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer, err := cache.TokenizerNamed(tokenizerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var tokenFilters []analysis.TokenFilter
|
||||||
|
tokenFiltersNames, ok := config["token_filters"].([]string)
|
||||||
|
if ok {
|
||||||
|
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tokenFiltersNamesInterfaceSlice, ok := config["token_filters"].([]interface{})
|
||||||
|
if ok {
|
||||||
|
tokenFiltersNames, err := convertInterfaceSliceToStringSlice(tokenFiltersNamesInterfaceSlice, "token filter")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: tokenizer,
|
||||||
|
}
|
||||||
|
if charFilters != nil {
|
||||||
|
rv.CharFilters = charFilters
|
||||||
|
}
|
||||||
|
if tokenFilters != nil {
|
||||||
|
rv.TokenFilters = tokenFilters
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||||
|
}
|
||||||
|
|
||||||
|
func getCharFilters(charFilterNames []string, cache *registry.Cache) ([]analysis.CharFilter, error) {
|
||||||
|
charFilters := make([]analysis.CharFilter, len(charFilterNames))
|
||||||
|
for i, charFilterName := range charFilterNames {
|
||||||
|
charFilter, err := cache.CharFilterNamed(charFilterName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
charFilters[i] = charFilter
|
||||||
|
}
|
||||||
|
|
||||||
|
return charFilters, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTokenFilters(tokenFilterNames []string, cache *registry.Cache) ([]analysis.TokenFilter, error) {
|
||||||
|
tokenFilters := make([]analysis.TokenFilter, len(tokenFilterNames))
|
||||||
|
for i, tokenFilterName := range tokenFilterNames {
|
||||||
|
tokenFilter, err := cache.TokenFilterNamed(tokenFilterName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tokenFilters[i] = tokenFilter
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokenFilters, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType string) ([]string, error) {
|
||||||
|
stringSlice := make([]string, len(interfaceSlice))
|
||||||
|
for i, interfaceObj := range interfaceSlice {
|
||||||
|
stringObj, ok := interfaceObj.(string)
|
||||||
|
if ok {
|
||||||
|
stringSlice[i] = stringObj
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf(objType + " name must be a string")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return stringSlice, nil
|
||||||
|
}
|
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go
generated
vendored
Normal file
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build cld2 full
|
||||||
|
|
||||||
|
package detect_lang_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/cld2"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "detect_lang"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
detectLangFilter, err := cache.TokenFilterNamed(cld2.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: keywordTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
detectLangFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||||
|
}
|
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer/keyword_analyzer.go
generated
vendored
Normal file
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer/keyword_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package keyword_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "keyword"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: keywordTokenizer,
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||||
|
}
|
41
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer/simple_analyzer.go
generated
vendored
Normal file
41
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer/simple_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package simple_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "simple"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: tokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||||
|
}
|
47
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer/standard_analyzer.go
generated
vendored
Normal file
47
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer/standard_analyzer.go
generated
vendored
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package standard_analyzer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "standard"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: tokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopEnFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
|
||||||
|
}
|
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/ignore/ignore_byte_array_converter.go
generated
vendored
Normal file
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/ignore/ignore_byte_array_converter.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ignore_byte_array_converter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
type IgnoreByteArrayConverter struct{}
|
||||||
|
|
||||||
|
func NewIgnoreByteArrayConverter() *IgnoreByteArrayConverter {
|
||||||
|
return &IgnoreByteArrayConverter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *IgnoreByteArrayConverter) Convert(in []byte) (interface{}, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
|
||||||
|
return NewIgnoreByteArrayConverter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterByteArrayConverter("ignore", Constructor)
|
||||||
|
}
|
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/json/json_byte_array_converter.go
generated
vendored
Normal file
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/json/json_byte_array_converter.go
generated
vendored
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package json_byte_array_converter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
type JSONByteArrayConverter struct{}
|
||||||
|
|
||||||
|
func NewJSONByteArrayConverter() *JSONByteArrayConverter {
|
||||||
|
return &JSONByteArrayConverter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *JSONByteArrayConverter) Convert(in []byte) (interface{}, error) {
|
||||||
|
var rv map[string]interface{}
|
||||||
|
err := json.Unmarshal(in, &rv)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
|
||||||
|
return NewJSONByteArrayConverter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterByteArrayConverter("json", Constructor)
|
||||||
|
}
|
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/string/string_byte_array_conveter.go
generated
vendored
Normal file
33
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/byte_array_converters/string/string_byte_array_conveter.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package string_byte_array_converter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
type StringByteArrayConverter struct{}
|
||||||
|
|
||||||
|
func NewStringByteArrayConverter() *StringByteArrayConverter {
|
||||||
|
return &StringByteArrayConverter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *StringByteArrayConverter) Convert(in []byte) (interface{}, error) {
|
||||||
|
return string(in), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
|
||||||
|
return NewStringByteArrayConverter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterByteArrayConverter("string", Constructor)
|
||||||
|
}
|
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/html_char_filter/html_char_filter.go
generated
vendored
Normal file
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/html_char_filter/html_char_filter.go
generated
vendored
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package html_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "html"
|
||||||
|
|
||||||
|
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
|
||||||
|
|
||||||
|
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
|
||||||
|
replaceBytes := []byte(" ")
|
||||||
|
return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterCharFilter(Name, CharFilterConstructor)
|
||||||
|
}
|
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter.go
generated
vendored
Normal file
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter.go
generated
vendored
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package regexp_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "regexp"
|
||||||
|
|
||||||
|
type RegexpCharFilter struct {
|
||||||
|
r *regexp.Regexp
|
||||||
|
replacement []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter {
|
||||||
|
return &RegexpCharFilter{
|
||||||
|
r: r,
|
||||||
|
replacement: replacement,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *RegexpCharFilter) Filter(input []byte) []byte {
|
||||||
|
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
|
||||||
|
}
|
||||||
|
|
||||||
|
func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
|
||||||
|
regexpStr, ok := config["regexp"].(string)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("must specify regexp")
|
||||||
|
}
|
||||||
|
r, err := regexp.Compile(regexpStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
|
||||||
|
}
|
||||||
|
replaceBytes := []byte(" ")
|
||||||
|
replaceStr, ok := config["replace"].(string)
|
||||||
|
if ok {
|
||||||
|
replaceBytes = []byte(replaceStr)
|
||||||
|
}
|
||||||
|
return NewRegexpCharFilter(r, replaceBytes), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
|
||||||
|
}
|
82
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter_test.go
generated
vendored
Normal file
82
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter/regexp_char_filter_test.go
generated
vendored
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package regexp_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"regexp"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRegexpCharFilter(t *testing.T) {
|
||||||
|
|
||||||
|
htmlTagPattern := `</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`
|
||||||
|
htmlRegex := regexp.MustCompile(htmlTagPattern)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output []byte
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte(`<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>My First Heading</h1>
|
||||||
|
|
||||||
|
<p>My first paragraph.</p>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>`),
|
||||||
|
output: []byte(`
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
My First Heading
|
||||||
|
|
||||||
|
My first paragraph.
|
||||||
|
|
||||||
|
|
||||||
|
`),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
filter := NewRegexpCharFilter(htmlRegex, []byte{' '})
|
||||||
|
output := filter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(output, test.output) {
|
||||||
|
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestZeroWidthNonJoinerCharFilter(t *testing.T) {
|
||||||
|
|
||||||
|
zeroWidthNonJoinerPattern := `\x{200C}`
|
||||||
|
zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output []byte
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"),
|
||||||
|
output: []byte("water under the bridge"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
filter := NewRegexpCharFilter(zeroWidthNonJoinerRegex, []byte{' '})
|
||||||
|
output := filter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(output, test.output) {
|
||||||
|
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go
generated
vendored
Normal file
31
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go
generated
vendored
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package zero_width_non_joiner
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "zero_width_spaces"
|
||||||
|
|
||||||
|
var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
|
||||||
|
|
||||||
|
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
|
||||||
|
replaceBytes := []byte(" ")
|
||||||
|
return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterCharFilter(Name, CharFilterConstructor)
|
||||||
|
}
|
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional/datetime_optional.go
generated
vendored
Normal file
40
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional/datetime_optional.go
generated
vendored
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package html_char_filter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "dateTimeOptional"
|
||||||
|
|
||||||
|
const rfc3339NoTimezone = "2006-01-02T15:04:05"
|
||||||
|
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
|
||||||
|
const rfc3339NoTime = "2006-01-02"
|
||||||
|
|
||||||
|
var layouts = []string{
|
||||||
|
time.RFC3339Nano,
|
||||||
|
time.RFC3339,
|
||||||
|
rfc3339NoTimezone,
|
||||||
|
rfc3339NoTimezoneNoT,
|
||||||
|
rfc3339NoTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||||
|
return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
|
||||||
|
}
|
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go.go
generated
vendored
Normal file
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package flexible_go
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const Name = "flexiblego"
|
||||||
|
|
||||||
|
type FlexibleGoDateTimeParser struct {
|
||||||
|
layouts []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewFlexibleGoDateTimeParser(layouts []string) *FlexibleGoDateTimeParser {
|
||||||
|
return &FlexibleGoDateTimeParser{
|
||||||
|
layouts: layouts,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error) {
|
||||||
|
for _, layout := range p.layouts {
|
||||||
|
rv, err := time.Parse(layout, input)
|
||||||
|
if err == nil {
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return time.Time{}, analysis.ErrInvalidDateTime
|
||||||
|
}
|
||||||
|
|
||||||
|
func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||||
|
layouts, ok := config["layouts"].([]interface{})
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("must specify layouts")
|
||||||
|
}
|
||||||
|
layoutStrs := make([]string, 0)
|
||||||
|
for _, layout := range layouts {
|
||||||
|
layoutStr, ok := layout.(string)
|
||||||
|
if ok {
|
||||||
|
layoutStrs = append(layoutStrs, layoutStr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NewFlexibleGoDateTimeParser(layoutStrs), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
|
||||||
|
}
|
84
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go_test.go
generated
vendored
Normal file
84
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go/flexible_go_test.go
generated
vendored
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package flexible_go
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFlexibleDateTimeParser(t *testing.T) {
|
||||||
|
testLocation := time.FixedZone("", -8*60*60)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input string
|
||||||
|
expectedTime time.Time
|
||||||
|
expectedError error
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: "2014-08-03",
|
||||||
|
expectedTime: time.Date(2014, 8, 3, 0, 0, 0, 0, time.UTC),
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "2014-08-03T15:59:30",
|
||||||
|
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "2014-08-03 15:59:30",
|
||||||
|
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "2014-08-03T15:59:30-08:00",
|
||||||
|
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, testLocation),
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "2014-08-03T15:59:30.999999999-08:00",
|
||||||
|
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 999999999, testLocation),
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: "not a date time",
|
||||||
|
expectedTime: time.Time{},
|
||||||
|
expectedError: analysis.ErrInvalidDateTime,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
rfc3339NoTimezone := "2006-01-02T15:04:05"
|
||||||
|
rfc3339NoTimezoneNoT := "2006-01-02 15:04:05"
|
||||||
|
rfc3339NoTime := "2006-01-02"
|
||||||
|
|
||||||
|
dateOptionalTimeParser := NewFlexibleGoDateTimeParser(
|
||||||
|
[]string{
|
||||||
|
time.RFC3339Nano,
|
||||||
|
time.RFC3339,
|
||||||
|
rfc3339NoTimezone,
|
||||||
|
rfc3339NoTimezoneNoT,
|
||||||
|
rfc3339NoTime,
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
actualTime, actualErr := dateOptionalTimeParser.ParseDateTime(test.input)
|
||||||
|
if actualErr != test.expectedError {
|
||||||
|
t.Errorf("expected error %#v, got %#v", test.expectedError, actualErr)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(actualTime, test.expectedTime) {
|
||||||
|
t.Errorf("expected time %#v, got %#v", test.expectedTime, actualTime)
|
||||||
|
t.Errorf("expected location %#v,\n got %#v", test.expectedTime.Location(), actualTime.Location())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
88
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq.go
generated
vendored
Normal file
88
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq.go
generated
vendored
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package analysis
|
||||||
|
|
||||||
|
type TokenLocation struct {
|
||||||
|
Field string
|
||||||
|
Start int
|
||||||
|
End int
|
||||||
|
Position int
|
||||||
|
}
|
||||||
|
|
||||||
|
type TokenFreq struct {
|
||||||
|
Term []byte
|
||||||
|
Locations []*TokenLocation
|
||||||
|
}
|
||||||
|
|
||||||
|
type TokenFrequencies []*TokenFreq
|
||||||
|
|
||||||
|
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) TokenFrequencies {
|
||||||
|
// put existing tokens into a map
|
||||||
|
index := make(map[string]*TokenFreq)
|
||||||
|
for _, tf := range tfs {
|
||||||
|
index[string(tf.Term)] = tf
|
||||||
|
}
|
||||||
|
// walk the new token frequencies
|
||||||
|
for _, tf := range other {
|
||||||
|
// set the remoteField value in incoming token freqs
|
||||||
|
for _, l := range tf.Locations {
|
||||||
|
l.Field = remoteField
|
||||||
|
}
|
||||||
|
existingTf, exists := index[string(tf.Term)]
|
||||||
|
if exists {
|
||||||
|
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
|
||||||
|
} else {
|
||||||
|
index[string(tf.Term)] = tf
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// flatten map back to array
|
||||||
|
rv := make(TokenFrequencies, len(index))
|
||||||
|
i := 0
|
||||||
|
for _, tf := range index {
|
||||||
|
rv[i] = tf
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func TokenFrequency(tokens TokenStream) TokenFrequencies {
|
||||||
|
index := make(map[string]*TokenFreq)
|
||||||
|
|
||||||
|
for _, token := range tokens {
|
||||||
|
curr, ok := index[string(token.Term)]
|
||||||
|
if ok {
|
||||||
|
curr.Locations = append(curr.Locations, &TokenLocation{
|
||||||
|
Start: token.Start,
|
||||||
|
End: token.End,
|
||||||
|
Position: token.Position,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
index[string(token.Term)] = &TokenFreq{
|
||||||
|
Term: token.Term,
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Start: token.Start,
|
||||||
|
End: token.End,
|
||||||
|
Position: token.Position,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rv := make(TokenFrequencies, len(index))
|
||||||
|
i := 0
|
||||||
|
for _, tf := range index {
|
||||||
|
rv[i] = tf
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
167
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq_test.go
generated
vendored
Normal file
167
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/freq_test.go
generated
vendored
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package analysis
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTokenFrequency(t *testing.T) {
|
||||||
|
tokens := TokenStream{
|
||||||
|
&Token{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&Token{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
expectedResult := TokenFrequencies{
|
||||||
|
&TokenFreq{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
result := TokenFrequency(tokens)
|
||||||
|
if !reflect.DeepEqual(result, expectedResult) {
|
||||||
|
t.Errorf("expected %#v, got %#v", expectedResult, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTokenFrequenciesMergeAll(t *testing.T) {
|
||||||
|
tf1 := TokenFrequencies{
|
||||||
|
&TokenFreq{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tf2 := TokenFrequencies{
|
||||||
|
&TokenFreq{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
expectedResult := TokenFrequencies{
|
||||||
|
&TokenFreq{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Field: "tf2",
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Field: "tf2",
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
tf1.MergeAll("tf2", tf2)
|
||||||
|
if !reflect.DeepEqual(tf1, expectedResult) {
|
||||||
|
t.Errorf("expected %#v, got %#v", expectedResult, tf1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
|
||||||
|
tf1 := TokenFrequencies{}
|
||||||
|
tf2 := TokenFrequencies{
|
||||||
|
&TokenFreq{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
expectedResult := TokenFrequencies{
|
||||||
|
&TokenFreq{
|
||||||
|
Term: []byte("water"),
|
||||||
|
Locations: []*TokenLocation{
|
||||||
|
&TokenLocation{
|
||||||
|
Field: "tf2",
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
&TokenLocation{
|
||||||
|
Field: "tf2",
|
||||||
|
Position: 2,
|
||||||
|
Start: 6,
|
||||||
|
End: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
result := tf1.MergeAll("tf2", tf2)
|
||||||
|
if !reflect.DeepEqual(result, expectedResult) {
|
||||||
|
t.Errorf("expected %#v, got %#v", expectedResult, result)
|
||||||
|
}
|
||||||
|
}
|
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar.go
generated
vendored
Normal file
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "ar"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
|
||||||
|
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: tokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
normalizeFilter,
|
||||||
|
stopArFilter,
|
||||||
|
normalizeArFilter,
|
||||||
|
stemmerArFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
179
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar_test.go
generated
vendored
Normal file
179
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/analyzer_ar_test.go
generated
vendored
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestArabicAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte("كبير"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كبير"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// feminine marker
|
||||||
|
{
|
||||||
|
input: []byte("كبيرة"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كبير"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("مشروب"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("مشروب"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// plural -at
|
||||||
|
{
|
||||||
|
input: []byte("مشروبات"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("مشروب"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// plural -in
|
||||||
|
{
|
||||||
|
input: []byte("أمريكيين"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("امريك"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// singular with bare alif
|
||||||
|
{
|
||||||
|
input: []byte("امريكي"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("امريك"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("كتاب"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كتاب"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// definite article
|
||||||
|
{
|
||||||
|
input: []byte("الكتاب"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كتاب"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("ما ملكت أيمانكم"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ملكت"),
|
||||||
|
Position: 2,
|
||||||
|
Start: 5,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ايمانكم"),
|
||||||
|
Position: 3,
|
||||||
|
Start: 14,
|
||||||
|
End: 28,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stopwords
|
||||||
|
{
|
||||||
|
input: []byte("الذين ملكت أيمانكم"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ملكت"),
|
||||||
|
Position: 2,
|
||||||
|
Start: 11,
|
||||||
|
End: 19,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ايمانكم"),
|
||||||
|
Position: 3,
|
||||||
|
Start: 20,
|
||||||
|
End: 34,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// presentation form normalization
|
||||||
|
{
|
||||||
|
input: []byte("ﺍﻟﺴﻼﻢ"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
80
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize.go
generated
vendored
Normal file
80
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize.go
generated
vendored
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const NormalizeName = "normalize_ar"
|
||||||
|
|
||||||
|
const (
|
||||||
|
Alef = '\u0627'
|
||||||
|
AlefMadda = '\u0622'
|
||||||
|
AlefHamzaAbove = '\u0623'
|
||||||
|
AlefHamzaBelow = '\u0625'
|
||||||
|
Yeh = '\u064A'
|
||||||
|
DotlessYeh = '\u0649'
|
||||||
|
TehMarbuta = '\u0629'
|
||||||
|
Heh = '\u0647'
|
||||||
|
Tatweel = '\u0640'
|
||||||
|
Fathatan = '\u064B'
|
||||||
|
Dammatan = '\u064C'
|
||||||
|
Kasratan = '\u064D'
|
||||||
|
Fatha = '\u064E'
|
||||||
|
Damma = '\u064F'
|
||||||
|
Kasra = '\u0650'
|
||||||
|
Shadda = '\u0651'
|
||||||
|
Sukun = '\u0652'
|
||||||
|
)
|
||||||
|
|
||||||
|
type ArabicNormalizeFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
|
||||||
|
return &ArabicNormalizeFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
term := normalize(token.Term)
|
||||||
|
token.Term = term
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalize(input []byte) []byte {
|
||||||
|
runes := bytes.Runes(input)
|
||||||
|
for i := 0; i < len(runes); i++ {
|
||||||
|
switch runes[i] {
|
||||||
|
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
|
||||||
|
runes[i] = Alef
|
||||||
|
case DotlessYeh:
|
||||||
|
runes[i] = Yeh
|
||||||
|
case TehMarbuta:
|
||||||
|
runes[i] = Heh
|
||||||
|
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||||
|
runes = analysis.DeleteRune(runes, i)
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewArabicNormalizeFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||||
|
}
|
229
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize_test.go
generated
vendored
Normal file
229
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/arabic_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,229 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestArabicNormalizeFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// AlifMadda
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("آجن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("اجن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// AlifHamzaAbove
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("أحمد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("احمد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// AlifHamzaBelow
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("إعاذ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("اعاذ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// AlifMaksura
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بنى"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بني"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// TehMarbuta
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("فاطمة"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("فاطمه"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Tatweel
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("روبرـــــت"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("روبرت"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Fatha
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("مَبنا"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("مبنا"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Kasra
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("علِي"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("علي"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Damma
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بُوات"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بوات"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Fathatan
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ولداً"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ولدا"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Kasratan
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ولدٍ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ولد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Dammatan
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ولدٌ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ولد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Sukun
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("نلْسون"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("نلسون"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Shaddah
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هتميّ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هتمي"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// empty
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
arabicNormalizeFilter := NewArabicNormalizeFilter()
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := arabicNormalizeFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||||
|
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar.go
generated
vendored
Normal file
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar.go
generated
vendored
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_ar"
|
||||||
|
|
||||||
|
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
|
||||||
|
var prefixes = [][]rune{
|
||||||
|
[]rune("ال"),
|
||||||
|
[]rune("وال"),
|
||||||
|
[]rune("بال"),
|
||||||
|
[]rune("كال"),
|
||||||
|
[]rune("فال"),
|
||||||
|
[]rune("لل"),
|
||||||
|
[]rune("و"),
|
||||||
|
}
|
||||||
|
var suffixes = [][]rune{
|
||||||
|
[]rune("ها"),
|
||||||
|
[]rune("ان"),
|
||||||
|
[]rune("ات"),
|
||||||
|
[]rune("ون"),
|
||||||
|
[]rune("ين"),
|
||||||
|
[]rune("يه"),
|
||||||
|
[]rune("ية"),
|
||||||
|
[]rune("ه"),
|
||||||
|
[]rune("ة"),
|
||||||
|
[]rune("ي"),
|
||||||
|
}
|
||||||
|
|
||||||
|
type ArabicStemmerFilter struct{}
|
||||||
|
|
||||||
|
func NewArabicStemmerFilter() *ArabicStemmerFilter {
|
||||||
|
return &ArabicStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
term := stem(token.Term)
|
||||||
|
token.Term = term
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func canStemPrefix(input, prefix []rune) bool {
|
||||||
|
// Wa- prefix requires at least 3 characters.
|
||||||
|
if len(prefix) == 1 && len(input) < 4 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// Other prefixes require only 2.
|
||||||
|
if len(input)-len(prefix) < 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := range prefix {
|
||||||
|
if prefix[i] != input[i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func canStemSuffix(input, suffix []rune) bool {
|
||||||
|
// All suffixes require at least 2 characters after stemming.
|
||||||
|
if len(input)-len(suffix) < 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
stemEnd := len(input) - len(suffix)
|
||||||
|
for i := range suffix {
|
||||||
|
if suffix[i] != input[stemEnd+i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func stem(input []byte) []byte {
|
||||||
|
runes := bytes.Runes(input)
|
||||||
|
// Strip a single prefix.
|
||||||
|
for _, p := range prefixes {
|
||||||
|
if canStemPrefix(runes, p) {
|
||||||
|
runes = runes[len(p):]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Strip off multiple suffixes, in their order in the suffixes array.
|
||||||
|
for _, s := range suffixes {
|
||||||
|
if canStemSuffix(runes, s) {
|
||||||
|
runes = runes[:len(runes)-len(s)]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewArabicStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
392
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar_test.go
generated
vendored
Normal file
392
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stemmer_ar_test.go
generated
vendored
Normal file
@ -0,0 +1,392 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestArabicStemmerFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// AlPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("الحسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("حسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// WalPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("والحسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("حسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// BalPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بالحسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("حسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// KalPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كالحسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("حسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// FalPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("فالحسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("حسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// LlPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("للاخر"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("اخر"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// WaPrefix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("وحسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("حسن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// AhSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("زوجها"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("زوج"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// AnSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدان"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// AtSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدات"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// WnSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدون"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// YnSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدين"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// YhSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهديه"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// YpSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدية"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// HSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// PSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدة"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// YSuffix
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدي"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// ComboPrefSuf
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("وساهدون"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// ComboSuf
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهدهات"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ساهد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// ShouldntStem
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("الو"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("الو"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// NonArabic
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("English"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("English"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("السلام"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلامة"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("السلامة"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("الوصل"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("وصل"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("والصل"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("صل"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Empty
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
arabicStemmerFilter := NewArabicStemmerFilter()
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := arabicStemmerFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||||
|
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_filter_ar.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_filter_ar.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
149
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_words_ar.go
generated
vendored
Normal file
149
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar/stop_words_ar.go
generated
vendored
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
package ar
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_ar"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
|
||||||
|
# This means that when modifying this list, you might need to add some
|
||||||
|
# redundant entries, for example containing forms with both أ and ا
|
||||||
|
من
|
||||||
|
ومن
|
||||||
|
منها
|
||||||
|
منه
|
||||||
|
في
|
||||||
|
وفي
|
||||||
|
فيها
|
||||||
|
فيه
|
||||||
|
و
|
||||||
|
ف
|
||||||
|
ثم
|
||||||
|
او
|
||||||
|
أو
|
||||||
|
ب
|
||||||
|
بها
|
||||||
|
به
|
||||||
|
ا
|
||||||
|
أ
|
||||||
|
اى
|
||||||
|
اي
|
||||||
|
أي
|
||||||
|
أى
|
||||||
|
لا
|
||||||
|
ولا
|
||||||
|
الا
|
||||||
|
ألا
|
||||||
|
إلا
|
||||||
|
لكن
|
||||||
|
ما
|
||||||
|
وما
|
||||||
|
كما
|
||||||
|
فما
|
||||||
|
عن
|
||||||
|
مع
|
||||||
|
اذا
|
||||||
|
إذا
|
||||||
|
ان
|
||||||
|
أن
|
||||||
|
إن
|
||||||
|
انها
|
||||||
|
أنها
|
||||||
|
إنها
|
||||||
|
انه
|
||||||
|
أنه
|
||||||
|
إنه
|
||||||
|
بان
|
||||||
|
بأن
|
||||||
|
فان
|
||||||
|
فأن
|
||||||
|
وان
|
||||||
|
وأن
|
||||||
|
وإن
|
||||||
|
التى
|
||||||
|
التي
|
||||||
|
الذى
|
||||||
|
الذي
|
||||||
|
الذين
|
||||||
|
الى
|
||||||
|
الي
|
||||||
|
إلى
|
||||||
|
إلي
|
||||||
|
على
|
||||||
|
عليها
|
||||||
|
عليه
|
||||||
|
اما
|
||||||
|
أما
|
||||||
|
إما
|
||||||
|
ايضا
|
||||||
|
أيضا
|
||||||
|
كل
|
||||||
|
وكل
|
||||||
|
لم
|
||||||
|
ولم
|
||||||
|
لن
|
||||||
|
ولن
|
||||||
|
هى
|
||||||
|
هي
|
||||||
|
هو
|
||||||
|
وهى
|
||||||
|
وهي
|
||||||
|
وهو
|
||||||
|
فهى
|
||||||
|
فهي
|
||||||
|
فهو
|
||||||
|
انت
|
||||||
|
أنت
|
||||||
|
لك
|
||||||
|
لها
|
||||||
|
له
|
||||||
|
هذه
|
||||||
|
هذا
|
||||||
|
تلك
|
||||||
|
ذلك
|
||||||
|
هناك
|
||||||
|
كانت
|
||||||
|
كان
|
||||||
|
يكون
|
||||||
|
تكون
|
||||||
|
وكانت
|
||||||
|
وكان
|
||||||
|
غير
|
||||||
|
بعض
|
||||||
|
قد
|
||||||
|
نحو
|
||||||
|
بين
|
||||||
|
بينما
|
||||||
|
منذ
|
||||||
|
ضمن
|
||||||
|
حيث
|
||||||
|
الان
|
||||||
|
الآن
|
||||||
|
خلال
|
||||||
|
بعد
|
||||||
|
قبل
|
||||||
|
حتى
|
||||||
|
عند
|
||||||
|
عندما
|
||||||
|
لدى
|
||||||
|
جميع
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(ArabicStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_filter_bg.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_filter_bg.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package bg
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
217
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_words_bg.go
generated
vendored
Normal file
217
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/bg/stop_words_bg.go
generated
vendored
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
package bg
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_bg"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
а
|
||||||
|
аз
|
||||||
|
ако
|
||||||
|
ала
|
||||||
|
бе
|
||||||
|
без
|
||||||
|
беше
|
||||||
|
би
|
||||||
|
бил
|
||||||
|
била
|
||||||
|
били
|
||||||
|
било
|
||||||
|
близо
|
||||||
|
бъдат
|
||||||
|
бъде
|
||||||
|
бяха
|
||||||
|
в
|
||||||
|
вас
|
||||||
|
ваш
|
||||||
|
ваша
|
||||||
|
вероятно
|
||||||
|
вече
|
||||||
|
взема
|
||||||
|
ви
|
||||||
|
вие
|
||||||
|
винаги
|
||||||
|
все
|
||||||
|
всеки
|
||||||
|
всички
|
||||||
|
всичко
|
||||||
|
всяка
|
||||||
|
във
|
||||||
|
въпреки
|
||||||
|
върху
|
||||||
|
г
|
||||||
|
ги
|
||||||
|
главно
|
||||||
|
го
|
||||||
|
д
|
||||||
|
да
|
||||||
|
дали
|
||||||
|
до
|
||||||
|
докато
|
||||||
|
докога
|
||||||
|
дори
|
||||||
|
досега
|
||||||
|
доста
|
||||||
|
е
|
||||||
|
едва
|
||||||
|
един
|
||||||
|
ето
|
||||||
|
за
|
||||||
|
зад
|
||||||
|
заедно
|
||||||
|
заради
|
||||||
|
засега
|
||||||
|
затова
|
||||||
|
защо
|
||||||
|
защото
|
||||||
|
и
|
||||||
|
из
|
||||||
|
или
|
||||||
|
им
|
||||||
|
има
|
||||||
|
имат
|
||||||
|
иска
|
||||||
|
й
|
||||||
|
каза
|
||||||
|
как
|
||||||
|
каква
|
||||||
|
какво
|
||||||
|
както
|
||||||
|
какъв
|
||||||
|
като
|
||||||
|
кога
|
||||||
|
когато
|
||||||
|
което
|
||||||
|
които
|
||||||
|
кой
|
||||||
|
който
|
||||||
|
колко
|
||||||
|
която
|
||||||
|
къде
|
||||||
|
където
|
||||||
|
към
|
||||||
|
ли
|
||||||
|
м
|
||||||
|
ме
|
||||||
|
между
|
||||||
|
мен
|
||||||
|
ми
|
||||||
|
мнозина
|
||||||
|
мога
|
||||||
|
могат
|
||||||
|
може
|
||||||
|
моля
|
||||||
|
момента
|
||||||
|
му
|
||||||
|
н
|
||||||
|
на
|
||||||
|
над
|
||||||
|
назад
|
||||||
|
най
|
||||||
|
направи
|
||||||
|
напред
|
||||||
|
например
|
||||||
|
нас
|
||||||
|
не
|
||||||
|
него
|
||||||
|
нея
|
||||||
|
ни
|
||||||
|
ние
|
||||||
|
никой
|
||||||
|
нито
|
||||||
|
но
|
||||||
|
някои
|
||||||
|
някой
|
||||||
|
няма
|
||||||
|
обаче
|
||||||
|
около
|
||||||
|
освен
|
||||||
|
особено
|
||||||
|
от
|
||||||
|
отгоре
|
||||||
|
отново
|
||||||
|
още
|
||||||
|
пак
|
||||||
|
по
|
||||||
|
повече
|
||||||
|
повечето
|
||||||
|
под
|
||||||
|
поне
|
||||||
|
поради
|
||||||
|
после
|
||||||
|
почти
|
||||||
|
прави
|
||||||
|
пред
|
||||||
|
преди
|
||||||
|
през
|
||||||
|
при
|
||||||
|
пък
|
||||||
|
първо
|
||||||
|
с
|
||||||
|
са
|
||||||
|
само
|
||||||
|
се
|
||||||
|
сега
|
||||||
|
си
|
||||||
|
скоро
|
||||||
|
след
|
||||||
|
сме
|
||||||
|
според
|
||||||
|
сред
|
||||||
|
срещу
|
||||||
|
сте
|
||||||
|
съм
|
||||||
|
със
|
||||||
|
също
|
||||||
|
т
|
||||||
|
тази
|
||||||
|
така
|
||||||
|
такива
|
||||||
|
такъв
|
||||||
|
там
|
||||||
|
твой
|
||||||
|
те
|
||||||
|
тези
|
||||||
|
ти
|
||||||
|
тн
|
||||||
|
то
|
||||||
|
това
|
||||||
|
тогава
|
||||||
|
този
|
||||||
|
той
|
||||||
|
толкова
|
||||||
|
точно
|
||||||
|
трябва
|
||||||
|
тук
|
||||||
|
тъй
|
||||||
|
тя
|
||||||
|
тях
|
||||||
|
у
|
||||||
|
харесва
|
||||||
|
ч
|
||||||
|
че
|
||||||
|
често
|
||||||
|
чрез
|
||||||
|
ще
|
||||||
|
щом
|
||||||
|
я
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(BulgarianStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
30
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/articles_ca.go
generated
vendored
Normal file
30
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/articles_ca.go
generated
vendored
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
package ca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ArticlesName = "articles_ca"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||||
|
|
||||||
|
var CatalanArticles = []byte(`
|
||||||
|
d
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
s
|
||||||
|
t
|
||||||
|
`)
|
||||||
|
|
||||||
|
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(CatalanArticles)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||||
|
}
|
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca.go
generated
vendored
Normal file
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca.go
generated
vendored
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ElisionName = "elision_ca"
|
||||||
|
|
||||||
|
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||||
|
}
|
||||||
|
return elision_filter.NewElisionFilter(articlesTokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||||
|
}
|
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca_test.go
generated
vendored
Normal file
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/elision_ca_test.go
generated
vendored
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFrenchElision(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("l'Institut"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("d'Estudis"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("Institut"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("Estudis"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := elisionFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_filter_ca.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_filter_ca.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
244
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_words_ca.go
generated
vendored
Normal file
244
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ca/stop_words_ca.go
generated
vendored
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
package ca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_ca"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var CatalanStopWords = []byte(`# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
|
||||||
|
a
|
||||||
|
abans
|
||||||
|
ací
|
||||||
|
ah
|
||||||
|
així
|
||||||
|
això
|
||||||
|
al
|
||||||
|
als
|
||||||
|
aleshores
|
||||||
|
algun
|
||||||
|
alguna
|
||||||
|
algunes
|
||||||
|
alguns
|
||||||
|
alhora
|
||||||
|
allà
|
||||||
|
allí
|
||||||
|
allò
|
||||||
|
altra
|
||||||
|
altre
|
||||||
|
altres
|
||||||
|
amb
|
||||||
|
ambdós
|
||||||
|
ambdues
|
||||||
|
apa
|
||||||
|
aquell
|
||||||
|
aquella
|
||||||
|
aquelles
|
||||||
|
aquells
|
||||||
|
aquest
|
||||||
|
aquesta
|
||||||
|
aquestes
|
||||||
|
aquests
|
||||||
|
aquí
|
||||||
|
baix
|
||||||
|
cada
|
||||||
|
cadascú
|
||||||
|
cadascuna
|
||||||
|
cadascunes
|
||||||
|
cadascuns
|
||||||
|
com
|
||||||
|
contra
|
||||||
|
d'un
|
||||||
|
d'una
|
||||||
|
d'unes
|
||||||
|
d'uns
|
||||||
|
dalt
|
||||||
|
de
|
||||||
|
del
|
||||||
|
dels
|
||||||
|
des
|
||||||
|
després
|
||||||
|
dins
|
||||||
|
dintre
|
||||||
|
donat
|
||||||
|
doncs
|
||||||
|
durant
|
||||||
|
e
|
||||||
|
eh
|
||||||
|
el
|
||||||
|
els
|
||||||
|
em
|
||||||
|
en
|
||||||
|
encara
|
||||||
|
ens
|
||||||
|
entre
|
||||||
|
érem
|
||||||
|
eren
|
||||||
|
éreu
|
||||||
|
es
|
||||||
|
és
|
||||||
|
esta
|
||||||
|
està
|
||||||
|
estàvem
|
||||||
|
estaven
|
||||||
|
estàveu
|
||||||
|
esteu
|
||||||
|
et
|
||||||
|
etc
|
||||||
|
ets
|
||||||
|
fins
|
||||||
|
fora
|
||||||
|
gairebé
|
||||||
|
ha
|
||||||
|
han
|
||||||
|
has
|
||||||
|
havia
|
||||||
|
he
|
||||||
|
hem
|
||||||
|
heu
|
||||||
|
hi
|
||||||
|
ho
|
||||||
|
i
|
||||||
|
igual
|
||||||
|
iguals
|
||||||
|
ja
|
||||||
|
l'hi
|
||||||
|
la
|
||||||
|
les
|
||||||
|
li
|
||||||
|
li'n
|
||||||
|
llavors
|
||||||
|
m'he
|
||||||
|
ma
|
||||||
|
mal
|
||||||
|
malgrat
|
||||||
|
mateix
|
||||||
|
mateixa
|
||||||
|
mateixes
|
||||||
|
mateixos
|
||||||
|
me
|
||||||
|
mentre
|
||||||
|
més
|
||||||
|
meu
|
||||||
|
meus
|
||||||
|
meva
|
||||||
|
meves
|
||||||
|
molt
|
||||||
|
molta
|
||||||
|
moltes
|
||||||
|
molts
|
||||||
|
mon
|
||||||
|
mons
|
||||||
|
n'he
|
||||||
|
n'hi
|
||||||
|
ne
|
||||||
|
ni
|
||||||
|
no
|
||||||
|
nogensmenys
|
||||||
|
només
|
||||||
|
nosaltres
|
||||||
|
nostra
|
||||||
|
nostre
|
||||||
|
nostres
|
||||||
|
o
|
||||||
|
oh
|
||||||
|
oi
|
||||||
|
on
|
||||||
|
pas
|
||||||
|
pel
|
||||||
|
pels
|
||||||
|
per
|
||||||
|
però
|
||||||
|
perquè
|
||||||
|
poc
|
||||||
|
poca
|
||||||
|
pocs
|
||||||
|
poques
|
||||||
|
potser
|
||||||
|
propi
|
||||||
|
qual
|
||||||
|
quals
|
||||||
|
quan
|
||||||
|
quant
|
||||||
|
que
|
||||||
|
què
|
||||||
|
quelcom
|
||||||
|
qui
|
||||||
|
quin
|
||||||
|
quina
|
||||||
|
quines
|
||||||
|
quins
|
||||||
|
s'ha
|
||||||
|
s'han
|
||||||
|
sa
|
||||||
|
semblant
|
||||||
|
semblants
|
||||||
|
ses
|
||||||
|
seu
|
||||||
|
seus
|
||||||
|
seva
|
||||||
|
seva
|
||||||
|
seves
|
||||||
|
si
|
||||||
|
sobre
|
||||||
|
sobretot
|
||||||
|
sóc
|
||||||
|
solament
|
||||||
|
sols
|
||||||
|
son
|
||||||
|
són
|
||||||
|
sons
|
||||||
|
sota
|
||||||
|
sou
|
||||||
|
t'ha
|
||||||
|
t'han
|
||||||
|
t'he
|
||||||
|
ta
|
||||||
|
tal
|
||||||
|
també
|
||||||
|
tampoc
|
||||||
|
tan
|
||||||
|
tant
|
||||||
|
tanta
|
||||||
|
tantes
|
||||||
|
teu
|
||||||
|
teus
|
||||||
|
teva
|
||||||
|
teves
|
||||||
|
ton
|
||||||
|
tons
|
||||||
|
tot
|
||||||
|
tota
|
||||||
|
totes
|
||||||
|
tots
|
||||||
|
un
|
||||||
|
una
|
||||||
|
unes
|
||||||
|
uns
|
||||||
|
us
|
||||||
|
va
|
||||||
|
vaig
|
||||||
|
vam
|
||||||
|
van
|
||||||
|
vas
|
||||||
|
veu
|
||||||
|
vosaltres
|
||||||
|
vostra
|
||||||
|
vostre
|
||||||
|
vostres
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(CatalanStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk.go
generated
vendored
Normal file
49
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk.go
generated
vendored
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package cjk
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "cjk"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
bigramFilter, err := cache.TokenFilterNamed(BigramName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: whitespaceTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
normalizeFilter,
|
||||||
|
toLowerFilter,
|
||||||
|
bigramFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
620
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk_test.go
generated
vendored
Normal file
620
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/analyzer_cjk_test.go
generated
vendored
Normal file
@ -0,0 +1,620 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package cjk
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCJKAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte("こんにちは世界"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こん"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("んに"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("にち"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ちは"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は世"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世界"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("一二三四五六七八九十"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("一二"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("二三"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("三四"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("四五"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("五六"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("六七"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("七八"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 24,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("八九"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 8,
|
||||||
|
Start: 21,
|
||||||
|
End: 27,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("九十"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 9,
|
||||||
|
Start: 24,
|
||||||
|
End: 30,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("一 二三四 五六七八九 十"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("一"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("二三"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 4,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("三四"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 7,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("五六"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 5,
|
||||||
|
Start: 14,
|
||||||
|
End: 20,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("六七"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 6,
|
||||||
|
Start: 17,
|
||||||
|
End: 23,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("七八"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 7,
|
||||||
|
Start: 20,
|
||||||
|
End: 26,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("八九"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 8,
|
||||||
|
Start: 23,
|
||||||
|
End: 29,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("十"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 10,
|
||||||
|
Start: 30,
|
||||||
|
End: 33,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abc"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("defgh"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 2,
|
||||||
|
Start: 4,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ijklmn"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 3,
|
||||||
|
Start: 10,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("opqrstu"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 4,
|
||||||
|
Start: 17,
|
||||||
|
End: 24,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("vwxy"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 5,
|
||||||
|
Start: 25,
|
||||||
|
End: 29,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("z"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 6,
|
||||||
|
Start: 30,
|
||||||
|
End: 31,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("あい"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("あい"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("あい "),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("あい"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("test"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("test"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("test "),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("test"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("あいtest"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("あい"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("test"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("testあい "),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("test"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 4,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("あい"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 4,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("あいうえおabcかきくけこ"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("あい"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("いう"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("うえ"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("えお"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abc"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("かき"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 24,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("きく"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 8,
|
||||||
|
Start: 21,
|
||||||
|
End: 27,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("くけ"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 9,
|
||||||
|
Start: 24,
|
||||||
|
End: 30,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("けこ"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 10,
|
||||||
|
Start: 27,
|
||||||
|
End: 33,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("あいうえおabんcかきくけ こ"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("あい"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("いう"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("うえ"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("えお"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ab"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 17,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 7,
|
||||||
|
Start: 17,
|
||||||
|
End: 20,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("c"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 8,
|
||||||
|
Start: 20,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("かき"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 9,
|
||||||
|
Start: 21,
|
||||||
|
End: 27,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("きく"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 10,
|
||||||
|
Start: 24,
|
||||||
|
End: 30,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("くけ"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 11,
|
||||||
|
Start: 27,
|
||||||
|
End: 33,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 13,
|
||||||
|
Start: 34,
|
||||||
|
End: 37,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("一 روبرت موير"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("一"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("روبرت"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 2,
|
||||||
|
Start: 4,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("موير"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 3,
|
||||||
|
Start: 15,
|
||||||
|
End: 23,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("一 رُوبرت موير"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("一"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("رُوبرت"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 2,
|
||||||
|
Start: 4,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("موير"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 3,
|
||||||
|
Start: 17,
|
||||||
|
End: 25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("𩬅艱鍟䇹愯瀛"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("𩬅艱"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 7,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("艱鍟"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 4,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("鍟䇹"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 7,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("䇹愯"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 10,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("愯瀛"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 5,
|
||||||
|
Start: 13,
|
||||||
|
End: 19,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("一"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("一"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("一丁丂"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("一丁"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("丁丂"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
for _, test := range tests {
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
166
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram.go
generated
vendored
Normal file
166
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram.go
generated
vendored
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package cjk
|
||||||
|
|
||||||
|
import (
|
||||||
|
"container/ring"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const BigramName = "cjk_bigram"
|
||||||
|
|
||||||
|
type CJKBigramFilter struct {
|
||||||
|
outputUnigram bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
|
||||||
|
return &CJKBigramFilter{
|
||||||
|
outputUnigram: outputUnigram,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
r := ring.New(2)
|
||||||
|
itemsInRing := 0
|
||||||
|
|
||||||
|
rv := make(analysis.TokenStream, 0, len(input))
|
||||||
|
|
||||||
|
for _, token := range input {
|
||||||
|
if token.Type == analysis.Ideographic {
|
||||||
|
if itemsInRing > 0 {
|
||||||
|
// if items already buffered
|
||||||
|
// check to see if this is aligned
|
||||||
|
curr := r.Value.(*analysis.Token)
|
||||||
|
if token.Start-curr.End != 0 {
|
||||||
|
// not aligned flush
|
||||||
|
flushToken := s.flush(r, &itemsInRing)
|
||||||
|
if flushToken != nil {
|
||||||
|
rv = append(rv, flushToken)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// now we can add this token to the buffer
|
||||||
|
r = r.Next()
|
||||||
|
r.Value = token
|
||||||
|
if itemsInRing < 2 {
|
||||||
|
itemsInRing++
|
||||||
|
}
|
||||||
|
if itemsInRing > 1 && s.outputUnigram {
|
||||||
|
unigram := s.buildUnigram(r, &itemsInRing)
|
||||||
|
if unigram != nil {
|
||||||
|
rv = append(rv, unigram)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bigramToken := s.outputBigram(r, &itemsInRing)
|
||||||
|
if bigramToken != nil {
|
||||||
|
rv = append(rv, bigramToken)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// flush anything already buffered
|
||||||
|
flushToken := s.flush(r, &itemsInRing)
|
||||||
|
if flushToken != nil {
|
||||||
|
rv = append(rv, flushToken)
|
||||||
|
}
|
||||||
|
// output this token as is
|
||||||
|
rv = append(rv, token)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// deal with possible trailing unigram
|
||||||
|
if itemsInRing == 1 || s.outputUnigram {
|
||||||
|
if itemsInRing == 2 {
|
||||||
|
r = r.Next()
|
||||||
|
}
|
||||||
|
unigram := s.buildUnigram(r, &itemsInRing)
|
||||||
|
if unigram != nil {
|
||||||
|
rv = append(rv, unigram)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||||
|
var rv *analysis.Token
|
||||||
|
if *itemsInRing == 1 {
|
||||||
|
rv = s.buildUnigram(r, itemsInRing)
|
||||||
|
}
|
||||||
|
r.Value = nil
|
||||||
|
*itemsInRing = 0
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||||
|
if *itemsInRing == 2 {
|
||||||
|
thisShingleRing := r.Move(-1)
|
||||||
|
shingledBytes := make([]byte, 0)
|
||||||
|
|
||||||
|
// do first token
|
||||||
|
prev := thisShingleRing.Value.(*analysis.Token)
|
||||||
|
shingledBytes = append(shingledBytes, prev.Term...)
|
||||||
|
|
||||||
|
// do second token
|
||||||
|
thisShingleRing = thisShingleRing.Next()
|
||||||
|
curr := thisShingleRing.Value.(*analysis.Token)
|
||||||
|
shingledBytes = append(shingledBytes, curr.Term...)
|
||||||
|
|
||||||
|
token := analysis.Token{
|
||||||
|
Type: analysis.Double,
|
||||||
|
Term: shingledBytes,
|
||||||
|
Position: prev.Position,
|
||||||
|
Start: prev.Start,
|
||||||
|
End: curr.End,
|
||||||
|
}
|
||||||
|
return &token
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||||
|
if *itemsInRing == 2 {
|
||||||
|
thisShingleRing := r.Move(-1)
|
||||||
|
// do first token
|
||||||
|
prev := thisShingleRing.Value.(*analysis.Token)
|
||||||
|
token := analysis.Token{
|
||||||
|
Type: analysis.Single,
|
||||||
|
Term: prev.Term,
|
||||||
|
Position: prev.Position,
|
||||||
|
Start: prev.Start,
|
||||||
|
End: prev.End,
|
||||||
|
}
|
||||||
|
return &token
|
||||||
|
} else if *itemsInRing == 1 {
|
||||||
|
// do first token
|
||||||
|
prev := r.Value.(*analysis.Token)
|
||||||
|
token := analysis.Token{
|
||||||
|
Type: analysis.Single,
|
||||||
|
Term: prev.Term,
|
||||||
|
Position: prev.Position,
|
||||||
|
Start: prev.Start,
|
||||||
|
End: prev.End,
|
||||||
|
}
|
||||||
|
return &token
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
outputUnigram := false
|
||||||
|
outVal, ok := config["output_unigram"].(bool)
|
||||||
|
if ok {
|
||||||
|
outputUnigram = outVal
|
||||||
|
}
|
||||||
|
return NewCJKBigramFilter(outputUnigram), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
|
||||||
|
}
|
420
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram_test.go
generated
vendored
Normal file
420
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cjk/cjk_bigram_test.go
generated
vendored
Normal file
@ -0,0 +1,420 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package cjk
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCJKBigramFilter(t *testing.T) {
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
outputUnigram bool
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
outputUnigram: false,
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 2,
|
||||||
|
Start: 5,
|
||||||
|
End: 7,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 2,
|
||||||
|
Start: 5,
|
||||||
|
End: 7,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
outputUnigram: false,
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("に"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ち"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("界"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こん"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("んに"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("にち"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ちは"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は世"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世界"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
outputUnigram: true,
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("に"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ち"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("界"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こん"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("んに"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("に"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("にち"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ち"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ちは"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は世"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世界"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 6,
|
||||||
|
Start: 15,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("界"),
|
||||||
|
Type: analysis.Single,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
outputUnigram: false,
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こ"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 3,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ん"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("に"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ち"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("は"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 5,
|
||||||
|
Start: 12,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cat"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 6,
|
||||||
|
Start: 12,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 21,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("界"),
|
||||||
|
Type: analysis.Ideographic,
|
||||||
|
Position: 8,
|
||||||
|
Start: 21,
|
||||||
|
End: 24,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("こん"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("んに"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 2,
|
||||||
|
Start: 3,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("にち"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 3,
|
||||||
|
Start: 6,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ちは"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 4,
|
||||||
|
Start: 9,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cat"),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
|
Position: 6,
|
||||||
|
Start: 12,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("世界"),
|
||||||
|
Type: analysis.Double,
|
||||||
|
Position: 7,
|
||||||
|
Start: 18,
|
||||||
|
End: 24,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
|
||||||
|
actual := cjkBigramFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %s, got %s", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb.go
generated
vendored
Normal file
58
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb.go
generated
vendored
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "ckb"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: icuTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
normCkbFilter,
|
||||||
|
toLowerFilter,
|
||||||
|
stopCkbFilter,
|
||||||
|
stemmerCkbFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
74
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb_test.go
generated
vendored
Normal file
74
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/analyzer_ckb_test.go
generated
vendored
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSoraniAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stop word removal
|
||||||
|
{
|
||||||
|
input: []byte("ئەم پیاوە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 2,
|
||||||
|
Start: 7,
|
||||||
|
End: 17,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("پیاوە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("پیاو"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize.go
generated
vendored
Normal file
113
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize.go
generated
vendored
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const NormalizeName = "normalize_ckb"
|
||||||
|
|
||||||
|
const (
|
||||||
|
Yeh = '\u064A'
|
||||||
|
DotlessYeh = '\u0649'
|
||||||
|
FarsiYeh = '\u06CC'
|
||||||
|
|
||||||
|
Kaf = '\u0643'
|
||||||
|
Keheh = '\u06A9'
|
||||||
|
|
||||||
|
Heh = '\u0647'
|
||||||
|
Ae = '\u06D5'
|
||||||
|
Zwnj = '\u200C'
|
||||||
|
HehDoachashmee = '\u06BE'
|
||||||
|
TehMarbuta = '\u0629'
|
||||||
|
|
||||||
|
Reh = '\u0631'
|
||||||
|
Rreh = '\u0695'
|
||||||
|
RrehAbove = '\u0692'
|
||||||
|
|
||||||
|
Tatweel = '\u0640'
|
||||||
|
Fathatan = '\u064B'
|
||||||
|
Dammatan = '\u064C'
|
||||||
|
Kasratan = '\u064D'
|
||||||
|
Fatha = '\u064E'
|
||||||
|
Damma = '\u064F'
|
||||||
|
Kasra = '\u0650'
|
||||||
|
Shadda = '\u0651'
|
||||||
|
Sukun = '\u0652'
|
||||||
|
)
|
||||||
|
|
||||||
|
type SoraniNormalizeFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
|
||||||
|
return &SoraniNormalizeFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
term := normalize(token.Term)
|
||||||
|
token.Term = term
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalize(input []byte) []byte {
|
||||||
|
runes := bytes.Runes(input)
|
||||||
|
for i := 0; i < len(runes); i++ {
|
||||||
|
switch runes[i] {
|
||||||
|
case Yeh, DotlessYeh:
|
||||||
|
runes[i] = FarsiYeh
|
||||||
|
case Kaf:
|
||||||
|
runes[i] = Keheh
|
||||||
|
case Zwnj:
|
||||||
|
if i > 0 && runes[i-1] == Heh {
|
||||||
|
runes[i-1] = Ae
|
||||||
|
}
|
||||||
|
runes = analysis.DeleteRune(runes, i)
|
||||||
|
i--
|
||||||
|
case Heh:
|
||||||
|
if i == len(runes)-1 {
|
||||||
|
runes[i] = Ae
|
||||||
|
}
|
||||||
|
case TehMarbuta:
|
||||||
|
runes[i] = Ae
|
||||||
|
case HehDoachashmee:
|
||||||
|
runes[i] = Heh
|
||||||
|
case Reh:
|
||||||
|
if i == 0 {
|
||||||
|
runes[i] = Rreh
|
||||||
|
}
|
||||||
|
case RrehAbove:
|
||||||
|
runes[i] = Rreh
|
||||||
|
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||||
|
runes = analysis.DeleteRune(runes, i)
|
||||||
|
i--
|
||||||
|
default:
|
||||||
|
if unicode.In(runes[i], unicode.Cf) {
|
||||||
|
runes = analysis.DeleteRune(runes, i)
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewSoraniNormalizeFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||||
|
}
|
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize_test.go
generated
vendored
Normal file
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSoraniNormalizeFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// test Y
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u064A"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06CC"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0649"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06CC"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06CC"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06CC"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// test K
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0643"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06A9"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06A9"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06A9"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// test H
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0647\u200C"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06D5"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0647\u200C\u06A9"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06D5\u06A9"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06BE"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0647"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0629"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u06D5"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// test final H
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0647\u0647\u0647"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0647\u0647\u06D5"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// test RR
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0692"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0695"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// test initial RR
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0631\u0631\u0631"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0695\u0631\u0631"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// test remove
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0640"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u064B"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u064C"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u064D"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u064E"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u064F"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0650"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0651"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u0652"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("\u200C"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// empty
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
soraniNormalizeFilter := NewSoraniNormalizeFilter()
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := soraniNormalizeFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||||
|
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
143
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter.go
generated
vendored
Normal file
143
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter.go
generated
vendored
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_ckb"
|
||||||
|
|
||||||
|
type SoraniStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
|
||||||
|
return &SoraniStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
// if not protected keyword, stem it
|
||||||
|
if !token.KeyWord {
|
||||||
|
stemmed := stem(token.Term)
|
||||||
|
token.Term = stemmed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func stem(input []byte) []byte {
|
||||||
|
inputLen := utf8.RuneCount(input)
|
||||||
|
|
||||||
|
// postposition
|
||||||
|
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
|
||||||
|
input = truncateRunes(input, 2)
|
||||||
|
inputLen = utf8.RuneCount(input)
|
||||||
|
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
|
||||||
|
input = truncateRunes(input, 1)
|
||||||
|
inputLen = utf8.RuneCount(input)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
|
||||||
|
input = truncateRunes(input, 3)
|
||||||
|
inputLen = utf8.RuneCount(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
// possessive pronoun
|
||||||
|
if inputLen > 6 &&
|
||||||
|
(bytes.HasSuffix(input, []byte("مان")) ||
|
||||||
|
bytes.HasSuffix(input, []byte("یان")) ||
|
||||||
|
bytes.HasSuffix(input, []byte("تان"))) {
|
||||||
|
input = truncateRunes(input, 3)
|
||||||
|
inputLen = utf8.RuneCount(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
// indefinite singular ezafe
|
||||||
|
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
|
||||||
|
return truncateRunes(input, 4)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
|
||||||
|
// indefinite singular
|
||||||
|
return truncateRunes(input, 2)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
|
||||||
|
// indefinite singular
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
|
||||||
|
// definite singular
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
|
||||||
|
// definite singular
|
||||||
|
return truncateRunes(input, 2)
|
||||||
|
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
|
||||||
|
// definite plural
|
||||||
|
return truncateRunes(input, 4)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
|
||||||
|
// definite plural
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
|
||||||
|
// indefinite plural ezafe
|
||||||
|
return truncateRunes(input, 4)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
|
||||||
|
// indefinite plural ezafe
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
|
||||||
|
// indefinite plural
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
|
||||||
|
// indefinite plural
|
||||||
|
return truncateRunes(input, 2)
|
||||||
|
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
|
||||||
|
// demonstrative plural
|
||||||
|
return truncateRunes(input, 4)
|
||||||
|
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
|
||||||
|
// demonstrative plural
|
||||||
|
return truncateRunes(input, 3)
|
||||||
|
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
|
||||||
|
// demonstrative singular
|
||||||
|
return truncateRunes(input, 2)
|
||||||
|
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
|
||||||
|
// demonstrative singular
|
||||||
|
return truncateRunes(input, 1)
|
||||||
|
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
|
||||||
|
// absolute singular ezafe
|
||||||
|
return truncateRunes(input, 1)
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncateRunes(input []byte, num int) []byte {
|
||||||
|
runes := bytes.Runes(input)
|
||||||
|
runes = runes[:len(runes)-num]
|
||||||
|
out := buildTermFromRunes(runes)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildTermFromRunes(runes []rune) []byte {
|
||||||
|
rv := make([]byte, 0, len(runes)*4)
|
||||||
|
for _, r := range runes {
|
||||||
|
runeBytes := make([]byte, utf8.RuneLen(r))
|
||||||
|
utf8.EncodeRune(runeBytes, r)
|
||||||
|
rv = append(rv, runeBytes...)
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewSoraniStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
294
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter_test.go
generated
vendored
Normal file
294
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/sorani_stemmer_filter_test.go
generated
vendored
Normal file
@ -0,0 +1,294 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/single_token"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSoraniStemmerFilter(t *testing.T) {
|
||||||
|
|
||||||
|
// in order to match the lucene tests
|
||||||
|
// we will test with an analyzer, not just the stemmer
|
||||||
|
analyzer := analysis.Analyzer{
|
||||||
|
Tokenizer: single_token.NewSingleTokenTokenizer(),
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
NewSoraniNormalizeFilter(),
|
||||||
|
NewSoraniStemmerFilter(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{ // -ek
|
||||||
|
input: []byte("پیاوێک"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -yek
|
||||||
|
input: []byte("دەرگایەک"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دەرگا"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -aka
|
||||||
|
input: []byte("پیاوەكە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -ka
|
||||||
|
input: []byte("دەرگاكە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دەرگا"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -a
|
||||||
|
input: []byte("کتاویە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("کتاوی"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -ya
|
||||||
|
input: []byte("دەرگایە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دەرگا"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -An
|
||||||
|
input: []byte("پیاوان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -yAn
|
||||||
|
input: []byte("دەرگایان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دەرگا"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -akAn
|
||||||
|
input: []byte("پیاوەکان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -kAn
|
||||||
|
input: []byte("دەرگاکان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دەرگا"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -Ana
|
||||||
|
input: []byte("پیاوانە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پیاو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -yAna
|
||||||
|
input: []byte("دەرگایانە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دەرگا"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // Ezafe singular
|
||||||
|
input: []byte("هۆتیلی"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هۆتیل"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // Ezafe indefinite
|
||||||
|
input: []byte("هۆتیلێکی"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هۆتیل"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // Ezafe plural
|
||||||
|
input: []byte("هۆتیلانی"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هۆتیل"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 16,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -awa
|
||||||
|
input: []byte("دوورەوە"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("دوور"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -dA
|
||||||
|
input: []byte("نیوەشەودا"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("نیوەشەو"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 18,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -A
|
||||||
|
input: []byte("سۆرانا"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سۆران"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 12,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -mAn
|
||||||
|
input: []byte("پارەمان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پارە"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -tAn
|
||||||
|
input: []byte("پارەتان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پارە"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // -yAn
|
||||||
|
input: []byte("پارەیان"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("پارە"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ // empty
|
||||||
|
input: []byte(""),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("for input %s(% x)", test.input, test.input)
|
||||||
|
t.Errorf("\texpected:")
|
||||||
|
for _, token := range test.output {
|
||||||
|
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||||
|
}
|
||||||
|
t.Errorf("\tactual:")
|
||||||
|
for _, token := range actual {
|
||||||
|
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_filter_ckb.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_filter_ckb.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
160
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_words_ckb.go
generated
vendored
Normal file
160
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ckb/stop_words_ckb.go
generated
vendored
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
package ckb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_ckb"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var SoraniStopWords = []byte(`# set of kurdish stopwords
|
||||||
|
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||||
|
# constructed from:
|
||||||
|
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||||
|
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||||
|
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||||
|
|
||||||
|
# and
|
||||||
|
و
|
||||||
|
# which
|
||||||
|
کە
|
||||||
|
# of
|
||||||
|
ی
|
||||||
|
# made/did
|
||||||
|
کرد
|
||||||
|
# that/which
|
||||||
|
ئەوەی
|
||||||
|
# on/head
|
||||||
|
سەر
|
||||||
|
# two
|
||||||
|
دوو
|
||||||
|
# also
|
||||||
|
هەروەها
|
||||||
|
# from/that
|
||||||
|
لەو
|
||||||
|
# makes/does
|
||||||
|
دەکات
|
||||||
|
# some
|
||||||
|
چەند
|
||||||
|
# every
|
||||||
|
هەر
|
||||||
|
|
||||||
|
# demonstratives
|
||||||
|
# that
|
||||||
|
ئەو
|
||||||
|
# this
|
||||||
|
ئەم
|
||||||
|
|
||||||
|
# personal pronouns
|
||||||
|
# I
|
||||||
|
من
|
||||||
|
# we
|
||||||
|
ئێمە
|
||||||
|
# you
|
||||||
|
تۆ
|
||||||
|
# you
|
||||||
|
ئێوە
|
||||||
|
# he/she/it
|
||||||
|
ئەو
|
||||||
|
# they
|
||||||
|
ئەوان
|
||||||
|
|
||||||
|
# prepositions
|
||||||
|
# to/with/by
|
||||||
|
بە
|
||||||
|
پێ
|
||||||
|
# without
|
||||||
|
بەبێ
|
||||||
|
# along with/while/during
|
||||||
|
بەدەم
|
||||||
|
# in the opinion of
|
||||||
|
بەلای
|
||||||
|
# according to
|
||||||
|
بەپێی
|
||||||
|
# before
|
||||||
|
بەرلە
|
||||||
|
# in the direction of
|
||||||
|
بەرەوی
|
||||||
|
# in front of/toward
|
||||||
|
بەرەوە
|
||||||
|
# before/in the face of
|
||||||
|
بەردەم
|
||||||
|
# without
|
||||||
|
بێ
|
||||||
|
# except for
|
||||||
|
بێجگە
|
||||||
|
# for
|
||||||
|
بۆ
|
||||||
|
# on/in
|
||||||
|
دە
|
||||||
|
تێ
|
||||||
|
# with
|
||||||
|
دەگەڵ
|
||||||
|
# after
|
||||||
|
دوای
|
||||||
|
# except for/aside from
|
||||||
|
جگە
|
||||||
|
# in/from
|
||||||
|
لە
|
||||||
|
لێ
|
||||||
|
# in front of/before/because of
|
||||||
|
لەبەر
|
||||||
|
# between/among
|
||||||
|
لەبەینی
|
||||||
|
# concerning/about
|
||||||
|
لەبابەت
|
||||||
|
# concerning
|
||||||
|
لەبارەی
|
||||||
|
# instead of
|
||||||
|
لەباتی
|
||||||
|
# beside
|
||||||
|
لەبن
|
||||||
|
# instead of
|
||||||
|
لەبرێتی
|
||||||
|
# behind
|
||||||
|
لەدەم
|
||||||
|
# with/together with
|
||||||
|
لەگەڵ
|
||||||
|
# by
|
||||||
|
لەلایەن
|
||||||
|
# within
|
||||||
|
لەناو
|
||||||
|
# between/among
|
||||||
|
لەنێو
|
||||||
|
# for the sake of
|
||||||
|
لەپێناوی
|
||||||
|
# with respect to
|
||||||
|
لەرەوی
|
||||||
|
# by means of/for
|
||||||
|
لەرێ
|
||||||
|
# for the sake of
|
||||||
|
لەرێگا
|
||||||
|
# on/on top of/according to
|
||||||
|
لەسەر
|
||||||
|
# under
|
||||||
|
لەژێر
|
||||||
|
# between/among
|
||||||
|
ناو
|
||||||
|
# between/among
|
||||||
|
نێوان
|
||||||
|
# after
|
||||||
|
پاش
|
||||||
|
# before
|
||||||
|
پێش
|
||||||
|
# like
|
||||||
|
وەک
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(SoraniStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_filter_cs.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_filter_cs.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package cs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_words_cs.go
generated
vendored
Normal file
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/cs/stop_words_cs.go
generated
vendored
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
package cs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_cs"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var CzechStopWords = []byte(`a
|
||||||
|
s
|
||||||
|
k
|
||||||
|
o
|
||||||
|
i
|
||||||
|
u
|
||||||
|
v
|
||||||
|
z
|
||||||
|
dnes
|
||||||
|
cz
|
||||||
|
tímto
|
||||||
|
budeš
|
||||||
|
budem
|
||||||
|
byli
|
||||||
|
jseš
|
||||||
|
můj
|
||||||
|
svým
|
||||||
|
ta
|
||||||
|
tomto
|
||||||
|
tohle
|
||||||
|
tuto
|
||||||
|
tyto
|
||||||
|
jej
|
||||||
|
zda
|
||||||
|
proč
|
||||||
|
máte
|
||||||
|
tato
|
||||||
|
kam
|
||||||
|
tohoto
|
||||||
|
kdo
|
||||||
|
kteří
|
||||||
|
mi
|
||||||
|
nám
|
||||||
|
tom
|
||||||
|
tomuto
|
||||||
|
mít
|
||||||
|
nic
|
||||||
|
proto
|
||||||
|
kterou
|
||||||
|
byla
|
||||||
|
toho
|
||||||
|
protože
|
||||||
|
asi
|
||||||
|
ho
|
||||||
|
naši
|
||||||
|
napište
|
||||||
|
re
|
||||||
|
což
|
||||||
|
tím
|
||||||
|
takže
|
||||||
|
svých
|
||||||
|
její
|
||||||
|
svými
|
||||||
|
jste
|
||||||
|
aj
|
||||||
|
tu
|
||||||
|
tedy
|
||||||
|
teto
|
||||||
|
bylo
|
||||||
|
kde
|
||||||
|
ke
|
||||||
|
pravé
|
||||||
|
ji
|
||||||
|
nad
|
||||||
|
nejsou
|
||||||
|
či
|
||||||
|
pod
|
||||||
|
téma
|
||||||
|
mezi
|
||||||
|
přes
|
||||||
|
ty
|
||||||
|
pak
|
||||||
|
vám
|
||||||
|
ani
|
||||||
|
když
|
||||||
|
však
|
||||||
|
neg
|
||||||
|
jsem
|
||||||
|
tento
|
||||||
|
článku
|
||||||
|
články
|
||||||
|
aby
|
||||||
|
jsme
|
||||||
|
před
|
||||||
|
pta
|
||||||
|
jejich
|
||||||
|
byl
|
||||||
|
ještě
|
||||||
|
až
|
||||||
|
bez
|
||||||
|
také
|
||||||
|
pouze
|
||||||
|
první
|
||||||
|
vaše
|
||||||
|
která
|
||||||
|
nás
|
||||||
|
nový
|
||||||
|
tipy
|
||||||
|
pokud
|
||||||
|
může
|
||||||
|
strana
|
||||||
|
jeho
|
||||||
|
své
|
||||||
|
jiné
|
||||||
|
zprávy
|
||||||
|
nové
|
||||||
|
není
|
||||||
|
vás
|
||||||
|
jen
|
||||||
|
podle
|
||||||
|
zde
|
||||||
|
už
|
||||||
|
být
|
||||||
|
více
|
||||||
|
bude
|
||||||
|
již
|
||||||
|
než
|
||||||
|
který
|
||||||
|
by
|
||||||
|
které
|
||||||
|
co
|
||||||
|
nebo
|
||||||
|
ten
|
||||||
|
tak
|
||||||
|
má
|
||||||
|
při
|
||||||
|
od
|
||||||
|
po
|
||||||
|
jsou
|
||||||
|
jak
|
||||||
|
další
|
||||||
|
ale
|
||||||
|
si
|
||||||
|
se
|
||||||
|
ve
|
||||||
|
to
|
||||||
|
jako
|
||||||
|
za
|
||||||
|
zpět
|
||||||
|
ze
|
||||||
|
do
|
||||||
|
pro
|
||||||
|
je
|
||||||
|
na
|
||||||
|
atd
|
||||||
|
atp
|
||||||
|
jakmile
|
||||||
|
přičemž
|
||||||
|
já
|
||||||
|
on
|
||||||
|
ona
|
||||||
|
ono
|
||||||
|
oni
|
||||||
|
ony
|
||||||
|
my
|
||||||
|
vy
|
||||||
|
jí
|
||||||
|
ji
|
||||||
|
mě
|
||||||
|
mne
|
||||||
|
jemu
|
||||||
|
tomu
|
||||||
|
těm
|
||||||
|
těmu
|
||||||
|
němu
|
||||||
|
němuž
|
||||||
|
jehož
|
||||||
|
jíž
|
||||||
|
jelikož
|
||||||
|
jež
|
||||||
|
jakož
|
||||||
|
načež
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(CzechStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da.go
generated
vendored
Normal file
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "da"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopDaFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: icuTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopDaFilter,
|
||||||
|
stemmerDaFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
69
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da_test.go
generated
vendored
Normal file
69
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/analyzer_da_test.go
generated
vendored
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestDanishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("undersøg"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("undersøg"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("undersøgelse"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("undersøg"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("på"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stemmer_da.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stemmer_da.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_da"
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return stemmer_filter.NewStemmerFilter("da")
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_filter_da.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_filter_da.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
134
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_words_da.go
generated
vendored
Normal file
134
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/da/stop_words_da.go
generated
vendored
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_da"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||||
|
| a large text sample.
|
||||||
|
|
||||||
|
|
||||||
|
og | and
|
||||||
|
i | in
|
||||||
|
jeg | I
|
||||||
|
det | that (dem. pronoun)/it (pers. pronoun)
|
||||||
|
at | that (in front of a sentence)/to (with infinitive)
|
||||||
|
en | a/an
|
||||||
|
den | it (pers. pronoun)/that (dem. pronoun)
|
||||||
|
til | to/at/for/until/against/by/of/into, more
|
||||||
|
er | present tense of "to be"
|
||||||
|
som | who, as
|
||||||
|
på | on/upon/in/on/at/to/after/of/with/for, on
|
||||||
|
de | they
|
||||||
|
med | with/by/in, along
|
||||||
|
han | he
|
||||||
|
af | of/by/from/off/for/in/with/on, off
|
||||||
|
for | at/for/to/from/by/of/ago, in front/before, because
|
||||||
|
ikke | not
|
||||||
|
der | who/which, there/those
|
||||||
|
var | past tense of "to be"
|
||||||
|
mig | me/myself
|
||||||
|
sig | oneself/himself/herself/itself/themselves
|
||||||
|
men | but
|
||||||
|
et | a/an/one, one (number), someone/somebody/one
|
||||||
|
har | present tense of "to have"
|
||||||
|
om | round/about/for/in/a, about/around/down, if
|
||||||
|
vi | we
|
||||||
|
min | my
|
||||||
|
havde | past tense of "to have"
|
||||||
|
ham | him
|
||||||
|
hun | she
|
||||||
|
nu | now
|
||||||
|
over | over/above/across/by/beyond/past/on/about, over/past
|
||||||
|
da | then, when/as/since
|
||||||
|
fra | from/off/since, off, since
|
||||||
|
du | you
|
||||||
|
ud | out
|
||||||
|
sin | his/her/its/one's
|
||||||
|
dem | them
|
||||||
|
os | us/ourselves
|
||||||
|
op | up
|
||||||
|
man | you/one
|
||||||
|
hans | his
|
||||||
|
hvor | where
|
||||||
|
eller | or
|
||||||
|
hvad | what
|
||||||
|
skal | must/shall etc.
|
||||||
|
selv | myself/youself/herself/ourselves etc., even
|
||||||
|
her | here
|
||||||
|
alle | all/everyone/everybody etc.
|
||||||
|
vil | will (verb)
|
||||||
|
blev | past tense of "to stay/to remain/to get/to become"
|
||||||
|
kunne | could
|
||||||
|
ind | in
|
||||||
|
når | when
|
||||||
|
være | present tense of "to be"
|
||||||
|
dog | however/yet/after all
|
||||||
|
noget | something
|
||||||
|
ville | would
|
||||||
|
jo | you know/you see (adv), yes
|
||||||
|
deres | their/theirs
|
||||||
|
efter | after/behind/according to/for/by/from, later/afterwards
|
||||||
|
ned | down
|
||||||
|
skulle | should
|
||||||
|
denne | this
|
||||||
|
end | than
|
||||||
|
dette | this
|
||||||
|
mit | my/mine
|
||||||
|
også | also
|
||||||
|
under | under/beneath/below/during, below/underneath
|
||||||
|
have | have
|
||||||
|
dig | you
|
||||||
|
anden | other
|
||||||
|
hende | her
|
||||||
|
mine | my
|
||||||
|
alt | everything
|
||||||
|
meget | much/very, plenty of
|
||||||
|
sit | his, her, its, one's
|
||||||
|
sine | his, her, its, one's
|
||||||
|
vor | our
|
||||||
|
mod | against
|
||||||
|
disse | these
|
||||||
|
hvis | if
|
||||||
|
din | your/yours
|
||||||
|
nogle | some
|
||||||
|
hos | by/at
|
||||||
|
blive | be/become
|
||||||
|
mange | many
|
||||||
|
ad | by/through
|
||||||
|
bliver | present tense of "to be/to become"
|
||||||
|
hendes | her/hers
|
||||||
|
været | be
|
||||||
|
thi | for (conj)
|
||||||
|
jer | you
|
||||||
|
sådan | such, like this/like that
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(DanishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de.go
generated
vendored
Normal file
59
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "de"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: icuTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopDeFilter,
|
||||||
|
normalizeDeFilter,
|
||||||
|
stemmerDeFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
97
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de_test.go
generated
vendored
Normal file
97
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/analyzer_de_test.go
generated
vendored
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGermanAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte("Tisch"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("tisch"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("Tische"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("tisch"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("Tischen"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("tisch"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 7,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// german specials
|
||||||
|
{
|
||||||
|
input: []byte("Schaltflächen"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("schaltflach"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("Schaltflaechen"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("schaltflach"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 14,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
94
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize.go
generated
vendored
Normal file
94
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize.go
generated
vendored
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const NormalizeName = "normalize_de"
|
||||||
|
|
||||||
|
const (
|
||||||
|
N = /* ordinary state */ 0
|
||||||
|
V = 1 /* stops 'u' from entering umlaut state */
|
||||||
|
U = 2 /* umlaut state, allows e-deletion */
|
||||||
|
)
|
||||||
|
|
||||||
|
type GermanNormalizeFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
|
||||||
|
return &GermanNormalizeFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
term := normalize(token.Term)
|
||||||
|
token.Term = term
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalize(input []byte) []byte {
|
||||||
|
state := N
|
||||||
|
runes := bytes.Runes(input)
|
||||||
|
for i := 0; i < len(runes); i++ {
|
||||||
|
switch runes[i] {
|
||||||
|
case 'a', 'o':
|
||||||
|
state = U
|
||||||
|
case 'u':
|
||||||
|
if state == N {
|
||||||
|
state = U
|
||||||
|
} else {
|
||||||
|
state = V
|
||||||
|
}
|
||||||
|
case 'e':
|
||||||
|
if state == U {
|
||||||
|
runes = analysis.DeleteRune(runes, i)
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
state = V
|
||||||
|
case 'i', 'q', 'y':
|
||||||
|
state = V
|
||||||
|
case 'ä':
|
||||||
|
runes[i] = 'a'
|
||||||
|
state = V
|
||||||
|
case 'ö':
|
||||||
|
runes[i] = 'o'
|
||||||
|
state = V
|
||||||
|
case 'ü':
|
||||||
|
runes[i] = 'u'
|
||||||
|
state = V
|
||||||
|
case 'ß':
|
||||||
|
runes[i] = 's'
|
||||||
|
i++
|
||||||
|
// newrunes := make([]rune, len(runes)+1)
|
||||||
|
// copy(newrunes, runes)
|
||||||
|
// runes = newrunes
|
||||||
|
// runes[i] = 's'
|
||||||
|
runes = analysis.InsertRune(runes, i, 's')
|
||||||
|
state = N
|
||||||
|
default:
|
||||||
|
state = N
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewGermanNormalizeFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||||
|
}
|
98
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize_test.go
generated
vendored
Normal file
98
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/german_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGermanNormalizeFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// Tests that a/o/u + e is equivalent to the umlaut form
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("Schaltflächen"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("Schaltflachen"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("Schaltflaechen"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("Schaltflachen"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Tests the specific heuristic that ue is not folded after a vowel or q.
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("dauer"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("dauer"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Tests german specific folding of sharp-s
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("weißbier"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("weissbier"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// empty
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
germanNormalizeFilter := NewGermanNormalizeFilter()
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := germanNormalizeFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||||
|
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stemmer_de.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stemmer_de.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
|
||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_de"
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return stemmer_filter.NewStemmerFilter("de")
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_filter_de.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_filter_de.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_words_de.go
generated
vendored
Normal file
318
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/de/stop_words_de.go
generated
vendored
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
package de
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_de"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| The number of forms in this list is reduced significantly by passing it
|
||||||
|
| through the German stemmer.
|
||||||
|
|
||||||
|
|
||||||
|
aber | but
|
||||||
|
|
||||||
|
alle | all
|
||||||
|
allem
|
||||||
|
allen
|
||||||
|
aller
|
||||||
|
alles
|
||||||
|
|
||||||
|
als | than, as
|
||||||
|
also | so
|
||||||
|
am | an + dem
|
||||||
|
an | at
|
||||||
|
|
||||||
|
ander | other
|
||||||
|
andere
|
||||||
|
anderem
|
||||||
|
anderen
|
||||||
|
anderer
|
||||||
|
anderes
|
||||||
|
anderm
|
||||||
|
andern
|
||||||
|
anderr
|
||||||
|
anders
|
||||||
|
|
||||||
|
auch | also
|
||||||
|
auf | on
|
||||||
|
aus | out of
|
||||||
|
bei | by
|
||||||
|
bin | am
|
||||||
|
bis | until
|
||||||
|
bist | art
|
||||||
|
da | there
|
||||||
|
damit | with it
|
||||||
|
dann | then
|
||||||
|
|
||||||
|
der | the
|
||||||
|
den
|
||||||
|
des
|
||||||
|
dem
|
||||||
|
die
|
||||||
|
das
|
||||||
|
|
||||||
|
daß | that
|
||||||
|
|
||||||
|
derselbe | the same
|
||||||
|
derselben
|
||||||
|
denselben
|
||||||
|
desselben
|
||||||
|
demselben
|
||||||
|
dieselbe
|
||||||
|
dieselben
|
||||||
|
dasselbe
|
||||||
|
|
||||||
|
dazu | to that
|
||||||
|
|
||||||
|
dein | thy
|
||||||
|
deine
|
||||||
|
deinem
|
||||||
|
deinen
|
||||||
|
deiner
|
||||||
|
deines
|
||||||
|
|
||||||
|
denn | because
|
||||||
|
|
||||||
|
derer | of those
|
||||||
|
dessen | of him
|
||||||
|
|
||||||
|
dich | thee
|
||||||
|
dir | to thee
|
||||||
|
du | thou
|
||||||
|
|
||||||
|
dies | this
|
||||||
|
diese
|
||||||
|
diesem
|
||||||
|
diesen
|
||||||
|
dieser
|
||||||
|
dieses
|
||||||
|
|
||||||
|
|
||||||
|
doch | (several meanings)
|
||||||
|
dort | (over) there
|
||||||
|
|
||||||
|
|
||||||
|
durch | through
|
||||||
|
|
||||||
|
ein | a
|
||||||
|
eine
|
||||||
|
einem
|
||||||
|
einen
|
||||||
|
einer
|
||||||
|
eines
|
||||||
|
|
||||||
|
einig | some
|
||||||
|
einige
|
||||||
|
einigem
|
||||||
|
einigen
|
||||||
|
einiger
|
||||||
|
einiges
|
||||||
|
|
||||||
|
einmal | once
|
||||||
|
|
||||||
|
er | he
|
||||||
|
ihn | him
|
||||||
|
ihm | to him
|
||||||
|
|
||||||
|
es | it
|
||||||
|
etwas | something
|
||||||
|
|
||||||
|
euer | your
|
||||||
|
eure
|
||||||
|
eurem
|
||||||
|
euren
|
||||||
|
eurer
|
||||||
|
eures
|
||||||
|
|
||||||
|
für | for
|
||||||
|
gegen | towards
|
||||||
|
gewesen | p.p. of sein
|
||||||
|
hab | have
|
||||||
|
habe | have
|
||||||
|
haben | have
|
||||||
|
hat | has
|
||||||
|
hatte | had
|
||||||
|
hatten | had
|
||||||
|
hier | here
|
||||||
|
hin | there
|
||||||
|
hinter | behind
|
||||||
|
|
||||||
|
ich | I
|
||||||
|
mich | me
|
||||||
|
mir | to me
|
||||||
|
|
||||||
|
|
||||||
|
ihr | you, to her
|
||||||
|
ihre
|
||||||
|
ihrem
|
||||||
|
ihren
|
||||||
|
ihrer
|
||||||
|
ihres
|
||||||
|
euch | to you
|
||||||
|
|
||||||
|
im | in + dem
|
||||||
|
in | in
|
||||||
|
indem | while
|
||||||
|
ins | in + das
|
||||||
|
ist | is
|
||||||
|
|
||||||
|
jede | each, every
|
||||||
|
jedem
|
||||||
|
jeden
|
||||||
|
jeder
|
||||||
|
jedes
|
||||||
|
|
||||||
|
jene | that
|
||||||
|
jenem
|
||||||
|
jenen
|
||||||
|
jener
|
||||||
|
jenes
|
||||||
|
|
||||||
|
jetzt | now
|
||||||
|
kann | can
|
||||||
|
|
||||||
|
kein | no
|
||||||
|
keine
|
||||||
|
keinem
|
||||||
|
keinen
|
||||||
|
keiner
|
||||||
|
keines
|
||||||
|
|
||||||
|
können | can
|
||||||
|
könnte | could
|
||||||
|
machen | do
|
||||||
|
man | one
|
||||||
|
|
||||||
|
manche | some, many a
|
||||||
|
manchem
|
||||||
|
manchen
|
||||||
|
mancher
|
||||||
|
manches
|
||||||
|
|
||||||
|
mein | my
|
||||||
|
meine
|
||||||
|
meinem
|
||||||
|
meinen
|
||||||
|
meiner
|
||||||
|
meines
|
||||||
|
|
||||||
|
mit | with
|
||||||
|
muss | must
|
||||||
|
musste | had to
|
||||||
|
nach | to(wards)
|
||||||
|
nicht | not
|
||||||
|
nichts | nothing
|
||||||
|
noch | still, yet
|
||||||
|
nun | now
|
||||||
|
nur | only
|
||||||
|
ob | whether
|
||||||
|
oder | or
|
||||||
|
ohne | without
|
||||||
|
sehr | very
|
||||||
|
|
||||||
|
sein | his
|
||||||
|
seine
|
||||||
|
seinem
|
||||||
|
seinen
|
||||||
|
seiner
|
||||||
|
seines
|
||||||
|
|
||||||
|
selbst | self
|
||||||
|
sich | herself
|
||||||
|
|
||||||
|
sie | they, she
|
||||||
|
ihnen | to them
|
||||||
|
|
||||||
|
sind | are
|
||||||
|
so | so
|
||||||
|
|
||||||
|
solche | such
|
||||||
|
solchem
|
||||||
|
solchen
|
||||||
|
solcher
|
||||||
|
solches
|
||||||
|
|
||||||
|
soll | shall
|
||||||
|
sollte | should
|
||||||
|
sondern | but
|
||||||
|
sonst | else
|
||||||
|
über | over
|
||||||
|
um | about, around
|
||||||
|
und | and
|
||||||
|
|
||||||
|
uns | us
|
||||||
|
unse
|
||||||
|
unsem
|
||||||
|
unsen
|
||||||
|
unser
|
||||||
|
unses
|
||||||
|
|
||||||
|
unter | under
|
||||||
|
viel | much
|
||||||
|
vom | von + dem
|
||||||
|
von | from
|
||||||
|
vor | before
|
||||||
|
während | while
|
||||||
|
war | was
|
||||||
|
waren | were
|
||||||
|
warst | wast
|
||||||
|
was | what
|
||||||
|
weg | away, off
|
||||||
|
weil | because
|
||||||
|
weiter | further
|
||||||
|
|
||||||
|
welche | which
|
||||||
|
welchem
|
||||||
|
welchen
|
||||||
|
welcher
|
||||||
|
welches
|
||||||
|
|
||||||
|
wenn | when
|
||||||
|
werde | will
|
||||||
|
werden | will
|
||||||
|
wie | how
|
||||||
|
wieder | again
|
||||||
|
will | want
|
||||||
|
wir | we
|
||||||
|
wird | will
|
||||||
|
wirst | willst
|
||||||
|
wo | where
|
||||||
|
wollen | want
|
||||||
|
wollte | wanted
|
||||||
|
würde | would
|
||||||
|
würden | would
|
||||||
|
zu | to
|
||||||
|
zum | zu + dem
|
||||||
|
zur | zu + der
|
||||||
|
zwar | indeed
|
||||||
|
zwischen | between
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(GermanStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_filter_el.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_filter_el.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package el
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
102
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_words_el.go
generated
vendored
Normal file
102
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/el/stop_words_el.go
generated
vendored
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
package el
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_el"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var GreekStopWords = []byte(`# Lucene Greek Stopwords list
|
||||||
|
# Note: by default this file is used after GreekLowerCaseFilter,
|
||||||
|
# so when modifying this file use 'σ' instead of 'ς'
|
||||||
|
ο
|
||||||
|
η
|
||||||
|
το
|
||||||
|
οι
|
||||||
|
τα
|
||||||
|
του
|
||||||
|
τησ
|
||||||
|
των
|
||||||
|
τον
|
||||||
|
την
|
||||||
|
και
|
||||||
|
κι
|
||||||
|
κ
|
||||||
|
ειμαι
|
||||||
|
εισαι
|
||||||
|
ειναι
|
||||||
|
ειμαστε
|
||||||
|
ειστε
|
||||||
|
στο
|
||||||
|
στον
|
||||||
|
στη
|
||||||
|
στην
|
||||||
|
μα
|
||||||
|
αλλα
|
||||||
|
απο
|
||||||
|
για
|
||||||
|
προσ
|
||||||
|
με
|
||||||
|
σε
|
||||||
|
ωσ
|
||||||
|
παρα
|
||||||
|
αντι
|
||||||
|
κατα
|
||||||
|
μετα
|
||||||
|
θα
|
||||||
|
να
|
||||||
|
δε
|
||||||
|
δεν
|
||||||
|
μη
|
||||||
|
μην
|
||||||
|
επι
|
||||||
|
ενω
|
||||||
|
εαν
|
||||||
|
αν
|
||||||
|
τοτε
|
||||||
|
που
|
||||||
|
πωσ
|
||||||
|
ποιοσ
|
||||||
|
ποια
|
||||||
|
ποιο
|
||||||
|
ποιοι
|
||||||
|
ποιεσ
|
||||||
|
ποιων
|
||||||
|
ποιουσ
|
||||||
|
αυτοσ
|
||||||
|
αυτη
|
||||||
|
αυτο
|
||||||
|
αυτοι
|
||||||
|
αυτων
|
||||||
|
αυτουσ
|
||||||
|
αυτεσ
|
||||||
|
αυτα
|
||||||
|
εκεινοσ
|
||||||
|
εκεινη
|
||||||
|
εκεινο
|
||||||
|
εκεινοι
|
||||||
|
εκεινεσ
|
||||||
|
εκεινα
|
||||||
|
εκεινων
|
||||||
|
εκεινουσ
|
||||||
|
οπωσ
|
||||||
|
ομωσ
|
||||||
|
ισωσ
|
||||||
|
οσο
|
||||||
|
οτι
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(GreekStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en.go
generated
vendored
Normal file
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en.go
generated
vendored
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/porter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "en"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopEnFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: tokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
possEnFilter,
|
||||||
|
toLowerFilter,
|
||||||
|
stopEnFilter,
|
||||||
|
stemmerEnFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
100
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en_test.go
generated
vendored
Normal file
100
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/analyzer_en_test.go
generated
vendored
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnglishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("books"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("book"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 5,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("book"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("book"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word removal
|
||||||
|
{
|
||||||
|
input: []byte("the"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
// possessive removal
|
||||||
|
{
|
||||||
|
input: []byte("steven's"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("steven"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("steven\u2019s"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("steven"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("steven\uFF07s"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("steven"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en.go
generated
vendored
Normal file
57
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en.go
generated
vendored
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const PossessiveName = "possessive_en"
|
||||||
|
|
||||||
|
const rightSingleQuotationMark = '’'
|
||||||
|
const apostrophe = '\''
|
||||||
|
const fullWidthApostrophe = '''
|
||||||
|
|
||||||
|
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
|
||||||
|
|
||||||
|
type PossessiveFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewPossessiveFilter() *PossessiveFilter {
|
||||||
|
return &PossessiveFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
runes := bytes.Runes(token.Term)
|
||||||
|
if len(runes) >= 2 {
|
||||||
|
secondToLastRune := runes[len(runes)-2]
|
||||||
|
lastRune := runes[len(runes)-1]
|
||||||
|
if (secondToLastRune == rightSingleQuotationMark ||
|
||||||
|
secondToLastRune == apostrophe ||
|
||||||
|
secondToLastRune == fullWidthApostrophe) &&
|
||||||
|
(lastRune == 's' || lastRune == 'S') {
|
||||||
|
token.Term = analysis.TruncateRunes(token.Term, 2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewPossessiveFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
|
||||||
|
}
|
86
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en_test.go
generated
vendored
Normal file
86
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/possessive_filter_en_test.go
generated
vendored
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnglishPossessiveFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty's"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY'S"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty’s"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY’S"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty's"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY'S"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("m"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("m"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := stemmerFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %s, got %s", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_en"
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return stemmer_filter.NewStemmerFilter("en")
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en_test.go
generated
vendored
Normal file
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stemmer_en_test.go
generated
vendored
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnglishStemmer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("walking"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("talked"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("business"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("protected"),
|
||||||
|
KeyWord: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("walk"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("talk"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("busi"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("protected"),
|
||||||
|
KeyWord: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := stemmerFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %s, got %s", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_filter_en.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_filter_en.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
343
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_words_en.go
generated
vendored
Normal file
343
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/en/stop_words_en.go
generated
vendored
Normal file
@ -0,0 +1,343 @@
|
|||||||
|
package en
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_en"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| An English stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| Many of the forms below are quite rare (e.g. "yourselves") but included for
|
||||||
|
| completeness.
|
||||||
|
|
||||||
|
| PRONOUNS FORMS
|
||||||
|
| 1st person sing
|
||||||
|
|
||||||
|
i | subject, always in upper case of course
|
||||||
|
|
||||||
|
me | object
|
||||||
|
my | possessive adjective
|
||||||
|
| the possessive pronoun 'mine' is best suppressed, because of the
|
||||||
|
| sense of coal-mine etc.
|
||||||
|
myself | reflexive
|
||||||
|
| 1st person plural
|
||||||
|
we | subject
|
||||||
|
|
||||||
|
| us | object
|
||||||
|
| care is required here because US = United States. It is usually
|
||||||
|
| safe to remove it if it is in lower case.
|
||||||
|
our | possessive adjective
|
||||||
|
ours | possessive pronoun
|
||||||
|
ourselves | reflexive
|
||||||
|
| second person (archaic 'thou' forms not included)
|
||||||
|
you | subject and object
|
||||||
|
your | possessive adjective
|
||||||
|
yours | possessive pronoun
|
||||||
|
yourself | reflexive (singular)
|
||||||
|
yourselves | reflexive (plural)
|
||||||
|
| third person singular
|
||||||
|
he | subject
|
||||||
|
him | object
|
||||||
|
his | possessive adjective and pronoun
|
||||||
|
himself | reflexive
|
||||||
|
|
||||||
|
she | subject
|
||||||
|
her | object and possessive adjective
|
||||||
|
hers | possessive pronoun
|
||||||
|
herself | reflexive
|
||||||
|
|
||||||
|
it | subject and object
|
||||||
|
its | possessive adjective
|
||||||
|
itself | reflexive
|
||||||
|
| third person plural
|
||||||
|
they | subject
|
||||||
|
them | object
|
||||||
|
their | possessive adjective
|
||||||
|
theirs | possessive pronoun
|
||||||
|
themselves | reflexive
|
||||||
|
| other forms (demonstratives, interrogatives)
|
||||||
|
what
|
||||||
|
which
|
||||||
|
who
|
||||||
|
whom
|
||||||
|
this
|
||||||
|
that
|
||||||
|
these
|
||||||
|
those
|
||||||
|
|
||||||
|
| VERB FORMS (using F.R. Palmer's nomenclature)
|
||||||
|
| BE
|
||||||
|
am | 1st person, present
|
||||||
|
is | -s form (3rd person, present)
|
||||||
|
are | present
|
||||||
|
was | 1st person, past
|
||||||
|
were | past
|
||||||
|
be | infinitive
|
||||||
|
been | past participle
|
||||||
|
being | -ing form
|
||||||
|
| HAVE
|
||||||
|
have | simple
|
||||||
|
has | -s form
|
||||||
|
had | past
|
||||||
|
having | -ing form
|
||||||
|
| DO
|
||||||
|
do | simple
|
||||||
|
does | -s form
|
||||||
|
did | past
|
||||||
|
doing | -ing form
|
||||||
|
|
||||||
|
| The forms below are, I believe, best omitted, because of the significant
|
||||||
|
| homonym forms:
|
||||||
|
|
||||||
|
| He made a WILL
|
||||||
|
| old tin CAN
|
||||||
|
| merry month of MAY
|
||||||
|
| a smell of MUST
|
||||||
|
| fight the good fight with all thy MIGHT
|
||||||
|
|
||||||
|
| would, could, should, ought might however be included
|
||||||
|
|
||||||
|
| | AUXILIARIES
|
||||||
|
| | WILL
|
||||||
|
|will
|
||||||
|
|
||||||
|
would
|
||||||
|
|
||||||
|
| | SHALL
|
||||||
|
|shall
|
||||||
|
|
||||||
|
should
|
||||||
|
|
||||||
|
| | CAN
|
||||||
|
|can
|
||||||
|
|
||||||
|
could
|
||||||
|
|
||||||
|
| | MAY
|
||||||
|
|may
|
||||||
|
|might
|
||||||
|
| | MUST
|
||||||
|
|must
|
||||||
|
| | OUGHT
|
||||||
|
|
||||||
|
ought
|
||||||
|
|
||||||
|
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
|
||||||
|
| pronoun + verb
|
||||||
|
|
||||||
|
i'm
|
||||||
|
you're
|
||||||
|
he's
|
||||||
|
she's
|
||||||
|
it's
|
||||||
|
we're
|
||||||
|
they're
|
||||||
|
i've
|
||||||
|
you've
|
||||||
|
we've
|
||||||
|
they've
|
||||||
|
i'd
|
||||||
|
you'd
|
||||||
|
he'd
|
||||||
|
she'd
|
||||||
|
we'd
|
||||||
|
they'd
|
||||||
|
i'll
|
||||||
|
you'll
|
||||||
|
he'll
|
||||||
|
she'll
|
||||||
|
we'll
|
||||||
|
they'll
|
||||||
|
|
||||||
|
| verb + negation
|
||||||
|
|
||||||
|
isn't
|
||||||
|
aren't
|
||||||
|
wasn't
|
||||||
|
weren't
|
||||||
|
hasn't
|
||||||
|
haven't
|
||||||
|
hadn't
|
||||||
|
doesn't
|
||||||
|
don't
|
||||||
|
didn't
|
||||||
|
|
||||||
|
| auxiliary + negation
|
||||||
|
|
||||||
|
won't
|
||||||
|
wouldn't
|
||||||
|
shan't
|
||||||
|
shouldn't
|
||||||
|
can't
|
||||||
|
cannot
|
||||||
|
couldn't
|
||||||
|
mustn't
|
||||||
|
|
||||||
|
| miscellaneous forms
|
||||||
|
|
||||||
|
let's
|
||||||
|
that's
|
||||||
|
who's
|
||||||
|
what's
|
||||||
|
here's
|
||||||
|
there's
|
||||||
|
when's
|
||||||
|
where's
|
||||||
|
why's
|
||||||
|
how's
|
||||||
|
|
||||||
|
| rarer forms
|
||||||
|
|
||||||
|
| daren't needn't
|
||||||
|
|
||||||
|
| doubtful forms
|
||||||
|
|
||||||
|
| oughtn't mightn't
|
||||||
|
|
||||||
|
| ARTICLES
|
||||||
|
a
|
||||||
|
an
|
||||||
|
the
|
||||||
|
|
||||||
|
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
|
||||||
|
| high, that classification is pointless.)
|
||||||
|
and
|
||||||
|
but
|
||||||
|
if
|
||||||
|
or
|
||||||
|
because
|
||||||
|
as
|
||||||
|
until
|
||||||
|
while
|
||||||
|
|
||||||
|
of
|
||||||
|
at
|
||||||
|
by
|
||||||
|
for
|
||||||
|
with
|
||||||
|
about
|
||||||
|
against
|
||||||
|
between
|
||||||
|
into
|
||||||
|
through
|
||||||
|
during
|
||||||
|
before
|
||||||
|
after
|
||||||
|
above
|
||||||
|
below
|
||||||
|
to
|
||||||
|
from
|
||||||
|
up
|
||||||
|
down
|
||||||
|
in
|
||||||
|
out
|
||||||
|
on
|
||||||
|
off
|
||||||
|
over
|
||||||
|
under
|
||||||
|
|
||||||
|
again
|
||||||
|
further
|
||||||
|
then
|
||||||
|
once
|
||||||
|
|
||||||
|
here
|
||||||
|
there
|
||||||
|
when
|
||||||
|
where
|
||||||
|
why
|
||||||
|
how
|
||||||
|
|
||||||
|
all
|
||||||
|
any
|
||||||
|
both
|
||||||
|
each
|
||||||
|
few
|
||||||
|
more
|
||||||
|
most
|
||||||
|
other
|
||||||
|
some
|
||||||
|
such
|
||||||
|
|
||||||
|
no
|
||||||
|
nor
|
||||||
|
not
|
||||||
|
only
|
||||||
|
own
|
||||||
|
same
|
||||||
|
so
|
||||||
|
than
|
||||||
|
too
|
||||||
|
very
|
||||||
|
|
||||||
|
| Just for the record, the following words are among the commonest in English
|
||||||
|
|
||||||
|
| one
|
||||||
|
| every
|
||||||
|
| least
|
||||||
|
| less
|
||||||
|
| many
|
||||||
|
| now
|
||||||
|
| ever
|
||||||
|
| never
|
||||||
|
| say
|
||||||
|
| says
|
||||||
|
| said
|
||||||
|
| also
|
||||||
|
| get
|
||||||
|
| go
|
||||||
|
| goes
|
||||||
|
| just
|
||||||
|
| made
|
||||||
|
| make
|
||||||
|
| put
|
||||||
|
| see
|
||||||
|
| seen
|
||||||
|
| whether
|
||||||
|
| like
|
||||||
|
| well
|
||||||
|
| back
|
||||||
|
| even
|
||||||
|
| still
|
||||||
|
| way
|
||||||
|
| take
|
||||||
|
| since
|
||||||
|
| another
|
||||||
|
| however
|
||||||
|
| two
|
||||||
|
| three
|
||||||
|
| four
|
||||||
|
| five
|
||||||
|
| first
|
||||||
|
| second
|
||||||
|
| new
|
||||||
|
| old
|
||||||
|
| high
|
||||||
|
| long
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(EnglishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es.go
generated
vendored
Normal file
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package es
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "es"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopEsFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: icuTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopEsFilter,
|
||||||
|
stemmerEsFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
64
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es_test.go
generated
vendored
Normal file
64
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/analyzer_es_test.go
generated
vendored
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package es
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSpanishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("chicana"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chican"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 7,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("chicano"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chican"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 7,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stemmer_es.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stemmer_es.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
|
||||||
|
package es
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_es"
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return stemmer_filter.NewStemmerFilter("es")
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_filter_es.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_filter_es.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package es
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
380
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_words_es.go
generated
vendored
Normal file
380
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/es/stop_words_es.go
generated
vendored
Normal file
@ -0,0 +1,380 @@
|
|||||||
|
package es
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_es"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
|
||||||
|
| The following is a ranked list (commonest to rarest) of stopwords
|
||||||
|
| deriving from a large sample of text.
|
||||||
|
|
||||||
|
| Extra words have been added at the end.
|
||||||
|
|
||||||
|
de | from, of
|
||||||
|
la | the, her
|
||||||
|
que | who, that
|
||||||
|
el | the
|
||||||
|
en | in
|
||||||
|
y | and
|
||||||
|
a | to
|
||||||
|
los | the, them
|
||||||
|
del | de + el
|
||||||
|
se | himself, from him etc
|
||||||
|
las | the, them
|
||||||
|
por | for, by, etc
|
||||||
|
un | a
|
||||||
|
para | for
|
||||||
|
con | with
|
||||||
|
no | no
|
||||||
|
una | a
|
||||||
|
su | his, her
|
||||||
|
al | a + el
|
||||||
|
| es from SER
|
||||||
|
lo | him
|
||||||
|
como | how
|
||||||
|
más | more
|
||||||
|
pero | pero
|
||||||
|
sus | su plural
|
||||||
|
le | to him, her
|
||||||
|
ya | already
|
||||||
|
o | or
|
||||||
|
| fue from SER
|
||||||
|
este | this
|
||||||
|
| ha from HABER
|
||||||
|
sí | himself etc
|
||||||
|
porque | because
|
||||||
|
esta | this
|
||||||
|
| son from SER
|
||||||
|
entre | between
|
||||||
|
| está from ESTAR
|
||||||
|
cuando | when
|
||||||
|
muy | very
|
||||||
|
sin | without
|
||||||
|
sobre | on
|
||||||
|
| ser from SER
|
||||||
|
| tiene from TENER
|
||||||
|
también | also
|
||||||
|
me | me
|
||||||
|
hasta | until
|
||||||
|
hay | there is/are
|
||||||
|
donde | where
|
||||||
|
| han from HABER
|
||||||
|
quien | whom, that
|
||||||
|
| están from ESTAR
|
||||||
|
| estado from ESTAR
|
||||||
|
desde | from
|
||||||
|
todo | all
|
||||||
|
nos | us
|
||||||
|
durante | during
|
||||||
|
| estados from ESTAR
|
||||||
|
todos | all
|
||||||
|
uno | a
|
||||||
|
les | to them
|
||||||
|
ni | nor
|
||||||
|
contra | against
|
||||||
|
otros | other
|
||||||
|
| fueron from SER
|
||||||
|
ese | that
|
||||||
|
eso | that
|
||||||
|
| había from HABER
|
||||||
|
ante | before
|
||||||
|
ellos | they
|
||||||
|
e | and (variant of y)
|
||||||
|
esto | this
|
||||||
|
mí | me
|
||||||
|
antes | before
|
||||||
|
algunos | some
|
||||||
|
qué | what?
|
||||||
|
unos | a
|
||||||
|
yo | I
|
||||||
|
otro | other
|
||||||
|
otras | other
|
||||||
|
otra | other
|
||||||
|
él | he
|
||||||
|
tanto | so much, many
|
||||||
|
esa | that
|
||||||
|
estos | these
|
||||||
|
mucho | much, many
|
||||||
|
quienes | who
|
||||||
|
nada | nothing
|
||||||
|
muchos | many
|
||||||
|
cual | who
|
||||||
|
| sea from SER
|
||||||
|
poco | few
|
||||||
|
ella | she
|
||||||
|
estar | to be
|
||||||
|
| haber from HABER
|
||||||
|
estas | these
|
||||||
|
| estaba from ESTAR
|
||||||
|
| estamos from ESTAR
|
||||||
|
algunas | some
|
||||||
|
algo | something
|
||||||
|
nosotros | we
|
||||||
|
|
||||||
|
| other forms
|
||||||
|
|
||||||
|
mi | me
|
||||||
|
mis | mi plural
|
||||||
|
tú | thou
|
||||||
|
te | thee
|
||||||
|
ti | thee
|
||||||
|
tu | thy
|
||||||
|
tus | tu plural
|
||||||
|
ellas | they
|
||||||
|
nosotras | we
|
||||||
|
vosotros | you
|
||||||
|
vosotras | you
|
||||||
|
os | you
|
||||||
|
mío | mine
|
||||||
|
mía |
|
||||||
|
míos |
|
||||||
|
mías |
|
||||||
|
tuyo | thine
|
||||||
|
tuya |
|
||||||
|
tuyos |
|
||||||
|
tuyas |
|
||||||
|
suyo | his, hers, theirs
|
||||||
|
suya |
|
||||||
|
suyos |
|
||||||
|
suyas |
|
||||||
|
nuestro | ours
|
||||||
|
nuestra |
|
||||||
|
nuestros |
|
||||||
|
nuestras |
|
||||||
|
vuestro | yours
|
||||||
|
vuestra |
|
||||||
|
vuestros |
|
||||||
|
vuestras |
|
||||||
|
esos | those
|
||||||
|
esas | those
|
||||||
|
|
||||||
|
| forms of estar, to be (not including the infinitive):
|
||||||
|
estoy
|
||||||
|
estás
|
||||||
|
está
|
||||||
|
estamos
|
||||||
|
estáis
|
||||||
|
están
|
||||||
|
esté
|
||||||
|
estés
|
||||||
|
estemos
|
||||||
|
estéis
|
||||||
|
estén
|
||||||
|
estaré
|
||||||
|
estarás
|
||||||
|
estará
|
||||||
|
estaremos
|
||||||
|
estaréis
|
||||||
|
estarán
|
||||||
|
estaría
|
||||||
|
estarías
|
||||||
|
estaríamos
|
||||||
|
estaríais
|
||||||
|
estarían
|
||||||
|
estaba
|
||||||
|
estabas
|
||||||
|
estábamos
|
||||||
|
estabais
|
||||||
|
estaban
|
||||||
|
estuve
|
||||||
|
estuviste
|
||||||
|
estuvo
|
||||||
|
estuvimos
|
||||||
|
estuvisteis
|
||||||
|
estuvieron
|
||||||
|
estuviera
|
||||||
|
estuvieras
|
||||||
|
estuviéramos
|
||||||
|
estuvierais
|
||||||
|
estuvieran
|
||||||
|
estuviese
|
||||||
|
estuvieses
|
||||||
|
estuviésemos
|
||||||
|
estuvieseis
|
||||||
|
estuviesen
|
||||||
|
estando
|
||||||
|
estado
|
||||||
|
estada
|
||||||
|
estados
|
||||||
|
estadas
|
||||||
|
estad
|
||||||
|
|
||||||
|
| forms of haber, to have (not including the infinitive):
|
||||||
|
he
|
||||||
|
has
|
||||||
|
ha
|
||||||
|
hemos
|
||||||
|
habéis
|
||||||
|
han
|
||||||
|
haya
|
||||||
|
hayas
|
||||||
|
hayamos
|
||||||
|
hayáis
|
||||||
|
hayan
|
||||||
|
habré
|
||||||
|
habrás
|
||||||
|
habrá
|
||||||
|
habremos
|
||||||
|
habréis
|
||||||
|
habrán
|
||||||
|
habría
|
||||||
|
habrías
|
||||||
|
habríamos
|
||||||
|
habríais
|
||||||
|
habrían
|
||||||
|
había
|
||||||
|
habías
|
||||||
|
habíamos
|
||||||
|
habíais
|
||||||
|
habían
|
||||||
|
hube
|
||||||
|
hubiste
|
||||||
|
hubo
|
||||||
|
hubimos
|
||||||
|
hubisteis
|
||||||
|
hubieron
|
||||||
|
hubiera
|
||||||
|
hubieras
|
||||||
|
hubiéramos
|
||||||
|
hubierais
|
||||||
|
hubieran
|
||||||
|
hubiese
|
||||||
|
hubieses
|
||||||
|
hubiésemos
|
||||||
|
hubieseis
|
||||||
|
hubiesen
|
||||||
|
habiendo
|
||||||
|
habido
|
||||||
|
habida
|
||||||
|
habidos
|
||||||
|
habidas
|
||||||
|
|
||||||
|
| forms of ser, to be (not including the infinitive):
|
||||||
|
soy
|
||||||
|
eres
|
||||||
|
es
|
||||||
|
somos
|
||||||
|
sois
|
||||||
|
son
|
||||||
|
sea
|
||||||
|
seas
|
||||||
|
seamos
|
||||||
|
seáis
|
||||||
|
sean
|
||||||
|
seré
|
||||||
|
serás
|
||||||
|
será
|
||||||
|
seremos
|
||||||
|
seréis
|
||||||
|
serán
|
||||||
|
sería
|
||||||
|
serías
|
||||||
|
seríamos
|
||||||
|
seríais
|
||||||
|
serían
|
||||||
|
era
|
||||||
|
eras
|
||||||
|
éramos
|
||||||
|
erais
|
||||||
|
eran
|
||||||
|
fui
|
||||||
|
fuiste
|
||||||
|
fue
|
||||||
|
fuimos
|
||||||
|
fuisteis
|
||||||
|
fueron
|
||||||
|
fuera
|
||||||
|
fueras
|
||||||
|
fuéramos
|
||||||
|
fuerais
|
||||||
|
fueran
|
||||||
|
fuese
|
||||||
|
fueses
|
||||||
|
fuésemos
|
||||||
|
fueseis
|
||||||
|
fuesen
|
||||||
|
siendo
|
||||||
|
sido
|
||||||
|
| sed also means 'thirst'
|
||||||
|
|
||||||
|
| forms of tener, to have (not including the infinitive):
|
||||||
|
tengo
|
||||||
|
tienes
|
||||||
|
tiene
|
||||||
|
tenemos
|
||||||
|
tenéis
|
||||||
|
tienen
|
||||||
|
tenga
|
||||||
|
tengas
|
||||||
|
tengamos
|
||||||
|
tengáis
|
||||||
|
tengan
|
||||||
|
tendré
|
||||||
|
tendrás
|
||||||
|
tendrá
|
||||||
|
tendremos
|
||||||
|
tendréis
|
||||||
|
tendrán
|
||||||
|
tendría
|
||||||
|
tendrías
|
||||||
|
tendríamos
|
||||||
|
tendríais
|
||||||
|
tendrían
|
||||||
|
tenía
|
||||||
|
tenías
|
||||||
|
teníamos
|
||||||
|
teníais
|
||||||
|
tenían
|
||||||
|
tuve
|
||||||
|
tuviste
|
||||||
|
tuvo
|
||||||
|
tuvimos
|
||||||
|
tuvisteis
|
||||||
|
tuvieron
|
||||||
|
tuviera
|
||||||
|
tuvieras
|
||||||
|
tuviéramos
|
||||||
|
tuvierais
|
||||||
|
tuvieran
|
||||||
|
tuviese
|
||||||
|
tuvieses
|
||||||
|
tuviésemos
|
||||||
|
tuvieseis
|
||||||
|
tuviesen
|
||||||
|
teniendo
|
||||||
|
tenido
|
||||||
|
tenida
|
||||||
|
tenidos
|
||||||
|
tenidas
|
||||||
|
tened
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(SpanishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_filter_eu.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_filter_eu.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package eu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
123
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_words_eu.go
generated
vendored
Normal file
123
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/eu/stop_words_eu.go
generated
vendored
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
package eu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_eu"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var BasqueStopWords = []byte(`# example set of basque stopwords
|
||||||
|
al
|
||||||
|
anitz
|
||||||
|
arabera
|
||||||
|
asko
|
||||||
|
baina
|
||||||
|
bat
|
||||||
|
batean
|
||||||
|
batek
|
||||||
|
bati
|
||||||
|
batzuei
|
||||||
|
batzuek
|
||||||
|
batzuetan
|
||||||
|
batzuk
|
||||||
|
bera
|
||||||
|
beraiek
|
||||||
|
berau
|
||||||
|
berauek
|
||||||
|
bere
|
||||||
|
berori
|
||||||
|
beroriek
|
||||||
|
beste
|
||||||
|
bezala
|
||||||
|
da
|
||||||
|
dago
|
||||||
|
dira
|
||||||
|
ditu
|
||||||
|
du
|
||||||
|
dute
|
||||||
|
edo
|
||||||
|
egin
|
||||||
|
ere
|
||||||
|
eta
|
||||||
|
eurak
|
||||||
|
ez
|
||||||
|
gainera
|
||||||
|
gu
|
||||||
|
gutxi
|
||||||
|
guzti
|
||||||
|
haiei
|
||||||
|
haiek
|
||||||
|
haietan
|
||||||
|
hainbeste
|
||||||
|
hala
|
||||||
|
han
|
||||||
|
handik
|
||||||
|
hango
|
||||||
|
hara
|
||||||
|
hari
|
||||||
|
hark
|
||||||
|
hartan
|
||||||
|
hau
|
||||||
|
hauei
|
||||||
|
hauek
|
||||||
|
hauetan
|
||||||
|
hemen
|
||||||
|
hemendik
|
||||||
|
hemengo
|
||||||
|
hi
|
||||||
|
hona
|
||||||
|
honek
|
||||||
|
honela
|
||||||
|
honetan
|
||||||
|
honi
|
||||||
|
hor
|
||||||
|
hori
|
||||||
|
horiei
|
||||||
|
horiek
|
||||||
|
horietan
|
||||||
|
horko
|
||||||
|
horra
|
||||||
|
horrek
|
||||||
|
horrela
|
||||||
|
horretan
|
||||||
|
horri
|
||||||
|
hortik
|
||||||
|
hura
|
||||||
|
izan
|
||||||
|
ni
|
||||||
|
noiz
|
||||||
|
nola
|
||||||
|
non
|
||||||
|
nondik
|
||||||
|
nongo
|
||||||
|
nor
|
||||||
|
nora
|
||||||
|
ze
|
||||||
|
zein
|
||||||
|
zen
|
||||||
|
zenbait
|
||||||
|
zenbat
|
||||||
|
zer
|
||||||
|
zergatik
|
||||||
|
ziren
|
||||||
|
zituen
|
||||||
|
zu
|
||||||
|
zuek
|
||||||
|
zuen
|
||||||
|
zuten
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(BasqueStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
67
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa.go
generated
vendored
Normal file
67
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa.go
generated
vendored
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package fa
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/ar"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "fa"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopFaFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
CharFilters: []analysis.CharFilter{
|
||||||
|
zFilter,
|
||||||
|
},
|
||||||
|
Tokenizer: icuTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
normArFilter,
|
||||||
|
normFaFilter,
|
||||||
|
stopFaFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
681
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa_test.go
generated
vendored
Normal file
681
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/analyzer_fa_test.go
generated
vendored
Normal file
@ -0,0 +1,681 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package fa
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPersianAnalyzerVerbs(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// active present indicative
|
||||||
|
{
|
||||||
|
input: []byte("میخورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("میخورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active future indicative
|
||||||
|
{
|
||||||
|
input: []byte("خواهد خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active present progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("دارد میخورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active preterite progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("داشت میخورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خوردهاست"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("میخوردهاست"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("میخورده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("میخورده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("میخورده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive present indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده میشود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده شد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده میشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده شدهاست"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده میشدهاست"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده میشده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive future indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده خواهد شد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive present progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("دارد خورده میشود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive preterite progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("داشت خورده میشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive present subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده شود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده میشده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده میشده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active present subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("بخورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بخورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// active present indicative
|
||||||
|
{
|
||||||
|
input: []byte("مي خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("مي خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active future indicative
|
||||||
|
{
|
||||||
|
input: []byte("خواهد خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active present progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("دارد مي خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active preterite progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("داشت مي خورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده است"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("مي خورده است"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("مي خورده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("مي خورده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active imperfective pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("مي خورده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive present indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده شد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective preterite indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده است"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective perfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شده است"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective pluperfect indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شده بود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive future indicative
|
||||||
|
{
|
||||||
|
input: []byte("خورده خواهد شد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive present progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("دارد خورده مي شود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive preterite progressive indicative
|
||||||
|
{
|
||||||
|
input: []byte("داشت خورده مي شد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive present subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده شود"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective preterite subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده شده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// passive imperfective pluperfect subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// active present subjunctive
|
||||||
|
{
|
||||||
|
input: []byte("بخورد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("بخورد"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPersianAnalyzerOthers(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// nouns
|
||||||
|
{
|
||||||
|
input: []byte("برگ ها"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("برگ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("برگها"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("برگ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// non persian
|
||||||
|
{
|
||||||
|
input: []byte("English test."),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("english"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("test"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// others
|
||||||
|
{
|
||||||
|
input: []byte("خورده مي شده بوده باشد"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("خورده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("برگها"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("برگ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize.go
generated
vendored
Normal file
72
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize.go
generated
vendored
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fa
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const NormalizeName = "normalize_fa"
|
||||||
|
|
||||||
|
const (
|
||||||
|
Yeh = '\u064A'
|
||||||
|
FarsiYeh = '\u06CC'
|
||||||
|
YehBarree = '\u06D2'
|
||||||
|
Keheh = '\u06A9'
|
||||||
|
Kaf = '\u0643'
|
||||||
|
HamzaAbove = '\u0654'
|
||||||
|
HehYeh = '\u06C0'
|
||||||
|
HehGoal = '\u06C1'
|
||||||
|
Heh = '\u0647'
|
||||||
|
)
|
||||||
|
|
||||||
|
type PersianNormalizeFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewPersianNormalizeFilter() *PersianNormalizeFilter {
|
||||||
|
return &PersianNormalizeFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
term := normalize(token.Term)
|
||||||
|
token.Term = term
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalize(input []byte) []byte {
|
||||||
|
runes := bytes.Runes(input)
|
||||||
|
for i := 0; i < len(runes); i++ {
|
||||||
|
switch runes[i] {
|
||||||
|
case FarsiYeh, YehBarree:
|
||||||
|
runes[i] = Yeh
|
||||||
|
case Keheh:
|
||||||
|
runes[i] = Kaf
|
||||||
|
case HehYeh, HehGoal:
|
||||||
|
runes[i] = Heh
|
||||||
|
case HamzaAbove: // necessary for HEH + HAMZA
|
||||||
|
runes = analysis.DeleteRune(runes, i)
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewPersianNormalizeFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||||
|
}
|
125
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize_test.go
generated
vendored
Normal file
125
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/persian_normalize_test.go
generated
vendored
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fa
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPersianNormalizeFilter(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// FarsiYeh
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("های"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هاي"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// YehBarree
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هاے"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("هاي"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Keheh
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("کشاندن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كشاندن"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// HehYeh
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كتابۀ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كتابه"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// HehHamzaAbove
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كتابهٔ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("كتابه"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// HehGoal
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("زادہ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("زاده"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// empty
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
persianNormalizeFilter := NewPersianNormalizeFilter()
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := persianNormalizeFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||||
|
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_filter_fa.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_filter_fa.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fa
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
337
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_words_fa.go
generated
vendored
Normal file
337
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fa/stop_words_fa.go
generated
vendored
Normal file
@ -0,0 +1,337 @@
|
|||||||
|
package fa
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_fa"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
# Note: by default this file is used after normalization, so when adding entries
|
||||||
|
# to this file, use the arabic 'ي' instead of 'ی'
|
||||||
|
انان
|
||||||
|
نداشته
|
||||||
|
سراسر
|
||||||
|
خياه
|
||||||
|
ايشان
|
||||||
|
وي
|
||||||
|
تاكنون
|
||||||
|
بيشتري
|
||||||
|
دوم
|
||||||
|
پس
|
||||||
|
ناشي
|
||||||
|
وگو
|
||||||
|
يا
|
||||||
|
داشتند
|
||||||
|
سپس
|
||||||
|
هنگام
|
||||||
|
هرگز
|
||||||
|
پنج
|
||||||
|
نشان
|
||||||
|
امسال
|
||||||
|
ديگر
|
||||||
|
گروهي
|
||||||
|
شدند
|
||||||
|
چطور
|
||||||
|
ده
|
||||||
|
و
|
||||||
|
دو
|
||||||
|
نخستين
|
||||||
|
ولي
|
||||||
|
چرا
|
||||||
|
چه
|
||||||
|
وسط
|
||||||
|
ه
|
||||||
|
كدام
|
||||||
|
قابل
|
||||||
|
يك
|
||||||
|
رفت
|
||||||
|
هفت
|
||||||
|
همچنين
|
||||||
|
در
|
||||||
|
هزار
|
||||||
|
بله
|
||||||
|
بلي
|
||||||
|
شايد
|
||||||
|
اما
|
||||||
|
شناسي
|
||||||
|
گرفته
|
||||||
|
دهد
|
||||||
|
داشته
|
||||||
|
دانست
|
||||||
|
داشتن
|
||||||
|
خواهيم
|
||||||
|
ميليارد
|
||||||
|
وقتيكه
|
||||||
|
امد
|
||||||
|
خواهد
|
||||||
|
جز
|
||||||
|
اورده
|
||||||
|
شده
|
||||||
|
بلكه
|
||||||
|
خدمات
|
||||||
|
شدن
|
||||||
|
برخي
|
||||||
|
نبود
|
||||||
|
بسياري
|
||||||
|
جلوگيري
|
||||||
|
حق
|
||||||
|
كردند
|
||||||
|
نوعي
|
||||||
|
بعري
|
||||||
|
نكرده
|
||||||
|
نظير
|
||||||
|
نبايد
|
||||||
|
بوده
|
||||||
|
بودن
|
||||||
|
داد
|
||||||
|
اورد
|
||||||
|
هست
|
||||||
|
جايي
|
||||||
|
شود
|
||||||
|
دنبال
|
||||||
|
داده
|
||||||
|
بايد
|
||||||
|
سابق
|
||||||
|
هيچ
|
||||||
|
همان
|
||||||
|
انجا
|
||||||
|
كمتر
|
||||||
|
كجاست
|
||||||
|
گردد
|
||||||
|
كسي
|
||||||
|
تر
|
||||||
|
مردم
|
||||||
|
تان
|
||||||
|
دادن
|
||||||
|
بودند
|
||||||
|
سري
|
||||||
|
جدا
|
||||||
|
ندارند
|
||||||
|
مگر
|
||||||
|
يكديگر
|
||||||
|
دارد
|
||||||
|
دهند
|
||||||
|
بنابراين
|
||||||
|
هنگامي
|
||||||
|
سمت
|
||||||
|
جا
|
||||||
|
انچه
|
||||||
|
خود
|
||||||
|
دادند
|
||||||
|
زياد
|
||||||
|
دارند
|
||||||
|
اثر
|
||||||
|
بدون
|
||||||
|
بهترين
|
||||||
|
بيشتر
|
||||||
|
البته
|
||||||
|
به
|
||||||
|
براساس
|
||||||
|
بيرون
|
||||||
|
كرد
|
||||||
|
بعضي
|
||||||
|
گرفت
|
||||||
|
توي
|
||||||
|
اي
|
||||||
|
ميليون
|
||||||
|
او
|
||||||
|
جريان
|
||||||
|
تول
|
||||||
|
بر
|
||||||
|
مانند
|
||||||
|
برابر
|
||||||
|
باشيم
|
||||||
|
مدتي
|
||||||
|
گويند
|
||||||
|
اكنون
|
||||||
|
تا
|
||||||
|
تنها
|
||||||
|
جديد
|
||||||
|
چند
|
||||||
|
بي
|
||||||
|
نشده
|
||||||
|
كردن
|
||||||
|
كردم
|
||||||
|
گويد
|
||||||
|
كرده
|
||||||
|
كنيم
|
||||||
|
نمي
|
||||||
|
نزد
|
||||||
|
روي
|
||||||
|
قصد
|
||||||
|
فقط
|
||||||
|
بالاي
|
||||||
|
ديگران
|
||||||
|
اين
|
||||||
|
ديروز
|
||||||
|
توسط
|
||||||
|
سوم
|
||||||
|
ايم
|
||||||
|
دانند
|
||||||
|
سوي
|
||||||
|
استفاده
|
||||||
|
شما
|
||||||
|
كنار
|
||||||
|
داريم
|
||||||
|
ساخته
|
||||||
|
طور
|
||||||
|
امده
|
||||||
|
رفته
|
||||||
|
نخست
|
||||||
|
بيست
|
||||||
|
نزديك
|
||||||
|
طي
|
||||||
|
كنيد
|
||||||
|
از
|
||||||
|
انها
|
||||||
|
تمامي
|
||||||
|
داشت
|
||||||
|
يكي
|
||||||
|
طريق
|
||||||
|
اش
|
||||||
|
چيست
|
||||||
|
روب
|
||||||
|
نمايد
|
||||||
|
گفت
|
||||||
|
چندين
|
||||||
|
چيزي
|
||||||
|
تواند
|
||||||
|
ام
|
||||||
|
ايا
|
||||||
|
با
|
||||||
|
ان
|
||||||
|
ايد
|
||||||
|
ترين
|
||||||
|
اينكه
|
||||||
|
ديگري
|
||||||
|
راه
|
||||||
|
هايي
|
||||||
|
بروز
|
||||||
|
همچنان
|
||||||
|
پاعين
|
||||||
|
كس
|
||||||
|
حدود
|
||||||
|
مختلف
|
||||||
|
مقابل
|
||||||
|
چيز
|
||||||
|
گيرد
|
||||||
|
ندارد
|
||||||
|
ضد
|
||||||
|
همچون
|
||||||
|
سازي
|
||||||
|
شان
|
||||||
|
مورد
|
||||||
|
باره
|
||||||
|
مرسي
|
||||||
|
خويش
|
||||||
|
برخوردار
|
||||||
|
چون
|
||||||
|
خارج
|
||||||
|
شش
|
||||||
|
هنوز
|
||||||
|
تحت
|
||||||
|
ضمن
|
||||||
|
هستيم
|
||||||
|
گفته
|
||||||
|
فكر
|
||||||
|
بسيار
|
||||||
|
پيش
|
||||||
|
براي
|
||||||
|
روزهاي
|
||||||
|
انكه
|
||||||
|
نخواهد
|
||||||
|
بالا
|
||||||
|
كل
|
||||||
|
وقتي
|
||||||
|
كي
|
||||||
|
چنين
|
||||||
|
كه
|
||||||
|
گيري
|
||||||
|
نيست
|
||||||
|
است
|
||||||
|
كجا
|
||||||
|
كند
|
||||||
|
نيز
|
||||||
|
يابد
|
||||||
|
بندي
|
||||||
|
حتي
|
||||||
|
توانند
|
||||||
|
عقب
|
||||||
|
خواست
|
||||||
|
كنند
|
||||||
|
بين
|
||||||
|
تمام
|
||||||
|
همه
|
||||||
|
ما
|
||||||
|
باشند
|
||||||
|
مثل
|
||||||
|
شد
|
||||||
|
اري
|
||||||
|
باشد
|
||||||
|
اره
|
||||||
|
طبق
|
||||||
|
بعد
|
||||||
|
اگر
|
||||||
|
صورت
|
||||||
|
غير
|
||||||
|
جاي
|
||||||
|
بيش
|
||||||
|
ريزي
|
||||||
|
اند
|
||||||
|
زيرا
|
||||||
|
چگونه
|
||||||
|
بار
|
||||||
|
لطفا
|
||||||
|
مي
|
||||||
|
درباره
|
||||||
|
من
|
||||||
|
ديده
|
||||||
|
همين
|
||||||
|
گذاري
|
||||||
|
برداري
|
||||||
|
علت
|
||||||
|
گذاشته
|
||||||
|
هم
|
||||||
|
فوق
|
||||||
|
نه
|
||||||
|
ها
|
||||||
|
شوند
|
||||||
|
اباد
|
||||||
|
همواره
|
||||||
|
هر
|
||||||
|
اول
|
||||||
|
خواهند
|
||||||
|
چهار
|
||||||
|
نام
|
||||||
|
امروز
|
||||||
|
مان
|
||||||
|
هاي
|
||||||
|
قبل
|
||||||
|
كنم
|
||||||
|
سعي
|
||||||
|
تازه
|
||||||
|
را
|
||||||
|
هستند
|
||||||
|
زير
|
||||||
|
جلوي
|
||||||
|
عنوان
|
||||||
|
بود
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(PersianStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi.go
generated
vendored
Normal file
54
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/icu"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "fi"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopFiFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: icuTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopFiFilter,
|
||||||
|
stemmerFiFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
68
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi_test.go
generated
vendored
Normal file
68
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/analyzer_fi_test.go
generated
vendored
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
// +build icu full
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFinishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("edeltäjiinsä"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("edeltäj"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("edeltäjistään"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("edeltäj"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("olla"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stemmer_fi.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stemmer_fi.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// +build libstemmer full
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StemmerName = "stemmer_fi"
|
||||||
|
|
||||||
|
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return stemmer_filter.NewStemmerFilter("fi")
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||||
|
}
|
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_filter_fi.go
generated
vendored
Normal file
28
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_filter_fi.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
121
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_words_fi.go
generated
vendored
Normal file
121
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fi/stop_words_fi.go
generated
vendored
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_fi"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| forms of BE
|
||||||
|
|
||||||
|
olla
|
||||||
|
olen
|
||||||
|
olet
|
||||||
|
on
|
||||||
|
olemme
|
||||||
|
olette
|
||||||
|
ovat
|
||||||
|
ole | negative form
|
||||||
|
|
||||||
|
oli
|
||||||
|
olisi
|
||||||
|
olisit
|
||||||
|
olisin
|
||||||
|
olisimme
|
||||||
|
olisitte
|
||||||
|
olisivat
|
||||||
|
olit
|
||||||
|
olin
|
||||||
|
olimme
|
||||||
|
olitte
|
||||||
|
olivat
|
||||||
|
ollut
|
||||||
|
olleet
|
||||||
|
|
||||||
|
en | negation
|
||||||
|
et
|
||||||
|
ei
|
||||||
|
emme
|
||||||
|
ette
|
||||||
|
eivät
|
||||||
|
|
||||||
|
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
|
||||||
|
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
|
||||||
|
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
|
||||||
|
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
|
||||||
|
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
|
||||||
|
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
|
||||||
|
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
|
||||||
|
|
||||||
|
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
|
||||||
|
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
|
||||||
|
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
|
||||||
|
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
|
||||||
|
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
|
||||||
|
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
|
||||||
|
|
||||||
|
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
|
||||||
|
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
|
||||||
|
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
|
||||||
|
mitkä | (pl)
|
||||||
|
|
||||||
|
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
|
||||||
|
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
|
||||||
|
|
||||||
|
| conjunctions
|
||||||
|
|
||||||
|
että | that
|
||||||
|
ja | and
|
||||||
|
jos | if
|
||||||
|
koska | because
|
||||||
|
kuin | than
|
||||||
|
mutta | but
|
||||||
|
niin | so
|
||||||
|
sekä | and
|
||||||
|
sillä | for
|
||||||
|
tai | or
|
||||||
|
vaan | but
|
||||||
|
vai | or
|
||||||
|
vaikka | although
|
||||||
|
|
||||||
|
|
||||||
|
| prepositions
|
||||||
|
|
||||||
|
kanssa | with
|
||||||
|
mukaan | according to
|
||||||
|
noin | about
|
||||||
|
poikki | across
|
||||||
|
yli | over, across
|
||||||
|
|
||||||
|
| other
|
||||||
|
|
||||||
|
kun | when
|
||||||
|
niin | so
|
||||||
|
nyt | now
|
||||||
|
itse | self
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(FinnishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr.go
generated
vendored
Normal file
56
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr.go
generated
vendored
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "fr"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopFrFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerFrFilter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: tokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
elisionFilter,
|
||||||
|
toLowerFilter,
|
||||||
|
stopFrFilter,
|
||||||
|
stemmerFrFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr_test.go
generated
vendored
Normal file
196
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/analyzer_fr_test.go
generated
vendored
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFrenchAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte(""),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("chien chat cheval"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chien"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chat"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("chien CHAT CHEVAL"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chien"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chat"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte(" chien ,? + = - CHAT /: > CHEVAL"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chien"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chat"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("chien++"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chien"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("mot \"entreguillemet\""),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("mot"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("entreguilemet"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("Jean-François"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("jean"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("francoi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop words
|
||||||
|
{
|
||||||
|
input: []byte("le la chien les aux chat du des à cheval"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chien"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chat"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// nouns and adjectives
|
||||||
|
{
|
||||||
|
input: []byte("lances chismes habitable chiste éléments captifs"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("lanc"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chism"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("habitabl"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chist"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("element"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("captif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// verbs
|
||||||
|
{
|
||||||
|
input: []byte("finissions souffrirent rugissante"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("finision"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("soufrirent"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("rugisant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("c3po"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("aujourd'hui"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("oeuf"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ïaöuaä"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("anticonstitutionel"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("java"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
37
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/articles_fr.go
generated
vendored
Normal file
37
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/articles_fr.go
generated
vendored
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ArticlesName = "articles_fr"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||||
|
|
||||||
|
var FrenchArticles = []byte(`
|
||||||
|
l
|
||||||
|
m
|
||||||
|
t
|
||||||
|
qu
|
||||||
|
n
|
||||||
|
s
|
||||||
|
j
|
||||||
|
d
|
||||||
|
c
|
||||||
|
jusqu
|
||||||
|
quoiqu
|
||||||
|
lorsqu
|
||||||
|
puisqu
|
||||||
|
`)
|
||||||
|
|
||||||
|
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(FrenchArticles)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||||
|
}
|
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr.go
generated
vendored
Normal file
32
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr.go
generated
vendored
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ElisionName = "elision_fr"
|
||||||
|
|
||||||
|
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||||
|
}
|
||||||
|
return elision_filter.NewElisionFilter(articlesTokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||||
|
}
|
50
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr_test.go
generated
vendored
Normal file
50
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/elision_fr_test.go
generated
vendored
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
// Copyright (c) 2014 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFrenchElision(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("l'avion"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("avion"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := elisionFilter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
308
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr.go
generated
vendored
Normal file
308
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr.go
generated
vendored
Normal file
@ -0,0 +1,308 @@
|
|||||||
|
// Copyright (c) 2015 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const LightStemmerName = "stemmer_fr_light"
|
||||||
|
|
||||||
|
type FrenchLightStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewFrenchLightStemmerFilter() *FrenchLightStemmerFilter {
|
||||||
|
return &FrenchLightStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
runes := bytes.Runes(token.Term)
|
||||||
|
runes = stem(runes)
|
||||||
|
token.Term = analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func stem(input []rune) []rune {
|
||||||
|
|
||||||
|
inputLen := len(input)
|
||||||
|
|
||||||
|
if inputLen > 5 && input[inputLen-1] == 'x' {
|
||||||
|
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' && input[inputLen-4] != 'e' {
|
||||||
|
input[inputLen-2] = 'l'
|
||||||
|
}
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 3 && input[inputLen-1] == 'x' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 3 && input[inputLen-1] == 's' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 9 && analysis.RunesEndsWith(input, "issement") {
|
||||||
|
input = input[0 : inputLen-6]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "issant") {
|
||||||
|
input = input[0 : inputLen-4]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 6 && analysis.RunesEndsWith(input, "ement") {
|
||||||
|
input = input[0 : inputLen-4]
|
||||||
|
inputLen = len(input)
|
||||||
|
if inputLen > 3 && analysis.RunesEndsWith(input, "ive") {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'f'
|
||||||
|
}
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 11 && analysis.RunesEndsWith(input, "ficatrice") {
|
||||||
|
input = input[0 : inputLen-5]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 10 && analysis.RunesEndsWith(input, "ficateur") {
|
||||||
|
input = input[0 : inputLen-4]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 9 && analysis.RunesEndsWith(input, "catrice") {
|
||||||
|
input = input[0 : inputLen-3]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-4] = 'q'
|
||||||
|
input[inputLen-3] = 'u'
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
//s[len-1] = 'r' <-- unnecessary, already 'r'.
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "cateur") {
|
||||||
|
input = input[0 : inputLen-2]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-4] = 'q'
|
||||||
|
input[inputLen-3] = 'u'
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "atrice") {
|
||||||
|
input = input[0 : inputLen-4]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 7 && analysis.RunesEndsWith(input, "ateur") {
|
||||||
|
input = input[0 : inputLen-3]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 6 && analysis.RunesEndsWith(input, "trice") {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-3] = 'e'
|
||||||
|
input[inputLen-2] = 'u'
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 5 && analysis.RunesEndsWith(input, "ième") {
|
||||||
|
return norm(input[0 : inputLen-4])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 7 && analysis.RunesEndsWith(input, "teuse") {
|
||||||
|
input = input[0 : inputLen-2]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 6 && analysis.RunesEndsWith(input, "teur") {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'r'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 5 && analysis.RunesEndsWith(input, "euse") {
|
||||||
|
return norm(input[0 : inputLen-2])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "ère") {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 7 && analysis.RunesEndsWith(input, "ive") {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'f'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 4 &&
|
||||||
|
(analysis.RunesEndsWith(input, "folle") ||
|
||||||
|
analysis.RunesEndsWith(input, "molle")) {
|
||||||
|
input = input[0 : inputLen-2]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-1] = 'u'
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 9 && analysis.RunesEndsWith(input, "nnelle") {
|
||||||
|
return norm(input[0 : inputLen-5])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 9 && analysis.RunesEndsWith(input, "nnel") {
|
||||||
|
return norm(input[0 : inputLen-3])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 4 && analysis.RunesEndsWith(input, "ète") {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "ique") {
|
||||||
|
input = input[0 : inputLen-4]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "esse") {
|
||||||
|
return norm(input[0 : inputLen-3])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 7 && analysis.RunesEndsWith(input, "inage") {
|
||||||
|
return norm(input[0 : inputLen-3])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 9 && analysis.RunesEndsWith(input, "isation") {
|
||||||
|
input = input[0 : inputLen-7]
|
||||||
|
inputLen = len(input)
|
||||||
|
if inputLen > 5 && analysis.RunesEndsWith(input, "ual") {
|
||||||
|
input[inputLen-2] = 'e'
|
||||||
|
}
|
||||||
|
return norm(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 9 && analysis.RunesEndsWith(input, "isateur") {
|
||||||
|
return norm(input[0 : inputLen-7])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "ation") {
|
||||||
|
return norm(input[0 : inputLen-5])
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 8 && analysis.RunesEndsWith(input, "ition") {
|
||||||
|
return norm(input[0 : inputLen-5])
|
||||||
|
}
|
||||||
|
|
||||||
|
return norm(input)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func norm(input []rune) []rune {
|
||||||
|
|
||||||
|
inputLen := len(input)
|
||||||
|
if inputLen > 4 {
|
||||||
|
for i := 0; i < inputLen; i++ {
|
||||||
|
switch input[i] {
|
||||||
|
case 'à', 'á', 'â':
|
||||||
|
input[i] = 'a'
|
||||||
|
case 'ô':
|
||||||
|
input[i] = 'o'
|
||||||
|
case 'è', 'é', 'ê':
|
||||||
|
input[i] = 'e'
|
||||||
|
case 'ù', 'û':
|
||||||
|
input[i] = 'u'
|
||||||
|
case 'î':
|
||||||
|
input[i] = 'i'
|
||||||
|
case 'ç':
|
||||||
|
input[i] = 'c'
|
||||||
|
}
|
||||||
|
|
||||||
|
ch := input[0]
|
||||||
|
for i := 1; i < inputLen; i++ {
|
||||||
|
if input[i] == ch && unicode.IsLetter(ch) {
|
||||||
|
input = analysis.DeleteRune(input, i)
|
||||||
|
i -= 1
|
||||||
|
inputLen = len(input)
|
||||||
|
} else {
|
||||||
|
ch = input[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 4 && analysis.RunesEndsWith(input, "ie") {
|
||||||
|
input = input[0 : inputLen-2]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
if inputLen > 4 {
|
||||||
|
if input[inputLen-1] == 'r' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == 'e' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == 'e' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == input[inputLen-2] && unicode.IsLetter(input[inputLen-1]) {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func FrenchLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewFrenchLightStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(LightStemmerName, FrenchLightStemmerFilterConstructor)
|
||||||
|
}
|
997
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr_test.go
generated
vendored
Normal file
997
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/light_stemmer_fr_test.go
generated
vendored
Normal file
@ -0,0 +1,997 @@
|
|||||||
|
// Copyright (c) 2015 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFrenchLightStemmer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input analysis.TokenStream
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chevaux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cheval"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("hiboux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("hibou"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("hibou"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("hibou"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chantés"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chanter"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chante"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("chant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("baronnes"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("baron"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("barons"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("baron"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("baron"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("baron"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("peaux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("peau"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("peau"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("peau"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("anneaux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("aneau"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("anneau"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("aneau"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("neveux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("neveu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("neveu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("neveu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("affreux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("afreu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("affreuse"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("afreu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("investissement"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("investi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("investir"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("investi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("assourdissant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("asourdi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("assourdir"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("asourdi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("pratiquement"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("pratiqu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("pratique"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("pratiqu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administrativement"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administratif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administratif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administratif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("justificatrice"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("justifi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("justificateur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("justifi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("justifier"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("justifi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("educatrice"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("eduqu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("eduquer"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("eduqu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("communicateur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("comuniqu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("communiquer"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("comuniqu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("accompagnatrice"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("acompagn"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("accompagnateur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("acompagn"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administrateur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administr"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administrer"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("administr"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("productrice"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("product"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("producteur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("product"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("acheteuse"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("achet"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("acheteur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("achet"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("planteur"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("plant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("plante"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("plant"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("poreuse"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("poreu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("poreux"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("poreu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("plieuse"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("plieu"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("bijoutière"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("bijouti"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("bijoutier"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("bijouti"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("caissière"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("caisi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("caissier"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("caisi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abrasive"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abrasif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abrasif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abrasif"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("folle"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("fou"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("fou"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("fou"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("personnelle"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("person"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("personne"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("person"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// algo bug: too short length
|
||||||
|
// {
|
||||||
|
// input: analysis.TokenStream{
|
||||||
|
// &analysis.Token{
|
||||||
|
// Term: []byte("personnel"),
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
// output: analysis.TokenStream{
|
||||||
|
// &analysis.Token{
|
||||||
|
// Term: []byte("person"),
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("complète"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("complet"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("complet"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("complet"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("aromatique"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("aromat"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("faiblesse"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("faibl"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("faible"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("faibl"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("patinage"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("patin"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("patin"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("patin"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("sonorisation"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("sono"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ritualisation"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("rituel"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("rituel"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("rituel"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// algo bug: masked by rules above
|
||||||
|
// {
|
||||||
|
// input: analysis.TokenStream{
|
||||||
|
// &analysis.Token{
|
||||||
|
// Term: []byte("colonisateur"),
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
// output: analysis.TokenStream{
|
||||||
|
// &analysis.Token{
|
||||||
|
// Term: []byte("colon"),
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
// },
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("nomination"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("nomin"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("disposition"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("dispos"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("dispose"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("dispos"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// SOLR-3463 : abusive compression of repeated characters in numbers
|
||||||
|
// Trailing repeated char elision :
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1234555"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1234555"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Repeated char within numbers with more than 4 characters :
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("12333345"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("12333345"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Short numbers weren't affected already:
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1234"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("1234"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Ensure behaviour is preserved for words!
|
||||||
|
// Trailing repeated char elision :
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abcdeff"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abcdef"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Repeated char within words with more than 4 characters :
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abcccddeef"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("abcdef"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("créées"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("cre"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Combined letter and digit repetition
|
||||||
|
// 10:00pm
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("22hh00"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("22h00"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
filter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := filter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
81
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/minimal_stemmer_fr.go
generated
vendored
Normal file
81
Godeps/_workspace/src/github.com/blevesearch/bleve/analysis/language/fr/minimal_stemmer_fr.go
generated
vendored
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
// Copyright (c) 2015 Couchbase, Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
|
// and limitations under the License.
|
||||||
|
|
||||||
|
package fr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/khlieng/name_pending/Godeps/_workspace/src/github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const MinimalStemmerName = "stemmer_fr_min"
|
||||||
|
|
||||||
|
type FrenchMinimalStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewFrenchMinimalStemmerFilter() *FrenchMinimalStemmerFilter {
|
||||||
|
return &FrenchMinimalStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
runes := bytes.Runes(token.Term)
|
||||||
|
runes = minstem(runes)
|
||||||
|
token.Term = analysis.BuildTermFromRunes(runes)
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func minstem(input []rune) []rune {
|
||||||
|
|
||||||
|
inputLen := len(input)
|
||||||
|
|
||||||
|
if inputLen < 6 {
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
if input[inputLen-1] == 'x' {
|
||||||
|
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' {
|
||||||
|
input[inputLen-2] = 'l'
|
||||||
|
}
|
||||||
|
return input[0 : inputLen-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
if input[inputLen-1] == 's' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == 'r' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == 'e' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == 'é' {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
if input[inputLen-1] == input[inputLen-2] {
|
||||||
|
input = input[0 : inputLen-1]
|
||||||
|
inputLen = len(input)
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func FrenchMinimalStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewFrenchMinimalStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(MinimalStemmerName, FrenchMinimalStemmerFilterConstructor)
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user