Upgrade server dependencies, manage them with govendor

This commit is contained in:
Ken-Håvard Lieng 2017-04-18 03:02:51 +02:00
parent ebee2746d6
commit 971278e7e5
1748 changed files with 196165 additions and 194500 deletions

View File

@ -24,8 +24,8 @@ func Run(dir, domain, email, port string) (*state, error) {
return nil, err return nil, err
} }
client, err := acme.NewClient(URL, &user, KeySize) client, err := acme.NewClient(URL, &user, acme.RSA2048)
client.ExcludeChallenges([]string{"tls-sni-01"}) client.ExcludeChallenges([]acme.Challenge{acme.TLSSNI01})
client.SetHTTPAddress(port) client.SetHTTPAddress(port)
if user.Registration == nil { if user.Registration == nil {
@ -123,7 +123,7 @@ func (s *state) setOCSP(ocsp []byte) {
} }
func (s *state) obtain() error { func (s *state) obtain() error {
cert, errors := s.client.ObtainCertificate([]string{s.domain}, true, nil) cert, errors := s.client.ObtainCertificate([]string{s.domain}, true, nil, false)
if err := errors[s.domain]; err != nil { if err := errors[s.domain]; err != nil {
if _, ok := err.(acme.TOSError); ok { if _, ok := err.(acme.TOSError); ok {
err := s.client.AgreeToTOS() err := s.client.AgreeToTOS()
@ -180,7 +180,7 @@ func (s *state) renew() bool {
meta.PrivateKey = key meta.PrivateKey = key
Renew: Renew:
newMeta, err := s.client.RenewCertificate(meta, true) newMeta, err := s.client.RenewCertificate(meta, true, false)
if err != nil { if err != nil {
if _, ok := err.(acme.TOSError); ok { if _, ok := err.(acme.TOSError); ok {
err := s.client.AgreeToTOS() err := s.client.AgreeToTOS()

View File

@ -1,6 +1,7 @@
package letsencrypt package letsencrypt
import ( import (
"crypto"
"crypto/rand" "crypto/rand"
"crypto/rsa" "crypto/rsa"
"crypto/x509" "crypto/x509"
@ -17,7 +18,7 @@ const defaultUser = "default"
type User struct { type User struct {
Email string Email string
Registration *acme.RegistrationResource Registration *acme.RegistrationResource
key *rsa.PrivateKey key crypto.PrivateKey
} }
func (u User) GetEmail() string { func (u User) GetEmail() string {
@ -28,7 +29,7 @@ func (u User) GetRegistration() *acme.RegistrationResource {
return u.Registration return u.Registration
} }
func (u User) GetPrivateKey() *rsa.PrivateKey { func (u User) GetPrivateKey() crypto.PrivateKey {
return u.key return u.key
} }
@ -86,7 +87,7 @@ func saveUser(user User) error {
return ioutil.WriteFile(directory.UserRegistration(user.Email), jsonBytes, 0600) return ioutil.WriteFile(directory.UserRegistration(user.Email), jsonBytes, 0600)
} }
func loadRSAPrivateKey(file string) (*rsa.PrivateKey, error) { func loadRSAPrivateKey(file string) (crypto.PrivateKey, error) {
keyBytes, err := ioutil.ReadFile(file) keyBytes, err := ioutil.ReadFile(file)
if err != nil { if err != nil {
return nil, err return nil, err
@ -95,8 +96,10 @@ func loadRSAPrivateKey(file string) (*rsa.PrivateKey, error) {
return x509.ParsePKCS1PrivateKey(keyBlock.Bytes) return x509.ParsePKCS1PrivateKey(keyBlock.Bytes)
} }
func saveRSAPrivateKey(key *rsa.PrivateKey, file string) error { func saveRSAPrivateKey(key crypto.PrivateKey, file string) error {
pemKey := pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(key)} pemKey := pem.Block{
Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(key.(*rsa.PrivateKey)),
}
keyOut, err := os.Create(file) keyOut, err := os.Create(file)
if err != nil { if err != nil {
return err return err

View File

@ -5,7 +5,6 @@ import (
"testing" "testing"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/xenolf/lego/acme"
) )
func tempdir() string { func tempdir() string {
@ -14,18 +13,10 @@ func tempdir() string {
} }
func testUser(t *testing.T, email string) { func testUser(t *testing.T, email string) {
reg := &acme.RegistrationResource{
URI: "test.com",
Body: acme.Registration{
Agreement: "agree?",
},
}
user, err := newUser(email) user, err := newUser(email)
assert.Nil(t, err) assert.Nil(t, err)
key := user.GetPrivateKey() key := user.GetPrivateKey()
assert.NotNil(t, key) assert.NotNil(t, key)
user.Registration = reg
err = saveUser(user) err = saveUser(user)
assert.Nil(t, err) assert.Nil(t, err)
@ -34,7 +25,6 @@ func testUser(t *testing.T, email string) {
assert.Nil(t, err) assert.Nil(t, err)
assert.Equal(t, email, user.GetEmail()) assert.Equal(t, email, user.GetEmail())
assert.Equal(t, key, user.GetPrivateKey()) assert.Equal(t, key, user.GetPrivateKey())
assert.Equal(t, reg, user.GetRegistration())
} }
func TestUser(t *testing.T) { func TestUser(t *testing.T) {

View File

@ -55,7 +55,8 @@ func handleAuth(w http.ResponseWriter, r *http.Request) *Session {
token, err := parseToken(cookie.Value) token, err := parseToken(cookie.Value)
if err == nil && token.Valid { if err == nil && token.Valid {
userID := uint64(token.Claims["UserID"].(float64)) claims := token.Claims.(jwt.MapClaims)
userID := uint64(claims["UserID"].(float64))
log.Println(r.RemoteAddr, "[Auth] GET", r.URL.Path, "| Valid token | User ID:", userID) log.Println(r.RemoteAddr, "[Auth] GET", r.URL.Path, "| Valid token | User ID:", userID)
@ -91,7 +92,8 @@ func newUser(w http.ResponseWriter, r *http.Request) *Session {
go session.run() go session.run()
token := jwt.New(jwt.SigningMethodHS256) token := jwt.New(jwt.SigningMethodHS256)
token.Claims["UserID"] = user.ID claims := token.Claims.(jwt.MapClaims)
claims["UserID"] = user.ID
tokenString, err := token.SignedString(hmacKey) tokenString, err := token.SignedString(hmacKey)
if err != nil { if err != nil {
return nil return nil

View File

@ -118,7 +118,8 @@ func (u *User) SearchMessages(server, channel, q string) ([]Message, error) {
contentQuery.SetField("content") contentQuery.SetField("content")
contentQuery.SetFuzziness(2) contentQuery.SetFuzziness(2)
query := bleve.NewBooleanQuery([]bleve.Query{serverQuery, channelQuery, contentQuery}, nil, nil) query := bleve.NewBooleanQuery()
query.AddMust(serverQuery, channelQuery, contentQuery)
search := bleve.NewSearchRequest(query) search := bleve.NewSearchRequest(query)
searchResults, err := u.messageIndex.Search(search) searchResults, err := u.messageIndex.Search(search)

View File

@ -1,3 +0,0 @@
Compatible with TOML version
[v0.2.0](https://github.com/mojombo/toml/blob/master/versions/toml-v0.2.0.md)

View File

@ -1,14 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

View File

@ -1,19 +0,0 @@
install:
go install ./...
test: install
go test -v
toml-test toml-test-decoder
toml-test -encoder toml-test-encoder
fmt:
gofmt -w *.go */*.go
colcheck *.go */*.go
tags:
find ./ -name '*.go' -print0 | xargs -0 gotags > TAGS
push:
git push origin master
git push github master

View File

@ -1,220 +0,0 @@
## TOML parser and encoder for Go with reflection
TOML stands for Tom's Obvious, Minimal Language. This Go package provides a
reflection interface similar to Go's standard library `json` and `xml`
packages. This package also supports the `encoding.TextUnmarshaler` and
`encoding.TextMarshaler` interfaces so that you can define custom data
representations. (There is an example of this below.)
Spec: https://github.com/mojombo/toml
Compatible with TOML version
[v0.2.0](https://github.com/toml-lang/toml/blob/master/versions/en/toml-v0.2.0.md)
Documentation: http://godoc.org/github.com/BurntSushi/toml
Installation:
```bash
go get github.com/BurntSushi/toml
```
Try the toml validator:
```bash
go get github.com/BurntSushi/toml/cmd/tomlv
tomlv some-toml-file.toml
```
[![Build status](https://api.travis-ci.org/BurntSushi/toml.png)](https://travis-ci.org/BurntSushi/toml)
### Testing
This package passes all tests in
[toml-test](https://github.com/BurntSushi/toml-test) for both the decoder
and the encoder.
### Examples
This package works similarly to how the Go standard library handles `XML`
and `JSON`. Namely, data is loaded into Go values via reflection.
For the simplest example, consider some TOML file as just a list of keys
and values:
```toml
Age = 25
Cats = [ "Cauchy", "Plato" ]
Pi = 3.14
Perfection = [ 6, 28, 496, 8128 ]
DOB = 1987-07-05T05:45:00Z
```
Which could be defined in Go as:
```go
type Config struct {
Age int
Cats []string
Pi float64
Perfection []int
DOB time.Time // requires `import time`
}
```
And then decoded with:
```go
var conf Config
if _, err := toml.Decode(tomlData, &conf); err != nil {
// handle error
}
```
You can also use struct tags if your struct field name doesn't map to a TOML
key value directly:
```toml
some_key_NAME = "wat"
```
```go
type TOML struct {
ObscureKey string `toml:"some_key_NAME"`
}
```
### Using the `encoding.TextUnmarshaler` interface
Here's an example that automatically parses duration strings into
`time.Duration` values:
```toml
[[song]]
name = "Thunder Road"
duration = "4m49s"
[[song]]
name = "Stairway to Heaven"
duration = "8m03s"
```
Which can be decoded with:
```go
type song struct {
Name string
Duration duration
}
type songs struct {
Song []song
}
var favorites songs
if _, err := toml.Decode(blob, &favorites); err != nil {
log.Fatal(err)
}
for _, s := range favorites.Song {
fmt.Printf("%s (%s)\n", s.Name, s.Duration)
}
```
And you'll also need a `duration` type that satisfies the
`encoding.TextUnmarshaler` interface:
```go
type duration struct {
time.Duration
}
func (d *duration) UnmarshalText(text []byte) error {
var err error
d.Duration, err = time.ParseDuration(string(text))
return err
}
```
### More complex usage
Here's an example of how to load the example from the official spec page:
```toml
# This is a TOML document. Boom.
title = "TOML Example"
[owner]
name = "Tom Preston-Werner"
organization = "GitHub"
bio = "GitHub Cofounder & CEO\nLikes tater tots and beer."
dob = 1979-05-27T07:32:00Z # First class dates? Why not?
[database]
server = "192.168.1.1"
ports = [ 8001, 8001, 8002 ]
connection_max = 5000
enabled = true
[servers]
# You can indent as you please. Tabs or spaces. TOML don't care.
[servers.alpha]
ip = "10.0.0.1"
dc = "eqdc10"
[servers.beta]
ip = "10.0.0.2"
dc = "eqdc10"
[clients]
data = [ ["gamma", "delta"], [1, 2] ] # just an update to make sure parsers support it
# Line breaks are OK when inside arrays
hosts = [
"alpha",
"omega"
]
```
And the corresponding Go types are:
```go
type tomlConfig struct {
Title string
Owner ownerInfo
DB database `toml:"database"`
Servers map[string]server
Clients clients
}
type ownerInfo struct {
Name string
Org string `toml:"organization"`
Bio string
DOB time.Time
}
type database struct {
Server string
Ports []int
ConnMax int `toml:"connection_max"`
Enabled bool
}
type server struct {
IP string
DC string
}
type clients struct {
Data [][]interface{}
Hosts []string
}
```
Note that a case insensitive match will be tried if an exact match can't be
found.
A working example of the above can be found in `_examples/example.{go,toml}`.

View File

@ -1,61 +0,0 @@
package main
import (
"fmt"
"time"
"github.com/BurntSushi/toml"
)
type tomlConfig struct {
Title string
Owner ownerInfo
DB database `toml:"database"`
Servers map[string]server
Clients clients
}
type ownerInfo struct {
Name string
Org string `toml:"organization"`
Bio string
DOB time.Time
}
type database struct {
Server string
Ports []int
ConnMax int `toml:"connection_max"`
Enabled bool
}
type server struct {
IP string
DC string
}
type clients struct {
Data [][]interface{}
Hosts []string
}
func main() {
var config tomlConfig
if _, err := toml.DecodeFile("example.toml", &config); err != nil {
fmt.Println(err)
return
}
fmt.Printf("Title: %s\n", config.Title)
fmt.Printf("Owner: %s (%s, %s), Born: %s\n",
config.Owner.Name, config.Owner.Org, config.Owner.Bio,
config.Owner.DOB)
fmt.Printf("Database: %s %v (Max conn. %d), Enabled? %v\n",
config.DB.Server, config.DB.Ports, config.DB.ConnMax,
config.DB.Enabled)
for serverName, server := range config.Servers {
fmt.Printf("Server: %s (%s, %s)\n", serverName, server.IP, server.DC)
}
fmt.Printf("Client data: %v\n", config.Clients.Data)
fmt.Printf("Client hosts: %v\n", config.Clients.Hosts)
}

View File

@ -1,22 +0,0 @@
# Test file for TOML
# Only this one tries to emulate a TOML file written by a user of the kind of parser writers probably hate
# This part you'll really hate
[the]
test_string = "You'll hate me after this - #" # " Annoying, isn't it?
[the.hard]
test_array = [ "] ", " # "] # ] There you go, parse this!
test_array2 = [ "Test #11 ]proved that", "Experiment #9 was a success" ]
# You didn't think it'd as easy as chucking out the last #, did you?
another_test_string = " Same thing, but with a string #"
harder_test_string = " And when \"'s are in the string, along with # \"" # "and comments are there too"
# Things will get harder
[the.hard.bit#]
what? = "You don't think some user won't do that?"
multi_line_array = [
"]",
# ] Oh yes I did
]

View File

@ -1,4 +0,0 @@
# [x] you
# [x.y] don't
# [x.y.z] need these
[x.y.z.w] # for this to work

View File

@ -1,6 +0,0 @@
# DO NOT WANT
[fruit]
type = "apple"
[fruit.type]
apple = "yes"

View File

@ -1,5 +0,0 @@
Age = 25
Cats = [ "Cauchy", "Plato" ]
Pi = 3.14
Perfection = [ 6, 28, 496, 8128 ]
DOB = 1987-07-05T05:45:00Z

View File

@ -1 +0,0 @@
some_key_NAME = "wat"

View File

@ -1,14 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

View File

@ -1,14 +0,0 @@
# Implements the TOML test suite interface
This is an implementation of the interface expected by
[toml-test](https://github.com/BurntSushi/toml-test) for my
[toml parser written in Go](https://github.com/BurntSushi/toml).
In particular, it maps TOML data on `stdin` to a JSON format on `stdout`.
Compatible with TOML version
[v0.2.0](https://github.com/mojombo/toml/blob/master/versions/toml-v0.2.0.md)
Compatible with `toml-test` version
[v0.2.0](https://github.com/BurntSushi/toml-test/tree/v0.2.0)

View File

@ -1,90 +0,0 @@
// Command toml-test-decoder satisfies the toml-test interface for testing
// TOML decoders. Namely, it accepts TOML on stdin and outputs JSON on stdout.
package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"os"
"path"
"time"
"github.com/BurntSushi/toml"
)
func init() {
log.SetFlags(0)
flag.Usage = usage
flag.Parse()
}
func usage() {
log.Printf("Usage: %s < toml-file\n", path.Base(os.Args[0]))
flag.PrintDefaults()
os.Exit(1)
}
func main() {
if flag.NArg() != 0 {
flag.Usage()
}
var tmp interface{}
if _, err := toml.DecodeReader(os.Stdin, &tmp); err != nil {
log.Fatalf("Error decoding TOML: %s", err)
}
typedTmp := translate(tmp)
if err := json.NewEncoder(os.Stdout).Encode(typedTmp); err != nil {
log.Fatalf("Error encoding JSON: %s", err)
}
}
func translate(tomlData interface{}) interface{} {
switch orig := tomlData.(type) {
case map[string]interface{}:
typed := make(map[string]interface{}, len(orig))
for k, v := range orig {
typed[k] = translate(v)
}
return typed
case []map[string]interface{}:
typed := make([]map[string]interface{}, len(orig))
for i, v := range orig {
typed[i] = translate(v).(map[string]interface{})
}
return typed
case []interface{}:
typed := make([]interface{}, len(orig))
for i, v := range orig {
typed[i] = translate(v)
}
// We don't really need to tag arrays, but let's be future proof.
// (If TOML ever supports tuples, we'll need this.)
return tag("array", typed)
case time.Time:
return tag("datetime", orig.Format("2006-01-02T15:04:05Z"))
case bool:
return tag("bool", fmt.Sprintf("%v", orig))
case int64:
return tag("integer", fmt.Sprintf("%d", orig))
case float64:
return tag("float", fmt.Sprintf("%v", orig))
case string:
return tag("string", orig)
}
panic(fmt.Sprintf("Unknown type: %T", tomlData))
}
func tag(typeName string, data interface{}) map[string]interface{} {
return map[string]interface{}{
"type": typeName,
"value": data,
}
}

View File

@ -1,14 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

View File

@ -1,14 +0,0 @@
# Implements the TOML test suite interface for TOML encoders
This is an implementation of the interface expected by
[toml-test](https://github.com/BurntSushi/toml-test) for the
[TOML encoder](https://github.com/BurntSushi/toml).
In particular, it maps JSON data on `stdin` to a TOML format on `stdout`.
Compatible with TOML version
[v0.2.0](https://github.com/mojombo/toml/blob/master/versions/toml-v0.2.0.md)
Compatible with `toml-test` version
[v0.2.0](https://github.com/BurntSushi/toml-test/tree/v0.2.0)

View File

@ -1,131 +0,0 @@
// Command toml-test-encoder satisfies the toml-test interface for testing
// TOML encoders. Namely, it accepts JSON on stdin and outputs TOML on stdout.
package main
import (
"encoding/json"
"flag"
"log"
"os"
"path"
"strconv"
"time"
"github.com/BurntSushi/toml"
)
func init() {
log.SetFlags(0)
flag.Usage = usage
flag.Parse()
}
func usage() {
log.Printf("Usage: %s < json-file\n", path.Base(os.Args[0]))
flag.PrintDefaults()
os.Exit(1)
}
func main() {
if flag.NArg() != 0 {
flag.Usage()
}
var tmp interface{}
if err := json.NewDecoder(os.Stdin).Decode(&tmp); err != nil {
log.Fatalf("Error decoding JSON: %s", err)
}
tomlData := translate(tmp)
if err := toml.NewEncoder(os.Stdout).Encode(tomlData); err != nil {
log.Fatalf("Error encoding TOML: %s", err)
}
}
func translate(typedJson interface{}) interface{} {
switch v := typedJson.(type) {
case map[string]interface{}:
if len(v) == 2 && in("type", v) && in("value", v) {
return untag(v)
}
m := make(map[string]interface{}, len(v))
for k, v2 := range v {
m[k] = translate(v2)
}
return m
case []interface{}:
tabArray := make([]map[string]interface{}, len(v))
for i := range v {
if m, ok := translate(v[i]).(map[string]interface{}); ok {
tabArray[i] = m
} else {
log.Fatalf("JSON arrays may only contain objects. This " +
"corresponds to only tables being allowed in " +
"TOML table arrays.")
}
}
return tabArray
}
log.Fatalf("Unrecognized JSON format '%T'.", typedJson)
panic("unreachable")
}
func untag(typed map[string]interface{}) interface{} {
t := typed["type"].(string)
v := typed["value"]
switch t {
case "string":
return v.(string)
case "integer":
v := v.(string)
n, err := strconv.Atoi(v)
if err != nil {
log.Fatalf("Could not parse '%s' as integer: %s", v, err)
}
return n
case "float":
v := v.(string)
f, err := strconv.ParseFloat(v, 64)
if err != nil {
log.Fatalf("Could not parse '%s' as float64: %s", v, err)
}
return f
case "datetime":
v := v.(string)
t, err := time.Parse("2006-01-02T15:04:05Z", v)
if err != nil {
log.Fatalf("Could not parse '%s' as a datetime: %s", v, err)
}
return t
case "bool":
v := v.(string)
switch v {
case "true":
return true
case "false":
return false
}
log.Fatalf("Could not parse '%s' as a boolean.", v)
case "array":
v := v.([]interface{})
array := make([]interface{}, len(v))
for i := range v {
if m, ok := v[i].(map[string]interface{}); ok {
array[i] = untag(m)
} else {
log.Fatalf("Arrays may only contain other arrays or "+
"primitive values, but found a '%T'.", m)
}
}
return array
}
log.Fatalf("Unrecognized tag type '%s'.", t)
panic("unreachable")
}
func in(key string, m map[string]interface{}) bool {
_, ok := m[key]
return ok
}

View File

@ -1,14 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

View File

@ -1,22 +0,0 @@
# TOML Validator
If Go is installed, it's simple to try it out:
```bash
go get github.com/BurntSushi/toml/cmd/tomlv
tomlv some-toml-file.toml
```
You can see the types of every key in a TOML file with:
```bash
tomlv -types some-toml-file.toml
```
At the moment, only one error message is reported at a time. Error messages
include line numbers. No output means that the files given are valid TOML, or
there is a bug in `tomlv`.
Compatible with TOML version
[v0.1.0](https://github.com/mojombo/toml/blob/master/versions/toml-v0.1.0.md)

View File

@ -1,61 +0,0 @@
// Command tomlv validates TOML documents and prints each key's type.
package main
import (
"flag"
"fmt"
"log"
"os"
"path"
"strings"
"text/tabwriter"
"github.com/BurntSushi/toml"
)
var (
flagTypes = false
)
func init() {
log.SetFlags(0)
flag.BoolVar(&flagTypes, "types", flagTypes,
"When set, the types of every defined key will be shown.")
flag.Usage = usage
flag.Parse()
}
func usage() {
log.Printf("Usage: %s toml-file [ toml-file ... ]\n",
path.Base(os.Args[0]))
flag.PrintDefaults()
os.Exit(1)
}
func main() {
if flag.NArg() < 1 {
flag.Usage()
}
for _, f := range flag.Args() {
var tmp interface{}
md, err := toml.DecodeFile(f, &tmp)
if err != nil {
log.Fatalf("Error in '%s': %s", f, err)
}
if flagTypes {
printTypes(md)
}
}
}
func printTypes(md toml.MetaData) {
tabw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
for _, key := range md.Keys() {
fmt.Fprintf(tabw, "%s%s\t%s\n",
strings.Repeat(" ", len(key)-1), key, md.Type(key...))
}
tabw.Flush()
}

View File

@ -1,493 +0,0 @@
package toml
import (
"fmt"
"io"
"io/ioutil"
"math"
"reflect"
"strings"
"time"
)
var e = fmt.Errorf
// Unmarshaler is the interface implemented by objects that can unmarshal a
// TOML description of themselves.
type Unmarshaler interface {
UnmarshalTOML(interface{}) error
}
// Unmarshal decodes the contents of `p` in TOML format into a pointer `v`.
func Unmarshal(p []byte, v interface{}) error {
_, err := Decode(string(p), v)
return err
}
// Primitive is a TOML value that hasn't been decoded into a Go value.
// When using the various `Decode*` functions, the type `Primitive` may
// be given to any value, and its decoding will be delayed.
//
// A `Primitive` value can be decoded using the `PrimitiveDecode` function.
//
// The underlying representation of a `Primitive` value is subject to change.
// Do not rely on it.
//
// N.B. Primitive values are still parsed, so using them will only avoid
// the overhead of reflection. They can be useful when you don't know the
// exact type of TOML data until run time.
type Primitive struct {
undecoded interface{}
context Key
}
// DEPRECATED!
//
// Use MetaData.PrimitiveDecode instead.
func PrimitiveDecode(primValue Primitive, v interface{}) error {
md := MetaData{decoded: make(map[string]bool)}
return md.unify(primValue.undecoded, rvalue(v))
}
// PrimitiveDecode is just like the other `Decode*` functions, except it
// decodes a TOML value that has already been parsed. Valid primitive values
// can *only* be obtained from values filled by the decoder functions,
// including this method. (i.e., `v` may contain more `Primitive`
// values.)
//
// Meta data for primitive values is included in the meta data returned by
// the `Decode*` functions with one exception: keys returned by the Undecoded
// method will only reflect keys that were decoded. Namely, any keys hidden
// behind a Primitive will be considered undecoded. Executing this method will
// update the undecoded keys in the meta data. (See the example.)
func (md *MetaData) PrimitiveDecode(primValue Primitive, v interface{}) error {
md.context = primValue.context
defer func() { md.context = nil }()
return md.unify(primValue.undecoded, rvalue(v))
}
// Decode will decode the contents of `data` in TOML format into a pointer
// `v`.
//
// TOML hashes correspond to Go structs or maps. (Dealer's choice. They can be
// used interchangeably.)
//
// TOML arrays of tables correspond to either a slice of structs or a slice
// of maps.
//
// TOML datetimes correspond to Go `time.Time` values.
//
// All other TOML types (float, string, int, bool and array) correspond
// to the obvious Go types.
//
// An exception to the above rules is if a type implements the
// encoding.TextUnmarshaler interface. In this case, any primitive TOML value
// (floats, strings, integers, booleans and datetimes) will be converted to
// a byte string and given to the value's UnmarshalText method. See the
// Unmarshaler example for a demonstration with time duration strings.
//
// Key mapping
//
// TOML keys can map to either keys in a Go map or field names in a Go
// struct. The special `toml` struct tag may be used to map TOML keys to
// struct fields that don't match the key name exactly. (See the example.)
// A case insensitive match to struct names will be tried if an exact match
// can't be found.
//
// The mapping between TOML values and Go values is loose. That is, there
// may exist TOML values that cannot be placed into your representation, and
// there may be parts of your representation that do not correspond to
// TOML values. This loose mapping can be made stricter by using the IsDefined
// and/or Undecoded methods on the MetaData returned.
//
// This decoder will not handle cyclic types. If a cyclic type is passed,
// `Decode` will not terminate.
func Decode(data string, v interface{}) (MetaData, error) {
p, err := parse(data)
if err != nil {
return MetaData{}, err
}
md := MetaData{
p.mapping, p.types, p.ordered,
make(map[string]bool, len(p.ordered)), nil,
}
return md, md.unify(p.mapping, rvalue(v))
}
// DecodeFile is just like Decode, except it will automatically read the
// contents of the file at `fpath` and decode it for you.
func DecodeFile(fpath string, v interface{}) (MetaData, error) {
bs, err := ioutil.ReadFile(fpath)
if err != nil {
return MetaData{}, err
}
return Decode(string(bs), v)
}
// DecodeReader is just like Decode, except it will consume all bytes
// from the reader and decode it for you.
func DecodeReader(r io.Reader, v interface{}) (MetaData, error) {
bs, err := ioutil.ReadAll(r)
if err != nil {
return MetaData{}, err
}
return Decode(string(bs), v)
}
// unify performs a sort of type unification based on the structure of `rv`,
// which is the client representation.
//
// Any type mismatch produces an error. Finding a type that we don't know
// how to handle produces an unsupported type error.
func (md *MetaData) unify(data interface{}, rv reflect.Value) error {
// Special case. Look for a `Primitive` value.
if rv.Type() == reflect.TypeOf((*Primitive)(nil)).Elem() {
// Save the undecoded data and the key context into the primitive
// value.
context := make(Key, len(md.context))
copy(context, md.context)
rv.Set(reflect.ValueOf(Primitive{
undecoded: data,
context: context,
}))
return nil
}
// Special case. Unmarshaler Interface support.
if rv.CanAddr() {
if v, ok := rv.Addr().Interface().(Unmarshaler); ok {
return v.UnmarshalTOML(data)
}
}
// Special case. Handle time.Time values specifically.
// TODO: Remove this code when we decide to drop support for Go 1.1.
// This isn't necessary in Go 1.2 because time.Time satisfies the encoding
// interfaces.
if rv.Type().AssignableTo(rvalue(time.Time{}).Type()) {
return md.unifyDatetime(data, rv)
}
// Special case. Look for a value satisfying the TextUnmarshaler interface.
if v, ok := rv.Interface().(TextUnmarshaler); ok {
return md.unifyText(data, v)
}
// BUG(burntsushi)
// The behavior here is incorrect whenever a Go type satisfies the
// encoding.TextUnmarshaler interface but also corresponds to a TOML
// hash or array. In particular, the unmarshaler should only be applied
// to primitive TOML values. But at this point, it will be applied to
// all kinds of values and produce an incorrect error whenever those values
// are hashes or arrays (including arrays of tables).
k := rv.Kind()
// laziness
if k >= reflect.Int && k <= reflect.Uint64 {
return md.unifyInt(data, rv)
}
switch k {
case reflect.Ptr:
elem := reflect.New(rv.Type().Elem())
err := md.unify(data, reflect.Indirect(elem))
if err != nil {
return err
}
rv.Set(elem)
return nil
case reflect.Struct:
return md.unifyStruct(data, rv)
case reflect.Map:
return md.unifyMap(data, rv)
case reflect.Array:
return md.unifyArray(data, rv)
case reflect.Slice:
return md.unifySlice(data, rv)
case reflect.String:
return md.unifyString(data, rv)
case reflect.Bool:
return md.unifyBool(data, rv)
case reflect.Interface:
// we only support empty interfaces.
if rv.NumMethod() > 0 {
return e("Unsupported type '%s'.", rv.Kind())
}
return md.unifyAnything(data, rv)
case reflect.Float32:
fallthrough
case reflect.Float64:
return md.unifyFloat64(data, rv)
}
return e("Unsupported type '%s'.", rv.Kind())
}
func (md *MetaData) unifyStruct(mapping interface{}, rv reflect.Value) error {
tmap, ok := mapping.(map[string]interface{})
if !ok {
return mismatch(rv, "map", mapping)
}
for key, datum := range tmap {
var f *field
fields := cachedTypeFields(rv.Type())
for i := range fields {
ff := &fields[i]
if ff.name == key {
f = ff
break
}
if f == nil && strings.EqualFold(ff.name, key) {
f = ff
}
}
if f != nil {
subv := rv
for _, i := range f.index {
subv = indirect(subv.Field(i))
}
if isUnifiable(subv) {
md.decoded[md.context.add(key).String()] = true
md.context = append(md.context, key)
if err := md.unify(datum, subv); err != nil {
return e("Type mismatch for '%s.%s': %s",
rv.Type().String(), f.name, err)
}
md.context = md.context[0 : len(md.context)-1]
} else if f.name != "" {
// Bad user! No soup for you!
return e("Field '%s.%s' is unexported, and therefore cannot "+
"be loaded with reflection.", rv.Type().String(), f.name)
}
}
}
return nil
}
func (md *MetaData) unifyMap(mapping interface{}, rv reflect.Value) error {
tmap, ok := mapping.(map[string]interface{})
if !ok {
return badtype("map", mapping)
}
if rv.IsNil() {
rv.Set(reflect.MakeMap(rv.Type()))
}
for k, v := range tmap {
md.decoded[md.context.add(k).String()] = true
md.context = append(md.context, k)
rvkey := indirect(reflect.New(rv.Type().Key()))
rvval := reflect.Indirect(reflect.New(rv.Type().Elem()))
if err := md.unify(v, rvval); err != nil {
return err
}
md.context = md.context[0 : len(md.context)-1]
rvkey.SetString(k)
rv.SetMapIndex(rvkey, rvval)
}
return nil
}
func (md *MetaData) unifyArray(data interface{}, rv reflect.Value) error {
datav := reflect.ValueOf(data)
if datav.Kind() != reflect.Slice {
return badtype("slice", data)
}
sliceLen := datav.Len()
if sliceLen != rv.Len() {
return e("expected array length %d; got TOML array of length %d",
rv.Len(), sliceLen)
}
return md.unifySliceArray(datav, rv)
}
func (md *MetaData) unifySlice(data interface{}, rv reflect.Value) error {
datav := reflect.ValueOf(data)
if datav.Kind() != reflect.Slice {
return badtype("slice", data)
}
sliceLen := datav.Len()
if rv.IsNil() || rv.Len() < datav.Len() {
rv.Set(reflect.MakeSlice(rv.Type(), sliceLen, sliceLen))
}
rv.SetLen(datav.Len())
return md.unifySliceArray(datav, rv)
}
func (md *MetaData) unifySliceArray(data, rv reflect.Value) error {
sliceLen := data.Len()
for i := 0; i < sliceLen; i++ {
v := data.Index(i).Interface()
sliceval := indirect(rv.Index(i))
if err := md.unify(v, sliceval); err != nil {
return err
}
}
return nil
}
func (md *MetaData) unifyDatetime(data interface{}, rv reflect.Value) error {
if _, ok := data.(time.Time); ok {
rv.Set(reflect.ValueOf(data))
return nil
}
return badtype("time.Time", data)
}
func (md *MetaData) unifyString(data interface{}, rv reflect.Value) error {
if s, ok := data.(string); ok {
rv.SetString(s)
return nil
}
return badtype("string", data)
}
func (md *MetaData) unifyFloat64(data interface{}, rv reflect.Value) error {
if num, ok := data.(float64); ok {
switch rv.Kind() {
case reflect.Float32:
fallthrough
case reflect.Float64:
rv.SetFloat(num)
default:
panic("bug")
}
return nil
}
return badtype("float", data)
}
func (md *MetaData) unifyInt(data interface{}, rv reflect.Value) error {
if num, ok := data.(int64); ok {
if rv.Kind() >= reflect.Int && rv.Kind() <= reflect.Int64 {
switch rv.Kind() {
case reflect.Int, reflect.Int64:
// No bounds checking necessary.
case reflect.Int8:
if num < math.MinInt8 || num > math.MaxInt8 {
return e("Value '%d' is out of range for int8.", num)
}
case reflect.Int16:
if num < math.MinInt16 || num > math.MaxInt16 {
return e("Value '%d' is out of range for int16.", num)
}
case reflect.Int32:
if num < math.MinInt32 || num > math.MaxInt32 {
return e("Value '%d' is out of range for int32.", num)
}
}
rv.SetInt(num)
} else if rv.Kind() >= reflect.Uint && rv.Kind() <= reflect.Uint64 {
unum := uint64(num)
switch rv.Kind() {
case reflect.Uint, reflect.Uint64:
// No bounds checking necessary.
case reflect.Uint8:
if num < 0 || unum > math.MaxUint8 {
return e("Value '%d' is out of range for uint8.", num)
}
case reflect.Uint16:
if num < 0 || unum > math.MaxUint16 {
return e("Value '%d' is out of range for uint16.", num)
}
case reflect.Uint32:
if num < 0 || unum > math.MaxUint32 {
return e("Value '%d' is out of range for uint32.", num)
}
}
rv.SetUint(unum)
} else {
panic("unreachable")
}
return nil
}
return badtype("integer", data)
}
func (md *MetaData) unifyBool(data interface{}, rv reflect.Value) error {
if b, ok := data.(bool); ok {
rv.SetBool(b)
return nil
}
return badtype("boolean", data)
}
func (md *MetaData) unifyAnything(data interface{}, rv reflect.Value) error {
rv.Set(reflect.ValueOf(data))
return nil
}
func (md *MetaData) unifyText(data interface{}, v TextUnmarshaler) error {
var s string
switch sdata := data.(type) {
case TextMarshaler:
text, err := sdata.MarshalText()
if err != nil {
return err
}
s = string(text)
case fmt.Stringer:
s = sdata.String()
case string:
s = sdata
case bool:
s = fmt.Sprintf("%v", sdata)
case int64:
s = fmt.Sprintf("%d", sdata)
case float64:
s = fmt.Sprintf("%f", sdata)
default:
return badtype("primitive (string-like)", data)
}
if err := v.UnmarshalText([]byte(s)); err != nil {
return err
}
return nil
}
// rvalue returns a reflect.Value of `v`. All pointers are resolved.
func rvalue(v interface{}) reflect.Value {
return indirect(reflect.ValueOf(v))
}
// indirect returns the value pointed to by a pointer.
// Pointers are followed until the value is not a pointer.
// New values are allocated for each nil pointer.
//
// An exception to this rule is if the value satisfies an interface of
// interest to us (like encoding.TextUnmarshaler).
func indirect(v reflect.Value) reflect.Value {
if v.Kind() != reflect.Ptr {
if v.CanAddr() {
pv := v.Addr()
if _, ok := pv.Interface().(TextUnmarshaler); ok {
return pv
}
}
return v
}
if v.IsNil() {
v.Set(reflect.New(v.Type().Elem()))
}
return indirect(reflect.Indirect(v))
}
func isUnifiable(rv reflect.Value) bool {
if rv.CanSet() {
return true
}
if _, ok := rv.Interface().(TextUnmarshaler); ok {
return true
}
return false
}
func badtype(expected string, data interface{}) error {
return e("Expected %s but found '%T'.", expected, data)
}
func mismatch(user reflect.Value, expected string, data interface{}) error {
return e("Type mismatch for %s. Expected %s but found '%T'.",
user.Type().String(), expected, data)
}

View File

@ -1,122 +0,0 @@
package toml
import "strings"
// MetaData allows access to meta information about TOML data that may not
// be inferrable via reflection. In particular, whether a key has been defined
// and the TOML type of a key.
type MetaData struct {
mapping map[string]interface{}
types map[string]tomlType
keys []Key
decoded map[string]bool
context Key // Used only during decoding.
}
// IsDefined returns true if the key given exists in the TOML data. The key
// should be specified hierarchially. e.g.,
//
// // access the TOML key 'a.b.c'
// IsDefined("a", "b", "c")
//
// IsDefined will return false if an empty key given. Keys are case sensitive.
func (md *MetaData) IsDefined(key ...string) bool {
if len(key) == 0 {
return false
}
var hash map[string]interface{}
var ok bool
var hashOrVal interface{} = md.mapping
for _, k := range key {
if hash, ok = hashOrVal.(map[string]interface{}); !ok {
return false
}
if hashOrVal, ok = hash[k]; !ok {
return false
}
}
return true
}
// Type returns a string representation of the type of the key specified.
//
// Type will return the empty string if given an empty key or a key that
// does not exist. Keys are case sensitive.
func (md *MetaData) Type(key ...string) string {
fullkey := strings.Join(key, ".")
if typ, ok := md.types[fullkey]; ok {
return typ.typeString()
}
return ""
}
// Key is the type of any TOML key, including key groups. Use (MetaData).Keys
// to get values of this type.
type Key []string
func (k Key) String() string {
return strings.Join(k, ".")
}
func (k Key) maybeQuotedAll() string {
var ss []string
for i := range k {
ss = append(ss, k.maybeQuoted(i))
}
return strings.Join(ss, ".")
}
func (k Key) maybeQuoted(i int) string {
quote := false
for _, c := range k[i] {
if !isBareKeyChar(c) {
quote = true
break
}
}
if quote {
return "\"" + strings.Replace(k[i], "\"", "\\\"", -1) + "\""
} else {
return k[i]
}
}
func (k Key) add(piece string) Key {
newKey := make(Key, len(k)+1)
copy(newKey, k)
newKey[len(k)] = piece
return newKey
}
// Keys returns a slice of every key in the TOML data, including key groups.
// Each key is itself a slice, where the first element is the top of the
// hierarchy and the last is the most specific.
//
// The list will have the same order as the keys appeared in the TOML data.
//
// All keys returned are non-empty.
func (md *MetaData) Keys() []Key {
return md.keys
}
// Undecoded returns all keys that have not been decoded in the order in which
// they appear in the original TOML document.
//
// This includes keys that haven't been decoded because of a Primitive value.
// Once the Primitive value is decoded, the keys will be considered decoded.
//
// Also note that decoding into an empty interface will result in no decoding,
// and so no keys will be considered decoded.
//
// In this sense, the Undecoded keys correspond to keys in the TOML document
// that do not have a concrete type in your representation.
func (md *MetaData) Undecoded() []Key {
undecoded := make([]Key, 0, len(md.keys))
for _, key := range md.keys {
if !md.decoded[key.String()] {
undecoded = append(undecoded, key)
}
}
return undecoded
}

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +0,0 @@
/*
Package toml provides facilities for decoding and encoding TOML configuration
files via reflection. There is also support for delaying decoding with
the Primitive type, and querying the set of keys in a TOML document with the
MetaData type.
The specification implemented: https://github.com/mojombo/toml
The sub-command github.com/BurntSushi/toml/cmd/tomlv can be used to verify
whether a file is a valid TOML document. It can also be used to print the
type of each key in a TOML document.
Testing
There are two important types of tests used for this package. The first is
contained inside '*_test.go' files and uses the standard Go unit testing
framework. These tests are primarily devoted to holistically testing the
decoder and encoder.
The second type of testing is used to verify the implementation's adherence
to the TOML specification. These tests have been factored into their own
project: https://github.com/BurntSushi/toml-test
The reason the tests are in a separate project is so that they can be used by
any implementation of TOML. Namely, it is language agnostic.
*/
package toml

View File

@ -1,562 +0,0 @@
package toml
import (
"bufio"
"errors"
"fmt"
"io"
"reflect"
"sort"
"strconv"
"strings"
"time"
)
type tomlEncodeError struct{ error }
var (
errArrayMixedElementTypes = errors.New(
"can't encode array with mixed element types")
errArrayNilElement = errors.New(
"can't encode array with nil element")
errNonString = errors.New(
"can't encode a map with non-string key type")
errAnonNonStruct = errors.New(
"can't encode an anonymous field that is not a struct")
errArrayNoTable = errors.New(
"TOML array element can't contain a table")
errNoKey = errors.New(
"top-level values must be a Go map or struct")
errAnything = errors.New("") // used in testing
)
var quotedReplacer = strings.NewReplacer(
"\t", "\\t",
"\n", "\\n",
"\r", "\\r",
"\"", "\\\"",
"\\", "\\\\",
)
// Encoder controls the encoding of Go values to a TOML document to some
// io.Writer.
//
// The indentation level can be controlled with the Indent field.
type Encoder struct {
// A single indentation level. By default it is two spaces.
Indent string
// hasWritten is whether we have written any output to w yet.
hasWritten bool
w *bufio.Writer
}
// NewEncoder returns a TOML encoder that encodes Go values to the io.Writer
// given. By default, a single indentation level is 2 spaces.
func NewEncoder(w io.Writer) *Encoder {
return &Encoder{
w: bufio.NewWriter(w),
Indent: " ",
}
}
// Encode writes a TOML representation of the Go value to the underlying
// io.Writer. If the value given cannot be encoded to a valid TOML document,
// then an error is returned.
//
// The mapping between Go values and TOML values should be precisely the same
// as for the Decode* functions. Similarly, the TextMarshaler interface is
// supported by encoding the resulting bytes as strings. (If you want to write
// arbitrary binary data then you will need to use something like base64 since
// TOML does not have any binary types.)
//
// When encoding TOML hashes (i.e., Go maps or structs), keys without any
// sub-hashes are encoded first.
//
// If a Go map is encoded, then its keys are sorted alphabetically for
// deterministic output. More control over this behavior may be provided if
// there is demand for it.
//
// Encoding Go values without a corresponding TOML representation---like map
// types with non-string keys---will cause an error to be returned. Similarly
// for mixed arrays/slices, arrays/slices with nil elements, embedded
// non-struct types and nested slices containing maps or structs.
// (e.g., [][]map[string]string is not allowed but []map[string]string is OK
// and so is []map[string][]string.)
func (enc *Encoder) Encode(v interface{}) error {
rv := eindirect(reflect.ValueOf(v))
if err := enc.safeEncode(Key([]string{}), rv); err != nil {
return err
}
return enc.w.Flush()
}
func (enc *Encoder) safeEncode(key Key, rv reflect.Value) (err error) {
defer func() {
if r := recover(); r != nil {
if terr, ok := r.(tomlEncodeError); ok {
err = terr.error
return
}
panic(r)
}
}()
enc.encode(key, rv)
return nil
}
func (enc *Encoder) encode(key Key, rv reflect.Value) {
// Special case. Time needs to be in ISO8601 format.
// Special case. If we can marshal the type to text, then we used that.
// Basically, this prevents the encoder for handling these types as
// generic structs (or whatever the underlying type of a TextMarshaler is).
switch rv.Interface().(type) {
case time.Time, TextMarshaler:
enc.keyEqElement(key, rv)
return
}
k := rv.Kind()
switch k {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32,
reflect.Int64,
reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32,
reflect.Uint64,
reflect.Float32, reflect.Float64, reflect.String, reflect.Bool:
enc.keyEqElement(key, rv)
case reflect.Array, reflect.Slice:
if typeEqual(tomlArrayHash, tomlTypeOfGo(rv)) {
enc.eArrayOfTables(key, rv)
} else {
enc.keyEqElement(key, rv)
}
case reflect.Interface:
if rv.IsNil() {
return
}
enc.encode(key, rv.Elem())
case reflect.Map:
if rv.IsNil() {
return
}
enc.eTable(key, rv)
case reflect.Ptr:
if rv.IsNil() {
return
}
enc.encode(key, rv.Elem())
case reflect.Struct:
enc.eTable(key, rv)
default:
panic(e("Unsupported type for key '%s': %s", key, k))
}
}
// eElement encodes any value that can be an array element (primitives and
// arrays).
func (enc *Encoder) eElement(rv reflect.Value) {
switch v := rv.Interface().(type) {
case time.Time:
// Special case time.Time as a primitive. Has to come before
// TextMarshaler below because time.Time implements
// encoding.TextMarshaler, but we need to always use UTC.
enc.wf(v.In(time.FixedZone("UTC", 0)).Format("2006-01-02T15:04:05Z"))
return
case TextMarshaler:
// Special case. Use text marshaler if it's available for this value.
if s, err := v.MarshalText(); err != nil {
encPanic(err)
} else {
enc.writeQuoted(string(s))
}
return
}
switch rv.Kind() {
case reflect.Bool:
enc.wf(strconv.FormatBool(rv.Bool()))
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32,
reflect.Int64:
enc.wf(strconv.FormatInt(rv.Int(), 10))
case reflect.Uint, reflect.Uint8, reflect.Uint16,
reflect.Uint32, reflect.Uint64:
enc.wf(strconv.FormatUint(rv.Uint(), 10))
case reflect.Float32:
enc.wf(floatAddDecimal(strconv.FormatFloat(rv.Float(), 'f', -1, 32)))
case reflect.Float64:
enc.wf(floatAddDecimal(strconv.FormatFloat(rv.Float(), 'f', -1, 64)))
case reflect.Array, reflect.Slice:
enc.eArrayOrSliceElement(rv)
case reflect.Interface:
enc.eElement(rv.Elem())
case reflect.String:
enc.writeQuoted(rv.String())
default:
panic(e("Unexpected primitive type: %s", rv.Kind()))
}
}
// By the TOML spec, all floats must have a decimal with at least one
// number on either side.
func floatAddDecimal(fstr string) string {
if !strings.Contains(fstr, ".") {
return fstr + ".0"
}
return fstr
}
func (enc *Encoder) writeQuoted(s string) {
enc.wf("\"%s\"", quotedReplacer.Replace(s))
}
func (enc *Encoder) eArrayOrSliceElement(rv reflect.Value) {
length := rv.Len()
enc.wf("[")
for i := 0; i < length; i++ {
elem := rv.Index(i)
enc.eElement(elem)
if i != length-1 {
enc.wf(", ")
}
}
enc.wf("]")
}
func (enc *Encoder) eArrayOfTables(key Key, rv reflect.Value) {
if len(key) == 0 {
encPanic(errNoKey)
}
for i := 0; i < rv.Len(); i++ {
trv := rv.Index(i)
if isNil(trv) {
continue
}
panicIfInvalidKey(key)
enc.newline()
enc.wf("%s[[%s]]", enc.indentStr(key), key.maybeQuotedAll())
enc.newline()
enc.eMapOrStruct(key, trv)
}
}
func (enc *Encoder) eTable(key Key, rv reflect.Value) {
panicIfInvalidKey(key)
if len(key) == 1 {
// Output an extra new line between top-level tables.
// (The newline isn't written if nothing else has been written though.)
enc.newline()
}
if len(key) > 0 {
enc.wf("%s[%s]", enc.indentStr(key), key.maybeQuotedAll())
enc.newline()
}
enc.eMapOrStruct(key, rv)
}
func (enc *Encoder) eMapOrStruct(key Key, rv reflect.Value) {
switch rv := eindirect(rv); rv.Kind() {
case reflect.Map:
enc.eMap(key, rv)
case reflect.Struct:
enc.eStruct(key, rv)
default:
panic("eTable: unhandled reflect.Value Kind: " + rv.Kind().String())
}
}
func (enc *Encoder) eMap(key Key, rv reflect.Value) {
rt := rv.Type()
if rt.Key().Kind() != reflect.String {
encPanic(errNonString)
}
// Sort keys so that we have deterministic output. And write keys directly
// underneath this key first, before writing sub-structs or sub-maps.
var mapKeysDirect, mapKeysSub []string
for _, mapKey := range rv.MapKeys() {
k := mapKey.String()
if typeIsHash(tomlTypeOfGo(rv.MapIndex(mapKey))) {
mapKeysSub = append(mapKeysSub, k)
} else {
mapKeysDirect = append(mapKeysDirect, k)
}
}
var writeMapKeys = func(mapKeys []string) {
sort.Strings(mapKeys)
for _, mapKey := range mapKeys {
mrv := rv.MapIndex(reflect.ValueOf(mapKey))
if isNil(mrv) {
// Don't write anything for nil fields.
continue
}
enc.encode(key.add(mapKey), mrv)
}
}
writeMapKeys(mapKeysDirect)
writeMapKeys(mapKeysSub)
}
func (enc *Encoder) eStruct(key Key, rv reflect.Value) {
// Write keys for fields directly under this key first, because if we write
// a field that creates a new table, then all keys under it will be in that
// table (not the one we're writing here).
rt := rv.Type()
var fieldsDirect, fieldsSub [][]int
var addFields func(rt reflect.Type, rv reflect.Value, start []int)
addFields = func(rt reflect.Type, rv reflect.Value, start []int) {
for i := 0; i < rt.NumField(); i++ {
f := rt.Field(i)
// skip unexported fields
if f.PkgPath != "" && !f.Anonymous {
continue
}
frv := rv.Field(i)
if f.Anonymous {
t := f.Type
switch t.Kind() {
case reflect.Struct:
addFields(t, frv, f.Index)
continue
case reflect.Ptr:
if t.Elem().Kind() == reflect.Struct {
if !frv.IsNil() {
addFields(t.Elem(), frv.Elem(), f.Index)
}
continue
}
// Fall through to the normal field encoding logic below
// for non-struct anonymous fields.
}
}
if typeIsHash(tomlTypeOfGo(frv)) {
fieldsSub = append(fieldsSub, append(start, f.Index...))
} else {
fieldsDirect = append(fieldsDirect, append(start, f.Index...))
}
}
}
addFields(rt, rv, nil)
var writeFields = func(fields [][]int) {
for _, fieldIndex := range fields {
sft := rt.FieldByIndex(fieldIndex)
sf := rv.FieldByIndex(fieldIndex)
if isNil(sf) {
// Don't write anything for nil fields.
continue
}
keyName := sft.Tag.Get("toml")
if keyName == "-" {
continue
}
if keyName == "" {
keyName = sft.Name
}
keyName, opts := getOptions(keyName)
if _, ok := opts["omitempty"]; ok && isEmpty(sf) {
continue
} else if _, ok := opts["omitzero"]; ok && isZero(sf) {
continue
}
enc.encode(key.add(keyName), sf)
}
}
writeFields(fieldsDirect)
writeFields(fieldsSub)
}
// tomlTypeName returns the TOML type name of the Go value's type. It is
// used to determine whether the types of array elements are mixed (which is
// forbidden). If the Go value is nil, then it is illegal for it to be an array
// element, and valueIsNil is returned as true.
// Returns the TOML type of a Go value. The type may be `nil`, which means
// no concrete TOML type could be found.
func tomlTypeOfGo(rv reflect.Value) tomlType {
if isNil(rv) || !rv.IsValid() {
return nil
}
switch rv.Kind() {
case reflect.Bool:
return tomlBool
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32,
reflect.Int64,
reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32,
reflect.Uint64:
return tomlInteger
case reflect.Float32, reflect.Float64:
return tomlFloat
case reflect.Array, reflect.Slice:
if typeEqual(tomlHash, tomlArrayType(rv)) {
return tomlArrayHash
} else {
return tomlArray
}
case reflect.Ptr, reflect.Interface:
return tomlTypeOfGo(rv.Elem())
case reflect.String:
return tomlString
case reflect.Map:
return tomlHash
case reflect.Struct:
switch rv.Interface().(type) {
case time.Time:
return tomlDatetime
case TextMarshaler:
return tomlString
default:
return tomlHash
}
default:
panic("unexpected reflect.Kind: " + rv.Kind().String())
}
}
// tomlArrayType returns the element type of a TOML array. The type returned
// may be nil if it cannot be determined (e.g., a nil slice or a zero length
// slize). This function may also panic if it finds a type that cannot be
// expressed in TOML (such as nil elements, heterogeneous arrays or directly
// nested arrays of tables).
func tomlArrayType(rv reflect.Value) tomlType {
if isNil(rv) || !rv.IsValid() || rv.Len() == 0 {
return nil
}
firstType := tomlTypeOfGo(rv.Index(0))
if firstType == nil {
encPanic(errArrayNilElement)
}
rvlen := rv.Len()
for i := 1; i < rvlen; i++ {
elem := rv.Index(i)
switch elemType := tomlTypeOfGo(elem); {
case elemType == nil:
encPanic(errArrayNilElement)
case !typeEqual(firstType, elemType):
encPanic(errArrayMixedElementTypes)
}
}
// If we have a nested array, then we must make sure that the nested
// array contains ONLY primitives.
// This checks arbitrarily nested arrays.
if typeEqual(firstType, tomlArray) || typeEqual(firstType, tomlArrayHash) {
nest := tomlArrayType(eindirect(rv.Index(0)))
if typeEqual(nest, tomlHash) || typeEqual(nest, tomlArrayHash) {
encPanic(errArrayNoTable)
}
}
return firstType
}
func getOptions(keyName string) (string, map[string]struct{}) {
opts := make(map[string]struct{})
ss := strings.Split(keyName, ",")
name := ss[0]
if len(ss) > 1 {
for _, opt := range ss {
opts[opt] = struct{}{}
}
}
return name, opts
}
func isZero(rv reflect.Value) bool {
switch rv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
if rv.Int() == 0 {
return true
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
if rv.Uint() == 0 {
return true
}
case reflect.Float32, reflect.Float64:
if rv.Float() == 0.0 {
return true
}
}
return false
}
func isEmpty(rv reflect.Value) bool {
switch rv.Kind() {
case reflect.String:
if len(strings.TrimSpace(rv.String())) == 0 {
return true
}
case reflect.Array, reflect.Slice, reflect.Map:
if rv.Len() == 0 {
return true
}
}
return false
}
func (enc *Encoder) newline() {
if enc.hasWritten {
enc.wf("\n")
}
}
func (enc *Encoder) keyEqElement(key Key, val reflect.Value) {
if len(key) == 0 {
encPanic(errNoKey)
}
panicIfInvalidKey(key)
enc.wf("%s%s = ", enc.indentStr(key), key.maybeQuoted(len(key)-1))
enc.eElement(val)
enc.newline()
}
func (enc *Encoder) wf(format string, v ...interface{}) {
if _, err := fmt.Fprintf(enc.w, format, v...); err != nil {
encPanic(err)
}
enc.hasWritten = true
}
func (enc *Encoder) indentStr(key Key) string {
return strings.Repeat(enc.Indent, len(key)-1)
}
func encPanic(err error) {
panic(tomlEncodeError{err})
}
func eindirect(v reflect.Value) reflect.Value {
switch v.Kind() {
case reflect.Ptr, reflect.Interface:
return eindirect(v.Elem())
default:
return v
}
}
func isNil(rv reflect.Value) bool {
switch rv.Kind() {
case reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice:
return rv.IsNil()
default:
return false
}
}
func panicIfInvalidKey(key Key) {
for _, k := range key {
if len(k) == 0 {
encPanic(e("Key '%s' is not a valid table name. Key names "+
"cannot be empty.", key.maybeQuotedAll()))
}
}
}
func isValidKeyName(s string) bool {
return len(s) != 0
}

View File

@ -1,566 +0,0 @@
package toml
import (
"bytes"
"fmt"
"log"
"net"
"testing"
"time"
)
func TestEncodeRoundTrip(t *testing.T) {
type Config struct {
Age int
Cats []string
Pi float64
Perfection []int
DOB time.Time
Ipaddress net.IP
}
var inputs = Config{
13,
[]string{"one", "two", "three"},
3.145,
[]int{11, 2, 3, 4},
time.Now(),
net.ParseIP("192.168.59.254"),
}
var firstBuffer bytes.Buffer
e := NewEncoder(&firstBuffer)
err := e.Encode(inputs)
if err != nil {
t.Fatal(err)
}
var outputs Config
if _, err := Decode(firstBuffer.String(), &outputs); err != nil {
log.Printf("Could not decode:\n-----\n%s\n-----\n",
firstBuffer.String())
t.Fatal(err)
}
// could test each value individually, but I'm lazy
var secondBuffer bytes.Buffer
e2 := NewEncoder(&secondBuffer)
err = e2.Encode(outputs)
if err != nil {
t.Fatal(err)
}
if firstBuffer.String() != secondBuffer.String() {
t.Error(
firstBuffer.String(),
"\n\n is not identical to\n\n",
secondBuffer.String())
}
}
// XXX(burntsushi)
// I think these tests probably should be removed. They are good, but they
// ought to be obsolete by toml-test.
func TestEncode(t *testing.T) {
type Embedded struct {
Int int `toml:"_int"`
}
type NonStruct int
date := time.Date(2014, 5, 11, 20, 30, 40, 0, time.FixedZone("IST", 3600))
dateStr := "2014-05-11T19:30:40Z"
tests := map[string]struct {
input interface{}
wantOutput string
wantError error
}{
"bool field": {
input: struct {
BoolTrue bool
BoolFalse bool
}{true, false},
wantOutput: "BoolTrue = true\nBoolFalse = false\n",
},
"int fields": {
input: struct {
Int int
Int8 int8
Int16 int16
Int32 int32
Int64 int64
}{1, 2, 3, 4, 5},
wantOutput: "Int = 1\nInt8 = 2\nInt16 = 3\nInt32 = 4\nInt64 = 5\n",
},
"uint fields": {
input: struct {
Uint uint
Uint8 uint8
Uint16 uint16
Uint32 uint32
Uint64 uint64
}{1, 2, 3, 4, 5},
wantOutput: "Uint = 1\nUint8 = 2\nUint16 = 3\nUint32 = 4" +
"\nUint64 = 5\n",
},
"float fields": {
input: struct {
Float32 float32
Float64 float64
}{1.5, 2.5},
wantOutput: "Float32 = 1.5\nFloat64 = 2.5\n",
},
"string field": {
input: struct{ String string }{"foo"},
wantOutput: "String = \"foo\"\n",
},
"string field and unexported field": {
input: struct {
String string
unexported int
}{"foo", 0},
wantOutput: "String = \"foo\"\n",
},
"datetime field in UTC": {
input: struct{ Date time.Time }{date},
wantOutput: fmt.Sprintf("Date = %s\n", dateStr),
},
"datetime field as primitive": {
// Using a map here to fail if isStructOrMap() returns true for
// time.Time.
input: map[string]interface{}{
"Date": date,
"Int": 1,
},
wantOutput: fmt.Sprintf("Date = %s\nInt = 1\n", dateStr),
},
"array fields": {
input: struct {
IntArray0 [0]int
IntArray3 [3]int
}{[0]int{}, [3]int{1, 2, 3}},
wantOutput: "IntArray0 = []\nIntArray3 = [1, 2, 3]\n",
},
"slice fields": {
input: struct{ IntSliceNil, IntSlice0, IntSlice3 []int }{
nil, []int{}, []int{1, 2, 3},
},
wantOutput: "IntSlice0 = []\nIntSlice3 = [1, 2, 3]\n",
},
"datetime slices": {
input: struct{ DatetimeSlice []time.Time }{
[]time.Time{date, date},
},
wantOutput: fmt.Sprintf("DatetimeSlice = [%s, %s]\n",
dateStr, dateStr),
},
"nested arrays and slices": {
input: struct {
SliceOfArrays [][2]int
ArrayOfSlices [2][]int
SliceOfArraysOfSlices [][2][]int
ArrayOfSlicesOfArrays [2][][2]int
SliceOfMixedArrays [][2]interface{}
ArrayOfMixedSlices [2][]interface{}
}{
[][2]int{{1, 2}, {3, 4}},
[2][]int{{1, 2}, {3, 4}},
[][2][]int{
{
{1, 2}, {3, 4},
},
{
{5, 6}, {7, 8},
},
},
[2][][2]int{
{
{1, 2}, {3, 4},
},
{
{5, 6}, {7, 8},
},
},
[][2]interface{}{
{1, 2}, {"a", "b"},
},
[2][]interface{}{
{1, 2}, {"a", "b"},
},
},
wantOutput: `SliceOfArrays = [[1, 2], [3, 4]]
ArrayOfSlices = [[1, 2], [3, 4]]
SliceOfArraysOfSlices = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
ArrayOfSlicesOfArrays = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
SliceOfMixedArrays = [[1, 2], ["a", "b"]]
ArrayOfMixedSlices = [[1, 2], ["a", "b"]]
`,
},
"empty slice": {
input: struct{ Empty []interface{} }{[]interface{}{}},
wantOutput: "Empty = []\n",
},
"(error) slice with element type mismatch (string and integer)": {
input: struct{ Mixed []interface{} }{[]interface{}{1, "a"}},
wantError: errArrayMixedElementTypes,
},
"(error) slice with element type mismatch (integer and float)": {
input: struct{ Mixed []interface{} }{[]interface{}{1, 2.5}},
wantError: errArrayMixedElementTypes,
},
"slice with elems of differing Go types, same TOML types": {
input: struct {
MixedInts []interface{}
MixedFloats []interface{}
}{
[]interface{}{
int(1), int8(2), int16(3), int32(4), int64(5),
uint(1), uint8(2), uint16(3), uint32(4), uint64(5),
},
[]interface{}{float32(1.5), float64(2.5)},
},
wantOutput: "MixedInts = [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]\n" +
"MixedFloats = [1.5, 2.5]\n",
},
"(error) slice w/ element type mismatch (one is nested array)": {
input: struct{ Mixed []interface{} }{
[]interface{}{1, []interface{}{2}},
},
wantError: errArrayMixedElementTypes,
},
"(error) slice with 1 nil element": {
input: struct{ NilElement1 []interface{} }{[]interface{}{nil}},
wantError: errArrayNilElement,
},
"(error) slice with 1 nil element (and other non-nil elements)": {
input: struct{ NilElement []interface{} }{
[]interface{}{1, nil},
},
wantError: errArrayNilElement,
},
"simple map": {
input: map[string]int{"a": 1, "b": 2},
wantOutput: "a = 1\nb = 2\n",
},
"map with interface{} value type": {
input: map[string]interface{}{"a": 1, "b": "c"},
wantOutput: "a = 1\nb = \"c\"\n",
},
"map with interface{} value type, some of which are structs": {
input: map[string]interface{}{
"a": struct{ Int int }{2},
"b": 1,
},
wantOutput: "b = 1\n\n[a]\n Int = 2\n",
},
"nested map": {
input: map[string]map[string]int{
"a": {"b": 1},
"c": {"d": 2},
},
wantOutput: "[a]\n b = 1\n\n[c]\n d = 2\n",
},
"nested struct": {
input: struct{ Struct struct{ Int int } }{
struct{ Int int }{1},
},
wantOutput: "[Struct]\n Int = 1\n",
},
"nested struct and non-struct field": {
input: struct {
Struct struct{ Int int }
Bool bool
}{struct{ Int int }{1}, true},
wantOutput: "Bool = true\n\n[Struct]\n Int = 1\n",
},
"2 nested structs": {
input: struct{ Struct1, Struct2 struct{ Int int } }{
struct{ Int int }{1}, struct{ Int int }{2},
},
wantOutput: "[Struct1]\n Int = 1\n\n[Struct2]\n Int = 2\n",
},
"deeply nested structs": {
input: struct {
Struct1, Struct2 struct{ Struct3 *struct{ Int int } }
}{
struct{ Struct3 *struct{ Int int } }{&struct{ Int int }{1}},
struct{ Struct3 *struct{ Int int } }{nil},
},
wantOutput: "[Struct1]\n [Struct1.Struct3]\n Int = 1" +
"\n\n[Struct2]\n",
},
"nested struct with nil struct elem": {
input: struct {
Struct struct{ Inner *struct{ Int int } }
}{
struct{ Inner *struct{ Int int } }{nil},
},
wantOutput: "[Struct]\n",
},
"nested struct with no fields": {
input: struct {
Struct struct{ Inner struct{} }
}{
struct{ Inner struct{} }{struct{}{}},
},
wantOutput: "[Struct]\n [Struct.Inner]\n",
},
"struct with tags": {
input: struct {
Struct struct {
Int int `toml:"_int"`
} `toml:"_struct"`
Bool bool `toml:"_bool"`
}{
struct {
Int int `toml:"_int"`
}{1}, true,
},
wantOutput: "_bool = true\n\n[_struct]\n _int = 1\n",
},
"embedded struct": {
input: struct{ Embedded }{Embedded{1}},
wantOutput: "_int = 1\n",
},
"embedded *struct": {
input: struct{ *Embedded }{&Embedded{1}},
wantOutput: "_int = 1\n",
},
"nested embedded struct": {
input: struct {
Struct struct{ Embedded } `toml:"_struct"`
}{struct{ Embedded }{Embedded{1}}},
wantOutput: "[_struct]\n _int = 1\n",
},
"nested embedded *struct": {
input: struct {
Struct struct{ *Embedded } `toml:"_struct"`
}{struct{ *Embedded }{&Embedded{1}}},
wantOutput: "[_struct]\n _int = 1\n",
},
"embedded non-struct": {
input: struct{ NonStruct }{5},
wantOutput: "NonStruct = 5\n",
},
"array of tables": {
input: struct {
Structs []*struct{ Int int } `toml:"struct"`
}{
[]*struct{ Int int }{{1}, {3}},
},
wantOutput: "[[struct]]\n Int = 1\n\n[[struct]]\n Int = 3\n",
},
"array of tables order": {
input: map[string]interface{}{
"map": map[string]interface{}{
"zero": 5,
"arr": []map[string]int{
{
"friend": 5,
},
},
},
},
wantOutput: "[map]\n zero = 5\n\n [[map.arr]]\n friend = 5\n",
},
"(error) top-level slice": {
input: []struct{ Int int }{{1}, {2}, {3}},
wantError: errNoKey,
},
"(error) slice of slice": {
input: struct {
Slices [][]struct{ Int int }
}{
[][]struct{ Int int }{{{1}}, {{2}}, {{3}}},
},
wantError: errArrayNoTable,
},
"(error) map no string key": {
input: map[int]string{1: ""},
wantError: errNonString,
},
"(error) empty key name": {
input: map[string]int{"": 1},
wantError: errAnything,
},
"(error) empty map name": {
input: map[string]interface{}{
"": map[string]int{"v": 1},
},
wantError: errAnything,
},
}
for label, test := range tests {
encodeExpected(t, label, test.input, test.wantOutput, test.wantError)
}
}
func TestEncodeNestedTableArrays(t *testing.T) {
type song struct {
Name string `toml:"name"`
}
type album struct {
Name string `toml:"name"`
Songs []song `toml:"songs"`
}
type springsteen struct {
Albums []album `toml:"albums"`
}
value := springsteen{
[]album{
{"Born to Run",
[]song{{"Jungleland"}, {"Meeting Across the River"}}},
{"Born in the USA",
[]song{{"Glory Days"}, {"Dancing in the Dark"}}},
},
}
expected := `[[albums]]
name = "Born to Run"
[[albums.songs]]
name = "Jungleland"
[[albums.songs]]
name = "Meeting Across the River"
[[albums]]
name = "Born in the USA"
[[albums.songs]]
name = "Glory Days"
[[albums.songs]]
name = "Dancing in the Dark"
`
encodeExpected(t, "nested table arrays", value, expected, nil)
}
func TestEncodeArrayHashWithNormalHashOrder(t *testing.T) {
type Alpha struct {
V int
}
type Beta struct {
V int
}
type Conf struct {
V int
A Alpha
B []Beta
}
val := Conf{
V: 1,
A: Alpha{2},
B: []Beta{{3}},
}
expected := "V = 1\n\n[A]\n V = 2\n\n[[B]]\n V = 3\n"
encodeExpected(t, "array hash with normal hash order", val, expected, nil)
}
func TestEncodeWithOmitEmpty(t *testing.T) {
type simple struct {
User string `toml:"user"`
Pass string `toml:"password,omitempty"`
}
value := simple{"Testing", ""}
expected := fmt.Sprintf("user = %q\n", value.User)
encodeExpected(t, "simple with omitempty, is empty", value, expected, nil)
value.Pass = "some password"
expected = fmt.Sprintf("user = %q\npassword = %q\n", value.User, value.Pass)
encodeExpected(t, "simple with omitempty, not empty", value, expected, nil)
}
func TestEncodeWithOmitZero(t *testing.T) {
type simple struct {
Number int `toml:"number,omitzero"`
Real float64 `toml:"real,omitzero"`
Unsigned uint `toml:"unsigned,omitzero"`
}
value := simple{0, 0.0, uint(0)}
expected := ""
encodeExpected(t, "simple with omitzero, all zero", value, expected, nil)
value.Number = 10
value.Real = 20
value.Unsigned = 5
expected = `number = 10
real = 20.0
unsigned = 5
`
encodeExpected(t, "simple with omitzero, non-zero", value, expected, nil)
}
func TestEncodeAnonymousStructPointerField(t *testing.T) {
type Sub struct{}
type simple struct {
*Sub
}
value := simple{}
expected := ""
encodeExpected(t, "nil anonymous struct pointer field", value, expected, nil)
value = simple{Sub: &Sub{}}
expected = ""
encodeExpected(t, "non-nil anonymous struct pointer field", value, expected, nil)
}
func TestEncodeIgnoredFields(t *testing.T) {
type simple struct {
Number int `toml:"-"`
}
value := simple{}
expected := ""
encodeExpected(t, "ignored field", value, expected, nil)
}
func encodeExpected(
t *testing.T, label string, val interface{}, wantStr string, wantErr error,
) {
var buf bytes.Buffer
enc := NewEncoder(&buf)
err := enc.Encode(val)
if err != wantErr {
if wantErr != nil {
if wantErr == errAnything && err != nil {
return
}
t.Errorf("%s: want Encode error %v, got %v", label, wantErr, err)
} else {
t.Errorf("%s: Encode failed: %s", label, err)
}
}
if err != nil {
return
}
if got := buf.String(); wantStr != got {
t.Errorf("%s: want\n-----\n%q\n-----\nbut got\n-----\n%q\n-----\n",
label, wantStr, got)
}
}
func ExampleEncoder_Encode() {
date, _ := time.Parse(time.RFC822, "14 Mar 10 18:00 UTC")
var config = map[string]interface{}{
"date": date,
"counts": []int{1, 1, 2, 3, 5, 8},
"hash": map[string]string{
"key1": "val1",
"key2": "val2",
},
}
buf := new(bytes.Buffer)
if err := NewEncoder(buf).Encode(config); err != nil {
log.Fatal(err)
}
fmt.Println(buf.String())
// Output:
// counts = [1, 1, 2, 3, 5, 8]
// date = 2010-03-14T18:00:00Z
//
// [hash]
// key1 = "val1"
// key2 = "val2"
}

View File

@ -1,19 +0,0 @@
// +build go1.2
package toml
// In order to support Go 1.1, we define our own TextMarshaler and
// TextUnmarshaler types. For Go 1.2+, we just alias them with the
// standard library interfaces.
import (
"encoding"
)
// TextMarshaler is a synonym for encoding.TextMarshaler. It is defined here
// so that Go 1.1 can be supported.
type TextMarshaler encoding.TextMarshaler
// TextUnmarshaler is a synonym for encoding.TextUnmarshaler. It is defined
// here so that Go 1.1 can be supported.
type TextUnmarshaler encoding.TextUnmarshaler

View File

@ -1,18 +0,0 @@
// +build !go1.2
package toml
// These interfaces were introduced in Go 1.2, so we add them manually when
// compiling for Go 1.1.
// TextMarshaler is a synonym for encoding.TextMarshaler. It is defined here
// so that Go 1.1 can be supported.
type TextMarshaler interface {
MarshalText() (text []byte, err error)
}
// TextUnmarshaler is a synonym for encoding.TextUnmarshaler. It is defined
// here so that Go 1.1 can be supported.
type TextUnmarshaler interface {
UnmarshalText(text []byte) error
}

View File

@ -1,871 +0,0 @@
package toml
import (
"fmt"
"strings"
"unicode/utf8"
)
type itemType int
const (
itemError itemType = iota
itemNIL // used in the parser to indicate no type
itemEOF
itemText
itemString
itemRawString
itemMultilineString
itemRawMultilineString
itemBool
itemInteger
itemFloat
itemDatetime
itemArray // the start of an array
itemArrayEnd
itemTableStart
itemTableEnd
itemArrayTableStart
itemArrayTableEnd
itemKeyStart
itemCommentStart
)
const (
eof = 0
tableStart = '['
tableEnd = ']'
arrayTableStart = '['
arrayTableEnd = ']'
tableSep = '.'
keySep = '='
arrayStart = '['
arrayEnd = ']'
arrayValTerm = ','
commentStart = '#'
stringStart = '"'
stringEnd = '"'
rawStringStart = '\''
rawStringEnd = '\''
)
type stateFn func(lx *lexer) stateFn
type lexer struct {
input string
start int
pos int
width int
line int
state stateFn
items chan item
// A stack of state functions used to maintain context.
// The idea is to reuse parts of the state machine in various places.
// For example, values can appear at the top level or within arbitrarily
// nested arrays. The last state on the stack is used after a value has
// been lexed. Similarly for comments.
stack []stateFn
}
type item struct {
typ itemType
val string
line int
}
func (lx *lexer) nextItem() item {
for {
select {
case item := <-lx.items:
return item
default:
lx.state = lx.state(lx)
}
}
}
func lex(input string) *lexer {
lx := &lexer{
input: input + "\n",
state: lexTop,
line: 1,
items: make(chan item, 10),
stack: make([]stateFn, 0, 10),
}
return lx
}
func (lx *lexer) push(state stateFn) {
lx.stack = append(lx.stack, state)
}
func (lx *lexer) pop() stateFn {
if len(lx.stack) == 0 {
return lx.errorf("BUG in lexer: no states to pop.")
}
last := lx.stack[len(lx.stack)-1]
lx.stack = lx.stack[0 : len(lx.stack)-1]
return last
}
func (lx *lexer) current() string {
return lx.input[lx.start:lx.pos]
}
func (lx *lexer) emit(typ itemType) {
lx.items <- item{typ, lx.current(), lx.line}
lx.start = lx.pos
}
func (lx *lexer) emitTrim(typ itemType) {
lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line}
lx.start = lx.pos
}
func (lx *lexer) next() (r rune) {
if lx.pos >= len(lx.input) {
lx.width = 0
return eof
}
if lx.input[lx.pos] == '\n' {
lx.line++
}
r, lx.width = utf8.DecodeRuneInString(lx.input[lx.pos:])
lx.pos += lx.width
return r
}
// ignore skips over the pending input before this point.
func (lx *lexer) ignore() {
lx.start = lx.pos
}
// backup steps back one rune. Can be called only once per call of next.
func (lx *lexer) backup() {
lx.pos -= lx.width
if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
lx.line--
}
}
// accept consumes the next rune if it's equal to `valid`.
func (lx *lexer) accept(valid rune) bool {
if lx.next() == valid {
return true
}
lx.backup()
return false
}
// peek returns but does not consume the next rune in the input.
func (lx *lexer) peek() rune {
r := lx.next()
lx.backup()
return r
}
// errorf stops all lexing by emitting an error and returning `nil`.
// Note that any value that is a character is escaped if it's a special
// character (new lines, tabs, etc.).
func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
lx.items <- item{
itemError,
fmt.Sprintf(format, values...),
lx.line,
}
return nil
}
// lexTop consumes elements at the top level of TOML data.
func lexTop(lx *lexer) stateFn {
r := lx.next()
if isWhitespace(r) || isNL(r) {
return lexSkip(lx, lexTop)
}
switch r {
case commentStart:
lx.push(lexTop)
return lexCommentStart
case tableStart:
return lexTableStart
case eof:
if lx.pos > lx.start {
return lx.errorf("Unexpected EOF.")
}
lx.emit(itemEOF)
return nil
}
// At this point, the only valid item can be a key, so we back up
// and let the key lexer do the rest.
lx.backup()
lx.push(lexTopEnd)
return lexKeyStart
}
// lexTopEnd is entered whenever a top-level item has been consumed. (A value
// or a table.) It must see only whitespace, and will turn back to lexTop
// upon a new line. If it sees EOF, it will quit the lexer successfully.
func lexTopEnd(lx *lexer) stateFn {
r := lx.next()
switch {
case r == commentStart:
// a comment will read to a new line for us.
lx.push(lexTop)
return lexCommentStart
case isWhitespace(r):
return lexTopEnd
case isNL(r):
lx.ignore()
return lexTop
case r == eof:
lx.ignore()
return lexTop
}
return lx.errorf("Expected a top-level item to end with a new line, "+
"comment or EOF, but got %q instead.", r)
}
// lexTable lexes the beginning of a table. Namely, it makes sure that
// it starts with a character other than '.' and ']'.
// It assumes that '[' has already been consumed.
// It also handles the case that this is an item in an array of tables.
// e.g., '[[name]]'.
func lexTableStart(lx *lexer) stateFn {
if lx.peek() == arrayTableStart {
lx.next()
lx.emit(itemArrayTableStart)
lx.push(lexArrayTableEnd)
} else {
lx.emit(itemTableStart)
lx.push(lexTableEnd)
}
return lexTableNameStart
}
func lexTableEnd(lx *lexer) stateFn {
lx.emit(itemTableEnd)
return lexTopEnd
}
func lexArrayTableEnd(lx *lexer) stateFn {
if r := lx.next(); r != arrayTableEnd {
return lx.errorf("Expected end of table array name delimiter %q, "+
"but got %q instead.", arrayTableEnd, r)
}
lx.emit(itemArrayTableEnd)
return lexTopEnd
}
func lexTableNameStart(lx *lexer) stateFn {
switch r := lx.peek(); {
case r == tableEnd || r == eof:
return lx.errorf("Unexpected end of table name. (Table names cannot " +
"be empty.)")
case r == tableSep:
return lx.errorf("Unexpected table separator. (Table names cannot " +
"be empty.)")
case r == stringStart || r == rawStringStart:
lx.ignore()
lx.push(lexTableNameEnd)
return lexValue // reuse string lexing
default:
return lexBareTableName
}
}
// lexTableName lexes the name of a table. It assumes that at least one
// valid character for the table has already been read.
func lexBareTableName(lx *lexer) stateFn {
switch r := lx.next(); {
case isBareKeyChar(r):
return lexBareTableName
case r == tableSep || r == tableEnd:
lx.backup()
lx.emitTrim(itemText)
return lexTableNameEnd
default:
return lx.errorf("Bare keys cannot contain %q.", r)
}
}
// lexTableNameEnd reads the end of a piece of a table name, optionally
// consuming whitespace.
func lexTableNameEnd(lx *lexer) stateFn {
switch r := lx.next(); {
case isWhitespace(r):
return lexTableNameEnd
case r == tableSep:
lx.ignore()
return lexTableNameStart
case r == tableEnd:
return lx.pop()
default:
return lx.errorf("Expected '.' or ']' to end table name, but got %q "+
"instead.", r)
}
}
// lexKeyStart consumes a key name up until the first non-whitespace character.
// lexKeyStart will ignore whitespace.
func lexKeyStart(lx *lexer) stateFn {
r := lx.peek()
switch {
case r == keySep:
return lx.errorf("Unexpected key separator %q.", keySep)
case isWhitespace(r) || isNL(r):
lx.next()
return lexSkip(lx, lexKeyStart)
case r == stringStart || r == rawStringStart:
lx.ignore()
lx.emit(itemKeyStart)
lx.push(lexKeyEnd)
return lexValue // reuse string lexing
default:
lx.ignore()
lx.emit(itemKeyStart)
return lexBareKey
}
}
// lexBareKey consumes the text of a bare key. Assumes that the first character
// (which is not whitespace) has not yet been consumed.
func lexBareKey(lx *lexer) stateFn {
switch r := lx.next(); {
case isBareKeyChar(r):
return lexBareKey
case isWhitespace(r):
lx.emitTrim(itemText)
return lexKeyEnd
case r == keySep:
lx.backup()
lx.emitTrim(itemText)
return lexKeyEnd
default:
return lx.errorf("Bare keys cannot contain %q.", r)
}
}
// lexKeyEnd consumes the end of a key and trims whitespace (up to the key
// separator).
func lexKeyEnd(lx *lexer) stateFn {
switch r := lx.next(); {
case r == keySep:
return lexSkip(lx, lexValue)
case isWhitespace(r):
return lexSkip(lx, lexKeyEnd)
default:
return lx.errorf("Expected key separator %q, but got %q instead.",
keySep, r)
}
}
// lexValue starts the consumption of a value anywhere a value is expected.
// lexValue will ignore whitespace.
// After a value is lexed, the last state on the next is popped and returned.
func lexValue(lx *lexer) stateFn {
// We allow whitespace to precede a value, but NOT new lines.
// In array syntax, the array states are responsible for ignoring new
// lines.
r := lx.next()
if isWhitespace(r) {
return lexSkip(lx, lexValue)
}
switch {
case r == arrayStart:
lx.ignore()
lx.emit(itemArray)
return lexArrayValue
case r == stringStart:
if lx.accept(stringStart) {
if lx.accept(stringStart) {
lx.ignore() // Ignore """
return lexMultilineString
}
lx.backup()
}
lx.ignore() // ignore the '"'
return lexString
case r == rawStringStart:
if lx.accept(rawStringStart) {
if lx.accept(rawStringStart) {
lx.ignore() // Ignore """
return lexMultilineRawString
}
lx.backup()
}
lx.ignore() // ignore the "'"
return lexRawString
case r == 't':
return lexTrue
case r == 'f':
return lexFalse
case r == '-':
return lexNumberStart
case isDigit(r):
lx.backup() // avoid an extra state and use the same as above
return lexNumberOrDateStart
case r == '.': // special error case, be kind to users
return lx.errorf("Floats must start with a digit, not '.'.")
}
return lx.errorf("Expected value but found %q instead.", r)
}
// lexArrayValue consumes one value in an array. It assumes that '[' or ','
// have already been consumed. All whitespace and new lines are ignored.
func lexArrayValue(lx *lexer) stateFn {
r := lx.next()
switch {
case isWhitespace(r) || isNL(r):
return lexSkip(lx, lexArrayValue)
case r == commentStart:
lx.push(lexArrayValue)
return lexCommentStart
case r == arrayValTerm:
return lx.errorf("Unexpected array value terminator %q.",
arrayValTerm)
case r == arrayEnd:
return lexArrayEnd
}
lx.backup()
lx.push(lexArrayValueEnd)
return lexValue
}
// lexArrayValueEnd consumes the cruft between values of an array. Namely,
// it ignores whitespace and expects either a ',' or a ']'.
func lexArrayValueEnd(lx *lexer) stateFn {
r := lx.next()
switch {
case isWhitespace(r) || isNL(r):
return lexSkip(lx, lexArrayValueEnd)
case r == commentStart:
lx.push(lexArrayValueEnd)
return lexCommentStart
case r == arrayValTerm:
lx.ignore()
return lexArrayValue // move on to the next value
case r == arrayEnd:
return lexArrayEnd
}
return lx.errorf("Expected an array value terminator %q or an array "+
"terminator %q, but got %q instead.", arrayValTerm, arrayEnd, r)
}
// lexArrayEnd finishes the lexing of an array. It assumes that a ']' has
// just been consumed.
func lexArrayEnd(lx *lexer) stateFn {
lx.ignore()
lx.emit(itemArrayEnd)
return lx.pop()
}
// lexString consumes the inner contents of a string. It assumes that the
// beginning '"' has already been consumed and ignored.
func lexString(lx *lexer) stateFn {
r := lx.next()
switch {
case isNL(r):
return lx.errorf("Strings cannot contain new lines.")
case r == '\\':
lx.push(lexString)
return lexStringEscape
case r == stringEnd:
lx.backup()
lx.emit(itemString)
lx.next()
lx.ignore()
return lx.pop()
}
return lexString
}
// lexMultilineString consumes the inner contents of a string. It assumes that
// the beginning '"""' has already been consumed and ignored.
func lexMultilineString(lx *lexer) stateFn {
r := lx.next()
switch {
case r == '\\':
return lexMultilineStringEscape
case r == stringEnd:
if lx.accept(stringEnd) {
if lx.accept(stringEnd) {
lx.backup()
lx.backup()
lx.backup()
lx.emit(itemMultilineString)
lx.next()
lx.next()
lx.next()
lx.ignore()
return lx.pop()
}
lx.backup()
}
}
return lexMultilineString
}
// lexRawString consumes a raw string. Nothing can be escaped in such a string.
// It assumes that the beginning "'" has already been consumed and ignored.
func lexRawString(lx *lexer) stateFn {
r := lx.next()
switch {
case isNL(r):
return lx.errorf("Strings cannot contain new lines.")
case r == rawStringEnd:
lx.backup()
lx.emit(itemRawString)
lx.next()
lx.ignore()
return lx.pop()
}
return lexRawString
}
// lexMultilineRawString consumes a raw string. Nothing can be escaped in such
// a string. It assumes that the beginning "'" has already been consumed and
// ignored.
func lexMultilineRawString(lx *lexer) stateFn {
r := lx.next()
switch {
case r == rawStringEnd:
if lx.accept(rawStringEnd) {
if lx.accept(rawStringEnd) {
lx.backup()
lx.backup()
lx.backup()
lx.emit(itemRawMultilineString)
lx.next()
lx.next()
lx.next()
lx.ignore()
return lx.pop()
}
lx.backup()
}
}
return lexMultilineRawString
}
// lexMultilineStringEscape consumes an escaped character. It assumes that the
// preceding '\\' has already been consumed.
func lexMultilineStringEscape(lx *lexer) stateFn {
// Handle the special case first:
if isNL(lx.next()) {
return lexMultilineString
} else {
lx.backup()
lx.push(lexMultilineString)
return lexStringEscape(lx)
}
}
func lexStringEscape(lx *lexer) stateFn {
r := lx.next()
switch r {
case 'b':
fallthrough
case 't':
fallthrough
case 'n':
fallthrough
case 'f':
fallthrough
case 'r':
fallthrough
case '"':
fallthrough
case '\\':
return lx.pop()
case 'u':
return lexShortUnicodeEscape
case 'U':
return lexLongUnicodeEscape
}
return lx.errorf("Invalid escape character %q. Only the following "+
"escape characters are allowed: "+
"\\b, \\t, \\n, \\f, \\r, \\\", \\/, \\\\, "+
"\\uXXXX and \\UXXXXXXXX.", r)
}
func lexShortUnicodeEscape(lx *lexer) stateFn {
var r rune
for i := 0; i < 4; i++ {
r = lx.next()
if !isHexadecimal(r) {
return lx.errorf("Expected four hexadecimal digits after '\\u', "+
"but got '%s' instead.", lx.current())
}
}
return lx.pop()
}
func lexLongUnicodeEscape(lx *lexer) stateFn {
var r rune
for i := 0; i < 8; i++ {
r = lx.next()
if !isHexadecimal(r) {
return lx.errorf("Expected eight hexadecimal digits after '\\U', "+
"but got '%s' instead.", lx.current())
}
}
return lx.pop()
}
// lexNumberOrDateStart consumes either a (positive) integer, float or
// datetime. It assumes that NO negative sign has been consumed.
func lexNumberOrDateStart(lx *lexer) stateFn {
r := lx.next()
if !isDigit(r) {
if r == '.' {
return lx.errorf("Floats must start with a digit, not '.'.")
} else {
return lx.errorf("Expected a digit but got %q.", r)
}
}
return lexNumberOrDate
}
// lexNumberOrDate consumes either a (positive) integer, float or datetime.
func lexNumberOrDate(lx *lexer) stateFn {
r := lx.next()
switch {
case r == '-':
if lx.pos-lx.start != 5 {
return lx.errorf("All ISO8601 dates must be in full Zulu form.")
}
return lexDateAfterYear
case isDigit(r):
return lexNumberOrDate
case r == '.':
return lexFloatStart
}
lx.backup()
lx.emit(itemInteger)
return lx.pop()
}
// lexDateAfterYear consumes a full Zulu Datetime in ISO8601 format.
// It assumes that "YYYY-" has already been consumed.
func lexDateAfterYear(lx *lexer) stateFn {
formats := []rune{
// digits are '0'.
// everything else is direct equality.
'0', '0', '-', '0', '0',
'T',
'0', '0', ':', '0', '0', ':', '0', '0',
'Z',
}
for _, f := range formats {
r := lx.next()
if f == '0' {
if !isDigit(r) {
return lx.errorf("Expected digit in ISO8601 datetime, "+
"but found %q instead.", r)
}
} else if f != r {
return lx.errorf("Expected %q in ISO8601 datetime, "+
"but found %q instead.", f, r)
}
}
lx.emit(itemDatetime)
return lx.pop()
}
// lexNumberStart consumes either an integer or a float. It assumes that
// a negative sign has already been read, but that *no* digits have been
// consumed. lexNumberStart will move to the appropriate integer or float
// states.
func lexNumberStart(lx *lexer) stateFn {
// we MUST see a digit. Even floats have to start with a digit.
r := lx.next()
if !isDigit(r) {
if r == '.' {
return lx.errorf("Floats must start with a digit, not '.'.")
} else {
return lx.errorf("Expected a digit but got %q.", r)
}
}
return lexNumber
}
// lexNumber consumes an integer or a float after seeing the first digit.
func lexNumber(lx *lexer) stateFn {
r := lx.next()
switch {
case isDigit(r):
return lexNumber
case r == '.':
return lexFloatStart
}
lx.backup()
lx.emit(itemInteger)
return lx.pop()
}
// lexFloatStart starts the consumption of digits of a float after a '.'.
// Namely, at least one digit is required.
func lexFloatStart(lx *lexer) stateFn {
r := lx.next()
if !isDigit(r) {
return lx.errorf("Floats must have a digit after the '.', but got "+
"%q instead.", r)
}
return lexFloat
}
// lexFloat consumes the digits of a float after a '.'.
// Assumes that one digit has been consumed after a '.' already.
func lexFloat(lx *lexer) stateFn {
r := lx.next()
if isDigit(r) {
return lexFloat
}
lx.backup()
lx.emit(itemFloat)
return lx.pop()
}
// lexConst consumes the s[1:] in s. It assumes that s[0] has already been
// consumed.
func lexConst(lx *lexer, s string) stateFn {
for i := range s[1:] {
if r := lx.next(); r != rune(s[i+1]) {
return lx.errorf("Expected %q, but found %q instead.", s[:i+1],
s[:i]+string(r))
}
}
return nil
}
// lexTrue consumes the "rue" in "true". It assumes that 't' has already
// been consumed.
func lexTrue(lx *lexer) stateFn {
if fn := lexConst(lx, "true"); fn != nil {
return fn
}
lx.emit(itemBool)
return lx.pop()
}
// lexFalse consumes the "alse" in "false". It assumes that 'f' has already
// been consumed.
func lexFalse(lx *lexer) stateFn {
if fn := lexConst(lx, "false"); fn != nil {
return fn
}
lx.emit(itemBool)
return lx.pop()
}
// lexCommentStart begins the lexing of a comment. It will emit
// itemCommentStart and consume no characters, passing control to lexComment.
func lexCommentStart(lx *lexer) stateFn {
lx.ignore()
lx.emit(itemCommentStart)
return lexComment
}
// lexComment lexes an entire comment. It assumes that '#' has been consumed.
// It will consume *up to* the first new line character, and pass control
// back to the last state on the stack.
func lexComment(lx *lexer) stateFn {
r := lx.peek()
if isNL(r) || r == eof {
lx.emit(itemText)
return lx.pop()
}
lx.next()
return lexComment
}
// lexSkip ignores all slurped input and moves on to the next state.
func lexSkip(lx *lexer, nextState stateFn) stateFn {
return func(lx *lexer) stateFn {
lx.ignore()
return nextState
}
}
// isWhitespace returns true if `r` is a whitespace character according
// to the spec.
func isWhitespace(r rune) bool {
return r == '\t' || r == ' '
}
func isNL(r rune) bool {
return r == '\n' || r == '\r'
}
func isDigit(r rune) bool {
return r >= '0' && r <= '9'
}
func isHexadecimal(r rune) bool {
return (r >= '0' && r <= '9') ||
(r >= 'a' && r <= 'f') ||
(r >= 'A' && r <= 'F')
}
func isBareKeyChar(r rune) bool {
return (r >= 'A' && r <= 'Z') ||
(r >= 'a' && r <= 'z') ||
(r >= '0' && r <= '9') ||
r == '_' ||
r == '-'
}
func (itype itemType) String() string {
switch itype {
case itemError:
return "Error"
case itemNIL:
return "NIL"
case itemEOF:
return "EOF"
case itemText:
return "Text"
case itemString:
return "String"
case itemRawString:
return "String"
case itemMultilineString:
return "String"
case itemRawMultilineString:
return "String"
case itemBool:
return "Bool"
case itemInteger:
return "Integer"
case itemFloat:
return "Float"
case itemDatetime:
return "DateTime"
case itemTableStart:
return "TableStart"
case itemTableEnd:
return "TableEnd"
case itemKeyStart:
return "KeyStart"
case itemArray:
return "Array"
case itemArrayEnd:
return "ArrayEnd"
case itemCommentStart:
return "CommentStart"
}
panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
}
func (item item) String() string {
return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
}

View File

@ -1,493 +0,0 @@
package toml
import (
"fmt"
"log"
"strconv"
"strings"
"time"
"unicode"
"unicode/utf8"
)
type parser struct {
mapping map[string]interface{}
types map[string]tomlType
lx *lexer
// A list of keys in the order that they appear in the TOML data.
ordered []Key
// the full key for the current hash in scope
context Key
// the base key name for everything except hashes
currentKey string
// rough approximation of line number
approxLine int
// A map of 'key.group.names' to whether they were created implicitly.
implicits map[string]bool
}
type parseError string
func (pe parseError) Error() string {
return string(pe)
}
func parse(data string) (p *parser, err error) {
defer func() {
if r := recover(); r != nil {
var ok bool
if err, ok = r.(parseError); ok {
return
}
panic(r)
}
}()
p = &parser{
mapping: make(map[string]interface{}),
types: make(map[string]tomlType),
lx: lex(data),
ordered: make([]Key, 0),
implicits: make(map[string]bool),
}
for {
item := p.next()
if item.typ == itemEOF {
break
}
p.topLevel(item)
}
return p, nil
}
func (p *parser) panicf(format string, v ...interface{}) {
msg := fmt.Sprintf("Near line %d (last key parsed '%s'): %s",
p.approxLine, p.current(), fmt.Sprintf(format, v...))
panic(parseError(msg))
}
func (p *parser) next() item {
it := p.lx.nextItem()
if it.typ == itemError {
p.panicf("%s", it.val)
}
return it
}
func (p *parser) bug(format string, v ...interface{}) {
log.Panicf("BUG: %s\n\n", fmt.Sprintf(format, v...))
}
func (p *parser) expect(typ itemType) item {
it := p.next()
p.assertEqual(typ, it.typ)
return it
}
func (p *parser) assertEqual(expected, got itemType) {
if expected != got {
p.bug("Expected '%s' but got '%s'.", expected, got)
}
}
func (p *parser) topLevel(item item) {
switch item.typ {
case itemCommentStart:
p.approxLine = item.line
p.expect(itemText)
case itemTableStart:
kg := p.next()
p.approxLine = kg.line
var key Key
for ; kg.typ != itemTableEnd && kg.typ != itemEOF; kg = p.next() {
key = append(key, p.keyString(kg))
}
p.assertEqual(itemTableEnd, kg.typ)
p.establishContext(key, false)
p.setType("", tomlHash)
p.ordered = append(p.ordered, key)
case itemArrayTableStart:
kg := p.next()
p.approxLine = kg.line
var key Key
for ; kg.typ != itemArrayTableEnd && kg.typ != itemEOF; kg = p.next() {
key = append(key, p.keyString(kg))
}
p.assertEqual(itemArrayTableEnd, kg.typ)
p.establishContext(key, true)
p.setType("", tomlArrayHash)
p.ordered = append(p.ordered, key)
case itemKeyStart:
kname := p.next()
p.approxLine = kname.line
p.currentKey = p.keyString(kname)
val, typ := p.value(p.next())
p.setValue(p.currentKey, val)
p.setType(p.currentKey, typ)
p.ordered = append(p.ordered, p.context.add(p.currentKey))
p.currentKey = ""
default:
p.bug("Unexpected type at top level: %s", item.typ)
}
}
// Gets a string for a key (or part of a key in a table name).
func (p *parser) keyString(it item) string {
switch it.typ {
case itemText:
return it.val
case itemString, itemMultilineString,
itemRawString, itemRawMultilineString:
s, _ := p.value(it)
return s.(string)
default:
p.bug("Unexpected key type: %s", it.typ)
panic("unreachable")
}
}
// value translates an expected value from the lexer into a Go value wrapped
// as an empty interface.
func (p *parser) value(it item) (interface{}, tomlType) {
switch it.typ {
case itemString:
return p.replaceEscapes(it.val), p.typeOfPrimitive(it)
case itemMultilineString:
trimmed := stripFirstNewline(stripEscapedWhitespace(it.val))
return p.replaceEscapes(trimmed), p.typeOfPrimitive(it)
case itemRawString:
return it.val, p.typeOfPrimitive(it)
case itemRawMultilineString:
return stripFirstNewline(it.val), p.typeOfPrimitive(it)
case itemBool:
switch it.val {
case "true":
return true, p.typeOfPrimitive(it)
case "false":
return false, p.typeOfPrimitive(it)
}
p.bug("Expected boolean value, but got '%s'.", it.val)
case itemInteger:
num, err := strconv.ParseInt(it.val, 10, 64)
if err != nil {
// See comment below for floats describing why we make a
// distinction between a bug and a user error.
if e, ok := err.(*strconv.NumError); ok &&
e.Err == strconv.ErrRange {
p.panicf("Integer '%s' is out of the range of 64-bit "+
"signed integers.", it.val)
} else {
p.bug("Expected integer value, but got '%s'.", it.val)
}
}
return num, p.typeOfPrimitive(it)
case itemFloat:
num, err := strconv.ParseFloat(it.val, 64)
if err != nil {
// Distinguish float values. Normally, it'd be a bug if the lexer
// provides an invalid float, but it's possible that the float is
// out of range of valid values (which the lexer cannot determine).
// So mark the former as a bug but the latter as a legitimate user
// error.
//
// This is also true for integers.
if e, ok := err.(*strconv.NumError); ok &&
e.Err == strconv.ErrRange {
p.panicf("Float '%s' is out of the range of 64-bit "+
"IEEE-754 floating-point numbers.", it.val)
} else {
p.bug("Expected float value, but got '%s'.", it.val)
}
}
return num, p.typeOfPrimitive(it)
case itemDatetime:
t, err := time.Parse("2006-01-02T15:04:05Z", it.val)
if err != nil {
p.panicf("Invalid RFC3339 Zulu DateTime: '%s'.", it.val)
}
return t, p.typeOfPrimitive(it)
case itemArray:
array := make([]interface{}, 0)
types := make([]tomlType, 0)
for it = p.next(); it.typ != itemArrayEnd; it = p.next() {
if it.typ == itemCommentStart {
p.expect(itemText)
continue
}
val, typ := p.value(it)
array = append(array, val)
types = append(types, typ)
}
return array, p.typeOfArray(types)
}
p.bug("Unexpected value type: %s", it.typ)
panic("unreachable")
}
// establishContext sets the current context of the parser,
// where the context is either a hash or an array of hashes. Which one is
// set depends on the value of the `array` parameter.
//
// Establishing the context also makes sure that the key isn't a duplicate, and
// will create implicit hashes automatically.
func (p *parser) establishContext(key Key, array bool) {
var ok bool
// Always start at the top level and drill down for our context.
hashContext := p.mapping
keyContext := make(Key, 0)
// We only need implicit hashes for key[0:-1]
for _, k := range key[0 : len(key)-1] {
_, ok = hashContext[k]
keyContext = append(keyContext, k)
// No key? Make an implicit hash and move on.
if !ok {
p.addImplicit(keyContext)
hashContext[k] = make(map[string]interface{})
}
// If the hash context is actually an array of tables, then set
// the hash context to the last element in that array.
//
// Otherwise, it better be a table, since this MUST be a key group (by
// virtue of it not being the last element in a key).
switch t := hashContext[k].(type) {
case []map[string]interface{}:
hashContext = t[len(t)-1]
case map[string]interface{}:
hashContext = t
default:
p.panicf("Key '%s' was already created as a hash.", keyContext)
}
}
p.context = keyContext
if array {
// If this is the first element for this array, then allocate a new
// list of tables for it.
k := key[len(key)-1]
if _, ok := hashContext[k]; !ok {
hashContext[k] = make([]map[string]interface{}, 0, 5)
}
// Add a new table. But make sure the key hasn't already been used
// for something else.
if hash, ok := hashContext[k].([]map[string]interface{}); ok {
hashContext[k] = append(hash, make(map[string]interface{}))
} else {
p.panicf("Key '%s' was already created and cannot be used as "+
"an array.", keyContext)
}
} else {
p.setValue(key[len(key)-1], make(map[string]interface{}))
}
p.context = append(p.context, key[len(key)-1])
}
// setValue sets the given key to the given value in the current context.
// It will make sure that the key hasn't already been defined, account for
// implicit key groups.
func (p *parser) setValue(key string, value interface{}) {
var tmpHash interface{}
var ok bool
hash := p.mapping
keyContext := make(Key, 0)
for _, k := range p.context {
keyContext = append(keyContext, k)
if tmpHash, ok = hash[k]; !ok {
p.bug("Context for key '%s' has not been established.", keyContext)
}
switch t := tmpHash.(type) {
case []map[string]interface{}:
// The context is a table of hashes. Pick the most recent table
// defined as the current hash.
hash = t[len(t)-1]
case map[string]interface{}:
hash = t
default:
p.bug("Expected hash to have type 'map[string]interface{}', but "+
"it has '%T' instead.", tmpHash)
}
}
keyContext = append(keyContext, key)
if _, ok := hash[key]; ok {
// Typically, if the given key has already been set, then we have
// to raise an error since duplicate keys are disallowed. However,
// it's possible that a key was previously defined implicitly. In this
// case, it is allowed to be redefined concretely. (See the
// `tests/valid/implicit-and-explicit-after.toml` test in `toml-test`.)
//
// But we have to make sure to stop marking it as an implicit. (So that
// another redefinition provokes an error.)
//
// Note that since it has already been defined (as a hash), we don't
// want to overwrite it. So our business is done.
if p.isImplicit(keyContext) {
p.removeImplicit(keyContext)
return
}
// Otherwise, we have a concrete key trying to override a previous
// key, which is *always* wrong.
p.panicf("Key '%s' has already been defined.", keyContext)
}
hash[key] = value
}
// setType sets the type of a particular value at a given key.
// It should be called immediately AFTER setValue.
//
// Note that if `key` is empty, then the type given will be applied to the
// current context (which is either a table or an array of tables).
func (p *parser) setType(key string, typ tomlType) {
keyContext := make(Key, 0, len(p.context)+1)
for _, k := range p.context {
keyContext = append(keyContext, k)
}
if len(key) > 0 { // allow type setting for hashes
keyContext = append(keyContext, key)
}
p.types[keyContext.String()] = typ
}
// addImplicit sets the given Key as having been created implicitly.
func (p *parser) addImplicit(key Key) {
p.implicits[key.String()] = true
}
// removeImplicit stops tagging the given key as having been implicitly
// created.
func (p *parser) removeImplicit(key Key) {
p.implicits[key.String()] = false
}
// isImplicit returns true if the key group pointed to by the key was created
// implicitly.
func (p *parser) isImplicit(key Key) bool {
return p.implicits[key.String()]
}
// current returns the full key name of the current context.
func (p *parser) current() string {
if len(p.currentKey) == 0 {
return p.context.String()
}
if len(p.context) == 0 {
return p.currentKey
}
return fmt.Sprintf("%s.%s", p.context, p.currentKey)
}
func stripFirstNewline(s string) string {
if len(s) == 0 || s[0] != '\n' {
return s
}
return s[1:]
}
func stripEscapedWhitespace(s string) string {
esc := strings.Split(s, "\\\n")
if len(esc) > 1 {
for i := 1; i < len(esc); i++ {
esc[i] = strings.TrimLeftFunc(esc[i], unicode.IsSpace)
}
}
return strings.Join(esc, "")
}
func (p *parser) replaceEscapes(str string) string {
var replaced []rune
s := []byte(str)
r := 0
for r < len(s) {
if s[r] != '\\' {
c, size := utf8.DecodeRune(s[r:])
r += size
replaced = append(replaced, c)
continue
}
r += 1
if r >= len(s) {
p.bug("Escape sequence at end of string.")
return ""
}
switch s[r] {
default:
p.bug("Expected valid escape code after \\, but got %q.", s[r])
return ""
case 'b':
replaced = append(replaced, rune(0x0008))
r += 1
case 't':
replaced = append(replaced, rune(0x0009))
r += 1
case 'n':
replaced = append(replaced, rune(0x000A))
r += 1
case 'f':
replaced = append(replaced, rune(0x000C))
r += 1
case 'r':
replaced = append(replaced, rune(0x000D))
r += 1
case '"':
replaced = append(replaced, rune(0x0022))
r += 1
case '\\':
replaced = append(replaced, rune(0x005C))
r += 1
case 'u':
// At this point, we know we have a Unicode escape of the form
// `uXXXX` at [r, r+5). (Because the lexer guarantees this
// for us.)
escaped := p.asciiEscapeToUnicode(s[r+1 : r+5])
replaced = append(replaced, escaped)
r += 5
case 'U':
// At this point, we know we have a Unicode escape of the form
// `uXXXX` at [r, r+9). (Because the lexer guarantees this
// for us.)
escaped := p.asciiEscapeToUnicode(s[r+1 : r+9])
replaced = append(replaced, escaped)
r += 9
}
}
return string(replaced)
}
func (p *parser) asciiEscapeToUnicode(bs []byte) rune {
s := string(bs)
hex, err := strconv.ParseUint(strings.ToLower(s), 16, 32)
if err != nil {
p.bug("Could not parse '%s' as a hexadecimal number, but the "+
"lexer claims it's OK: %s", s, err)
}
if !utf8.ValidRune(rune(hex)) {
p.panicf("Escaped character '\\u%s' is not valid UTF-8.", s)
}
return rune(hex)
}
func isStringType(ty itemType) bool {
return ty == itemString || ty == itemMultilineString ||
ty == itemRawString || ty == itemRawMultilineString
}

View File

@ -1 +0,0 @@
au BufWritePost *.go silent!make tags > /dev/null 2>&1

View File

@ -1,91 +0,0 @@
package toml
// tomlType represents any Go type that corresponds to a TOML type.
// While the first draft of the TOML spec has a simplistic type system that
// probably doesn't need this level of sophistication, we seem to be militating
// toward adding real composite types.
type tomlType interface {
typeString() string
}
// typeEqual accepts any two types and returns true if they are equal.
func typeEqual(t1, t2 tomlType) bool {
if t1 == nil || t2 == nil {
return false
}
return t1.typeString() == t2.typeString()
}
func typeIsHash(t tomlType) bool {
return typeEqual(t, tomlHash) || typeEqual(t, tomlArrayHash)
}
type tomlBaseType string
func (btype tomlBaseType) typeString() string {
return string(btype)
}
func (btype tomlBaseType) String() string {
return btype.typeString()
}
var (
tomlInteger tomlBaseType = "Integer"
tomlFloat tomlBaseType = "Float"
tomlDatetime tomlBaseType = "Datetime"
tomlString tomlBaseType = "String"
tomlBool tomlBaseType = "Bool"
tomlArray tomlBaseType = "Array"
tomlHash tomlBaseType = "Hash"
tomlArrayHash tomlBaseType = "ArrayHash"
)
// typeOfPrimitive returns a tomlType of any primitive value in TOML.
// Primitive values are: Integer, Float, Datetime, String and Bool.
//
// Passing a lexer item other than the following will cause a BUG message
// to occur: itemString, itemBool, itemInteger, itemFloat, itemDatetime.
func (p *parser) typeOfPrimitive(lexItem item) tomlType {
switch lexItem.typ {
case itemInteger:
return tomlInteger
case itemFloat:
return tomlFloat
case itemDatetime:
return tomlDatetime
case itemString:
return tomlString
case itemMultilineString:
return tomlString
case itemRawString:
return tomlString
case itemRawMultilineString:
return tomlString
case itemBool:
return tomlBool
}
p.bug("Cannot infer primitive type of lex item '%s'.", lexItem)
panic("unreachable")
}
// typeOfArray returns a tomlType for an array given a list of types of its
// values.
//
// In the current spec, if an array is homogeneous, then its type is always
// "Array". If the array is not homogeneous, an error is generated.
func (p *parser) typeOfArray(types []tomlType) tomlType {
// Empty arrays are cool.
if len(types) == 0 {
return tomlArray
}
theType := types[0]
for _, t := range types[1:] {
if !typeEqual(theType, t) {
p.panicf("Array contains values of type '%s' and '%s', but "+
"arrays must be homogeneous.", theType, t)
}
}
return tomlArray
}

View File

@ -1,241 +0,0 @@
package toml
// Struct field handling is adapted from code in encoding/json:
//
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the Go distribution.
import (
"reflect"
"sort"
"sync"
)
// A field represents a single field found in a struct.
type field struct {
name string // the name of the field (`toml` tag included)
tag bool // whether field has a `toml` tag
index []int // represents the depth of an anonymous field
typ reflect.Type // the type of the field
}
// byName sorts field by name, breaking ties with depth,
// then breaking ties with "name came from toml tag", then
// breaking ties with index sequence.
type byName []field
func (x byName) Len() int { return len(x) }
func (x byName) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
func (x byName) Less(i, j int) bool {
if x[i].name != x[j].name {
return x[i].name < x[j].name
}
if len(x[i].index) != len(x[j].index) {
return len(x[i].index) < len(x[j].index)
}
if x[i].tag != x[j].tag {
return x[i].tag
}
return byIndex(x).Less(i, j)
}
// byIndex sorts field by index sequence.
type byIndex []field
func (x byIndex) Len() int { return len(x) }
func (x byIndex) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
func (x byIndex) Less(i, j int) bool {
for k, xik := range x[i].index {
if k >= len(x[j].index) {
return false
}
if xik != x[j].index[k] {
return xik < x[j].index[k]
}
}
return len(x[i].index) < len(x[j].index)
}
// typeFields returns a list of fields that TOML should recognize for the given
// type. The algorithm is breadth-first search over the set of structs to
// include - the top struct and then any reachable anonymous structs.
func typeFields(t reflect.Type) []field {
// Anonymous fields to explore at the current level and the next.
current := []field{}
next := []field{{typ: t}}
// Count of queued names for current level and the next.
count := map[reflect.Type]int{}
nextCount := map[reflect.Type]int{}
// Types already visited at an earlier level.
visited := map[reflect.Type]bool{}
// Fields found.
var fields []field
for len(next) > 0 {
current, next = next, current[:0]
count, nextCount = nextCount, map[reflect.Type]int{}
for _, f := range current {
if visited[f.typ] {
continue
}
visited[f.typ] = true
// Scan f.typ for fields to include.
for i := 0; i < f.typ.NumField(); i++ {
sf := f.typ.Field(i)
if sf.PkgPath != "" && !sf.Anonymous { // unexported
continue
}
name, _ := getOptions(sf.Tag.Get("toml"))
if name == "-" {
continue
}
index := make([]int, len(f.index)+1)
copy(index, f.index)
index[len(f.index)] = i
ft := sf.Type
if ft.Name() == "" && ft.Kind() == reflect.Ptr {
// Follow pointer.
ft = ft.Elem()
}
// Record found field and index sequence.
if name != "" || !sf.Anonymous || ft.Kind() != reflect.Struct {
tagged := name != ""
if name == "" {
name = sf.Name
}
fields = append(fields, field{name, tagged, index, ft})
if count[f.typ] > 1 {
// If there were multiple instances, add a second,
// so that the annihilation code will see a duplicate.
// It only cares about the distinction between 1 or 2,
// so don't bother generating any more copies.
fields = append(fields, fields[len(fields)-1])
}
continue
}
// Record new anonymous struct to explore in next round.
nextCount[ft]++
if nextCount[ft] == 1 {
f := field{name: ft.Name(), index: index, typ: ft}
next = append(next, f)
}
}
}
}
sort.Sort(byName(fields))
// Delete all fields that are hidden by the Go rules for embedded fields,
// except that fields with TOML tags are promoted.
// The fields are sorted in primary order of name, secondary order
// of field index length. Loop over names; for each name, delete
// hidden fields by choosing the one dominant field that survives.
out := fields[:0]
for advance, i := 0, 0; i < len(fields); i += advance {
// One iteration per name.
// Find the sequence of fields with the name of this first field.
fi := fields[i]
name := fi.name
for advance = 1; i+advance < len(fields); advance++ {
fj := fields[i+advance]
if fj.name != name {
break
}
}
if advance == 1 { // Only one field with this name
out = append(out, fi)
continue
}
dominant, ok := dominantField(fields[i : i+advance])
if ok {
out = append(out, dominant)
}
}
fields = out
sort.Sort(byIndex(fields))
return fields
}
// dominantField looks through the fields, all of which are known to
// have the same name, to find the single field that dominates the
// others using Go's embedding rules, modified by the presence of
// TOML tags. If there are multiple top-level fields, the boolean
// will be false: This condition is an error in Go and we skip all
// the fields.
func dominantField(fields []field) (field, bool) {
// The fields are sorted in increasing index-length order. The winner
// must therefore be one with the shortest index length. Drop all
// longer entries, which is easy: just truncate the slice.
length := len(fields[0].index)
tagged := -1 // Index of first tagged field.
for i, f := range fields {
if len(f.index) > length {
fields = fields[:i]
break
}
if f.tag {
if tagged >= 0 {
// Multiple tagged fields at the same level: conflict.
// Return no field.
return field{}, false
}
tagged = i
}
}
if tagged >= 0 {
return fields[tagged], true
}
// All remaining fields have the same length. If there's more than one,
// we have a conflict (two fields named "X" at the same level) and we
// return no field.
if len(fields) > 1 {
return field{}, false
}
return fields[0], true
}
var fieldCache struct {
sync.RWMutex
m map[reflect.Type][]field
}
// cachedTypeFields is like typeFields but uses a cache to avoid repeated work.
func cachedTypeFields(t reflect.Type) []field {
fieldCache.RLock()
f := fieldCache.m[t]
fieldCache.RUnlock()
if f != nil {
return f
}
// Compute fields without lock.
// Might duplicate effort but won't hold other computations back.
f = typeFields(t)
if f == nil {
f = []field{}
}
fieldCache.Lock()
if fieldCache.m == nil {
fieldCache.m = map[reflect.Type][]field{}
}
fieldCache.m[t] = f
fieldCache.Unlock()
return f
}

16
vendor/github.com/blevesearch/bleve/CONTRIBUTING.md generated vendored Normal file
View File

@ -0,0 +1,16 @@
# Contributing to Bleve
We look forward to your contributions, but ask that you first review these guidelines.
### Sign the CLA
As Bleve is a Couchbase project we require contributors accept the [Couchbase Contributor License Agreement](http://review.couchbase.org/static/individual_agreement.html). To sign this agreement log into the Couchbase [code review tool](http://review.couchbase.org/). The Bleve project does not use this code review tool but it is still used to track acceptance of the contributor license agreements.
### Submitting a Pull Request
All types of contributions are welcome, but please keep the following in mind:
- If you're planning a large change, you should really discuss it in a github issue or on the google group first. This helps avoid duplicate effort and spending time on something that may not be merged.
- Existing tests should continue to pass, new tests for the contribution are nice to have.
- All code should have gone through `go fmt`
- All code should pass `go vet`

View File

@ -1,10 +1,14 @@
# ![bleve](docs/bleve.png) bleve # ![bleve](docs/bleve.png) bleve
[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve)
[![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve)
[![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve)
[![Sourcegraph](https://sourcegraph.com/github.com/blevesearch/bleve/-/badge.svg)](https://sourcegraph.com/github.com/blevesearch/bleve?badge) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/) modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/)
Try out bleve live by [searching our wiki](http://wikisearch.blevesearch.com/search/). Try out bleve live by [searching the bleve website](http://www.blevesearch.com/search/?q=bleve).
## Features ## Features
@ -16,7 +20,7 @@ Try out bleve live by [searching our wiki](http://wikisearch.blevesearch.com/sea
* Term, Phrase, Match, Match Phrase, Prefix * Term, Phrase, Match, Match Phrase, Prefix
* Conjunction, Disjunction, Boolean * Conjunction, Disjunction, Boolean
* Numeric Range, Date Range * Numeric Range, Date Range
* Simple query [syntax](https://github.com/blevesearch/bleve/wiki/Query-String-Query) for human entry * Simple query [syntax](http://www.blevesearch.com/docs/Query-String-Query/) for human entry
* tf-idf Scoring * tf-idf Scoring
* Search result match highlighting * Search result match highlighting
* Supports Aggregating Facets: * Supports Aggregating Facets:
@ -30,32 +34,34 @@ Discuss usage and development of bleve in the [google group](https://groups.goog
## Indexing ## Indexing
message := struct{ ```go
Id string message := struct{
From string Id string
Body string From string
}{ Body string
Id: "example", }{
From: "marty.schoch@gmail.com", Id: "example",
Body: "bleve indexing is easy", From: "marty.schoch@gmail.com",
} Body: "bleve indexing is easy",
}
mapping := bleve.NewIndexMapping() mapping := bleve.NewIndexMapping()
index, err := bleve.New("example.bleve", mapping) index, err := bleve.New("example.bleve", mapping)
if err != nil { if err != nil {
panic(err) panic(err)
} }
index.Index(message.Id, message) index.Index(message.Id, message)
```
## Querying ## Querying
index, _ := bleve.Open("example.bleve") ```go
query := bleve.NewQueryStringQuery("bleve") index, _ := bleve.Open("example.bleve")
searchRequest := bleve.NewSearchRequest(query) query := bleve.NewQueryStringQuery("bleve")
searchResult, _ := index.Search(searchRequest) searchRequest := bleve.NewSearchRequest(query)
searchResult, _ := index.Search(searchRequest)
```
## License ## License
Apache License Version 2.0 Apache License Version 2.0

View File

@ -0,0 +1,52 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package standard
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/lang/en"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/registry"
)
const Name = "standard"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -1,130 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package standard_analyzer
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "custom"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
var err error
var charFilters []analysis.CharFilter
charFiltersNames, ok := config["char_filters"].([]string)
if ok {
charFilters, err = getCharFilters(charFiltersNames, cache)
if err != nil {
return nil, err
}
} else {
charFiltersNamesInterfaceSlice, ok := config["char_filters"].([]interface{})
if ok {
charFiltersNames, err := convertInterfaceSliceToStringSlice(charFiltersNamesInterfaceSlice, "char filter")
if err != nil {
return nil, err
}
charFilters, err = getCharFilters(charFiltersNames, cache)
if err != nil {
return nil, err
}
}
}
tokenizerName, ok := config["tokenizer"].(string)
if !ok {
return nil, fmt.Errorf("must specify tokenizer")
}
tokenizer, err := cache.TokenizerNamed(tokenizerName)
if err != nil {
return nil, err
}
var tokenFilters []analysis.TokenFilter
tokenFiltersNames, ok := config["token_filters"].([]string)
if ok {
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
if err != nil {
return nil, err
}
} else {
tokenFiltersNamesInterfaceSlice, ok := config["token_filters"].([]interface{})
if ok {
tokenFiltersNames, err := convertInterfaceSliceToStringSlice(tokenFiltersNamesInterfaceSlice, "token filter")
if err != nil {
return nil, err
}
tokenFilters, err = getTokenFilters(tokenFiltersNames, cache)
if err != nil {
return nil, err
}
}
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
}
if charFilters != nil {
rv.CharFilters = charFilters
}
if tokenFilters != nil {
rv.TokenFilters = tokenFilters
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}
func getCharFilters(charFilterNames []string, cache *registry.Cache) ([]analysis.CharFilter, error) {
charFilters := make([]analysis.CharFilter, len(charFilterNames))
for i, charFilterName := range charFilterNames {
charFilter, err := cache.CharFilterNamed(charFilterName)
if err != nil {
return nil, err
}
charFilters[i] = charFilter
}
return charFilters, nil
}
func getTokenFilters(tokenFilterNames []string, cache *registry.Cache) ([]analysis.TokenFilter, error) {
tokenFilters := make([]analysis.TokenFilter, len(tokenFilterNames))
for i, tokenFilterName := range tokenFilterNames {
tokenFilter, err := cache.TokenFilterNamed(tokenFilterName)
if err != nil {
return nil, err
}
tokenFilters[i] = tokenFilter
}
return tokenFilters, nil
}
func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType string) ([]string, error) {
stringSlice := make([]string, len(interfaceSlice))
for i, interfaceObj := range interfaceSlice {
stringObj, ok := interfaceObj.(string)
if ok {
stringSlice[i] = stringObj
} else {
return nil, fmt.Errorf(objType + " name must be a string")
}
}
return stringSlice, nil
}

View File

@ -1,49 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build cld2 full
package detect_lang_analyzer
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/cld2"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/single_token"
"github.com/blevesearch/bleve/registry"
)
const Name = "detect_lang"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
detectLangFilter, err := cache.TokenFilterNamed(cld2.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
detectLangFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -1,33 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package keyword_analyzer
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizers/single_token"
"github.com/blevesearch/bleve/registry"
)
const Name = "keyword"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -1,41 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_analyzer
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/blevesearch/bleve/registry"
)
const Name = "simple"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -1,47 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package standard_analyzer
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/language/en"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/blevesearch/bleve/registry"
)
const Name = "standard"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -1,33 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ignore_byte_array_converter
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
type IgnoreByteArrayConverter struct{}
func NewIgnoreByteArrayConverter() *IgnoreByteArrayConverter {
return &IgnoreByteArrayConverter{}
}
func (c *IgnoreByteArrayConverter) Convert(in []byte) (interface{}, error) {
return nil, nil
}
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
return NewIgnoreByteArrayConverter(), nil
}
func init() {
registry.RegisterByteArrayConverter("ignore", Constructor)
}

View File

@ -1,40 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package json_byte_array_converter
import (
"encoding/json"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
type JSONByteArrayConverter struct{}
func NewJSONByteArrayConverter() *JSONByteArrayConverter {
return &JSONByteArrayConverter{}
}
func (c *JSONByteArrayConverter) Convert(in []byte) (interface{}, error) {
var rv map[string]interface{}
err := json.Unmarshal(in, &rv)
if err != nil {
return nil, err
}
return rv, nil
}
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
return NewJSONByteArrayConverter(), nil
}
func init() {
registry.RegisterByteArrayConverter("json", Constructor)
}

View File

@ -1,33 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package string_byte_array_converter
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
type StringByteArrayConverter struct{}
func NewStringByteArrayConverter() *StringByteArrayConverter {
return &StringByteArrayConverter{}
}
func (c *StringByteArrayConverter) Convert(in []byte) (interface{}, error) {
return string(in), nil
}
func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis.ByteArrayConverter, error) {
return NewStringByteArrayConverter(), nil
}
func init() {
registry.RegisterByteArrayConverter("string", Constructor)
}

View File

@ -1,31 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"regexp"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
"github.com/blevesearch/bleve/registry"
)
const Name = "html"
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -1,58 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package regexp_char_filter
import (
"bytes"
"fmt"
"regexp"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "regexp"
type RegexpCharFilter struct {
r *regexp.Regexp
replacement []byte
}
func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter {
return &RegexpCharFilter{
r: r,
replacement: replacement,
}
}
func (s *RegexpCharFilter) Filter(input []byte) []byte {
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
}
func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
regexpStr, ok := config["regexp"].(string)
if !ok {
return nil, fmt.Errorf("must specify regexp")
}
r, err := regexp.Compile(regexpStr)
if err != nil {
return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
}
replaceBytes := []byte(" ")
replaceStr, ok := config["replace"].(string)
if ok {
replaceBytes = []byte(replaceStr)
}
return NewRegexpCharFilter(r, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
}

View File

@ -1,82 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package regexp_char_filter
import (
"reflect"
"regexp"
"testing"
)
func TestRegexpCharFilter(t *testing.T) {
htmlTagPattern := `</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`
htmlRegex := regexp.MustCompile(htmlTagPattern)
tests := []struct {
input []byte
output []byte
}{
{
input: []byte(`<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>`),
output: []byte(`
My First Heading
My first paragraph.
`),
},
}
for _, test := range tests {
filter := NewRegexpCharFilter(htmlRegex, []byte{' '})
output := filter.Filter(test.input)
if !reflect.DeepEqual(output, test.output) {
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
}
}
}
func TestZeroWidthNonJoinerCharFilter(t *testing.T) {
zeroWidthNonJoinerPattern := `\x{200C}`
zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern)
tests := []struct {
input []byte
output []byte
}{
{
input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"),
output: []byte("water under the bridge"),
},
}
for _, test := range tests {
filter := NewRegexpCharFilter(zeroWidthNonJoinerRegex, []byte{' '})
output := filter.Filter(test.input)
if !reflect.DeepEqual(output, test.output) {
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
}
}
}

View File

@ -1,31 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package zero_width_non_joiner
import (
"regexp"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
"github.com/blevesearch/bleve/registry"
)
const Name = "zero_width_spaces"
var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -0,0 +1,64 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package flexible
import (
"fmt"
"time"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "flexiblego"
type DateTimeParser struct {
layouts []string
}
func New(layouts []string) *DateTimeParser {
return &DateTimeParser{
layouts: layouts,
}
}
func (p *DateTimeParser) ParseDateTime(input string) (time.Time, error) {
for _, layout := range p.layouts {
rv, err := time.Parse(layout, input)
if err == nil {
return rv, nil
}
}
return time.Time{}, analysis.ErrInvalidDateTime
}
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
layouts, ok := config["layouts"].([]interface{})
if !ok {
return nil, fmt.Errorf("must specify layouts")
}
var layoutStrs []string
for _, layout := range layouts {
layoutStr, ok := layout.(string)
if ok {
layoutStrs = append(layoutStrs, layoutStr)
}
}
return New(layoutStrs), nil
}
func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}

View File

@ -0,0 +1,45 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package optional
import (
"time"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/datetime/flexible"
"github.com/blevesearch/bleve/registry"
)
const Name = "dateTimeOptional"
const rfc3339NoTimezone = "2006-01-02T15:04:05"
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
const rfc3339NoTime = "2006-01-02"
var layouts = []string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
}
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
return flexible.New(layouts), nil
}
func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}

View File

@ -1,40 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"time"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go"
"github.com/blevesearch/bleve/registry"
)
const Name = "dateTimeOptional"
const rfc3339NoTimezone = "2006-01-02T15:04:05"
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
const rfc3339NoTime = "2006-01-02"
var layouts = []string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
}
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
}
func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package flexible_go
import (
"fmt"
"time"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "flexiblego"
type FlexibleGoDateTimeParser struct {
layouts []string
}
func NewFlexibleGoDateTimeParser(layouts []string) *FlexibleGoDateTimeParser {
return &FlexibleGoDateTimeParser{
layouts: layouts,
}
}
func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error) {
for _, layout := range p.layouts {
rv, err := time.Parse(layout, input)
if err == nil {
return rv, nil
}
}
return time.Time{}, analysis.ErrInvalidDateTime
}
func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
layouts, ok := config["layouts"].([]interface{})
if !ok {
return nil, fmt.Errorf("must specify layouts")
}
layoutStrs := make([]string, 0)
for _, layout := range layouts {
layoutStr, ok := layout.(string)
if ok {
layoutStrs = append(layoutStrs, layoutStr)
}
}
return NewFlexibleGoDateTimeParser(layoutStrs), nil
}
func init() {
registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
}

View File

@ -1,84 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package flexible_go
import (
"reflect"
"testing"
"time"
"github.com/blevesearch/bleve/analysis"
)
func TestFlexibleDateTimeParser(t *testing.T) {
testLocation := time.FixedZone("", -8*60*60)
tests := []struct {
input string
expectedTime time.Time
expectedError error
}{
{
input: "2014-08-03",
expectedTime: time.Date(2014, 8, 3, 0, 0, 0, 0, time.UTC),
expectedError: nil,
},
{
input: "2014-08-03T15:59:30",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
expectedError: nil,
},
{
input: "2014-08-03 15:59:30",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, time.UTC),
expectedError: nil,
},
{
input: "2014-08-03T15:59:30-08:00",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 0, testLocation),
expectedError: nil,
},
{
input: "2014-08-03T15:59:30.999999999-08:00",
expectedTime: time.Date(2014, 8, 3, 15, 59, 30, 999999999, testLocation),
expectedError: nil,
},
{
input: "not a date time",
expectedTime: time.Time{},
expectedError: analysis.ErrInvalidDateTime,
},
}
rfc3339NoTimezone := "2006-01-02T15:04:05"
rfc3339NoTimezoneNoT := "2006-01-02 15:04:05"
rfc3339NoTime := "2006-01-02"
dateOptionalTimeParser := NewFlexibleGoDateTimeParser(
[]string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
})
for _, test := range tests {
actualTime, actualErr := dateOptionalTimeParser.ParseDateTime(test.input)
if actualErr != test.expectedError {
t.Errorf("expected error %#v, got %#v", test.expectedError, actualErr)
continue
}
if !reflect.DeepEqual(actualTime, test.expectedTime) {
t.Errorf("expected time %#v, got %#v", test.expectedTime, actualTime)
t.Errorf("expected location %#v,\n got %#v", test.expectedTime.Location(), actualTime.Location())
}
}
}

View File

@ -1,88 +1,111 @@
// Copyright (c) 2014 Couchbase, Inc. // Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file //
// except in compliance with the License. You may obtain a copy of the License at // Licensed under the Apache License, Version 2.0 (the "License");
// http://www.apache.org/licenses/LICENSE-2.0 // you may not use this file except in compliance with the License.
// Unless required by applicable law or agreed to in writing, software distributed under the // You may obtain a copy of the License at
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, //
// either express or implied. See the License for the specific language governing permissions // http://www.apache.org/licenses/LICENSE-2.0
// and limitations under the License. //
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package analysis package analysis
// TokenLocation represents one occurrence of a term at a particular location in
// a field. Start, End and Position have the same meaning as in analysis.Token.
// Field and ArrayPositions identify the field value in the source document.
// See document.Field for details.
type TokenLocation struct { type TokenLocation struct {
Field string Field string
Start int ArrayPositions []uint64
End int Start int
Position int End int
Position int
} }
// TokenFreq represents all the occurrences of a term in all fields of a
// document.
type TokenFreq struct { type TokenFreq struct {
Term []byte Term []byte
Locations []*TokenLocation Locations []*TokenLocation
frequency int
} }
type TokenFrequencies []*TokenFreq func (tf *TokenFreq) Frequency() int {
return tf.frequency
}
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) TokenFrequencies { // TokenFrequencies maps document terms to their combined frequencies from all
// put existing tokens into a map // fields.
index := make(map[string]*TokenFreq) type TokenFrequencies map[string]*TokenFreq
for _, tf := range tfs {
index[string(tf.Term)] = tf func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
}
// walk the new token frequencies // walk the new token frequencies
for _, tf := range other { for tfk, tf := range other {
// set the remoteField value in incoming token freqs // set the remoteField value in incoming token freqs
for _, l := range tf.Locations { for _, l := range tf.Locations {
l.Field = remoteField l.Field = remoteField
} }
existingTf, exists := index[string(tf.Term)] existingTf, exists := tfs[tfk]
if exists { if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...) existingTf.Locations = append(existingTf.Locations, tf.Locations...)
existingTf.frequency = existingTf.frequency + tf.frequency
} else { } else {
index[string(tf.Term)] = tf tfs[tfk] = &TokenFreq{
Term: tf.Term,
frequency: tf.frequency,
Locations: make([]*TokenLocation, len(tf.Locations)),
}
copy(tfs[tfk].Locations, tf.Locations)
} }
} }
// flatten map back to array
rv := make(TokenFrequencies, len(index))
i := 0
for _, tf := range index {
rv[i] = tf
i++
}
return rv
} }
func TokenFrequency(tokens TokenStream) TokenFrequencies { func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
index := make(map[string]*TokenFreq) rv := make(map[string]*TokenFreq, len(tokens))
for _, token := range tokens { if includeTermVectors {
curr, ok := index[string(token.Term)] tls := make([]TokenLocation, len(tokens))
if ok { tlNext := 0
curr.Locations = append(curr.Locations, &TokenLocation{
Start: token.Start, for _, token := range tokens {
End: token.End, tls[tlNext] = TokenLocation{
Position: token.Position, ArrayPositions: arrayPositions,
}) Start: token.Start,
} else { End: token.End,
index[string(token.Term)] = &TokenFreq{ Position: token.Position,
Term: token.Term, }
Locations: []*TokenLocation{
&TokenLocation{ curr, ok := rv[string(token.Term)]
Start: token.Start, if ok {
End: token.End, curr.Locations = append(curr.Locations, &tls[tlNext])
Position: token.Position, curr.frequency++
}, } else {
}, rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{&tls[tlNext]},
frequency: 1,
}
}
tlNext++
}
} else {
for _, token := range tokens {
curr, exists := rv[string(token.Term)]
if exists {
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
frequency: 1,
}
} }
} }
} }
rv := make(TokenFrequencies, len(index))
i := 0
for _, tf := range index {
rv[i] = tf
i++
}
return rv return rv
} }

View File

@ -1,167 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package analysis
import (
"reflect"
"testing"
)
func TestTokenFrequency(t *testing.T) {
tokens := TokenStream{
&Token{
Term: []byte("water"),
Position: 1,
Start: 0,
End: 5,
},
&Token{
Term: []byte("water"),
Position: 2,
Start: 6,
End: 11,
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
result := TokenFrequency(tokens)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}
}
func TestTokenFrequenciesMergeAll(t *testing.T) {
tf1 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
tf2 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
&TokenLocation{
Field: "tf2",
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Field: "tf2",
Position: 2,
Start: 6,
End: 11,
},
},
},
}
tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(tf1, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, tf1)
}
}
func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
tf1 := TokenFrequencies{}
tf2 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Field: "tf2",
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Field: "tf2",
Position: 2,
Start: 6,
End: 11,
},
},
},
}
result := tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}
}

View File

@ -0,0 +1,70 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package en implements an analyzer with reasonable defaults for processing
// English text.
//
// It strips possessive suffixes ('s), transforms tokens to lower case,
// removes stopwords from a built-in list, and applies porter stemming.
//
// The built-in stopwords list is defined in EnglishStopWords.
package en
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/porter"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
)
const AnalyzerName = "en"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
possEnFilter,
toLowerFilter,
stopEnFilter,
stemmerEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,67 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
// PossessiveName is the name PossessiveFilter is registered as
// in the bleve registry.
const PossessiveName = "possessive_en"
const rightSingleQuotationMark = ''
const apostrophe = '\''
const fullWidthApostrophe = ''
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
// PossessiveFilter implements a TokenFilter which
// strips the English possessive suffix ('s) from tokens.
// It handle a variety of apostrophe types, is case-insensitive
// and doesn't distinguish between possessive and contraction.
// (ie "She's So Rad" becomes "She So Rad")
type PossessiveFilter struct {
}
func NewPossessiveFilter() *PossessiveFilter {
return &PossessiveFilter{}
}
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
if lastRune == 's' || lastRune == 'S' {
nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
if nextLastRune == rightSingleQuotationMark ||
nextLastRune == apostrophe ||
nextLastRune == fullWidthApostrophe {
token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
}
}
}
return input
}
func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPossessiveFilter(), nil
}
func init() {
registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
}

View File

@ -0,0 +1,33 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token/stop"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -7,10 +7,11 @@ import (
const StopName = "stop_en" const StopName = "stop_en"
// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
//
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string // ` was changed to ' to allow for literal string
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License. | This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php | See http://snowball.tartarus.org/license.php

View File

@ -1,60 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
)
const AnalyzerName = "ar"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
stopArFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normalizeFilter,
stopArFilter,
normalizeArFilter,
stemmerArFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,179 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestArabicAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("كبير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كبير"),
Position: 1,
Start: 0,
End: 8,
},
},
},
// feminine marker
{
input: []byte("كبيرة"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كبير"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("مشروب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مشروب"),
Position: 1,
Start: 0,
End: 10,
},
},
},
// plural -at
{
input: []byte("مشروبات"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مشروب"),
Position: 1,
Start: 0,
End: 14,
},
},
},
// plural -in
{
input: []byte("أمريكيين"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 16,
},
},
},
// singular with bare alif
{
input: []byte("امريكي"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{
input: []byte("كتاب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتاب"),
Position: 1,
Start: 0,
End: 8,
},
},
},
// definite article
{
input: []byte("الكتاب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتاب"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{
input: []byte("ما ملكت أيمانكم"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ملكت"),
Position: 2,
Start: 5,
End: 13,
},
&analysis.Token{
Term: []byte("ايمانكم"),
Position: 3,
Start: 14,
End: 28,
},
},
},
// stopwords
{
input: []byte("الذين ملكت أيمانكم"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ملكت"),
Position: 2,
Start: 11,
End: 19,
},
&analysis.Token{
Term: []byte("ايمانكم"),
Position: 3,
Start: 20,
End: 34,
},
},
},
// presentation form normalization
{
input: []byte("ﺍﻟﺴﻼﻢ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
Position: 1,
Start: 0,
End: 15,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -1,80 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_ar"
const (
Alef = '\u0627'
AlefMadda = '\u0622'
AlefHamzaAbove = '\u0623'
AlefHamzaBelow = '\u0625'
Yeh = '\u064A'
DotlessYeh = '\u0649'
TehMarbuta = '\u0629'
Heh = '\u0647'
Tatweel = '\u0640'
Fathatan = '\u064B'
Dammatan = '\u064C'
Kasratan = '\u064D'
Fatha = '\u064E'
Damma = '\u064F'
Kasra = '\u0650'
Shadda = '\u0651'
Sukun = '\u0652'
)
type ArabicNormalizeFilter struct {
}
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
return &ArabicNormalizeFilter{}
}
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
runes[i] = Alef
case DotlessYeh:
runes[i] = Yeh
case TehMarbuta:
runes[i] = Heh
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
runes = analysis.DeleteRune(runes, i)
i--
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -1,229 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestArabicNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// AlifMadda
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("آجن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اجن"),
},
},
},
// AlifHamzaAbove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("أحمد"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("احمد"),
},
},
},
// AlifHamzaBelow
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("إعاذ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اعاذ"),
},
},
},
// AlifMaksura
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بنى"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بني"),
},
},
},
// TehMarbuta
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فاطمة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("فاطمه"),
},
},
},
// Tatweel
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("روبرـــــت"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("روبرت"),
},
},
},
// Fatha
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("مَبنا"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مبنا"),
},
},
},
// Kasra
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("علِي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("علي"),
},
},
},
// Damma
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بُوات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بوات"),
},
},
},
// Fathatan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولداً"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدا"),
},
},
},
// Kasratan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدٍ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولد"),
},
},
},
// Dammatan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدٌ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولد"),
},
},
},
// Sukun
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("نلْسون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("نلسون"),
},
},
},
// Shaddah
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("هتميّ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هتمي"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
arabicNormalizeFilter := NewArabicNormalizeFilter()
for _, test := range tests {
actual := arabicNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -1,113 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_ar"
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
var prefixes = [][]rune{
[]rune("ال"),
[]rune("وال"),
[]rune("بال"),
[]rune("كال"),
[]rune("فال"),
[]rune("لل"),
[]rune("و"),
}
var suffixes = [][]rune{
[]rune("ها"),
[]rune("ان"),
[]rune("ات"),
[]rune("ون"),
[]rune("ين"),
[]rune("يه"),
[]rune("ية"),
[]rune("ه"),
[]rune("ة"),
[]rune("ي"),
}
type ArabicStemmerFilter struct{}
func NewArabicStemmerFilter() *ArabicStemmerFilter {
return &ArabicStemmerFilter{}
}
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := stem(token.Term)
token.Term = term
}
return input
}
func canStemPrefix(input, prefix []rune) bool {
// Wa- prefix requires at least 3 characters.
if len(prefix) == 1 && len(input) < 4 {
return false
}
// Other prefixes require only 2.
if len(input)-len(prefix) < 2 {
return false
}
for i := range prefix {
if prefix[i] != input[i] {
return false
}
}
return true
}
func canStemSuffix(input, suffix []rune) bool {
// All suffixes require at least 2 characters after stemming.
if len(input)-len(suffix) < 2 {
return false
}
stemEnd := len(input) - len(suffix)
for i := range suffix {
if suffix[i] != input[stemEnd+i] {
return false
}
}
return true
}
func stem(input []byte) []byte {
runes := bytes.Runes(input)
// Strip a single prefix.
for _, p := range prefixes {
if canStemPrefix(runes, p) {
runes = runes[len(p):]
break
}
}
// Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes {
if canStemSuffix(runes, s) {
runes = runes[:len(runes)-len(s)]
}
}
return analysis.BuildTermFromRunes(runes)
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,392 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestArabicStemmerFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// AlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// WalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// BalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// KalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// FalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// LlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("للاخر"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اخر"),
},
},
},
// WaPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// AhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوجها"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوج"),
},
},
},
// AnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدان"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// AtSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// WnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدين"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهديه"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YpSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدية"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// HSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهده"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// PSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboPrefSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدهات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ShouldntStem
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
},
// NonArabic
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("السلام"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلامة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("السلامة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الوصل"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("وصل"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والصل"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("صل"),
},
},
},
// Empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
arabicStemmerFilter := NewArabicStemmerFilter()
for _, test := range tests {
actual := arabicStemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ar
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,149 +0,0 @@
package ar
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ar"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
// ` was changed to ' to allow for literal string
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
# This means that when modifying this list, you might need to add some
# redundant entries, for example containing forms with both أ and ا
من
ومن
منها
منه
في
وفي
فيها
فيه
و
ف
ثم
او
أو
ب
بها
به
ا
أ
اى
اي
أي
أى
لا
ولا
الا
ألا
إلا
لكن
ما
وما
كما
فما
عن
مع
اذا
إذا
ان
أن
إن
انها
أنها
إنها
انه
أنه
إنه
بان
بأن
فان
فأن
وان
وأن
وإن
التى
التي
الذى
الذي
الذين
الى
الي
إلى
إلي
على
عليها
عليه
اما
أما
إما
ايضا
أيضا
كل
وكل
لم
ولم
لن
ولن
هى
هي
هو
وهى
وهي
وهو
فهى
فهي
فهو
انت
أنت
لك
لها
له
هذه
هذا
تلك
ذلك
هناك
كانت
كان
يكون
تكون
وكانت
وكان
غير
بعض
قد
نحو
بين
بينما
منذ
ضمن
حيث
الان
الآن
خلال
بعد
قبل
حتى
عند
عندما
لدى
جميع
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArabicStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bg
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,217 +0,0 @@
package bg
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_bg"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
а
аз
ако
ала
бе
без
беше
би
бил
била
били
било
близо
бъдат
бъде
бяха
в
вас
ваш
ваша
вероятно
вече
взема
ви
вие
винаги
все
всеки
всички
всичко
всяка
във
въпреки
върху
г
ги
главно
го
д
да
дали
до
докато
докога
дори
досега
доста
е
едва
един
ето
за
зад
заедно
заради
засега
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
ли
м
ме
между
мен
ми
мнозина
мога
могат
може
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нея
ни
ние
никой
нито
но
някои
някой
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първо
с
са
само
се
сега
си
скоро
след
сме
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
тн
то
това
тогава
този
той
толкова
точно
трябва
тук
тъй
тя
тях
у
харесва
ч
че
често
чрез
ще
щом
я
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BulgarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,30 +0,0 @@
package ca
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const ArticlesName = "articles_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var CatalanArticles = []byte(`
d
l
m
n
s
t
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -1,32 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
"github.com/blevesearch/bleve/registry"
)
const ElisionName = "elision_ca"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -1,56 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'Institut"),
},
&analysis.Token{
Term: []byte("d'Estudis"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Institut"),
},
&analysis.Token{
Term: []byte("Estudis"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,244 +0,0 @@
package ca
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var CatalanStopWords = []byte(`# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
a
abans
ací
ah
així
això
al
als
aleshores
algun
alguna
algunes
alguns
alhora
allà
allí
allò
altra
altre
altres
amb
ambdós
ambdues
apa
aquell
aquella
aquelles
aquells
aquest
aquesta
aquestes
aquests
aquí
baix
cada
cadascú
cadascuna
cadascunes
cadascuns
com
contra
d'un
d'una
d'unes
d'uns
dalt
de
del
dels
des
després
dins
dintre
donat
doncs
durant
e
eh
el
els
em
en
encara
ens
entre
érem
eren
éreu
es
és
esta
està
estàvem
estaven
estàveu
esteu
et
etc
ets
fins
fora
gairebé
ha
han
has
havia
he
hem
heu
hi
ho
i
igual
iguals
ja
l'hi
la
les
li
li'n
llavors
m'he
ma
mal
malgrat
mateix
mateixa
mateixes
mateixos
me
mentre
més
meu
meus
meva
meves
molt
molta
moltes
molts
mon
mons
n'he
n'hi
ne
ni
no
nogensmenys
només
nosaltres
nostra
nostre
nostres
o
oh
oi
on
pas
pel
pels
per
però
perquè
poc
poca
pocs
poques
potser
propi
qual
quals
quan
quant
que
què
quelcom
qui
quin
quina
quines
quins
s'ha
s'han
sa
semblant
semblants
ses
seu
seus
seva
seva
seves
si
sobre
sobretot
sóc
solament
sols
son
són
sons
sota
sou
t'ha
t'han
t'he
ta
tal
també
tampoc
tan
tant
tanta
tantes
teu
teus
teva
teves
ton
tons
tot
tota
totes
tots
un
una
unes
uns
us
va
vaig
vam
van
vas
veu
vosaltres
vostra
vostre
vostres
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,50 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
)
const AnalyzerName = "cjk"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
if err != nil {
return nil, err
}
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
bigramFilter, err := cache.TokenFilterNamed(BigramName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: whitespaceTokenizer,
TokenFilters: []analysis.TokenFilter{
normalizeFilter,
toLowerFilter,
bigramFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,620 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestCJKAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("こんにちは世界"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
},
},
{
input: []byte("一二三四五六七八九十"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一二"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("二三"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("三四"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("四五"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("五六"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("六七"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
&analysis.Token{
Term: []byte("七八"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("八九"),
Type: analysis.Double,
Position: 8,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("九十"),
Type: analysis.Double,
Position: 9,
Start: 24,
End: 30,
},
},
},
{
input: []byte("一 二三四 五六七八九 十"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("二三"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
&analysis.Token{
Term: []byte("三四"),
Type: analysis.Double,
Position: 3,
Start: 7,
End: 13,
},
&analysis.Token{
Term: []byte("五六"),
Type: analysis.Double,
Position: 5,
Start: 14,
End: 20,
},
&analysis.Token{
Term: []byte("六七"),
Type: analysis.Double,
Position: 6,
Start: 17,
End: 23,
},
&analysis.Token{
Term: []byte("七八"),
Type: analysis.Double,
Position: 7,
Start: 20,
End: 26,
},
&analysis.Token{
Term: []byte("八九"),
Type: analysis.Double,
Position: 8,
Start: 23,
End: 29,
},
&analysis.Token{
Term: []byte("十"),
Type: analysis.Single,
Position: 10,
Start: 30,
End: 33,
},
},
},
{
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abc"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("defgh"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 9,
},
&analysis.Token{
Term: []byte("ijklmn"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 10,
End: 16,
},
&analysis.Token{
Term: []byte("opqrstu"),
Type: analysis.AlphaNumeric,
Position: 4,
Start: 17,
End: 24,
},
&analysis.Token{
Term: []byte("vwxy"),
Type: analysis.AlphaNumeric,
Position: 5,
Start: 25,
End: 29,
},
&analysis.Token{
Term: []byte("z"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 30,
End: 31,
},
},
},
{
input: []byte("あい"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("あい "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("test"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("test "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("あいtest"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 6,
End: 10,
},
},
},
{
input: []byte("testあい "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
},
},
{
input: []byte("あいうえおabcかきくけこ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("いう"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("うえ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("えお"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("abc"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("かき"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("きく"),
Type: analysis.Double,
Position: 8,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("くけ"),
Type: analysis.Double,
Position: 9,
Start: 24,
End: 30,
},
&analysis.Token{
Term: []byte("けこ"),
Type: analysis.Double,
Position: 10,
Start: 27,
End: 33,
},
},
},
{
input: []byte("あいうえおabんcかきくけ こ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("いう"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("うえ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("えお"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("ab"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 15,
End: 17,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 7,
Start: 17,
End: 20,
},
&analysis.Token{
Term: []byte("c"),
Type: analysis.AlphaNumeric,
Position: 8,
Start: 20,
End: 21,
},
&analysis.Token{
Term: []byte("かき"),
Type: analysis.Double,
Position: 9,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("きく"),
Type: analysis.Double,
Position: 10,
Start: 24,
End: 30,
},
&analysis.Token{
Term: []byte("くけ"),
Type: analysis.Double,
Position: 11,
Start: 27,
End: 33,
},
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 13,
Start: 34,
End: 37,
},
},
},
{
input: []byte("一 روبرت موير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("روبرت"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 14,
},
&analysis.Token{
Term: []byte("موير"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 15,
End: 23,
},
},
},
{
input: []byte("一 رُوبرت موير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("رُوبرت"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 16,
},
&analysis.Token{
Term: []byte("موير"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 17,
End: 25,
},
},
},
{
input: []byte("𩬅艱鍟䇹愯瀛"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("𩬅艱"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 7,
},
&analysis.Token{
Term: []byte("艱鍟"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
&analysis.Token{
Term: []byte("鍟䇹"),
Type: analysis.Double,
Position: 3,
Start: 7,
End: 13,
},
&analysis.Token{
Term: []byte("䇹愯"),
Type: analysis.Double,
Position: 4,
Start: 10,
End: 16,
},
&analysis.Token{
Term: []byte("愯瀛"),
Type: analysis.Double,
Position: 5,
Start: 13,
End: 19,
},
},
},
{
input: []byte("一"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
},
},
{
input: []byte("一丁丂"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一丁"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("丁丂"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
},
},
}
cache := registry.NewCache()
for _, test := range tests {
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,166 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"container/ring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const BigramName = "cjk_bigram"
type CJKBigramFilter struct {
outputUnigram bool
}
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
return &CJKBigramFilter{
outputUnigram: outputUnigram,
}
}
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
r := ring.New(2)
itemsInRing := 0
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
if token.Type == analysis.Ideographic {
if itemsInRing > 0 {
// if items already buffered
// check to see if this is aligned
curr := r.Value.(*analysis.Token)
if token.Start-curr.End != 0 {
// not aligned flush
flushToken := s.flush(r, &itemsInRing)
if flushToken != nil {
rv = append(rv, flushToken)
}
}
}
// now we can add this token to the buffer
r = r.Next()
r.Value = token
if itemsInRing < 2 {
itemsInRing++
}
if itemsInRing > 1 && s.outputUnigram {
unigram := s.buildUnigram(r, &itemsInRing)
if unigram != nil {
rv = append(rv, unigram)
}
}
bigramToken := s.outputBigram(r, &itemsInRing)
if bigramToken != nil {
rv = append(rv, bigramToken)
}
} else {
// flush anything already buffered
flushToken := s.flush(r, &itemsInRing)
if flushToken != nil {
rv = append(rv, flushToken)
}
// output this token as is
rv = append(rv, token)
}
}
// deal with possible trailing unigram
if itemsInRing == 1 || s.outputUnigram {
if itemsInRing == 2 {
r = r.Next()
}
unigram := s.buildUnigram(r, &itemsInRing)
if unigram != nil {
rv = append(rv, unigram)
}
}
return rv
}
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token {
var rv *analysis.Token
if *itemsInRing == 1 {
rv = s.buildUnigram(r, itemsInRing)
}
r.Value = nil
*itemsInRing = 0
return rv
}
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
if *itemsInRing == 2 {
thisShingleRing := r.Move(-1)
shingledBytes := make([]byte, 0)
// do first token
prev := thisShingleRing.Value.(*analysis.Token)
shingledBytes = append(shingledBytes, prev.Term...)
// do second token
thisShingleRing = thisShingleRing.Next()
curr := thisShingleRing.Value.(*analysis.Token)
shingledBytes = append(shingledBytes, curr.Term...)
token := analysis.Token{
Type: analysis.Double,
Term: shingledBytes,
Position: prev.Position,
Start: prev.Start,
End: curr.End,
}
return &token
}
return nil
}
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
if *itemsInRing == 2 {
thisShingleRing := r.Move(-1)
// do first token
prev := thisShingleRing.Value.(*analysis.Token)
token := analysis.Token{
Type: analysis.Single,
Term: prev.Term,
Position: prev.Position,
Start: prev.Start,
End: prev.End,
}
return &token
} else if *itemsInRing == 1 {
// do first token
prev := r.Value.(*analysis.Token)
token := analysis.Token{
Type: analysis.Single,
Term: prev.Term,
Position: prev.Position,
Start: prev.Start,
End: prev.End,
}
return &token
}
return nil
}
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
outputUnigram := false
outVal, ok := config["output_unigram"].(bool)
if ok {
outputUnigram = outVal
}
return NewCJKBigramFilter(outputUnigram), nil
}
func init() {
registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
}

View File

@ -1,420 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cjk
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestCJKBigramFilter(t *testing.T) {
tests := []struct {
outputUnigram bool
input analysis.TokenStream
output analysis.TokenStream
}{
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 5,
End: 7,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 5,
End: 7,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
},
},
{
outputUnigram: true,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Single,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Single,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Single,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Single,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Single,
Position: 7,
Start: 18,
End: 21,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("cat"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 8,
Start: 21,
End: 24,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("cat"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
},
},
}
for _, test := range tests {
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
actual := cjkBigramFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View File

@ -1,58 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package ckb
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "ckb"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
normCkbFilter,
toLowerFilter,
stopCkbFilter,
stemmerCkbFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,74 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package ckb
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestSoraniAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stop word removal
{
input: []byte("ئەم پیاوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 2,
Start: 7,
End: 17,
},
},
},
{
input: []byte("پیاوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("پیاو"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 8,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,113 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"bytes"
"unicode"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_ckb"
const (
Yeh = '\u064A'
DotlessYeh = '\u0649'
FarsiYeh = '\u06CC'
Kaf = '\u0643'
Keheh = '\u06A9'
Heh = '\u0647'
Ae = '\u06D5'
Zwnj = '\u200C'
HehDoachashmee = '\u06BE'
TehMarbuta = '\u0629'
Reh = '\u0631'
Rreh = '\u0695'
RrehAbove = '\u0692'
Tatweel = '\u0640'
Fathatan = '\u064B'
Dammatan = '\u064C'
Kasratan = '\u064D'
Fatha = '\u064E'
Damma = '\u064F'
Kasra = '\u0650'
Shadda = '\u0651'
Sukun = '\u0652'
)
type SoraniNormalizeFilter struct {
}
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
return &SoraniNormalizeFilter{}
}
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case Yeh, DotlessYeh:
runes[i] = FarsiYeh
case Kaf:
runes[i] = Keheh
case Zwnj:
if i > 0 && runes[i-1] == Heh {
runes[i-1] = Ae
}
runes = analysis.DeleteRune(runes, i)
i--
case Heh:
if i == len(runes)-1 {
runes[i] = Ae
}
case TehMarbuta:
runes[i] = Ae
case HehDoachashmee:
runes[i] = Heh
case Reh:
if i == 0 {
runes[i] = Rreh
}
case RrehAbove:
runes[i] = Rreh
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
runes = analysis.DeleteRune(runes, i)
i--
default:
if unicode.In(runes[i], unicode.Cf) {
runes = analysis.DeleteRune(runes, i)
i--
}
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -1,318 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestSoraniNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// test Y
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064A"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0649"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
// test K
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0643"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
},
// test H
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u200C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u200C\u06A9"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5\u06A9"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06BE"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0629"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5"),
},
},
},
// test final H
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u0647\u0647"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u0647\u06D5"),
},
},
},
// test RR
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0692"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0695"),
},
},
},
// test initial RR
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0631\u0631\u0631"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0695\u0631\u0631"),
},
},
},
// test remove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0640"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064B"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064D"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064E"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064F"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0650"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0651"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0652"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u200C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
soraniNormalizeFilter := NewSoraniNormalizeFilter()
for _, test := range tests {
actual := soraniNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -1,143 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_ckb"
type SoraniStemmerFilter struct {
}
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
return &SoraniStemmerFilter{}
}
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
}
return input
}
func stem(input []byte) []byte {
inputLen := utf8.RuneCount(input)
// postposition
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
input = truncateRunes(input, 2)
inputLen = utf8.RuneCount(input)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
input = truncateRunes(input, 1)
inputLen = utf8.RuneCount(input)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
input = truncateRunes(input, 3)
inputLen = utf8.RuneCount(input)
}
// possessive pronoun
if inputLen > 6 &&
(bytes.HasSuffix(input, []byte("مان")) ||
bytes.HasSuffix(input, []byte("یان")) ||
bytes.HasSuffix(input, []byte("تان"))) {
input = truncateRunes(input, 3)
inputLen = utf8.RuneCount(input)
}
// indefinite singular ezafe
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
return truncateRunes(input, 3)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
return truncateRunes(input, 4)
}
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
// indefinite singular
return truncateRunes(input, 2)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
// indefinite singular
return truncateRunes(input, 3)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
// definite singular
return truncateRunes(input, 3)
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
// definite singular
return truncateRunes(input, 2)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
// definite plural
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
// definite plural
return truncateRunes(input, 3)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
// indefinite plural ezafe
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
// indefinite plural ezafe
return truncateRunes(input, 3)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
// indefinite plural
return truncateRunes(input, 3)
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
// indefinite plural
return truncateRunes(input, 2)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
// demonstrative plural
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
// demonstrative plural
return truncateRunes(input, 3)
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
// demonstrative singular
return truncateRunes(input, 2)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
// demonstrative singular
return truncateRunes(input, 1)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
// absolute singular ezafe
return truncateRunes(input, 1)
}
return input
}
func truncateRunes(input []byte, num int) []byte {
runes := bytes.Runes(input)
runes = runes[:len(runes)-num]
out := buildTermFromRunes(runes)
return out
}
func buildTermFromRunes(runes []rune) []byte {
rv := make([]byte, 0, len(runes)*4)
for _, r := range runes {
runeBytes := make([]byte, utf8.RuneLen(r))
utf8.EncodeRune(runeBytes, r)
rv = append(rv, runeBytes...)
}
return rv
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,294 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizers/single_token"
)
func TestSoraniStemmerFilter(t *testing.T) {
// in order to match the lucene tests
// we will test with an analyzer, not just the stemmer
analyzer := analysis.Analyzer{
Tokenizer: single_token.NewSingleTokenTokenizer(),
TokenFilters: []analysis.TokenFilter{
NewSoraniNormalizeFilter(),
NewSoraniStemmerFilter(),
},
}
tests := []struct {
input []byte
output analysis.TokenStream
}{
{ // -ek
input: []byte("پیاوێک"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -yek
input: []byte("دەرگایەک"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -aka
input: []byte("پیاوەكە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -ka
input: []byte("دەرگاكە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -a
input: []byte("کتاویە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("کتاوی"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -ya
input: []byte("دەرگایە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -An
input: []byte("پیاوان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -yAn
input: []byte("دەرگایان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -akAn
input: []byte("پیاوەکان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -kAn
input: []byte("دەرگاکان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -Ana
input: []byte("پیاوانە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -yAna
input: []byte("دەرگایانە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{ // Ezafe singular
input: []byte("هۆتیلی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // Ezafe indefinite
input: []byte("هۆتیلێکی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // Ezafe plural
input: []byte("هۆتیلانی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -awa
input: []byte("دوورەوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دوور"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -dA
input: []byte("نیوەشەودا"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("نیوەشەو"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{ // -A
input: []byte("سۆرانا"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سۆران"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -mAn
input: []byte("پارەمان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -tAn
input: []byte("پارەتان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -yAn
input: []byte("پارەیان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // empty
input: []byte(""),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
Position: 1,
Start: 0,
End: 0,
},
},
},
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("for input %s(% x)", test.input, test.input)
t.Errorf("\texpected:")
for _, token := range test.output {
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
}
t.Errorf("\tactual:")
for _, token := range actual {
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,160 +0,0 @@
package ckb
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ckb"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var SoraniStopWords = []byte(`# set of kurdish stopwords
# note these have been normalized with our scheme (e represented with U+06D5, etc)
# constructed from:
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
# and
و
# which
کە
# of
ی
# made/did
کرد
# that/which
ئەوەی
# on/head
سەر
# two
دوو
# also
هەروەها
# from/that
لەو
# makes/does
دەکات
# some
چەند
# every
هەر
# demonstratives
# that
ئەو
# this
ئەم
# personal pronouns
# I
من
# we
ئێمە
# you
تۆ
# you
ئێوە
# he/she/it
ئەو
# they
ئەوان
# prepositions
# to/with/by
بە
پێ
# without
بەبێ
# along with/while/during
بەدەم
# in the opinion of
بەلای
# according to
بەپێی
# before
بەرلە
# in the direction of
بەرەوی
# in front of/toward
بەرەوە
# before/in the face of
بەردەم
# without
بێ
# except for
بێجگە
# for
بۆ
# on/in
دە
تێ
# with
دەگەڵ
# after
دوای
# except for/aside from
جگە
# in/from
لە
لێ
# in front of/before/because of
لەبەر
# between/among
لەبەینی
# concerning/about
لەبابەت
# concerning
لەبارەی
# instead of
لەباتی
# beside
لەبن
# instead of
لەبرێتی
# behind
لەدەم
# with/together with
لەگەڵ
# by
لەلایەن
# within
لەناو
# between/among
لەنێو
# for the sake of
لەپێناوی
# with respect to
لەرەوی
# by means of/for
لەرێ
# for the sake of
لەرێگا
# on/on top of/according to
لەسەر
# under
لەژێر
# between/among
ناو
# between/among
نێوان
# after
پاش
# before
پێش
# like
وەک
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SoraniStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cs
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,196 +0,0 @@
package cs
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_cs"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var CzechStopWords = []byte(`a
s
k
o
i
u
v
z
dnes
cz
tímto
budeš
budem
byli
jseš
můj
svým
ta
tomto
tohle
tuto
tyto
jej
zda
proč
máte
tato
kam
tohoto
kdo
kteří
mi
nám
tom
tomuto
mít
nic
proto
kterou
byla
toho
protože
asi
ho
naši
napište
re
což
tím
takže
svých
její
svými
jste
aj
tu
tedy
teto
bylo
kde
ke
pravé
ji
nad
nejsou
či
pod
téma
mezi
přes
ty
pak
vám
ani
když
však
neg
jsem
tento
článku
články
aby
jsme
před
pta
jejich
byl
ještě
bez
také
pouze
první
vaše
která
nás
nový
tipy
pokud
může
strana
jeho
své
jiné
zprávy
nové
není
vás
jen
podle
zde
být
více
bude
již
než
který
by
které
co
nebo
ten
tak
při
od
po
jsou
jak
další
ale
si
se
ve
to
jako
za
zpět
ze
do
pro
je
na
atd
atp
jakmile
přičemž
on
ona
ono
oni
ony
my
vy
ji
mne
jemu
tomu
těm
těmu
němu
němuž
jehož
jíž
jelikož
jež
jakož
načež
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CzechStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,54 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "da"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDaFilter,
stemmerDaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,69 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package da
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestDanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("undersøg"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 9,
},
},
},
{
input: []byte("undersøgelse"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 13,
},
},
},
// stop word
{
input: []byte("på"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_da"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("da")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,134 +0,0 @@
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_da"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
| on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/youself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package de
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "de"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDeFilter,
normalizeDeFilter,
stemmerDeFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,97 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package de
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestGermanAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("Tisch"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("Tische"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("Tischen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 7,
},
},
},
// german specials
{
input: []byte("Schaltflächen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{
input: []byte("Schaltflaechen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,94 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_de"
const (
N = 0 /* ordinary state */
V = 1 /* stops 'u' from entering umlaut state */
U = 2 /* umlaut state, allows e-deletion */
)
type GermanNormalizeFilter struct {
}
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
return &GermanNormalizeFilter{}
}
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
state := N
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'a', 'o':
state = U
case 'u':
if state == N {
state = U
} else {
state = V
}
case 'e':
if state == U {
runes = analysis.DeleteRune(runes, i)
i--
}
state = V
case 'i', 'q', 'y':
state = V
case 'ä':
runes[i] = 'a'
state = V
case 'ö':
runes[i] = 'o'
state = V
case 'ü':
runes[i] = 'u'
state = V
case 'ß':
runes[i] = 's'
i++
// newrunes := make([]rune, len(runes)+1)
// copy(newrunes, runes)
// runes = newrunes
// runes[i] = 's'
runes = analysis.InsertRune(runes, i, 's')
state = N
default:
state = N
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

Some files were not shown because too many files have changed in this diff Show More