Switch from Godep to go vendoring
This commit is contained in:
parent
6b37713bc0
commit
cd317761c5
1504 changed files with 263076 additions and 34441 deletions
121
vendor/github.com/blevesearch/bleve/analysis/tokenizers/exception/exception.go
generated
vendored
Normal file
121
vendor/github.com/blevesearch/bleve/analysis/tokenizers/exception/exception.go
generated
vendored
Normal file
|
@ -0,0 +1,121 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package exception
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "exception"
|
||||
|
||||
type ExceptionsTokenizer struct {
|
||||
exception *regexp.Regexp
|
||||
remaining analysis.Tokenizer
|
||||
}
|
||||
|
||||
func NewExceptionsTokenizer(exception *regexp.Regexp, remaining analysis.Tokenizer) *ExceptionsTokenizer {
|
||||
return &ExceptionsTokenizer{
|
||||
exception: exception,
|
||||
remaining: remaining,
|
||||
}
|
||||
}
|
||||
|
||||
func (t *ExceptionsTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
matches := t.exception.FindAllIndex(input, -1)
|
||||
currInput := 0
|
||||
lastPos := 0
|
||||
for _, match := range matches {
|
||||
start := match[0]
|
||||
end := match[1]
|
||||
if start > currInput {
|
||||
// need to defer to remaining for unprocessed section
|
||||
intermediate := t.remaining.Tokenize(input[currInput:start])
|
||||
// add intermediate tokens to our result stream
|
||||
for _, token := range intermediate {
|
||||
// adjust token offsets
|
||||
token.Position += lastPos
|
||||
token.Start += currInput
|
||||
token.End += currInput
|
||||
rv = append(rv, token)
|
||||
}
|
||||
lastPos += len(intermediate)
|
||||
currInput = start
|
||||
}
|
||||
|
||||
// create single token with this regexp match
|
||||
token := &analysis.Token{
|
||||
Term: input[start:end],
|
||||
Start: start,
|
||||
End: end,
|
||||
Position: lastPos + 1,
|
||||
}
|
||||
rv = append(rv, token)
|
||||
lastPos++
|
||||
currInput = end
|
||||
|
||||
}
|
||||
|
||||
if currInput < len(input) {
|
||||
// need to defer to remaining for unprocessed section
|
||||
intermediate := t.remaining.Tokenize(input[currInput:])
|
||||
// add intermediate tokens to our result stream
|
||||
for _, token := range intermediate {
|
||||
// adjust token offsets
|
||||
token.Position += lastPos
|
||||
token.Start += currInput
|
||||
token.End += currInput
|
||||
rv = append(rv, token)
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
exceptions := []string{}
|
||||
iexceptions, ok := config["exceptions"].([]interface{})
|
||||
if ok {
|
||||
for _, exception := range iexceptions {
|
||||
exception, ok := exception.(string)
|
||||
if ok {
|
||||
exceptions = append(exceptions, exception)
|
||||
}
|
||||
}
|
||||
}
|
||||
aexceptions, ok := config["exceptions"].([]string)
|
||||
if ok {
|
||||
exceptions = append(exceptions, aexceptions...)
|
||||
}
|
||||
exceptionPattern := strings.Join(exceptions, "|")
|
||||
r, err := regexp.Compile(exceptionPattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
|
||||
}
|
||||
|
||||
remainingName, ok := config["tokenizer"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify tokenizer for remaining input")
|
||||
}
|
||||
remaining, err := cache.TokenizerNamed(remainingName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return NewExceptionsTokenizer(r, remaining), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, ExceptionsTokenizerConstructor)
|
||||
}
|
157
vendor/github.com/blevesearch/bleve/analysis/tokenizers/exception/exception_test.go
generated
vendored
Normal file
157
vendor/github.com/blevesearch/bleve/analysis/tokenizers/exception/exception_test.go
generated
vendored
Normal file
|
@ -0,0 +1,157 @@
|
|||
package exception
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
_ "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestExceptionsTokenizer(t *testing.T) {
|
||||
tests := []struct {
|
||||
config map[string]interface{}
|
||||
input []byte
|
||||
patterns []string
|
||||
result analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("test http://blevesearch.com/ words"),
|
||||
config: map[string]interface{}{
|
||||
"type": "exception",
|
||||
"tokenizer": "unicode",
|
||||
"exceptions": []interface{}{
|
||||
`[hH][tT][tT][pP][sS]?://(\S)*`,
|
||||
`[fF][iI][lL][eE]://(\S)*`,
|
||||
`[fF][tT][pP]://(\S)*`,
|
||||
},
|
||||
},
|
||||
result: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("http://blevesearch.com/"),
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 28,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("words"),
|
||||
Position: 3,
|
||||
Start: 29,
|
||||
End: 34,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("what ftp://blevesearch.com/ songs"),
|
||||
config: map[string]interface{}{
|
||||
"type": "exception",
|
||||
"tokenizer": "unicode",
|
||||
"exceptions": []interface{}{
|
||||
`[hH][tT][tT][pP][sS]?://(\S)*`,
|
||||
`[fF][iI][lL][eE]://(\S)*`,
|
||||
`[fF][tT][pP]://(\S)*`,
|
||||
},
|
||||
},
|
||||
result: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("what"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ftp://blevesearch.com/"),
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("songs"),
|
||||
Position: 3,
|
||||
Start: 28,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("please email marty@couchbase.com the URL https://blevesearch.com/"),
|
||||
config: map[string]interface{}{
|
||||
"type": "exception",
|
||||
"tokenizer": "unicode",
|
||||
"exceptions": []interface{}{
|
||||
`[hH][tT][tT][pP][sS]?://(\S)*`,
|
||||
`[fF][iI][lL][eE]://(\S)*`,
|
||||
`[fF][tT][pP]://(\S)*`,
|
||||
`\S+@\S+`,
|
||||
},
|
||||
},
|
||||
result: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("please"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("email"),
|
||||
Position: 2,
|
||||
Start: 7,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty@couchbase.com"),
|
||||
Position: 3,
|
||||
Start: 13,
|
||||
End: 32,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
Position: 4,
|
||||
Start: 33,
|
||||
End: 36,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("URL"),
|
||||
Position: 5,
|
||||
Start: 37,
|
||||
End: 40,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("https://blevesearch.com/"),
|
||||
Position: 6,
|
||||
Start: 41,
|
||||
End: 65,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// remaining := unicode.NewUnicodeTokenizer()
|
||||
for _, test := range tests {
|
||||
|
||||
// build the requested exception tokenizer
|
||||
cache := registry.NewCache()
|
||||
tokenizer, err := cache.DefineTokenizer("custom", test.config)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// pattern := strings.Join(test.patterns, "|")
|
||||
// r, err := regexp.Compile(pattern)
|
||||
// if err != nil {
|
||||
// t.Fatal(err)
|
||||
// }
|
||||
// tokenizer := NewExceptionsTokenizer(r, remaining)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(actual, test.result) {
|
||||
t.Errorf("expected %v, got %v", test.result, actual)
|
||||
}
|
||||
}
|
||||
}
|
138
vendor/github.com/blevesearch/bleve/analysis/tokenizers/icu/boundary.go
generated
vendored
Normal file
138
vendor/github.com/blevesearch/bleve/analysis/tokenizers/icu/boundary.go
generated
vendored
Normal file
|
@ -0,0 +1,138 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build icu full
|
||||
|
||||
package icu
|
||||
|
||||
// #cgo LDFLAGS: -licuuc -licudata
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
// #include "unicode/utypes.h"
|
||||
// #include "unicode/uchar.h"
|
||||
// #include "unicode/ubrk.h"
|
||||
// #include "unicode/ustring.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "icu"
|
||||
|
||||
type UnicodeWordBoundaryTokenizer struct {
|
||||
locale *C.char
|
||||
}
|
||||
|
||||
func NewUnicodeWordBoundaryTokenizer() *UnicodeWordBoundaryTokenizer {
|
||||
return &UnicodeWordBoundaryTokenizer{}
|
||||
}
|
||||
|
||||
func NewUnicodeWordBoundaryCustomLocaleTokenizer(locale string) *UnicodeWordBoundaryTokenizer {
|
||||
return &UnicodeWordBoundaryTokenizer{
|
||||
locale: C.CString(locale),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
if len(input) < 1 {
|
||||
return rv
|
||||
}
|
||||
|
||||
// works
|
||||
var myUnsafePointer = unsafe.Pointer(&(input[0]))
|
||||
var myCCharPointer *C.char = (*C.char)(myUnsafePointer)
|
||||
|
||||
var inlen C.int32_t = C.int32_t(len(input))
|
||||
var buflen C.int32_t = C.int32_t(2*len(input) + 1) // worse case each byte becomes 2
|
||||
var stringToExamine []C.UChar = make([]C.UChar, buflen)
|
||||
var myUnsafePointerToExamine = unsafe.Pointer(&(stringToExamine[0]))
|
||||
var myUCharPointer *C.UChar = (*C.UChar)(myUnsafePointerToExamine)
|
||||
C.u_uastrncpy(myUCharPointer, myCCharPointer, inlen)
|
||||
|
||||
var err C.UErrorCode = C.U_ZERO_ERROR
|
||||
bi := C.ubrk_open(C.UBRK_WORD, t.locale, myUCharPointer, -1, &err)
|
||||
|
||||
if err > C.U_ZERO_ERROR {
|
||||
return rv
|
||||
}
|
||||
|
||||
defer C.ubrk_close(bi)
|
||||
|
||||
position := 0
|
||||
var prev C.int32_t
|
||||
p := C.ubrk_first(bi)
|
||||
for p != C.UBRK_DONE {
|
||||
|
||||
q := C.ubrk_getRuleStatus(bi)
|
||||
|
||||
// convert boundaries back to utf8 positions
|
||||
var nilCString *C.char
|
||||
var indexA C.int32_t
|
||||
|
||||
C.u_strToUTF8(nilCString, 0, &indexA, myUCharPointer, prev, &err)
|
||||
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
|
||||
return rv
|
||||
} else {
|
||||
err = C.U_ZERO_ERROR
|
||||
}
|
||||
|
||||
var indexB C.int32_t
|
||||
C.u_strToUTF8(nilCString, 0, &indexB, myUCharPointer, p, &err)
|
||||
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
|
||||
return rv
|
||||
} else {
|
||||
err = C.U_ZERO_ERROR
|
||||
}
|
||||
|
||||
if q != 0 {
|
||||
position += 1
|
||||
token := analysis.Token{
|
||||
Start: int(indexA),
|
||||
End: int(indexB),
|
||||
Term: input[indexA:indexB],
|
||||
Position: position,
|
||||
Type: analysis.AlphaNumeric,
|
||||
}
|
||||
if q == 100 {
|
||||
token.Type = analysis.Numeric
|
||||
}
|
||||
if q == 400 {
|
||||
token.Type = analysis.Ideographic
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
prev = p
|
||||
p = C.ubrk_next(bi)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func UnicodeWordBoundaryTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
locale := ""
|
||||
localeVal, ok := config["locale"].(string)
|
||||
if ok {
|
||||
locale = localeVal
|
||||
}
|
||||
if locale == "" {
|
||||
return NewUnicodeWordBoundaryTokenizer(), nil
|
||||
} else {
|
||||
return NewUnicodeWordBoundaryCustomLocaleTokenizer(locale), nil
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, UnicodeWordBoundaryTokenizerConstructor)
|
||||
}
|
191
vendor/github.com/blevesearch/bleve/analysis/tokenizers/icu/boundary_test.go
generated
vendored
Normal file
191
vendor/github.com/blevesearch/bleve/analysis/tokenizers/icu/boundary_test.go
generated
vendored
Normal file
|
@ -0,0 +1,191 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// +build icu full
|
||||
|
||||
package icu
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestBoundary(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
locale string
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World"),
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Term: []byte("Hello"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 11,
|
||||
Term: []byte("World"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("steven's"),
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 8,
|
||||
Term: []byte("steven's"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 15,
|
||||
Term: []byte("こんにちは"),
|
||||
Position: 1,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 15,
|
||||
End: 21,
|
||||
Term: []byte("世界"),
|
||||
Position: 2,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
"th_TH",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 9,
|
||||
Term: []byte("แยก"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 9,
|
||||
End: 15,
|
||||
Term: []byte("คำ"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 15,
|
||||
End: 27,
|
||||
Term: []byte("ภาษา"),
|
||||
Position: 3,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 27,
|
||||
End: 36,
|
||||
Term: []byte("ไทย"),
|
||||
Position: 4,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 36,
|
||||
End: 42,
|
||||
Term: []byte("ก็"),
|
||||
Position: 5,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 42,
|
||||
End: 57,
|
||||
Term: []byte("ทำได้"),
|
||||
Position: 6,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 57,
|
||||
End: 63,
|
||||
Term: []byte("นะ"),
|
||||
Position: 7,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 63,
|
||||
End: 72,
|
||||
Term: []byte("จ้ะ"),
|
||||
Position: 8,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("age 25"),
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("age"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
End: 6,
|
||||
Term: []byte("25"),
|
||||
Position: 2,
|
||||
Type: analysis.Numeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer(test.locale)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var sampleLargeInput = []byte(`There are three characteristics of liquids which are relevant to the discussion of a BLEVE:
|
||||
If a liquid in a sealed container is boiled, the pressure inside the container increases. As the liquid changes to a gas it expands - this expansion in a vented container would cause the gas and liquid to take up more space. In a sealed container the gas and liquid are not able to take up more space and so the pressure rises. Pressurized vessels containing liquids can reach an equilibrium where the liquid stops boiling and the pressure stops rising. This occurs when no more heat is being added to the system (either because it has reached ambient temperature or has had a heat source removed).
|
||||
The boiling temperature of a liquid is dependent on pressure - high pressures will yield high boiling temperatures, and low pressures will yield low boiling temperatures. A common simple experiment is to place a cup of water in a vacuum chamber, and then reduce the pressure in the chamber until the water boils. By reducing the pressure the water will boil even at room temperature. This works both ways - if the pressure is increased beyond normal atmospheric pressures, the boiling of hot water could be suppressed far beyond normal temperatures. The cooling system of a modern internal combustion engine is a real-world example.
|
||||
When a liquid boils it turns into a gas. The resulting gas takes up far more space than the liquid did.
|
||||
Typically, a BLEVE starts with a container of liquid which is held above its normal, atmospheric-pressure boiling temperature. Many substances normally stored as liquids, such as CO2, oxygen, and other similar industrial gases have boiling temperatures, at atmospheric pressure, far below room temperature. In the case of water, a BLEVE could occur if a pressurized chamber of water is heated far beyond the standard 100 °C (212 °F). That container, because the boiling water pressurizes it, is capable of holding liquid water at very high temperatures.
|
||||
If the pressurized vessel, containing liquid at high temperature (which may be room temperature, depending on the substance) ruptures, the pressure which prevents the liquid from boiling is lost. If the rupture is catastrophic, where the vessel is immediately incapable of holding any pressure at all, then there suddenly exists a large mass of liquid which is at very high temperature and very low pressure. This causes the entire volume of liquid to instantaneously boil, which in turn causes an extremely rapid expansion. Depending on temperatures, pressures and the substance involved, that expansion may be so rapid that it can be classified as an explosion, fully capable of inflicting severe damage on its surroundings.`)
|
||||
|
||||
func BenchmarkTokenizeEnglishText(b *testing.B) {
|
||||
|
||||
tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer("en_US")
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
tokenizer.Tokenize(sampleLargeInput)
|
||||
}
|
||||
|
||||
}
|
77
vendor/github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
generated
vendored
Normal file
77
vendor/github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
generated
vendored
Normal file
|
@ -0,0 +1,77 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package regexp_tokenizer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "regexp"
|
||||
|
||||
var IdeographRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
|
||||
|
||||
type RegexpTokenizer struct {
|
||||
r *regexp.Regexp
|
||||
}
|
||||
|
||||
func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {
|
||||
return &RegexpTokenizer{
|
||||
r: r,
|
||||
}
|
||||
}
|
||||
|
||||
func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
matches := rt.r.FindAllIndex(input, -1)
|
||||
rv := make(analysis.TokenStream, len(matches))
|
||||
for i, match := range matches {
|
||||
matchBytes := input[match[0]:match[1]]
|
||||
token := analysis.Token{
|
||||
Term: matchBytes,
|
||||
Start: match[0],
|
||||
End: match[1],
|
||||
Position: i + 1,
|
||||
Type: detectTokenType(matchBytes),
|
||||
}
|
||||
rv[i] = &token
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func RegexpTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
rval, ok := config["regexp"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify regexp")
|
||||
}
|
||||
r, err := regexp.Compile(rval)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
|
||||
}
|
||||
return NewRegexpTokenizer(r), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, RegexpTokenizerConstructor)
|
||||
}
|
||||
|
||||
func detectTokenType(termBytes []byte) analysis.TokenType {
|
||||
if IdeographRegexp.Match(termBytes) {
|
||||
return analysis.Ideographic
|
||||
}
|
||||
_, err := strconv.ParseFloat(string(termBytes), 64)
|
||||
if err == nil {
|
||||
return analysis.Numeric
|
||||
}
|
||||
return analysis.AlphaNumeric
|
||||
}
|
115
vendor/github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
generated
vendored
Normal file
115
vendor/github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
generated
vendored
Normal file
|
@ -0,0 +1,115 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package regexp_tokenizer
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestBoundary(t *testing.T) {
|
||||
|
||||
wordRegex := regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World."),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Term: []byte("Hello"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 11,
|
||||
Term: []byte("World"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("こ"),
|
||||
Position: 1,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 3,
|
||||
End: 6,
|
||||
Term: []byte("ん"),
|
||||
Position: 2,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 9,
|
||||
Term: []byte("に"),
|
||||
Position: 3,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 9,
|
||||
End: 12,
|
||||
Term: []byte("ち"),
|
||||
Position: 4,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 12,
|
||||
End: 15,
|
||||
Term: []byte("は"),
|
||||
Position: 5,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 15,
|
||||
End: 18,
|
||||
Term: []byte("世"),
|
||||
Position: 6,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 18,
|
||||
End: 21,
|
||||
Term: []byte("界"),
|
||||
Position: 7,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte(""),
|
||||
analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewRegexpTokenizer(wordRegex)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
44
vendor/github.com/blevesearch/bleve/analysis/tokenizers/single_token/single_token.go
generated
vendored
Normal file
44
vendor/github.com/blevesearch/bleve/analysis/tokenizers/single_token/single_token.go
generated
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package single_token
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "single"
|
||||
|
||||
type SingleTokenTokenizer struct {
|
||||
}
|
||||
|
||||
func NewSingleTokenTokenizer() *SingleTokenTokenizer {
|
||||
return &SingleTokenTokenizer{}
|
||||
}
|
||||
|
||||
func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
return analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: input,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: len(input),
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func SingleTokenTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return NewSingleTokenTokenizer(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, SingleTokenTokenizerConstructor)
|
||||
}
|
71
vendor/github.com/blevesearch/bleve/analysis/tokenizers/single_token/single_token_test.go
generated
vendored
Normal file
71
vendor/github.com/blevesearch/bleve/analysis/tokenizers/single_token/single_token_test.go
generated
vendored
Normal file
|
@ -0,0 +1,71 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package single_token
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestSingleTokenTokenizer(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 11,
|
||||
Term: []byte("Hello World"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 21,
|
||||
Term: []byte("こんにちは世界"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 72,
|
||||
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewSingleTokenTokenizer()
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
73
vendor/github.com/blevesearch/bleve/analysis/tokenizers/unicode/unicode.go
generated
vendored
Normal file
73
vendor/github.com/blevesearch/bleve/analysis/tokenizers/unicode/unicode.go
generated
vendored
Normal file
|
@ -0,0 +1,73 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package unicode
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/segment"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "unicode"
|
||||
|
||||
type UnicodeTokenizer struct {
|
||||
}
|
||||
|
||||
func NewUnicodeTokenizer() *UnicodeTokenizer {
|
||||
return &UnicodeTokenizer{}
|
||||
}
|
||||
|
||||
func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
segmenter := segment.NewWordSegmenterDirect(input)
|
||||
start := 0
|
||||
pos := 1
|
||||
for segmenter.Segment() {
|
||||
segmentBytes := segmenter.Bytes()
|
||||
end := start + len(segmentBytes)
|
||||
if segmenter.Type() != segment.None {
|
||||
token := analysis.Token{
|
||||
Term: segmentBytes,
|
||||
Start: start,
|
||||
End: end,
|
||||
Position: pos,
|
||||
Type: convertType(segmenter.Type()),
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
pos++
|
||||
}
|
||||
start = end
|
||||
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func UnicodeTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return NewUnicodeTokenizer(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, UnicodeTokenizerConstructor)
|
||||
}
|
||||
|
||||
func convertType(segmentWordType int) analysis.TokenType {
|
||||
switch segmentWordType {
|
||||
case segment.Ideo:
|
||||
return analysis.Ideographic
|
||||
case segment.Kana:
|
||||
return analysis.Ideographic
|
||||
case segment.Number:
|
||||
return analysis.Numeric
|
||||
}
|
||||
return analysis.AlphaNumeric
|
||||
}
|
197
vendor/github.com/blevesearch/bleve/analysis/tokenizers/unicode/unicode_test.go
generated
vendored
Normal file
197
vendor/github.com/blevesearch/bleve/analysis/tokenizers/unicode/unicode_test.go
generated
vendored
Normal file
|
@ -0,0 +1,197 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package unicode
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/segment"
|
||||
)
|
||||
|
||||
func TestUnicode(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Term: []byte("Hello"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 11,
|
||||
Term: []byte("World"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("steven's"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 8,
|
||||
Term: []byte("steven's"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("こ"),
|
||||
Position: 1,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 3,
|
||||
End: 6,
|
||||
Term: []byte("ん"),
|
||||
Position: 2,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 9,
|
||||
Term: []byte("に"),
|
||||
Position: 3,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 9,
|
||||
End: 12,
|
||||
Term: []byte("ち"),
|
||||
Position: 4,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 12,
|
||||
End: 15,
|
||||
Term: []byte("は"),
|
||||
Position: 5,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 15,
|
||||
End: 18,
|
||||
Term: []byte("世"),
|
||||
Position: 6,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 18,
|
||||
End: 21,
|
||||
Term: []byte("界"),
|
||||
Position: 7,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("age 25"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("age"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
End: 6,
|
||||
Term: []byte("25"),
|
||||
Position: 2,
|
||||
Type: analysis.Numeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("カ"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("カ"),
|
||||
Position: 1,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewUnicodeTokenizer()
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var sampleLargeInput = []byte(`There are three characteristics of liquids which are relevant to the discussion of a BLEVE:
|
||||
If a liquid in a sealed container is boiled, the pressure inside the container increases. As the liquid changes to a gas it expands - this expansion in a vented container would cause the gas and liquid to take up more space. In a sealed container the gas and liquid are not able to take up more space and so the pressure rises. Pressurized vessels containing liquids can reach an equilibrium where the liquid stops boiling and the pressure stops rising. This occurs when no more heat is being added to the system (either because it has reached ambient temperature or has had a heat source removed).
|
||||
The boiling temperature of a liquid is dependent on pressure - high pressures will yield high boiling temperatures, and low pressures will yield low boiling temperatures. A common simple experiment is to place a cup of water in a vacuum chamber, and then reduce the pressure in the chamber until the water boils. By reducing the pressure the water will boil even at room temperature. This works both ways - if the pressure is increased beyond normal atmospheric pressures, the boiling of hot water could be suppressed far beyond normal temperatures. The cooling system of a modern internal combustion engine is a real-world example.
|
||||
When a liquid boils it turns into a gas. The resulting gas takes up far more space than the liquid did.
|
||||
Typically, a BLEVE starts with a container of liquid which is held above its normal, atmospheric-pressure boiling temperature. Many substances normally stored as liquids, such as CO2, oxygen, and other similar industrial gases have boiling temperatures, at atmospheric pressure, far below room temperature. In the case of water, a BLEVE could occur if a pressurized chamber of water is heated far beyond the standard 100 °C (212 °F). That container, because the boiling water pressurizes it, is capable of holding liquid water at very high temperatures.
|
||||
If the pressurized vessel, containing liquid at high temperature (which may be room temperature, depending on the substance) ruptures, the pressure which prevents the liquid from boiling is lost. If the rupture is catastrophic, where the vessel is immediately incapable of holding any pressure at all, then there suddenly exists a large mass of liquid which is at very high temperature and very low pressure. This causes the entire volume of liquid to instantaneously boil, which in turn causes an extremely rapid expansion. Depending on temperatures, pressures and the substance involved, that expansion may be so rapid that it can be classified as an explosion, fully capable of inflicting severe damage on its surroundings.`)
|
||||
|
||||
func BenchmarkTokenizeEnglishText(b *testing.B) {
|
||||
|
||||
tokenizer := NewUnicodeTokenizer()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
tokenizer.Tokenize(sampleLargeInput)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestConvertType(t *testing.T) {
|
||||
tests := []struct {
|
||||
in int
|
||||
out analysis.TokenType
|
||||
}{
|
||||
{
|
||||
segment.Ideo, analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
segment.Kana, analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
segment.Number, analysis.Numeric,
|
||||
},
|
||||
{
|
||||
segment.Letter, analysis.AlphaNumeric,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual := convertType(test.in)
|
||||
if actual != test.out {
|
||||
t.Errorf("expected %d, got %d for %d", test.out, actual, test.in)
|
||||
}
|
||||
}
|
||||
}
|
30
vendor/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
generated
vendored
Normal file
30
vendor/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
generated
vendored
Normal file
|
@ -0,0 +1,30 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package whitespace_tokenizer
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "whitespace"
|
||||
|
||||
var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|[^\p{Z}\p{P}\p{C}\p{Han}\p{Hangul}\p{Hiragana}\p{Katakana}]+`)
|
||||
|
||||
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, TokenizerConstructor)
|
||||
}
|
150
vendor/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer_test.go
generated
vendored
Normal file
150
vendor/github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer_test.go
generated
vendored
Normal file
|
@ -0,0 +1,150 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package whitespace_tokenizer
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
|
||||
)
|
||||
|
||||
func TestBoundary(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Hello World."),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Term: []byte("Hello"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 11,
|
||||
Term: []byte("World"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte("こんにちは世界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("こ"),
|
||||
Position: 1,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 3,
|
||||
End: 6,
|
||||
Term: []byte("ん"),
|
||||
Position: 2,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
End: 9,
|
||||
Term: []byte("に"),
|
||||
Position: 3,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 9,
|
||||
End: 12,
|
||||
Term: []byte("ち"),
|
||||
Position: 4,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 12,
|
||||
End: 15,
|
||||
Term: []byte("は"),
|
||||
Position: 5,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 15,
|
||||
End: 18,
|
||||
Term: []byte("世"),
|
||||
Position: 6,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
{
|
||||
Start: 18,
|
||||
End: 21,
|
||||
Term: []byte("界"),
|
||||
Position: 7,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
[]byte(""),
|
||||
analysis.TokenStream{},
|
||||
},
|
||||
{
|
||||
[]byte("abc界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Term: []byte("abc"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 3,
|
||||
End: 6,
|
||||
Term: []byte("界"),
|
||||
Position: 2,
|
||||
Type: analysis.Ideographic,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var sampleLargeInput = []byte(`There are three characteristics of liquids which are relevant to the discussion of a BLEVE:
|
||||
If a liquid in a sealed container is boiled, the pressure inside the container increases. As the liquid changes to a gas it expands - this expansion in a vented container would cause the gas and liquid to take up more space. In a sealed container the gas and liquid are not able to take up more space and so the pressure rises. Pressurized vessels containing liquids can reach an equilibrium where the liquid stops boiling and the pressure stops rising. This occurs when no more heat is being added to the system (either because it has reached ambient temperature or has had a heat source removed).
|
||||
The boiling temperature of a liquid is dependent on pressure - high pressures will yield high boiling temperatures, and low pressures will yield low boiling temperatures. A common simple experiment is to place a cup of water in a vacuum chamber, and then reduce the pressure in the chamber until the water boils. By reducing the pressure the water will boil even at room temperature. This works both ways - if the pressure is increased beyond normal atmospheric pressures, the boiling of hot water could be suppressed far beyond normal temperatures. The cooling system of a modern internal combustion engine is a real-world example.
|
||||
When a liquid boils it turns into a gas. The resulting gas takes up far more space than the liquid did.
|
||||
Typically, a BLEVE starts with a container of liquid which is held above its normal, atmospheric-pressure boiling temperature. Many substances normally stored as liquids, such as CO2, oxygen, and other similar industrial gases have boiling temperatures, at atmospheric pressure, far below room temperature. In the case of water, a BLEVE could occur if a pressurized chamber of water is heated far beyond the standard 100 °C (212 °F). That container, because the boiling water pressurizes it, is capable of holding liquid water at very high temperatures.
|
||||
If the pressurized vessel, containing liquid at high temperature (which may be room temperature, depending on the substance) ruptures, the pressure which prevents the liquid from boiling is lost. If the rupture is catastrophic, where the vessel is immediately incapable of holding any pressure at all, then there suddenly exists a large mass of liquid which is at very high temperature and very low pressure. This causes the entire volume of liquid to instantaneously boil, which in turn causes an extremely rapid expansion. Depending on temperatures, pressures and the substance involved, that expansion may be so rapid that it can be classified as an explosion, fully capable of inflicting severe damage on its surroundings.`)
|
||||
|
||||
func BenchmarkTokenizeEnglishText(b *testing.B) {
|
||||
|
||||
tokenizer := regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
tokenizer.Tokenize(sampleLargeInput)
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue