292 lines
10 KiB
Go
292 lines
10 KiB
Go
// Copyright (c) 2014 Couchbase, Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
// except in compliance with the License. You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
// either express or implied. See the License for the specific language governing permissions
|
|
// and limitations under the License.
|
|
|
|
package in
|
|
|
|
import (
|
|
"unicode"
|
|
|
|
"github.com/blevesearch/bleve/analysis"
|
|
"github.com/willf/bitset"
|
|
)
|
|
|
|
type ScriptData struct {
|
|
flag rune
|
|
base rune
|
|
decompMask *bitset.BitSet
|
|
}
|
|
|
|
var scripts = map[*unicode.RangeTable]*ScriptData{
|
|
unicode.Devanagari: &ScriptData{
|
|
flag: 1,
|
|
base: 0x0900,
|
|
},
|
|
unicode.Bengali: &ScriptData{
|
|
flag: 2,
|
|
base: 0x0980,
|
|
},
|
|
unicode.Gurmukhi: &ScriptData{
|
|
flag: 4,
|
|
base: 0x0A00,
|
|
},
|
|
unicode.Gujarati: &ScriptData{
|
|
flag: 8,
|
|
base: 0x0A80,
|
|
},
|
|
unicode.Oriya: &ScriptData{
|
|
flag: 16,
|
|
base: 0x0B00,
|
|
},
|
|
unicode.Tamil: &ScriptData{
|
|
flag: 32,
|
|
base: 0x0B80,
|
|
},
|
|
unicode.Telugu: &ScriptData{
|
|
flag: 64,
|
|
base: 0x0C00,
|
|
},
|
|
unicode.Kannada: &ScriptData{
|
|
flag: 128,
|
|
base: 0x0C80,
|
|
},
|
|
unicode.Malayalam: &ScriptData{
|
|
flag: 256,
|
|
base: 0x0D00,
|
|
},
|
|
}
|
|
|
|
func flag(ub *unicode.RangeTable) rune {
|
|
return scripts[ub].flag
|
|
}
|
|
|
|
var decompositions = [][]rune{
|
|
/* devanagari, gujarati vowel candra O */
|
|
{0x05, 0x3E, 0x45, 0x11, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari short O */
|
|
{0x05, 0x3E, 0x46, 0x12, flag(unicode.Devanagari)},
|
|
/* devanagari, gujarati letter O */
|
|
{0x05, 0x3E, 0x47, 0x13, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari letter AI, gujarati letter AU */
|
|
{0x05, 0x3E, 0x48, 0x14, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari, bengali, gurmukhi, gujarati, oriya AA */
|
|
{0x05, 0x3E, -1, 0x06, flag(unicode.Devanagari) | flag(unicode.Bengali) | flag(unicode.Gurmukhi) | flag(unicode.Gujarati) | flag(unicode.Oriya)},
|
|
/* devanagari letter candra A */
|
|
{0x05, 0x45, -1, 0x72, flag(unicode.Devanagari)},
|
|
/* gujarati vowel candra E */
|
|
{0x05, 0x45, -1, 0x0D, flag(unicode.Gujarati)},
|
|
/* devanagari letter short A */
|
|
{0x05, 0x46, -1, 0x04, flag(unicode.Devanagari)},
|
|
/* gujarati letter E */
|
|
{0x05, 0x47, -1, 0x0F, flag(unicode.Gujarati)},
|
|
/* gurmukhi, gujarati letter AI */
|
|
{0x05, 0x48, -1, 0x10, flag(unicode.Gurmukhi) | flag(unicode.Gujarati)},
|
|
/* devanagari, gujarati vowel candra O */
|
|
{0x05, 0x49, -1, 0x11, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari short O */
|
|
{0x05, 0x4A, -1, 0x12, flag(unicode.Devanagari)},
|
|
/* devanagari, gujarati letter O */
|
|
{0x05, 0x4B, -1, 0x13, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
|
|
{0x05, 0x4C, -1, 0x14, flag(unicode.Devanagari) | flag(unicode.Gurmukhi) | flag(unicode.Gujarati)},
|
|
/* devanagari, gujarati vowel candra O */
|
|
{0x06, 0x45, -1, 0x11, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari short O */
|
|
{0x06, 0x46, -1, 0x12, flag(unicode.Devanagari)},
|
|
/* devanagari, gujarati letter O */
|
|
{0x06, 0x47, -1, 0x13, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari letter AI, gujarati letter AU */
|
|
{0x06, 0x48, -1, 0x14, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* malayalam letter II */
|
|
{0x07, 0x57, -1, 0x08, flag(unicode.Malayalam)},
|
|
/* devanagari letter UU */
|
|
{0x09, 0x41, -1, 0x0A, flag(unicode.Devanagari)},
|
|
/* tamil, malayalam letter UU (some styles) */
|
|
{0x09, 0x57, -1, 0x0A, flag(unicode.Tamil) | flag(unicode.Malayalam)},
|
|
/* malayalam letter AI */
|
|
{0x0E, 0x46, -1, 0x10, flag(unicode.Malayalam)},
|
|
/* devanagari candra E */
|
|
{0x0F, 0x45, -1, 0x0D, flag(unicode.Devanagari)},
|
|
/* devanagari short E */
|
|
{0x0F, 0x46, -1, 0x0E, flag(unicode.Devanagari)},
|
|
/* devanagari AI */
|
|
{0x0F, 0x47, -1, 0x10, flag(unicode.Devanagari)},
|
|
/* oriya AI */
|
|
{0x0F, 0x57, -1, 0x10, flag(unicode.Oriya)},
|
|
/* malayalam letter OO */
|
|
{0x12, 0x3E, -1, 0x13, flag(unicode.Malayalam)},
|
|
/* telugu, kannada letter AU */
|
|
{0x12, 0x4C, -1, 0x14, flag(unicode.Telugu) | flag(unicode.Kannada)},
|
|
/* telugu letter OO */
|
|
{0x12, 0x55, -1, 0x13, flag(unicode.Telugu)},
|
|
/* tamil, malayalam letter AU */
|
|
{0x12, 0x57, -1, 0x14, flag(unicode.Tamil) | flag(unicode.Malayalam)},
|
|
/* oriya letter AU */
|
|
{0x13, 0x57, -1, 0x14, flag(unicode.Oriya)},
|
|
/* devanagari qa */
|
|
{0x15, 0x3C, -1, 0x58, flag(unicode.Devanagari)},
|
|
/* devanagari, gurmukhi khha */
|
|
{0x16, 0x3C, -1, 0x59, flag(unicode.Devanagari) | flag(unicode.Gurmukhi)},
|
|
/* devanagari, gurmukhi ghha */
|
|
{0x17, 0x3C, -1, 0x5A, flag(unicode.Devanagari) | flag(unicode.Gurmukhi)},
|
|
/* devanagari, gurmukhi za */
|
|
{0x1C, 0x3C, -1, 0x5B, flag(unicode.Devanagari) | flag(unicode.Gurmukhi)},
|
|
/* devanagari dddha, bengali, oriya rra */
|
|
{0x21, 0x3C, -1, 0x5C, flag(unicode.Devanagari) | flag(unicode.Bengali) | flag(unicode.Oriya)},
|
|
/* devanagari, bengali, oriya rha */
|
|
{0x22, 0x3C, -1, 0x5D, flag(unicode.Devanagari) | flag(unicode.Bengali) | flag(unicode.Oriya)},
|
|
/* malayalam chillu nn */
|
|
{0x23, 0x4D, 0xFF, 0x7A, flag(unicode.Malayalam)},
|
|
/* bengali khanda ta */
|
|
{0x24, 0x4D, 0xFF, 0x4E, flag(unicode.Bengali)},
|
|
/* devanagari nnna */
|
|
{0x28, 0x3C, -1, 0x29, flag(unicode.Devanagari)},
|
|
/* malayalam chillu n */
|
|
{0x28, 0x4D, 0xFF, 0x7B, flag(unicode.Malayalam)},
|
|
/* devanagari, gurmukhi fa */
|
|
{0x2B, 0x3C, -1, 0x5E, flag(unicode.Devanagari) | flag(unicode.Gurmukhi)},
|
|
/* devanagari, bengali yya */
|
|
{0x2F, 0x3C, -1, 0x5F, flag(unicode.Devanagari) | flag(unicode.Bengali)},
|
|
/* telugu letter vocalic R */
|
|
{0x2C, 0x41, 0x41, 0x0B, flag(unicode.Telugu)},
|
|
/* devanagari rra */
|
|
{0x30, 0x3C, -1, 0x31, flag(unicode.Devanagari)},
|
|
/* malayalam chillu rr */
|
|
{0x30, 0x4D, 0xFF, 0x7C, flag(unicode.Malayalam)},
|
|
/* malayalam chillu l */
|
|
{0x32, 0x4D, 0xFF, 0x7D, flag(unicode.Malayalam)},
|
|
/* devanagari llla */
|
|
{0x33, 0x3C, -1, 0x34, flag(unicode.Devanagari)},
|
|
/* malayalam chillu ll */
|
|
{0x33, 0x4D, 0xFF, 0x7E, flag(unicode.Malayalam)},
|
|
/* telugu letter MA */
|
|
{0x35, 0x41, -1, 0x2E, flag(unicode.Telugu)},
|
|
/* devanagari, gujarati vowel sign candra O */
|
|
{0x3E, 0x45, -1, 0x49, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari vowel sign short O */
|
|
{0x3E, 0x46, -1, 0x4A, flag(unicode.Devanagari)},
|
|
/* devanagari, gujarati vowel sign O */
|
|
{0x3E, 0x47, -1, 0x4B, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* devanagari, gujarati vowel sign AU */
|
|
{0x3E, 0x48, -1, 0x4C, flag(unicode.Devanagari) | flag(unicode.Gujarati)},
|
|
/* kannada vowel sign II */
|
|
{0x3F, 0x55, -1, 0x40, flag(unicode.Kannada)},
|
|
/* gurmukhi vowel sign UU (when stacking) */
|
|
{0x41, 0x41, -1, 0x42, flag(unicode.Gurmukhi)},
|
|
/* tamil, malayalam vowel sign O */
|
|
{0x46, 0x3E, -1, 0x4A, flag(unicode.Tamil) | flag(unicode.Malayalam)},
|
|
/* kannada vowel sign OO */
|
|
{0x46, 0x42, 0x55, 0x4B, flag(unicode.Kannada)},
|
|
/* kannada vowel sign O */
|
|
{0x46, 0x42, -1, 0x4A, flag(unicode.Kannada)},
|
|
/* malayalam vowel sign AI (if reordered twice) */
|
|
{0x46, 0x46, -1, 0x48, flag(unicode.Malayalam)},
|
|
/* telugu, kannada vowel sign EE */
|
|
{0x46, 0x55, -1, 0x47, flag(unicode.Telugu) | flag(unicode.Kannada)},
|
|
/* telugu, kannada vowel sign AI */
|
|
{0x46, 0x56, -1, 0x48, flag(unicode.Telugu) | flag(unicode.Kannada)},
|
|
/* tamil, malayalam vowel sign AU */
|
|
{0x46, 0x57, -1, 0x4C, flag(unicode.Tamil) | flag(unicode.Malayalam)},
|
|
/* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
|
|
{0x47, 0x3E, -1, 0x4B, flag(unicode.Bengali) | flag(unicode.Oriya) | flag(unicode.Tamil) | flag(unicode.Malayalam)},
|
|
/* bengali, oriya vowel sign AU */
|
|
{0x47, 0x57, -1, 0x4C, flag(unicode.Bengali) | flag(unicode.Oriya)},
|
|
/* kannada vowel sign OO */
|
|
{0x4A, 0x55, -1, 0x4B, flag(unicode.Kannada)},
|
|
/* gurmukhi letter I */
|
|
{0x72, 0x3F, -1, 0x07, flag(unicode.Gurmukhi)},
|
|
/* gurmukhi letter II */
|
|
{0x72, 0x40, -1, 0x08, flag(unicode.Gurmukhi)},
|
|
/* gurmukhi letter EE */
|
|
{0x72, 0x47, -1, 0x0F, flag(unicode.Gurmukhi)},
|
|
/* gurmukhi letter U */
|
|
{0x73, 0x41, -1, 0x09, flag(unicode.Gurmukhi)},
|
|
/* gurmukhi letter UU */
|
|
{0x73, 0x42, -1, 0x0A, flag(unicode.Gurmukhi)},
|
|
/* gurmukhi letter OO */
|
|
{0x73, 0x4B, -1, 0x13, flag(unicode.Gurmukhi)},
|
|
}
|
|
|
|
func init() {
|
|
for _, scriptData := range scripts {
|
|
scriptData.decompMask = bitset.New(0x7d)
|
|
for _, decomposition := range decompositions {
|
|
ch := decomposition[0]
|
|
flags := decomposition[4]
|
|
if (flags & scriptData.flag) != 0 {
|
|
scriptData.decompMask.Set(uint(ch))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func lookupScript(r rune) *unicode.RangeTable {
|
|
for script, _ := range scripts {
|
|
if unicode.Is(script, r) {
|
|
return script
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func normalize(input []rune) []rune {
|
|
inputLen := len(input)
|
|
for i := 0; i < inputLen; i++ {
|
|
r := input[i]
|
|
script := lookupScript(r)
|
|
if script != nil {
|
|
scriptData := scripts[script]
|
|
ch := r - scriptData.base
|
|
if scriptData.decompMask.Test(uint(ch)) {
|
|
input = compose(ch, script, scriptData, input, i, inputLen)
|
|
inputLen = len(input)
|
|
}
|
|
}
|
|
}
|
|
return input[0:inputLen]
|
|
}
|
|
|
|
func compose(ch0 rune, script0 *unicode.RangeTable, scriptData *ScriptData, input []rune, pos int, inputLen int) []rune {
|
|
if pos+1 >= inputLen {
|
|
return input // need at least 2 characters
|
|
}
|
|
|
|
ch1 := input[pos+1] - scriptData.base
|
|
script1 := lookupScript(input[pos+1])
|
|
if script0 != script1 {
|
|
return input // need to be same script
|
|
}
|
|
|
|
ch2 := rune(-1)
|
|
if pos+2 < inputLen {
|
|
ch2 = input[pos+2] - scriptData.base
|
|
script2 := lookupScript(input[pos+2])
|
|
if input[pos+2] == '\u200D' {
|
|
ch2 = 0xff // zero width joiner
|
|
} else if script2 != script1 {
|
|
ch2 = -1 // still allow 2 character match
|
|
}
|
|
}
|
|
|
|
for _, decomposition := range decompositions {
|
|
if decomposition[0] == ch0 &&
|
|
(decomposition[4]&scriptData.flag) != 0 {
|
|
if decomposition[1] == ch1 &&
|
|
(decomposition[2] < 0 || decomposition[2] == ch2) {
|
|
input[pos] = scriptData.base + decomposition[3]
|
|
input = analysis.DeleteRune(input, pos+1)
|
|
if decomposition[2] >= 0 {
|
|
input = analysis.DeleteRune(input, pos+1)
|
|
}
|
|
return input
|
|
}
|
|
}
|
|
}
|
|
return input
|
|
}
|