Ethan Koenig
6 years ago
committed by
Lauris BH
14 changed files with 704 additions and 97 deletions
@ -0,0 +1,53 @@ |
|||
// Copyright (c) 2018 Couchbase, Inc.
|
|||
//
|
|||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|||
// you may not use this file except in compliance with the License.
|
|||
// You may obtain a copy of the License at
|
|||
//
|
|||
// http://www.apache.org/licenses/LICENSE-2.0
|
|||
//
|
|||
// Unless required by applicable law or agreed to in writing, software
|
|||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
// See the License for the specific language governing permissions and
|
|||
// limitations under the License.
|
|||
|
|||
package unique |
|||
|
|||
import ( |
|||
"github.com/blevesearch/bleve/analysis" |
|||
"github.com/blevesearch/bleve/registry" |
|||
) |
|||
|
|||
const Name = "unique" |
|||
|
|||
// UniqueTermFilter retains only the tokens which mark the first occurence of
|
|||
// a term. Tokens whose term appears in a preceding token are dropped.
|
|||
type UniqueTermFilter struct{} |
|||
|
|||
func NewUniqueTermFilter() *UniqueTermFilter { |
|||
return &UniqueTermFilter{} |
|||
} |
|||
|
|||
func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream { |
|||
encounteredTerms := make(map[string]struct{}, len(input)/4) |
|||
j := 0 |
|||
for _, token := range input { |
|||
term := string(token.Term) |
|||
if _, ok := encounteredTerms[term]; ok { |
|||
continue |
|||
} |
|||
encounteredTerms[term] = struct{}{} |
|||
input[j] = token |
|||
j++ |
|||
} |
|||
return input[:j] |
|||
} |
|||
|
|||
func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { |
|||
return NewUniqueTermFilter(), nil |
|||
} |
|||
|
|||
func init() { |
|||
registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor) |
|||
} |
@ -0,0 +1,173 @@ |
|||
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. |
|||
|
|||
|
|||
[[projects]] |
|||
name = "github.com/RoaringBitmap/roaring" |
|||
packages = ["."] |
|||
revision = "84551f0e309d6f9bafa428ef39b31ab7f16ff7b8" |
|||
version = "v0.4.1" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/Smerity/govarint" |
|||
packages = ["."] |
|||
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab" |
|||
|
|||
[[projects]] |
|||
name = "github.com/blevesearch/bleve" |
|||
packages = [ |
|||
".", |
|||
"analysis", |
|||
"analysis/analyzer/standard", |
|||
"analysis/datetime/flexible", |
|||
"analysis/datetime/optional", |
|||
"analysis/lang/en", |
|||
"analysis/token/lowercase", |
|||
"analysis/token/porter", |
|||
"analysis/token/stop", |
|||
"analysis/tokenizer/unicode", |
|||
"document", |
|||
"geo", |
|||
"index", |
|||
"index/scorch", |
|||
"index/scorch/mergeplan", |
|||
"index/scorch/segment", |
|||
"index/scorch/segment/mem", |
|||
"index/scorch/segment/zap", |
|||
"index/store", |
|||
"index/store/boltdb", |
|||
"index/store/gtreap", |
|||
"index/upsidedown", |
|||
"mapping", |
|||
"numeric", |
|||
"registry", |
|||
"search", |
|||
"search/collector", |
|||
"search/facet", |
|||
"search/highlight", |
|||
"search/highlight/format/html", |
|||
"search/highlight/fragmenter/simple", |
|||
"search/highlight/highlighter/html", |
|||
"search/highlight/highlighter/simple", |
|||
"search/query", |
|||
"search/scorer", |
|||
"search/searcher" |
|||
] |
|||
revision = "a3b125508b4443344b596888ca58467b6c9310b9" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/blevesearch/go-porterstemmer" |
|||
packages = ["."] |
|||
revision = "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/blevesearch/segment" |
|||
packages = ["."] |
|||
revision = "762005e7a34fd909a84586299f1dd457371d36ee" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/boltdb/bolt" |
|||
packages = ["."] |
|||
revision = "9da31745363232bc1e27dbab3569e77383a51585" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/couchbase/vellum" |
|||
packages = [ |
|||
".", |
|||
"regexp", |
|||
"utf8" |
|||
] |
|||
revision = "ed84a675e24ed0a0bf6859b1ddec7e7c858354bd" |
|||
|
|||
[[projects]] |
|||
name = "github.com/davecgh/go-spew" |
|||
packages = ["spew"] |
|||
revision = "346938d642f2ec3594ed81d874461961cd0faa76" |
|||
version = "v1.1.0" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/edsrzf/mmap-go" |
|||
packages = ["."] |
|||
revision = "0bce6a6887123b67a60366d2c9fe2dfb74289d2e" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/glycerine/go-unsnap-stream" |
|||
packages = ["."] |
|||
revision = "62a9a9eb44fd8932157b1a8ace2149eff5971af6" |
|||
|
|||
[[projects]] |
|||
name = "github.com/golang/protobuf" |
|||
packages = ["proto"] |
|||
revision = "925541529c1fa6821df4e44ce2723319eb2be768" |
|||
version = "v1.0.0" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/golang/snappy" |
|||
packages = ["."] |
|||
revision = "553a641470496b2327abcac10b36396bd98e45c9" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/mschoch/smat" |
|||
packages = ["."] |
|||
revision = "90eadee771aeab36e8bf796039b8c261bebebe4f" |
|||
|
|||
[[projects]] |
|||
name = "github.com/philhofer/fwd" |
|||
packages = ["."] |
|||
revision = "bb6d471dc95d4fe11e432687f8b70ff496cf3136" |
|||
version = "v1.0.0" |
|||
|
|||
[[projects]] |
|||
name = "github.com/pmezard/go-difflib" |
|||
packages = ["difflib"] |
|||
revision = "792786c7400a136282c1664665ae0a8db921c6c2" |
|||
version = "v1.0.0" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/steveyen/gtreap" |
|||
packages = ["."] |
|||
revision = "0abe01ef9be25c4aedc174758ec2d917314d6d70" |
|||
|
|||
[[projects]] |
|||
name = "github.com/stretchr/testify" |
|||
packages = ["assert"] |
|||
revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71" |
|||
version = "v1.2.1" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/tinylib/msgp" |
|||
packages = ["msgp"] |
|||
revision = "03a79185462ad029a6e7e05b2f3f3e0498d0a6c0" |
|||
|
|||
[[projects]] |
|||
branch = "master" |
|||
name = "github.com/willf/bitset" |
|||
packages = ["."] |
|||
revision = "1a37ad96e8c1a11b20900a232874843b5174221f" |
|||
|
|||
[[projects]] |
|||
name = "golang.org/x/net" |
|||
packages = ["context"] |
|||
revision = "309822c5b9b9f80db67f016069a12628d94fad34" |
|||
|
|||
[[projects]] |
|||
name = "golang.org/x/sys" |
|||
packages = ["unix"] |
|||
revision = "3dbebcf8efb6a5011a60c2b4591c1022a759af8a" |
|||
|
|||
[solve-meta] |
|||
analyzer-name = "dep" |
|||
analyzer-version = 1 |
|||
inputs-digest = "61c759f0c1136cadf86ae8a30bb78edf33fc844cdcb2316469b4ae14a8d051b0" |
|||
solver-name = "gps-cdcl" |
|||
solver-version = 1 |
@ -0,0 +1,34 @@ |
|||
# Gopkg.toml example |
|||
# |
|||
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md |
|||
# for detailed Gopkg.toml documentation. |
|||
# |
|||
# required = ["github.com/user/thing/cmd/thing"] |
|||
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] |
|||
# |
|||
# [[constraint]] |
|||
# name = "github.com/user/project" |
|||
# version = "1.0.0" |
|||
# |
|||
# [[constraint]] |
|||
# name = "github.com/user/project2" |
|||
# branch = "dev" |
|||
# source = "github.com/myfork/project2" |
|||
# |
|||
# [[override]] |
|||
# name = "github.com/x/y" |
|||
# version = "2.4.0" |
|||
# |
|||
# [prune] |
|||
# non-go = false |
|||
# go-tests = true |
|||
# unused-packages = true |
|||
|
|||
|
|||
[[constraint]] |
|||
name = "github.com/stretchr/testify" |
|||
version = "1.2.1" |
|||
|
|||
[prune] |
|||
go-tests = true |
|||
unused-packages = true |
@ -0,0 +1,21 @@ |
|||
MIT License |
|||
|
|||
Copyright (c) 2018 Ethan Koenig |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a copy |
|||
of this software and associated documentation files (the "Software"), to deal |
|||
in the Software without restriction, including without limitation the rights |
|||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|||
copies of the Software, and to permit persons to whom the Software is |
|||
furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in all |
|||
copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|||
SOFTWARE. |
@ -0,0 +1,13 @@ |
|||
# rupture |
|||
|
|||
[![Build Status](https://travis-ci.org/ethantkoenig/rupture.svg?branch=master)](https://travis-ci.org/ethantkoenig/rupture) [![GoDoc](https://godoc.org/github.com/ethantkoenig/rupture?status.svg)](https://godoc.org/github.com/ethantkoenig/rupture) [![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) |
|||
|
|||
An explosive companion to the [bleve indexing library](https://www.github.com/blevesearch/bleve) |
|||
|
|||
## Features |
|||
|
|||
`rupture` includes the following additions to `bleve`: |
|||
|
|||
- __Flushing batches__: Batches of operation which automatically flush to the underlying bleve index. |
|||
- __Sharded indices__: An index-like abstraction built on top of several underlying indices. Sharded indices provide lower write latencies for indices with large amounts of data. |
|||
- __Index metadata__: Track index version for easily managing migrations and schema changes. |
@ -0,0 +1,67 @@ |
|||
package rupture |
|||
|
|||
import ( |
|||
"github.com/blevesearch/bleve" |
|||
) |
|||
|
|||
// FlushingBatch is a batch of operations that automatically flushes to the
|
|||
// underlying index once it reaches a certain size.
|
|||
type FlushingBatch interface { |
|||
// Index adds the specified index operation batch, possibly triggering a
|
|||
// flush.
|
|||
Index(id string, data interface{}) error |
|||
// Remove adds the specified delete operation to the batch, possibly
|
|||
// triggering a flush.
|
|||
Delete(id string) error |
|||
// Flush flushes the batch's contents.
|
|||
Flush() error |
|||
} |
|||
|
|||
type singleIndexFlushingBatch struct { |
|||
maxBatchSize int |
|||
batch *bleve.Batch |
|||
index bleve.Index |
|||
} |
|||
|
|||
func newFlushingBatch(index bleve.Index, maxBatchSize int) *singleIndexFlushingBatch { |
|||
return &singleIndexFlushingBatch{ |
|||
maxBatchSize: maxBatchSize, |
|||
batch: index.NewBatch(), |
|||
index: index, |
|||
} |
|||
} |
|||
|
|||
// NewFlushingBatch creates a new flushing batch for the specified index. Once
|
|||
// the number of operations in the batch reaches the specified limit, the batch
|
|||
// automatically flushes its operations to the index.
|
|||
func NewFlushingBatch(index bleve.Index, maxBatchSize int) FlushingBatch { |
|||
return newFlushingBatch(index, maxBatchSize) |
|||
} |
|||
|
|||
func (b *singleIndexFlushingBatch) Index(id string, data interface{}) error { |
|||
if err := b.batch.Index(id, data); err != nil { |
|||
return err |
|||
} |
|||
return b.flushIfFull() |
|||
} |
|||
|
|||
func (b *singleIndexFlushingBatch) Delete(id string) error { |
|||
b.batch.Delete(id) |
|||
return b.flushIfFull() |
|||
} |
|||
|
|||
func (b *singleIndexFlushingBatch) flushIfFull() error { |
|||
if b.batch.Size() < b.maxBatchSize { |
|||
return nil |
|||
} |
|||
return b.Flush() |
|||
} |
|||
|
|||
func (b *singleIndexFlushingBatch) Flush() error { |
|||
err := b.index.Batch(b.batch) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
b.batch.Reset() |
|||
return nil |
|||
} |
@ -0,0 +1,68 @@ |
|||
package rupture |
|||
|
|||
import ( |
|||
"encoding/json" |
|||
"io/ioutil" |
|||
"os" |
|||
"path/filepath" |
|||
) |
|||
|
|||
const metaFilename = "rupture_meta.json" |
|||
|
|||
func indexMetadataPath(dir string) string { |
|||
return filepath.Join(dir, metaFilename) |
|||
} |
|||
|
|||
// IndexMetadata contains metadata about a bleve index.
|
|||
type IndexMetadata struct { |
|||
// The version of the data in the index. This can be useful for tracking
|
|||
// schema changes or data migrations.
|
|||
Version int `json:"version"` |
|||
} |
|||
|
|||
// in addition to the user-exposed metadata, we keep additional, internal-only
|
|||
// metadata for sharded indices.
|
|||
const shardedMetadataFilename = "rupture_sharded_meta.json" |
|||
|
|||
func shardedIndexMetadataPath(dir string) string { |
|||
return filepath.Join(dir, shardedMetadataFilename) |
|||
} |
|||
|
|||
type shardedIndexMetadata struct { |
|||
NumShards int `json:"num_shards"` |
|||
} |
|||
|
|||
func readJSON(path string, meta interface{}) error { |
|||
metaBytes, err := ioutil.ReadFile(path) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
return json.Unmarshal(metaBytes, meta) |
|||
} |
|||
|
|||
func writeJSON(path string, meta interface{}) error { |
|||
metaBytes, err := json.Marshal(meta) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
return ioutil.WriteFile(path, metaBytes, 0666) |
|||
} |
|||
|
|||
// ReadIndexMetadata returns the metadata for the index at the specified path.
|
|||
// If no such index metadata exists, an empty metadata and a nil error are
|
|||
// returned.
|
|||
func ReadIndexMetadata(path string) (*IndexMetadata, error) { |
|||
meta := &IndexMetadata{} |
|||
metaPath := indexMetadataPath(path) |
|||
if _, err := os.Stat(metaPath); os.IsNotExist(err) { |
|||
return meta, nil |
|||
} else if err != nil { |
|||
return nil, err |
|||
} |
|||
return meta, readJSON(metaPath, meta) |
|||
} |
|||
|
|||
// WriteIndexMetadata writes metadata for the index at the specified path.
|
|||
func WriteIndexMetadata(path string, meta *IndexMetadata) error { |
|||
return writeJSON(indexMetadataPath(path), meta) |
|||
} |
@ -0,0 +1,146 @@ |
|||
package rupture |
|||
|
|||
import ( |
|||
"fmt" |
|||
"hash/fnv" |
|||
"path/filepath" |
|||
"strconv" |
|||
|
|||
"github.com/blevesearch/bleve" |
|||
"github.com/blevesearch/bleve/document" |
|||
"github.com/blevesearch/bleve/mapping" |
|||
) |
|||
|
|||
// ShardedIndex an index that is built onto of multiple underlying bleve
|
|||
// indices (i.e. shards). Similar to bleve's index aliases, some methods may
|
|||
// not be supported.
|
|||
type ShardedIndex interface { |
|||
bleve.Index |
|||
shards() []bleve.Index |
|||
} |
|||
|
|||
// a type alias for bleve.Index, so that the anonymous field of
|
|||
// shardedIndex does not conflict with the Index(..) method.
|
|||
type bleveIndex bleve.Index |
|||
|
|||
type shardedIndex struct { |
|||
bleveIndex |
|||
indices []bleve.Index |
|||
} |
|||
|
|||
func hash(id string, n int) uint64 { |
|||
fnvHash := fnv.New64() |
|||
fnvHash.Write([]byte(id)) |
|||
return fnvHash.Sum64() % uint64(n) |
|||
} |
|||
|
|||
func childIndexerPath(rootPath string, i int) string { |
|||
return filepath.Join(rootPath, strconv.Itoa(i)) |
|||
} |
|||
|
|||
// NewShardedIndex creates a sharded index at the specified path, with the
|
|||
// specified mapping and number of shards.
|
|||
func NewShardedIndex(path string, mapping mapping.IndexMapping, numShards int) (ShardedIndex, error) { |
|||
if numShards <= 0 { |
|||
return nil, fmt.Errorf("Invalid number of shards: %d", numShards) |
|||
} |
|||
err := writeJSON(shardedIndexMetadataPath(path), &shardedIndexMetadata{NumShards: numShards}) |
|||
if err != nil { |
|||
return nil, err |
|||
} |
|||
|
|||
s := &shardedIndex{ |
|||
indices: make([]bleve.Index, numShards), |
|||
} |
|||
for i := 0; i < numShards; i++ { |
|||
s.indices[i], err = bleve.New(childIndexerPath(path, i), mapping) |
|||
if err != nil { |
|||
return nil, err |
|||
} |
|||
} |
|||
s.bleveIndex = bleve.NewIndexAlias(s.indices...) |
|||
return s, nil |
|||
} |
|||
|
|||
// OpenShardedIndex opens a sharded index at the specified path.
|
|||
func OpenShardedIndex(path string) (ShardedIndex, error) { |
|||
var meta shardedIndexMetadata |
|||
var err error |
|||
if err = readJSON(shardedIndexMetadataPath(path), &meta); err != nil { |
|||
return nil, err |
|||
} |
|||
|
|||
s := &shardedIndex{ |
|||
indices: make([]bleve.Index, meta.NumShards), |
|||
} |
|||
for i := 0; i < meta.NumShards; i++ { |
|||
s.indices[i], err = bleve.Open(childIndexerPath(path, i)) |
|||
if err != nil { |
|||
return nil, err |
|||
} |
|||
} |
|||
s.bleveIndex = bleve.NewIndexAlias(s.indices...) |
|||
return s, nil |
|||
} |
|||
|
|||
func (s *shardedIndex) Index(id string, data interface{}) error { |
|||
return s.indices[hash(id, len(s.indices))].Index(id, data) |
|||
} |
|||
|
|||
func (s *shardedIndex) Delete(id string) error { |
|||
return s.indices[hash(id, len(s.indices))].Delete(id) |
|||
} |
|||
|
|||
func (s *shardedIndex) Document(id string) (*document.Document, error) { |
|||
return s.indices[hash(id, len(s.indices))].Document(id) |
|||
} |
|||
|
|||
func (s *shardedIndex) Close() error { |
|||
if err := s.bleveIndex.Close(); err != nil { |
|||
return err |
|||
} |
|||
for _, index := range s.indices { |
|||
if err := index.Close(); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
func (s *shardedIndex) shards() []bleve.Index { |
|||
return s.indices |
|||
} |
|||
|
|||
type shardedIndexFlushingBatch struct { |
|||
batches []*singleIndexFlushingBatch |
|||
} |
|||
|
|||
// NewShardedFlushingBatch creates a flushing batch with the specified batch
|
|||
// size for the specified sharded index.
|
|||
func NewShardedFlushingBatch(index ShardedIndex, maxBatchSize int) FlushingBatch { |
|||
indices := index.shards() |
|||
b := &shardedIndexFlushingBatch{ |
|||
batches: make([]*singleIndexFlushingBatch, len(indices)), |
|||
} |
|||
for i, index := range indices { |
|||
b.batches[i] = newFlushingBatch(index, maxBatchSize) |
|||
} |
|||
return b |
|||
} |
|||
|
|||
func (b *shardedIndexFlushingBatch) Index(id string, data interface{}) error { |
|||
return b.batches[hash(id, len(b.batches))].Index(id, data) |
|||
} |
|||
|
|||
func (b *shardedIndexFlushingBatch) Delete(id string) error { |
|||
return b.batches[hash(id, len(b.batches))].Delete(id) |
|||
} |
|||
|
|||
func (b *shardedIndexFlushingBatch) Flush() error { |
|||
for _, batch := range b.batches { |
|||
if err := batch.Flush(); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return nil |
|||
} |
Loading…
Reference in new issue