package roaring import ( "bytes" "encoding/binary" "fmt" "io" "io/ioutil" snappy "github.com/glycerine/go-unsnap-stream" "github.com/tinylib/msgp/msgp" ) //go:generate msgp -unexported type container interface { addOffset(uint16) []container clone() container and(container) container andCardinality(container) int iand(container) container // i stands for inplace andNot(container) container iandNot(container) container // i stands for inplace getCardinality() int // rank returns the number of integers that are // smaller or equal to x. rank(infinity) would be getCardinality(). rank(uint16) int iadd(x uint16) bool // inplace, returns true if x was new. iaddReturnMinimized(uint16) container // may change return type to minimize storage. //addRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) iaddRange(start, endx int) container // i stands for inplace, range is [firstOfRange,endx) iremove(x uint16) bool // inplace, returns true if x was present. iremoveReturnMinimized(uint16) container // may change return type to minimize storage. not(start, final int) container // range is [firstOfRange,lastOfRange) inot(firstOfRange, endx int) container // i stands for inplace, range is [firstOfRange,endx) xor(r container) container getShortIterator() shortIterable getReverseIterator() shortIterable getManyIterator() manyIterable contains(i uint16) bool maximum() uint16 minimum() uint16 // equals is now logical equals; it does not require the // same underlying container types, but compares across // any of the implementations. equals(r container) bool fillLeastSignificant16bits(array []uint32, i int, mask uint32) or(r container) container orCardinality(r container) int isFull() bool ior(r container) container // i stands for inplace intersects(r container) bool // whether the two containers intersect lazyOR(r container) container lazyIOR(r container) container getSizeInBytes() int //removeRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) iremoveRange(start, final int) container // i stands for inplace, range is [firstOfRange,lastOfRange) selectInt(x uint16) int // selectInt returns the xth integer in the container serializedSizeInBytes() int readFrom(io.Reader) (int, error) writeTo(io.Writer) (int, error) numberOfRuns() int toEfficientContainer() container String() string containerType() contype } type contype uint8 const ( bitmapContype contype = iota arrayContype run16Contype run32Contype ) // careful: range is [firstOfRange,lastOfRange] func rangeOfOnes(start, last int) container { if start > MaxUint16 { panic("rangeOfOnes called with start > MaxUint16") } if last > MaxUint16 { panic("rangeOfOnes called with last > MaxUint16") } if start < 0 { panic("rangeOfOnes called with start < 0") } if last < 0 { panic("rangeOfOnes called with last < 0") } return newRunContainer16Range(uint16(start), uint16(last)) } type roaringArray struct { keys []uint16 containers []container `msg:"-"` // don't try to serialize directly. needCopyOnWrite []bool copyOnWrite bool // conserz is used at serialization time // to serialize containers. Otherwise empty. conserz []containerSerz } // containerSerz facilitates serializing container (tricky to // serialize because it is an interface) by providing a // light wrapper with a type identifier. type containerSerz struct { t contype `msg:"t"` // type r msgp.Raw `msg:"r"` // Raw msgpack of the actual container type } func newRoaringArray() *roaringArray { return &roaringArray{} } // runOptimize compresses the element containers to minimize space consumed. // Q: how does this interact with copyOnWrite and needCopyOnWrite? // A: since we aren't changing the logical content, just the representation, // we don't bother to check the needCopyOnWrite bits. We replace // (possibly all) elements of ra.containers in-place with space // optimized versions. func (ra *roaringArray) runOptimize() { for i := range ra.containers { ra.containers[i] = ra.containers[i].toEfficientContainer() } } func (ra *roaringArray) appendContainer(key uint16, value container, mustCopyOnWrite bool) { ra.keys = append(ra.keys, key) ra.containers = append(ra.containers, value) ra.needCopyOnWrite = append(ra.needCopyOnWrite, mustCopyOnWrite) } func (ra *roaringArray) appendWithoutCopy(sa roaringArray, startingindex int) { mustCopyOnWrite := sa.needCopyOnWrite[startingindex] ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], mustCopyOnWrite) } func (ra *roaringArray) appendCopy(sa roaringArray, startingindex int) { // cow only if the two request it, or if we already have a lightweight copy copyonwrite := (ra.copyOnWrite && sa.copyOnWrite) || sa.needsCopyOnWrite(startingindex) if !copyonwrite { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].clone(), copyonwrite) } else { ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], copyonwrite) if !sa.needsCopyOnWrite(startingindex) { sa.setNeedsCopyOnWrite(startingindex) } } } func (ra *roaringArray) appendWithoutCopyMany(sa roaringArray, startingindex, end int) { for i := startingindex; i < end; i++ { ra.appendWithoutCopy(sa, i) } } func (ra *roaringArray) appendCopyMany(sa roaringArray, startingindex, end int) { for i := startingindex; i < end; i++ { ra.appendCopy(sa, i) } } func (ra *roaringArray) appendCopiesUntil(sa roaringArray, stoppingKey uint16) { // cow only if the two request it, or if we already have a lightweight copy copyonwrite := ra.copyOnWrite && sa.copyOnWrite for i := 0; i < sa.size(); i++ { if sa.keys[i] >= stoppingKey { break } thiscopyonewrite := copyonwrite || sa.needsCopyOnWrite(i) if thiscopyonewrite { ra.appendContainer(sa.keys[i], sa.containers[i], thiscopyonewrite) if !sa.needsCopyOnWrite(i) { sa.setNeedsCopyOnWrite(i) } } else { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) } } } func (ra *roaringArray) appendCopiesAfter(sa roaringArray, beforeStart uint16) { // cow only if the two request it, or if we already have a lightweight copy copyonwrite := ra.copyOnWrite && sa.copyOnWrite startLocation := sa.getIndex(beforeStart) if startLocation >= 0 { startLocation++ } else { startLocation = -startLocation - 1 } for i := startLocation; i < sa.size(); i++ { thiscopyonewrite := copyonwrite || sa.needsCopyOnWrite(i) if thiscopyonewrite { ra.appendContainer(sa.keys[i], sa.containers[i], thiscopyonewrite) if !sa.needsCopyOnWrite(i) { sa.setNeedsCopyOnWrite(i) } } else { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) } } } func (ra *roaringArray) removeIndexRange(begin, end int) { if end <= begin { return } r := end - begin copy(ra.keys[begin:], ra.keys[end:]) copy(ra.containers[begin:], ra.containers[end:]) copy(ra.needCopyOnWrite[begin:], ra.needCopyOnWrite[end:]) ra.resize(len(ra.keys) - r) } func (ra *roaringArray) resize(newsize int) { for k := newsize; k < len(ra.containers); k++ { ra.containers[k] = nil } ra.keys = ra.keys[:newsize] ra.containers = ra.containers[:newsize] ra.needCopyOnWrite = ra.needCopyOnWrite[:newsize] } func (ra *roaringArray) clear() { ra.resize(0) ra.copyOnWrite = false ra.conserz = nil } func (ra *roaringArray) clone() *roaringArray { sa := roaringArray{} sa.copyOnWrite = ra.copyOnWrite // this is where copyOnWrite is used. if ra.copyOnWrite { sa.keys = make([]uint16, len(ra.keys)) copy(sa.keys, ra.keys) sa.containers = make([]container, len(ra.containers)) copy(sa.containers, ra.containers) sa.needCopyOnWrite = make([]bool, len(ra.needCopyOnWrite)) ra.markAllAsNeedingCopyOnWrite() sa.markAllAsNeedingCopyOnWrite() // sa.needCopyOnWrite is shared } else { // make a full copy sa.keys = make([]uint16, len(ra.keys)) copy(sa.keys, ra.keys) sa.containers = make([]container, len(ra.containers)) for i := range sa.containers { sa.containers[i] = ra.containers[i].clone() } sa.needCopyOnWrite = make([]bool, len(ra.needCopyOnWrite)) } return &sa } // unused function: //func (ra *roaringArray) containsKey(x uint16) bool { // return (ra.binarySearch(0, int64(len(ra.keys)), x) >= 0) //} func (ra *roaringArray) getContainer(x uint16) container { i := ra.binarySearch(0, int64(len(ra.keys)), x) if i < 0 { return nil } return ra.containers[i] } func (ra *roaringArray) getContainerAtIndex(i int) container { return ra.containers[i] } func (ra *roaringArray) getFastContainerAtIndex(i int, needsWriteable bool) container { c := ra.getContainerAtIndex(i) switch t := c.(type) { case *arrayContainer: c = t.toBitmapContainer() case *runContainer16: if !t.isFull() { c = t.toBitmapContainer() } case *bitmapContainer: if needsWriteable && ra.needCopyOnWrite[i] { c = ra.containers[i].clone() } } return c } func (ra *roaringArray) getWritableContainerAtIndex(i int) container { if ra.needCopyOnWrite[i] { ra.containers[i] = ra.containers[i].clone() ra.needCopyOnWrite[i] = false } return ra.containers[i] } func (ra *roaringArray) getIndex(x uint16) int { // before the binary search, we optimize for frequent cases size := len(ra.keys) if (size == 0) || (ra.keys[size-1] == x) { return size - 1 } return ra.binarySearch(0, int64(size), x) } func (ra *roaringArray) getKeyAtIndex(i int) uint16 { return ra.keys[i] } func (ra *roaringArray) insertNewKeyValueAt(i int, key uint16, value container) { ra.keys = append(ra.keys, 0) ra.containers = append(ra.containers, nil) copy(ra.keys[i+1:], ra.keys[i:]) copy(ra.containers[i+1:], ra.containers[i:]) ra.keys[i] = key ra.containers[i] = value ra.needCopyOnWrite = append(ra.needCopyOnWrite, false) copy(ra.needCopyOnWrite[i+1:], ra.needCopyOnWrite[i:]) ra.needCopyOnWrite[i] = false } func (ra *roaringArray) remove(key uint16) bool { i := ra.binarySearch(0, int64(len(ra.keys)), key) if i >= 0 { // if a new key ra.removeAtIndex(i) return true } return false } func (ra *roaringArray) removeAtIndex(i int) { copy(ra.keys[i:], ra.keys[i+1:]) copy(ra.containers[i:], ra.containers[i+1:]) copy(ra.needCopyOnWrite[i:], ra.needCopyOnWrite[i+1:]) ra.resize(len(ra.keys) - 1) } func (ra *roaringArray) setContainerAtIndex(i int, c container) { ra.containers[i] = c } func (ra *roaringArray) replaceKeyAndContainerAtIndex(i int, key uint16, c container, mustCopyOnWrite bool) { ra.keys[i] = key ra.containers[i] = c ra.needCopyOnWrite[i] = mustCopyOnWrite } func (ra *roaringArray) size() int { return len(ra.keys) } func (ra *roaringArray) binarySearch(begin, end int64, ikey uint16) int { low := begin high := end - 1 for low+16 <= high { middleIndex := low + (high-low)/2 // avoid overflow middleValue := ra.keys[middleIndex] if middleValue < ikey { low = middleIndex + 1 } else if middleValue > ikey { high = middleIndex - 1 } else { return int(middleIndex) } } for ; low <= high; low++ { val := ra.keys[low] if val >= ikey { if val == ikey { return int(low) } break } } return -int(low + 1) } func (ra *roaringArray) equals(o interface{}) bool { srb, ok := o.(roaringArray) if ok { if srb.size() != ra.size() { return false } for i, k := range ra.keys { if k != srb.keys[i] { return false } } for i, c := range ra.containers { if !c.equals(srb.containers[i]) { return false } } return true } return false } func (ra *roaringArray) headerSize() uint64 { size := uint64(len(ra.keys)) if ra.hasRunCompression() { if size < noOffsetThreshold { // for small bitmaps, we omit the offsets return 4 + (size+7)/8 + 4*size } return 4 + (size+7)/8 + 8*size // - 4 because we pack the size with the cookie } return 4 + 4 + 8*size } // should be dirt cheap func (ra *roaringArray) serializedSizeInBytes() uint64 { answer := ra.headerSize() for _, c := range ra.containers { answer += uint64(c.serializedSizeInBytes()) } return answer } // // spec: https://github.com/RoaringBitmap/RoaringFormatSpec // func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) { hasRun := ra.hasRunCompression() isRunSizeInBytes := 0 cookieSize := 8 if hasRun { cookieSize = 4 isRunSizeInBytes = (len(ra.keys) + 7) / 8 } descriptiveHeaderSize := 4 * len(ra.keys) preambleSize := cookieSize + isRunSizeInBytes + descriptiveHeaderSize buf := make([]byte, preambleSize+4*len(ra.keys)) nw := 0 if hasRun { binary.LittleEndian.PutUint16(buf[0:], uint16(serialCookie)) nw += 2 binary.LittleEndian.PutUint16(buf[2:], uint16(len(ra.keys)-1)) nw += 2 // compute isRun bitmap var ir []byte isRun := newBitmapContainer() for i, c := range ra.containers { switch c.(type) { case *runContainer16: isRun.iadd(uint16(i)) } } // convert to little endian ir = isRun.asLittleEndianByteSlice()[:isRunSizeInBytes] nw += copy(buf[nw:], ir) } else { binary.LittleEndian.PutUint32(buf[0:], uint32(serialCookieNoRunContainer)) nw += 4 binary.LittleEndian.PutUint32(buf[4:], uint32(len(ra.keys))) nw += 4 } // descriptive header for i, key := range ra.keys { binary.LittleEndian.PutUint16(buf[nw:], key) nw += 2 c := ra.containers[i] binary.LittleEndian.PutUint16(buf[nw:], uint16(c.getCardinality()-1)) nw += 2 } startOffset := int64(preambleSize + 4*len(ra.keys)) if !hasRun || (len(ra.keys) >= noOffsetThreshold) { // offset header for _, c := range ra.containers { binary.LittleEndian.PutUint32(buf[nw:], uint32(startOffset)) nw += 4 switch rc := c.(type) { case *runContainer16: startOffset += 2 + int64(len(rc.iv))*4 default: startOffset += int64(getSizeInBytesFromCardinality(c.getCardinality())) } } } written, err := w.Write(buf[:nw]) if err != nil { return n, err } n += int64(written) for _, c := range ra.containers { written, err := c.writeTo(w) if err != nil { return n, err } n += int64(written) } return n, nil } // // spec: https://github.com/RoaringBitmap/RoaringFormatSpec // func (ra *roaringArray) toBytes() ([]byte, error) { var buf bytes.Buffer _, err := ra.writeTo(&buf) return buf.Bytes(), err } func (ra *roaringArray) fromBuffer(buf []byte) (int64, error) { pos := 0 if len(buf) < 8 { return 0, fmt.Errorf("buffer too small, expecting at least 8 bytes, was %d", len(buf)) } cookie := binary.LittleEndian.Uint32(buf) pos += 4 var size uint32 // number of containers haveRunContainers := false var isRunBitmap []byte // cookie header if cookie&0x0000FFFF == serialCookie { haveRunContainers = true size = uint32(uint16(cookie>>16) + 1) // number of containers // create is-run-container bitmap isRunBitmapSize := (int(size) + 7) / 8 if pos+isRunBitmapSize > len(buf) { return 0, fmt.Errorf("malformed bitmap, is-run bitmap overruns buffer at %d", pos+isRunBitmapSize) } isRunBitmap = buf[pos : pos+isRunBitmapSize] pos += isRunBitmapSize } else if cookie == serialCookieNoRunContainer { size = binary.LittleEndian.Uint32(buf[pos:]) pos += 4 } else { return 0, fmt.Errorf("error in roaringArray.readFrom: did not find expected serialCookie in header") } if size > (1 << 16) { return 0, fmt.Errorf("It is logically impossible to have more than (1<<16) containers.") } // descriptive header // keycard - is {key, cardinality} tuple slice if pos+2*2*int(size) > len(buf) { return 0, fmt.Errorf("malfomred bitmap, key-cardinality slice overruns buffer at %d", pos+2*2*int(size)) } keycard := byteSliceAsUint16Slice(buf[pos : pos+2*2*int(size)]) pos += 2 * 2 * int(size) if !haveRunContainers || size >= noOffsetThreshold { pos += 4 * int(size) } // Allocate slices upfront as number of containers is known if cap(ra.containers) >= int(size) { ra.containers = ra.containers[:size] } else { ra.containers = make([]container, size) } if cap(ra.keys) >= int(size) { ra.keys = ra.keys[:size] } else { ra.keys = make([]uint16, size) } if cap(ra.needCopyOnWrite) >= int(size) { ra.needCopyOnWrite = ra.needCopyOnWrite[:size] } else { ra.needCopyOnWrite = make([]bool, size) } for i := uint32(0); i < size; i++ { key := uint16(keycard[2*i]) card := int(keycard[2*i+1]) + 1 ra.keys[i] = key ra.needCopyOnWrite[i] = true if haveRunContainers && isRunBitmap[i/8]&(1<<(i%8)) != 0 { // run container nr := binary.LittleEndian.Uint16(buf[pos:]) pos += 2 if pos+int(nr)*4 > len(buf) { return 0, fmt.Errorf("malformed bitmap, a run container overruns buffer at %d:%d", pos, pos+int(nr)*4) } nb := runContainer16{ iv: byteSliceAsInterval16Slice(buf[pos : pos+int(nr)*4]), card: int64(card), } pos += int(nr) * 4 ra.containers[i] = &nb } else if card > arrayDefaultMaxSize { // bitmap container nb := bitmapContainer{ cardinality: card, bitmap: byteSliceAsUint64Slice(buf[pos : pos+arrayDefaultMaxSize*2]), } pos += arrayDefaultMaxSize * 2 ra.containers[i] = &nb } else { // array container nb := arrayContainer{ byteSliceAsUint16Slice(buf[pos : pos+card*2]), } pos += card * 2 ra.containers[i] = &nb } } return int64(pos), nil } func (ra *roaringArray) readFrom(stream io.Reader) (int64, error) { pos := 0 var cookie uint32 err := binary.Read(stream, binary.LittleEndian, &cookie) if err != nil { return 0, fmt.Errorf("error in roaringArray.readFrom: could not read initial cookie: %s", err) } pos += 4 var size uint32 haveRunContainers := false var isRun *bitmapContainer if cookie&0x0000FFFF == serialCookie { haveRunContainers = true size = uint32(uint16(cookie>>16) + 1) bytesToRead := (int(size) + 7) / 8 numwords := (bytesToRead + 7) / 8 by := make([]byte, bytesToRead, numwords*8) nr, err := io.ReadFull(stream, by) if err != nil { return 8 + int64(nr), fmt.Errorf("error in readFrom: could not read the "+ "runContainer bit flags of length %v bytes: %v", bytesToRead, err) } pos += bytesToRead by = by[:cap(by)] isRun = newBitmapContainer() for i := 0; i < numwords; i++ { isRun.bitmap[i] = binary.LittleEndian.Uint64(by) by = by[8:] } } else if cookie == serialCookieNoRunContainer { err = binary.Read(stream, binary.LittleEndian, &size) if err != nil { return 0, fmt.Errorf("error in roaringArray.readFrom: when reading size, got: %s", err) } pos += 4 } else { return 0, fmt.Errorf("error in roaringArray.readFrom: did not find expected serialCookie in header") } if size > (1 << 16) { return 0, fmt.Errorf("It is logically impossible to have more than (1<<16) containers.") } // descriptive header keycard := make([]uint16, 2*size, 2*size) err = binary.Read(stream, binary.LittleEndian, keycard) if err != nil { return 0, err } pos += 2 * 2 * int(size) // offset header if !haveRunContainers || size >= noOffsetThreshold { io.CopyN(ioutil.Discard, stream, 4*int64(size)) // we never skip ahead so this data can be ignored pos += 4 * int(size) } for i := uint32(0); i < size; i++ { key := int(keycard[2*i]) card := int(keycard[2*i+1]) + 1 if haveRunContainers && isRun.contains(uint16(i)) { nb := newRunContainer16() nr, err := nb.readFrom(stream) if err != nil { return 0, err } pos += nr ra.appendContainer(uint16(key), nb, false) } else if card > arrayDefaultMaxSize { nb := newBitmapContainer() nr, err := nb.readFrom(stream) if err != nil { return 0, err } nb.cardinality = card pos += nr ra.appendContainer(keycard[2*i], nb, false) } else { nb := newArrayContainerSize(card) nr, err := nb.readFrom(stream) if err != nil { return 0, err } pos += nr ra.appendContainer(keycard[2*i], nb, false) } } return int64(pos), nil } func (ra *roaringArray) hasRunCompression() bool { for _, c := range ra.containers { switch c.(type) { case *runContainer16: return true } } return false } func (ra *roaringArray) writeToMsgpack(stream io.Writer) error { ra.conserz = make([]containerSerz, len(ra.containers)) for i, v := range ra.containers { switch cn := v.(type) { case *bitmapContainer: bts, err := cn.MarshalMsg(nil) if err != nil { return err } ra.conserz[i].t = bitmapContype ra.conserz[i].r = bts case *arrayContainer: bts, err := cn.MarshalMsg(nil) if err != nil { return err } ra.conserz[i].t = arrayContype ra.conserz[i].r = bts case *runContainer16: bts, err := cn.MarshalMsg(nil) if err != nil { return err } ra.conserz[i].t = run16Contype ra.conserz[i].r = bts default: panic(fmt.Errorf("Unrecognized container implementation: %T", cn)) } } w := snappy.NewWriter(stream) err := msgp.Encode(w, ra) ra.conserz = nil return err } func (ra *roaringArray) readFromMsgpack(stream io.Reader) error { r := snappy.NewReader(stream) err := msgp.Decode(r, ra) if err != nil { return err } if len(ra.containers) != len(ra.keys) { ra.containers = make([]container, len(ra.keys)) } for i, v := range ra.conserz { switch v.t { case bitmapContype: c := &bitmapContainer{} _, err = c.UnmarshalMsg(v.r) if err != nil { return err } ra.containers[i] = c case arrayContype: c := &arrayContainer{} _, err = c.UnmarshalMsg(v.r) if err != nil { return err } ra.containers[i] = c case run16Contype: c := &runContainer16{} _, err = c.UnmarshalMsg(v.r) if err != nil { return err } ra.containers[i] = c default: return fmt.Errorf("unrecognized contype serialization code: '%v'", v.t) } } ra.conserz = nil return nil } func (ra *roaringArray) advanceUntil(min uint16, pos int) int { lower := pos + 1 if lower >= len(ra.keys) || ra.keys[lower] >= min { return lower } spansize := 1 for lower+spansize < len(ra.keys) && ra.keys[lower+spansize] < min { spansize *= 2 } var upper int if lower+spansize < len(ra.keys) { upper = lower + spansize } else { upper = len(ra.keys) - 1 } if ra.keys[upper] == min { return upper } if ra.keys[upper] < min { // means // array // has no // item // >= min // pos = array.length; return len(ra.keys) } // we know that the next-smallest span was too small lower += (spansize >> 1) mid := 0 for lower+1 != upper { mid = (lower + upper) >> 1 if ra.keys[mid] == min { return mid } else if ra.keys[mid] < min { lower = mid } else { upper = mid } } return upper } func (ra *roaringArray) markAllAsNeedingCopyOnWrite() { for i := range ra.needCopyOnWrite { ra.needCopyOnWrite[i] = true } } func (ra *roaringArray) needsCopyOnWrite(i int) bool { return ra.needCopyOnWrite[i] } func (ra *roaringArray) setNeedsCopyOnWrite(i int) { ra.needCopyOnWrite[i] = true }