posting.go

//  Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ice

import (
	"encoding/binary"
	"fmt"
	"math"

	"github.com/RoaringBitmap/roaring"
	segment "github.com/blugelabs/bluge_segment_api"
)

// FST or vellum value (uint64) encoding is determined by the top two
// highest-order or most significant bits...
//
//  encoding  : MSB
//  name      : 63  62  61...to...bit #0 (LSB)
//  ----------+---+---+---------------------------------------------------
//   general  : 0 | 0 | 62-bits of postingsOffset.
//   ~        : 0 | 1 | reserved for future.
//   1-hit    : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum.
//   ~        : 1 | 1 | reserved for future.
//
// Encoding "general" is able to handle all cases, where the
// postingsOffset points to more information about the postings for
// the term.
//
// Encoding "1-hit" is used to optimize a commonly seen case when a
// term has only a single hit.  For example, a term in the _id field
// will have only 1 hit.  The "1-hit" encoding is used for a term
// in a field when...
//
// - term vector info is disabled for that field;
// - and, the term appears in only a single doc for that field;
// - and, the term's freq is exactly 1 in that single doc for that field;
// - and, the docNum must fit into 31-bits;
//
// Otherwise, the "general" encoding is used instead.
//
// In the "1-hit" encoding, the field in that single doc may have
// other terms, which is supported in the "1-hit" encoding by the
// positive float31 norm.
const fSTValEncodingMaskRaw = 0xc000000000000000
const fSTValEncodingMask = uint64(fSTValEncodingMaskRaw)
const fSTValEncoding1HitRaw = 0x8000000000000000
const fSTValEncoding1Hit = uint64(fSTValEncoding1HitRaw)

func fSTValEncode1Hit(docNum, normBits uint64) uint64 {
	return fSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum)
}

func fSTValDecode1Hit(v uint64) (docNum, normBits uint64) {
	return mask31Bits & v, mask31Bits & (v >> 31)
}

const mask31BitsRaw = 0x000000007fffffff
const mask31Bits = uint64(mask31BitsRaw)

func under32Bits(x uint64) bool {
	return x <= mask31Bits
}

const docNum1HitFinished = math.MaxUint64

// PostingsList is an in-memory representation of a postings list
type PostingsList struct {
	sb             *Segment
	postingsOffset uint64
	freqOffset     uint64
	locOffset      uint64
	postings       *roaring.Bitmap
	except         *roaring.Bitmap

	// when normBits1Hit != 0, then this postings list came from a
	// 1-hit encoding, and only the docNum1Hit & normBits1Hit apply
	docNum1Hit   uint64
	normBits1Hit uint64

	chunkSize uint64
}

// represents an immutable, empty postings list
var emptyPostingsList = &PostingsList{}

func (p *PostingsList) Size() int {
	sizeInBytes := reflectStaticSizePostingsList + sizeOfPtr

	if p.except != nil {
		sizeInBytes += int(p.except.GetSizeInBytes())
	}

	return sizeInBytes
}

func (p *PostingsList) OrInto(receiver *roaring.Bitmap) {
	if p.normBits1Hit != 0 {
		receiver.Add(uint32(p.docNum1Hit))
		return
	}

	if p.postings != nil {
		receiver.Or(p.postings)
	}
}

// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool,
	prealloc segment.PostingsIterator) (segment.PostingsIterator, error) {
	if p.normBits1Hit == 0 && p.postings == nil {
		return emptyPostingsIterator, nil
	}

	var preallocPI *PostingsIterator
	pi, ok := prealloc.(*PostingsIterator)
	if ok && pi != nil {
		preallocPI = pi
	}
	if preallocPI == emptyPostingsIterator {
		preallocPI = nil
	}

	return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI)
}

func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool,
	rv *PostingsIterator) (*PostingsIterator, error) {
	if rv == nil {
		rv = &PostingsIterator{}
	} else {
		freqNormReader := rv.freqNormReader
		if freqNormReader != nil {
			freqNormReader.reset()
		}

		locReader := rv.locReader
		if locReader != nil {
			locReader.reset()
		}

		nextLocs := rv.nextLocs[:0]
		nextSegmentLocs := rv.nextSegmentLocs[:0]

		buf := rv.buf

		*rv = PostingsIterator{} // clear the struct

		rv.freqNormReader = freqNormReader
		rv.locReader = locReader

		rv.nextLocs = nextLocs
		rv.nextSegmentLocs = nextSegmentLocs

		rv.buf = buf
	}

	rv.postings = p
	rv.includeFreqNorm = includeFreq || includeNorm || includeLocs
	rv.includeLocs = includeLocs

	if p.normBits1Hit != 0 {
		// "1-hit" encoding
		rv.docNum1Hit = p.docNum1Hit
		rv.normBits1Hit = p.normBits1Hit

		if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) {
			rv.docNum1Hit = docNum1HitFinished
		}

		return rv, nil
	}

	// "general" encoding, check if empty
	if p.postings == nil {
		return rv, nil
	}

	// initialize freq chunk reader
	if rv.includeFreqNorm {
		var err error
		rv.freqNormReader, err = newChunkedIntDecoder(p.sb.data, p.freqOffset, rv.freqNormReader)
		if err != nil {
			return nil, err
		}
	}

	// initialize the loc chunk reader
	if rv.includeLocs {
		var err error
		rv.locReader, err = newChunkedIntDecoder(p.sb.data, p.locOffset, rv.locReader)
		if err != nil {
			return nil, err
		}
	}

	rv.all = p.postings.Iterator()
	if p.except != nil {
		rv.ActualBM = roaring.AndNot(p.postings, p.except)
		rv.Actual = rv.ActualBM.Iterator()
	} else {
		rv.ActualBM = p.postings
		rv.Actual = rv.all // Optimize to use same iterator for all & Actual.
	}

	return rv, nil
}

// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
	var n, e uint64
	if p.normBits1Hit != 0 {
		n = 1
		if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) {
			e = 1
		}
	} else if p.postings != nil {
		n = p.postings.GetCardinality()
		if p.except != nil {
			e = p.postings.AndCardinality(p.except)
		}
	}
	return n - e
}

func (p *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
	p.postingsOffset = postingsOffset

	// handle "1-hit" encoding special case
	if p.postingsOffset&fSTValEncodingMask == fSTValEncoding1Hit {
		return p.init1Hit(postingsOffset)
	}

	// read the location of the freq/norm details
	var n uint64
	var read int

	freqOffsetData, err := d.sb.data.Read(int(postingsOffset+n), int(postingsOffset+binary.MaxVarintLen64))
	if err != nil {
		return err
	}
	p.freqOffset, read = binary.Uvarint(freqOffsetData)
	n += uint64(read)

	locOffsetData, err := d.sb.data.Read(int(postingsOffset+n), int(postingsOffset+n+binary.MaxVarintLen64))
	if err != nil {
		return err
	}
	p.locOffset, read = binary.Uvarint(locOffsetData)
	if p.locOffset > 0 && p.freqOffset > 0 {
		p.locOffset += p.freqOffset
	}
	n += uint64(read)

	postingsLenData, err := d.sb.data.Read(int(postingsOffset+n), int(postingsOffset+n+binary.MaxVarintLen64))
	if err != nil {
		return err
	}
	var postingsLen uint64
	postingsLen, read = binary.Uvarint(postingsLenData)
	n += uint64(read)

	roaringData, err := d.sb.data.Read(int(postingsOffset+n), int(postingsOffset+n+postingsLen))
	if err != nil {
		return err
	}
	roaringBytes := roaringData

	if p.postings == nil {
		p.postings = roaring.NewBitmap()
	}
	_, err = p.postings.FromBuffer(roaringBytes)
	if err != nil {
		return fmt.Errorf("error loading roaring bitmap: %v", err)
	}

	p.chunkSize, err = getChunkSize(d.sb.footer.chunkMode,
		p.postings.GetCardinality(), d.sb.footer.numDocs)
	if err != nil {
		return err
	}

	return nil
}

func (p *PostingsList) init1Hit(fstVal uint64) error {
	docNum, normBits := fSTValDecode1Hit(fstVal)

	p.docNum1Hit = docNum
	p.normBits1Hit = normBits

	return nil
}

// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
	postings *PostingsList
	all      roaring.IntPeekable
	Actual   roaring.IntPeekable
	ActualBM *roaring.Bitmap

	currChunk      uint32
	freqNormReader *chunkedIntDecoder
	locReader      *chunkedIntDecoder

	next            Posting            // reused across Next() calls
	nextLocs        []Location         // reused across Next() calls
	nextSegmentLocs []segment.Location // reused across Next() calls

	docNum1Hit   uint64
	normBits1Hit uint64

	buf []byte

	includeFreqNorm bool
	includeLocs     bool
}

var emptyPostingsIterator = &PostingsIterator{}

func (i *PostingsIterator) Size() int {
	sizeInBytes := reflectStaticSizePostingsIterator + sizeOfPtr +
		i.next.Size()
	// account for freqNormReader, locReader if we start using this.
	for _, entry := range i.nextLocs {
		sizeInBytes += entry.Size()
	}

	return sizeInBytes
}
func (i *PostingsIterator) Empty() bool {
	return false
}

func (i *PostingsIterator) loadChunk(chunk int) error {
	if i.includeFreqNorm {
		err := i.freqNormReader.loadChunk(chunk)
		if err != nil {
			return err
		}
	}

	if i.includeLocs {
		err := i.locReader.loadChunk(chunk)
		if err != nil {
			return err
		}
	}

	i.currChunk = uint32(chunk)
	return nil
}

func (i *PostingsIterator) readFreqNormHasLocs() (freq int, norm uint64, hasLocs bool, err error) {
	if i.normBits1Hit != 0 {
		return 1, i.normBits1Hit, false, nil
	}

	freqHasLocs, err := i.freqNormReader.readUvarint()
	if err != nil {
		return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
	}

	freq, hasLocs = decodeFreqHasLocs(freqHasLocs)

	norm, err = i.freqNormReader.readUvarint()
	if err != nil {
		return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
	}

	return freq, norm, hasLocs, nil
}

func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) {
	if i.normBits1Hit != 0 {
		return false, nil
	}

	freqHasLocs, err := i.freqNormReader.readUvarint()
	if err != nil {
		return false, fmt.Errorf("error reading freqHasLocs: %v", err)
	}

	i.freqNormReader.SkipUvarint() // Skip normBits.

	return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs.
}

func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
	rv := freq << 1
	if hasLocs {
		rv |= 0x01 // 0'th LSB encodes whether there are locations
	}
	return rv
}

func decodeFreqHasLocs(freqHasLocs uint64) (int, bool) {
	freq := freqHasLocs >> 1
	hasLocs := freqHasLocs&0x01 != 0
	return int(freq), hasLocs
}

// readLocation processes all the integers on the stream representing a single
// location.
func (i *PostingsIterator) readLocation(l *Location) error {
	// read off field
	fieldID, err := i.locReader.readUvarint()
	if err != nil {
		return fmt.Errorf("error reading location field: %v", err)
	}
	// read off pos
	pos, err := i.locReader.readUvarint()
	if err != nil {
		return fmt.Errorf("error reading location pos: %v", err)
	}
	// read off start
	start, err := i.locReader.readUvarint()
	if err != nil {
		return fmt.Errorf("error reading location start: %v", err)
	}
	// read off end
	end, err := i.locReader.readUvarint()
	if err != nil {
		return fmt.Errorf("error reading location end: %v", err)
	}

	l.field = i.postings.sb.fieldsInv[fieldID]
	l.pos = int(pos)
	l.start = int(start)
	l.end = int(end)

	return nil
}

// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) {
	return i.nextAtOrAfter(0)
}

// Advance returns the posting at the specified docNum or it is not present
// the next posting, or if the end is reached, nil
func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) {
	return i.nextAtOrAfter(docNum)
}

const locSliceGrowth = 2

// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) {
	docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter)
	if err != nil || !exists {
		return nil, err
	}

	i.next = Posting{} // clear the struct
	rv := &i.next
	rv.docNum = docNum

	if !i.includeFreqNorm {
		return rv, nil
	}

	var normBits uint64
	var hasLocs bool

	rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
	if err != nil {
		return nil, err
	}

	rv.norm = math.Float32frombits(uint32(normBits))

	if i.includeLocs && hasLocs {
		// prepare locations into reused slices, where we assume
		// rv.freq >= "number of locs", since in a composite field,
		// some component fields might have their IncludeTermVector
		// flags disabled while other component fields are enabled
		if cap(i.nextLocs) >= rv.freq {
			i.nextLocs = i.nextLocs[0:rv.freq]
		} else {
			i.nextLocs = make([]Location, rv.freq, rv.freq*locSliceGrowth)
		}
		if cap(i.nextSegmentLocs) < rv.freq {
			i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*locSliceGrowth)
		}
		rv.locs = i.nextSegmentLocs[:0]

		numLocsBytes, err := i.locReader.readUvarint()
		if err != nil {
			return nil, fmt.Errorf("error reading location numLocsBytes: %v", err)
		}

		j := 0
		startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader
		for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) {
			err := i.readLocation(&i.nextLocs[j])
			if err != nil {
				return nil, err
			}
			rv.locs = append(rv.locs, &i.nextLocs[j])
			j++
		}
	}

	return rv, nil
}

// nextDocNum returns the next docNum on the postings list, and also
// sets up the currChunk / loc related fields of the iterator.
func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (docNum uint64, exists bool, err error) {
	if i.normBits1Hit != 0 {
		if i.docNum1Hit == docNum1HitFinished {
			return 0, false, nil
		}
		if i.docNum1Hit < atOrAfter {
			// advanced past our 1-hit
			i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum
			return 0, false, nil
		}
		docNum := i.docNum1Hit
		i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum
		return docNum, true, nil
	}

	if i.Actual == nil || !i.Actual.HasNext() {
		return 0, false, nil
	}

	if i.postings == nil || i.postings.postings == i.ActualBM {
		return i.nextDocNumAtOrAfterClean(atOrAfter)
	}

	i.Actual.AdvanceIfNeeded(uint32(atOrAfter))

	if !i.Actual.HasNext() {
		// couldn't find anything
		return 0, false, nil
	}

	n := i.Actual.Next()
	allN := i.all.Next()

	nChunk := n / uint32(i.postings.chunkSize)

	// when allN becomes >= to here, then allN is in the same chunk as nChunk.
	allNReachesNChunk := nChunk * uint32(i.postings.chunkSize)

	// n is the next actual hit (excluding some postings), and
	// allN is the next hit in the full postings, and
	// if they don't match, move 'all' forwards until they do
	for allN != n {
		// we've reached same chunk, so move the freq/norm/loc decoders forward
		if i.includeFreqNorm && allN >= allNReachesNChunk {
			err := i.currChunkNext(nChunk)
			if err != nil {
				return 0, false, err
			}
		}

		allN = i.all.Next()
	}

	if i.includeFreqNorm && (i.currChunk != nChunk || i.freqNormReader.isNil()) {
		err := i.loadChunk(int(nChunk))
		if err != nil {
			return 0, false, fmt.Errorf("error loading chunk: %v", err)
		}
	}

	return uint64(n), true, nil
}

// optimization when the postings list is "clean" (e.g., no updates &
// no deletions) where the all bitmap is the same as the actual bitmap
func (i *PostingsIterator) nextDocNumAtOrAfterClean(
	atOrAfter uint64) (docNum uint64, exists bool, err error) {
	if !i.includeFreqNorm {
		i.Actual.AdvanceIfNeeded(uint32(atOrAfter))

		if !i.Actual.HasNext() {
			return 0, false, nil // couldn't find anything
		}

		return uint64(i.Actual.Next()), true, nil
	}

	// freq-norm's needed, so maintain freq-norm chunk reader
	sameChunkNexts := 0 // # of times we called Next() in the same chunk
	n := i.Actual.Next()
	nChunk := n / uint32(i.postings.chunkSize)

	for uint64(n) < atOrAfter && i.Actual.HasNext() {
		n = i.Actual.Next()

		nChunkPrev := nChunk
		nChunk = n / uint32(i.postings.chunkSize)

		if nChunk != nChunkPrev {
			sameChunkNexts = 0
		} else {
			sameChunkNexts++
		}
	}

	if uint64(n) < atOrAfter {
		// couldn't find anything
		return 0, false, nil
	}

	for j := 0; j < sameChunkNexts; j++ {
		err := i.currChunkNext(nChunk)
		if err != nil {
			return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err)
		}
	}

	if i.currChunk != nChunk || i.freqNormReader.isNil() {
		err := i.loadChunk(int(nChunk))
		if err != nil {
			return 0, false, fmt.Errorf("error loading chunk: %v", err)
		}
	}

	return uint64(n), true, nil
}

func (i *PostingsIterator) currChunkNext(nChunk uint32) error {
	if i.currChunk != nChunk || i.freqNormReader.isNil() {
		err := i.loadChunk(int(nChunk))
		if err != nil {
			return fmt.Errorf("error loading chunk: %v", err)
		}
	}

	// read off freq/offsets even though we don't care about them
	hasLocs, err := i.skipFreqNormReadHasLocs()
	if err != nil {
		return err
	}

	if i.includeLocs && hasLocs {
		numLocsBytes, err := i.locReader.readUvarint()
		if err != nil {
			return fmt.Errorf("error reading location numLocsBytes: %v", err)
		}

		// skip over all the location bytes
		i.locReader.SkipBytes(int(numLocsBytes))
	}

	return nil
}

// DocNum1Hit returns the docNum and true if this is "1-hit" optimized
// and the docNum is available.
func (i *PostingsIterator) DocNum1Hit() (uint64, bool) {
	if i.normBits1Hit != 0 && i.docNum1Hit != docNum1HitFinished {
		return i.docNum1Hit, true
	}
	return 0, false
}

// ActualBitmap returns the underlying actual bitmap
// which can be used up the stack for optimizations
func (i *PostingsIterator) ActualBitmap() *roaring.Bitmap {
	return i.ActualBM
}

// ReplaceActual replaces the ActualBM with the provided
// bitmap
func (i *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) {
	i.ActualBM = abm
	i.Actual = abm.Iterator()
}

func (i *PostingsIterator) Count() uint64 {
	return i.postings.Count()
}

func (i *PostingsIterator) Close() error {
	return nil
}

// Posting is a single entry in a postings list
type Posting struct {
	docNum uint64
	freq   int
	norm   float32
	locs   []segment.Location
}

func (p *Posting) Size() int {
	sizeInBytes := reflectStaticSizePosting

	for _, entry := range p.locs {
		sizeInBytes += entry.Size()
	}

	return sizeInBytes
}

// Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 {
	return p.docNum
}

// SetNumber sets the document number of this posting
func (p *Posting) SetNumber(n uint64) {
	p.docNum = n
}

// Frequency returns the frequencies of occurrence of this term in this doc/field
func (p *Posting) Frequency() int {
	return p.freq
}

// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
	return float64(p.norm)
}

// Locations returns the location information for each occurrence
func (p *Posting) Locations() []segment.Location {
	return p.locs
}

// Location represents the location of a single occurrence
type Location struct {
	field string
	pos   int
	start int
	end   int
}

func (l *Location) Size() int {
	return reflectStaticSizeLocation +
		len(l.field)
}

// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
	return l.field
}

// Start returns the start byte offset of this occurrence
func (l *Location) Start() int {
	return l.start
}

// End returns the end byte offset of this occurrence
func (l *Location) End() int {
	return l.end
}

// Pos returns the 1-based phrase position of this occurrence
func (l *Location) Pos() int {
	return l.pos
}