FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs, int bytesPageBits) { this.inputType = inputType; this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; version = VERSION_CURRENT; bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs bytes.writeByte((byte) 0); NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio); inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); } else { nodeAddress = null; inCounts = null; } emptyOutput = null; packed = false; nodeRefToAddress = null; }
OrdinalsStore(int maxDoc, int startBitsPerValue, float acceptableOverheadRatio) { this.startBitsPerValue = startBitsPerValue; this.acceptableOverheadRatio = acceptableOverheadRatio; positions = new PagedGrowableWriter(maxDoc, PAGE_SIZE, startBitsPerValue, acceptableOverheadRatio); firstOrdinals = new GrowableWriter(startBitsPerValue, maxDoc, acceptableOverheadRatio); // over allocate in order to never worry about the array sizes, 24 entries would allow to store several millions of ordinals per doc... ordinals = new PagedGrowableWriter[24]; nextLevelSlices = new PagedGrowableWriter[24]; sizes = new int[24]; Arrays.fill(sizes, 1); // reserve the 1st slice on every level }
/** * Loads the segment information at segment load time. * * @param indexEnum * the term enum. * @param indexDivisor * the index divisor. * @param tiiFileLength * the size of the tii file, used to approximate the size of the * buffer. * @param totalIndexInterval * the total index interval. */ TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { this.totalIndexInterval = totalIndexInterval; indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; skipInterval = indexEnum.skipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput(); final int bitEstimate = 1+MathUtil.log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInts.DEFAULT); String currentField = null; List<String> fieldStrs = new ArrayList<>(); int fieldCounter = -1; for (int i = 0; indexEnum.next(); i++) { Term term = indexEnum.term(); if (currentField == null || !currentField.equals(term.field())) { currentField = term.field(); fieldStrs.add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.termInfo(); indexToTerms.set(i, dataOutput.getPosition()); dataOutput.writeVInt(fieldCounter); dataOutput.writeString(term.text()); dataOutput.writeVInt(termInfo.docFreq); if (termInfo.docFreq >= skipInterval) { dataOutput.writeVInt(termInfo.skipOffset); } dataOutput.writeVLong(termInfo.freqPointer); dataOutput.writeVLong(termInfo.proxPointer); dataOutput.writeVLong(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.next()) { break; } } } fields = new Term[fieldStrs.size()]; for (int i = 0; i < fields.length; i++) { fields[i] = new Term(fieldStrs.get(i)); } dataPagedBytes.freeze(true); dataInput = dataPagedBytes.getDataInput(); indexToDataOffset = indexToTerms.getMutable(); long ramBytesUsed = RamUsageEstimator.shallowSizeOf(fields); ramBytesUsed += RamUsageEstimator.shallowSizeOf(dataInput); ramBytesUsed += fields.length * RamUsageEstimator.shallowSizeOfInstance(Term.class); ramBytesUsed += dataPagedBytes.ramBytesUsed(); ramBytesUsed += indexToDataOffset.ramBytesUsed(); this.ramBytesUsed = ramBytesUsed; }
GrowableWriterAndMinValue(GrowableWriter array, long minValue) { this.writer = array; this.minValue = minValue; }
@Override protected Accountable createValue(AtomicReader reader, CacheKey key, boolean setDocsWithField /* ignored */) throws IOException { final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final PagedBytes bytes = new PagedBytes(15); int startTermsBPV; final int termCountHardLimit; if (maxDoc == Integer.MAX_VALUE) { termCountHardLimit = Integer.MAX_VALUE; } else { termCountHardLimit = maxDoc+1; } // TODO: use Uninvert? if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > termCountHardLimit) { // app is misusing the API (there is more than // one term per doc); in this case we make best // effort to load what we can (see LUCENE-2142) numUniqueTerms = termCountHardLimit; } startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); } else { startTermsBPV = 1; } } else { startTermsBPV = 1; } PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); int termOrd = 0; // TODO: use Uninvert? if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); DocsEnum docs = null; while(true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } if (termOrd >= termCountHardLimit) { break; } termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term)); docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } // Store 1+ ord into packed bits docToTermOrd.set(docID, 1+termOrd); } termOrd++; } } // maybe an int-only impl? return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd); }
/** * Loads the segment information at segment load time. * * @param indexEnum * the term enum. * @param indexDivisor * the index divisor. * @param tiiFileLength * the size of the tii file, used to approximate the size of the * buffer. * @param totalIndexInterval * the total index interval. */ TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { this.totalIndexInterval = totalIndexInterval; indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; skipInterval = indexEnum.skipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput(); final int bitEstimate = 1+MathUtil.log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInts.DEFAULT); String currentField = null; List<String> fieldStrs = new ArrayList<String>(); int fieldCounter = -1; for (int i = 0; indexEnum.next(); i++) { Term term = indexEnum.term(); if (currentField == null || !currentField.equals(term.field())) { currentField = term.field(); fieldStrs.add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.termInfo(); indexToTerms.set(i, dataOutput.getPosition()); dataOutput.writeVInt(fieldCounter); dataOutput.writeString(term.text()); dataOutput.writeVInt(termInfo.docFreq); if (termInfo.docFreq >= skipInterval) { dataOutput.writeVInt(termInfo.skipOffset); } dataOutput.writeVLong(termInfo.freqPointer); dataOutput.writeVLong(termInfo.proxPointer); dataOutput.writeVLong(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.next()) { break; } } } fields = new Term[fieldStrs.size()]; for (int i = 0; i < fields.length; i++) { fields[i] = new Term(fieldStrs.get(i)); } dataPagedBytes.freeze(true); dataInput = dataPagedBytes.getDataInput(); indexToDataOffset = indexToTerms.getMutable(); }
@Override protected Object createValue(AtomicReader reader, CacheKey key, boolean setDocsWithField /* ignored */) throws IOException { // TODO: would be nice to first check if DocTermsIndex // was already cached for this field and then return // that instead, to avoid insanity final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final int termCountHardLimit = maxDoc; // Holds the actual term data, expanded. final PagedBytes bytes = new PagedBytes(15); int startBPV; if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > termCountHardLimit) { numUniqueTerms = termCountHardLimit; } startBPV = PackedInts.bitsRequired(numUniqueTerms*4); } else { startBPV = 1; } } else { startBPV = 1; } final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio); // pointer==0 means not set bytes.copyUsingLengthPrefix(new BytesRef()); if (terms != null) { int termCount = 0; final TermsEnum termsEnum = terms.iterator(null); DocsEnum docs = null; while(true) { if (termCount++ == termCountHardLimit) { // app is misusing the API (there is more than // one term per doc); in this case we make best // effort to load what we can (see LUCENE-2142) break; } final BytesRef term = termsEnum.next(); if (term == null) { break; } final long pointer = bytes.copyUsingLengthPrefix(term); docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } docToOffset.set(docID, pointer); } } } // maybe an int-only impl? return new BinaryDocValuesImpl(bytes.freeze(true), docToOffset.getMutable()); }
public NodeHash(FST<T> fst, FST.BytesReader in) { table = new GrowableWriter(8, 16, PackedInts.COMPACT); mask = 15; this.fst = fst; this.in = in; }
/** * Loads the segment information at segment load time. * * @param indexEnum * the term enum. * @param indexDivisor * the index divisor. * @param tiiFileLength * the size of the tii file, used to approximate the size of the * buffer. * @param totalIndexInterval * the total index interval. */ TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { this.totalIndexInterval = totalIndexInterval; indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; skipInterval = indexEnum.skipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput(); final int bitEstimate = 1+MathUtil.log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInts.DEFAULT); String currentField = null; List<String> fieldStrs = new ArrayList<String>(); int fieldCounter = -1; for (int i = 0; indexEnum.next(); i++) { Term term = indexEnum.term(); if (currentField == null || !currentField.equals(term.field())) { currentField = term.field(); fieldStrs.add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.termInfo(); indexToTerms.set(i, dataOutput.getPosition()); dataOutput.writeVInt(fieldCounter); dataOutput.writeString(term.text()); dataOutput.writeVInt(termInfo.docFreq); if (termInfo.docFreq >= skipInterval) { dataOutput.writeVInt(termInfo.skipOffset); } dataOutput.writeVLong(termInfo.freqPointer); dataOutput.writeVLong(termInfo.proxPointer); dataOutput.writeVLong(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.next()) { break; } } } fields = new Term[fieldStrs.size()]; for (int i = 0; i < fields.length; i++) { fields[i] = new Term(fieldStrs.get(i)); } dataPagedBytes.freeze(true); dataInput = dataPagedBytes.getDataInput(); indexToDataOffset = indexToTerms.getMutable(); ramBytesUsed = fields.length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.shallowSizeOfInstance(Term.class)) + dataPagedBytes.ramBytesUsed() + indexToDataOffset.ramBytesUsed(); }
@Override protected Object createValue(AtomicReader reader, CacheKey key, boolean setDocsWithField /* ignored */) throws IOException { final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final PagedBytes bytes = new PagedBytes(15); int startTermsBPV; final int termCountHardLimit; if (maxDoc == Integer.MAX_VALUE) { termCountHardLimit = Integer.MAX_VALUE; } else { termCountHardLimit = maxDoc+1; } // TODO: use Uninvert? if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > termCountHardLimit) { // app is misusing the API (there is more than // one term per doc); in this case we make best // effort to load what we can (see LUCENE-2142) numUniqueTerms = termCountHardLimit; } startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); } else { startTermsBPV = 1; } } else { startTermsBPV = 1; } MonotonicAppendingLongBuffer termOrdToBytesOffset = new MonotonicAppendingLongBuffer(); final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); int termOrd = 0; // TODO: use Uninvert? if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); DocsEnum docs = null; while(true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } if (termOrd >= termCountHardLimit) { break; } termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term)); docs = termsEnum.docs(null, docs, DocsEnum.FLAG_NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } // Store 1+ ord into packed bits docToTermOrd.set(docID, 1+termOrd); } termOrd++; } } termOrdToBytesOffset.freeze(); // maybe an int-only impl? return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset, docToTermOrd.getMutable(), termOrd); }