@Override public void startTerm(BytesRef term, int freq) throws IOException { final int prefix = StringHelper.bytesDifference(lastTerm.get(), term); final int suffix = term.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(term.bytes, term.offset + prefix, suffix); tvf.writeVInt(freq); lastTerm.copyBytes(term); lastPosition = lastOffset = 0; if (offsets && positions) { // we might need to buffer if its a non-bulk merge offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq); offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq); } bufferedIndex = 0; bufferedFreq = freq; payloadData.clear(); }
private TreeMap<String,Long> readFields(IndexInput in) throws IOException { ChecksumIndexInput input = new BufferedChecksumIndexInput(in); BytesRefBuilder scratch = new BytesRefBuilder(); TreeMap<String,Long> fields = new TreeMap<>(); while (true) { SimpleTextUtil.readLine(input, scratch); if (scratch.get().equals(END)) { SimpleTextUtil.checkFooter(input); return fields; } else if (StringHelper.startsWith(scratch.get(), FIELD)) { String fieldName = new String(scratch.bytes(), FIELD.length, scratch.length() - FIELD.length, StandardCharsets.UTF_8); fields.put(fieldName, input.getFilePointer()); } } }
/** * Scans ({@code termsEnum.next()}) terms until a term is found that does * not start with curVNode's cell. If it finds a leaf cell or a cell at * level {@code scanDetailLevel} then it calls {@link * #visitScanned(org.apache.lucene.spatial.prefix.tree.Cell)}. */ protected void scan(int scanDetailLevel) throws IOException { for (; thisTerm != null && StringHelper.startsWith(thisTerm, curVNodeTerm);//TODO refactor to use method on curVNode.cell thisTerm = termsEnum.next()) { scanCell = grid.getCell(thisTerm.bytes, thisTerm.offset, thisTerm.length, scanCell); int termLevel = scanCell.getLevel(); if (termLevel < scanDetailLevel) { if (scanCell.isLeaf()) visitScanned(scanCell); } else if (termLevel == scanDetailLevel) { if (!scanCell.isLeaf())//LUCENE-5529 visitScanned(scanCell); } }//term loop }
@Override public void startTerm(BytesRef term, int freq) throws IOException { final int prefix = StringHelper.bytesDifference(lastTerm.get(), term); final int suffix = term.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(term.bytes, term.offset + prefix, suffix); tvf.writeVInt(freq); lastTerm.copyBytes(term); lastPosition = lastOffset = 0; if (offsets && positions) { // we might need to buffer if its a non-bulk merge offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq); offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq); offsetIndex = 0; offsetFreq = freq; } }
/** * Scans ({@code termsEnum.next()}) terms until a term is found that does * not start with curVNode's cell. If it finds a leaf cell or a cell at * level {@code scanDetailLevel} then it calls {@link * #visitScanned(org.apache.lucene.spatial.prefix.tree.Node, * com.spatial4j.core.shape.Shape)}. */ protected void scan(int scanDetailLevel) throws IOException { for (; thisTerm != null && StringHelper.startsWith(thisTerm, curVNodeTerm); thisTerm = termsEnum.next()) { scanCell = grid.getNode(thisTerm.bytes, thisTerm.offset, thisTerm.length, scanCell); int termLevel = scanCell.getLevel(); if (termLevel > scanDetailLevel) continue; if (termLevel == scanDetailLevel || scanCell.isLeaf()) { Shape cShape; //if this cell represents a point, use the cell center vs the box // (points never have isLeaf()) if (termLevel == grid.getMaxLevels() && !scanCell.isLeaf()) cShape = scanCell.getCenter(); else cShape = scanCell.getShape(); visitScanned(scanCell, cShape); } }//term loop }
@Override public void startTerm(BytesRef term, int freq) throws IOException { final int prefix = StringHelper.bytesDifference(lastTerm, term); final int suffix = term.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(term.bytes, term.offset + prefix, suffix); tvf.writeVInt(freq); lastTerm.copyBytes(term); lastPosition = lastOffset = 0; if (offsets && positions) { // we might need to buffer if its a non-bulk merge offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq); offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq); offsetIndex = 0; offsetFreq = freq; } }
@Override public void startTerm(BytesRef term, int freq) throws IOException { final int prefix = StringHelper.bytesDifference(lastTerm, term); final int suffix = term.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(term.bytes, term.offset + prefix, suffix); tvf.writeVInt(freq); lastTerm.copyBytes(term); lastPosition = lastOffset = 0; if (offsets && positions) { // we might need to buffer if its a non-bulk merge offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq); offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq); } bufferedIndex = 0; bufferedFreq = freq; payloadData.length = 0; }
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
private void addReverseTermIndex(FieldInfo field, final Iterable<BytesRef> values, int maxLength) throws IOException { long count = 0; BytesRefBuilder priorTerm = new BytesRefBuilder(); priorTerm.grow(maxLength); BytesRef indexTerm = new BytesRef(); long startFP = data.getFilePointer(); PagedBytes pagedBytes = new PagedBytes(15); MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); for (BytesRef b : values) { int termPosition = (int) (count & REVERSE_INTERVAL_MASK); if (termPosition == 0) { int len = StringHelper.sortKeyLength(priorTerm.get(), b); indexTerm.bytes = b.bytes; indexTerm.offset = b.offset; indexTerm.length = len; addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm)); } else if (termPosition == REVERSE_INTERVAL_MASK) { priorTerm.copyBytes(b); } count++; } addresses.finish(); long numBytes = pagedBytes.getPointer(); pagedBytes.freeze(true); PagedBytesDataInput in = pagedBytes.getDataInput(); meta.writeLong(startFP); data.writeVLong(numBytes); data.copyBytes(in, numBytes); }
@Override protected AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixRef)) { return AcceptStatus.YES; } else { return AcceptStatus.END; } }
/** Returns true if <code>o</code> is equal to this. If a * {@link FieldComparatorSource} or {@link * FieldCache.Parser} was provided, it must properly * implement equals (unless a singleton is always used). */ @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof SortField)) return false; final SortField other = (SortField)o; return ( StringHelper.equals(other.field, this.field) && other.type == this.type && other.reverse == this.reverse && (other.comparatorSource == null ? this.comparatorSource == null : other.comparatorSource.equals(this.comparatorSource)) ); }
/** * Returns true if the term matches the automaton. Also stashes away the term * to assist with smart enumeration. */ @Override protected AcceptStatus accept(final BytesRef term) { if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) { if (runAutomaton.run(term.bytes, term.offset, term.length)) return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; else return (linear && termComp.compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } else { return (linear && termComp.compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } }
private BytesRef setTerm() throws IOException { term = termsEnum.term(); //System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString())); if (prefix != null && !StringHelper.startsWith(term, prefix)) { term = null; } return term; }
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { Matcher matcher = pattern.matcher(""); try { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { text = termsEnum.term(); } else { text = null; } while(text != null) { if (text != null && StringHelper.startsWith(text, prefixRef)) { String textString = text.utf8ToString(); matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(new Term(fieldName, textString)); } } else { break; } text = termsEnum.next(); } } finally { matcher.reset(); } } }
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { /* inspired by PrefixQuery.rewrite(): */ Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); boolean skip = false; TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName)); } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { if (StringHelper.startsWith(termsEnum.term(), prefixRef)) { mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString())); } else { skip = true; } } else { // EOF skip = true; } if (!skip) { while(true) { BytesRef text = termsEnum.next(); if (text != null && StringHelper.startsWith(text, prefixRef)) { mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString())); } else { break; } } } } }
private static int hashCode(@Nullable Object value) { if (value == null) { return 0; } if (value instanceof BytesRef) { // since lucene 4.8 // BytesRef.hashCode() uses a random seed across different jvm // which causes the hashCode / routing to be different on each node // this breaks the group by redistribution logic - need to use a fixed seed here // to be consistent. return StringHelper.murmurhash3_x86_32(((BytesRef) value), 1); } return value.hashCode(); }
/** NOTE: if your codec does not sort in unicode code * point order, you must override this method, to simply * return indexedTerm.length. */ protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) { // As long as codec sorts terms in unicode codepoint // order, we can safely strip off the non-distinguishing // suffix to save RAM in the loaded terms index. return StringHelper.sortKeyLength(priorTerm, indexedTerm); }
public static void checkFooter(ChecksumIndexInput input) throws IOException { BytesRefBuilder scratch = new BytesRefBuilder(); String expectedChecksum = String.format(Locale.ROOT, "%020d", input.getChecksum()); SimpleTextUtil.readLine(input, scratch); if (StringHelper.startsWith(scratch.get(), CHECKSUM) == false) { throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + scratch.get().utf8ToString() + " (resource=" + input + ")"); } String actualChecksum = new BytesRef(scratch.bytes(), CHECKSUM.length, scratch.length() - CHECKSUM.length).utf8ToString(); if (!expectedChecksum.equals(actualChecksum)) { throw new CorruptIndexException("SimpleText checksum failure: " + actualChecksum + " != " + expectedChecksum + " (resource=" + input + ")"); } if (input.length() != input.getFilePointer()) { throw new CorruptIndexException("Unexpected stuff at the end of file, please be careful with your text editor! (resource=" + input + ")"); } }
@Override protected AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixRef)) { // TODO: set BoostAttr based on distance of // searchTerm.text() and term().text() return regexImpl.match(term) ? AcceptStatus.YES : AcceptStatus.NO; } else { return AcceptStatus.NO; } }
/** * <p>The termCompare method in FuzzyTermEnum uses Levenshtein distance to * calculate the distance between the given term and the comparing term. * </p> * <p>If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison. * Otherwise, this method uses the following logic to calculate similarity. * <pre> * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen))); * </pre> * where distance is the Levenshtein distance for the two words. * </p> * */ @Override protected final AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixBytesRef)) { utf32.copyUTF8Bytes(term); final int distance = calcDistance(utf32.ints(), realPrefixLength, utf32.length() - realPrefixLength); //Integer.MIN_VALUE is the sentinel that Levenshtein stopped early if (distance == Integer.MIN_VALUE){ return AcceptStatus.NO; } //no need to calc similarity, if raw is true and distance > maxEdits if (raw == true && distance > maxEdits){ return AcceptStatus.NO; } final float similarity = calcSimilarity(distance, (utf32.length() - realPrefixLength), text.length); //if raw is true, then distance must also be <= maxEdits by now //given the previous if statement if (raw == true || (raw == false && similarity > minSimilarity)) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); return AcceptStatus.YES; } else { return AcceptStatus.NO; } } else { return AcceptStatus.END; } }
private TreeMap<String,Long> readFields(IndexInput in) throws IOException { BytesRef scratch = new BytesRef(10); TreeMap<String,Long> fields = new TreeMap<String,Long>(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END)) { return fields; } else if (StringHelper.startsWith(scratch, FIELD)) { String fieldName = new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"); fields.put(fieldName, in.getFilePointer()); } } }
/** * The termCompare method in FuzzyTermEnum uses Levenshtein distance to * calculate the distance between the given term and the comparing term. */ @Override protected final AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixBytesRef)) { UnicodeUtil.UTF8toUTF32(term, utf32); final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); if (similarity > minSimilarity) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); return AcceptStatus.YES; } else return AcceptStatus.NO; } else { return AcceptStatus.END; } }