public void testSortMetaField() throws Exception { createIndex("test"); ensureGreen(); final int numDocs = randomIntBetween(10, 20); IndexRequestBuilder[] indexReqs = new IndexRequestBuilder[numDocs]; for (int i = 0; i < numDocs; ++i) { indexReqs[i] = client().prepareIndex("test", "type", Integer.toString(i)) .setSource(); } indexRandom(true, indexReqs); SortOrder order = randomFrom(SortOrder.values()); SearchResponse searchResponse = client().prepareSearch() .setQuery(matchAllQuery()) .setSize(randomIntBetween(1, numDocs + 5)) .addSort("_uid", order) .execute().actionGet(); assertNoFailures(searchResponse); SearchHit[] hits = searchResponse.getHits().getHits(); BytesRef previous = order == SortOrder.ASC ? new BytesRef() : UnicodeUtil.BIG_TERM; for (int i = 0; i < hits.length; ++i) { final BytesRef uid = new BytesRef(Uid.createUid(hits[i].getType(), hits[i].getId())); assertThat(previous, order == SortOrder.ASC ? lessThan(uid) : greaterThan(uid)); previous = uid; } }
/** initialize levenshtein DFAs up to maxDistance, if possible */ private List<CompiledAutomaton> initAutomata(int maxDistance) { final List<CompiledAutomaton> runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); for (int i = runAutomata.size(); i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i, prefix); //System.out.println("compute automaton n=" + i); runAutomata.add(new CompiledAutomaton(a, true, false)); } } return runAutomata; }
/** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ public static Automaton build(Collection<BytesRef> input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); for (BytesRef b : input) { chars = ArrayUtil.grow(chars, b.length); final int len = UnicodeUtil.UTF8toUTF16(b, chars); ref.chars = chars; ref.length = len; builder.add(ref); } Automaton.Builder a = new Automaton.Builder(); convert(a, builder.complete(), new IdentityHashMap<State,Integer>()); return a.finish(); }
@Override public BytesRef evaluate(Input<Object>... args) { Object stringValue = args[0].value(); if (stringValue == null) { return null; } BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue); char[] ref = new char[inputByteRef.length]; int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref); charUtils.toLowerCase(ref, 0, len); byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len]; len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res); return new BytesRef(res, 0, len); }
@Override public BytesRef evaluate(Input<Object>... args) { Object stringValue = args[0].value(); if (stringValue == null) { return null; } BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue); char[] ref = new char[inputByteRef.length]; int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref); charUtils.toUpperCase(ref, 0, len); byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len]; len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res); return new BytesRef(res, 0, len); }
@Override public TermDocIndexKey decrementKey(TermDocIndexKey previousKey) { int termIndex = previousKey.index; BytesRef docId = previousKey.docId; do { while ((docId = decrementDocId(termIndex, docId)) != null) { int docIndex = acceptDoc(termIndex, docId); if (docIndex >= 0) { localDocIndex = docIndex; return termDocIndexKey = new TermDocIndexKey(termIndex, docId); } } docId = UnicodeUtil.BIG_TERM; } while ((termIndex = decrementTermIndex(termIndex)) >= 0); localDocIndex = -1; return termDocIndexKey = null; }
@Override public TermDocIndexKey targetKeyInit(boolean ascending) throws IOException { int termIndex = getTargetKeyIndexInit(ascending); if (termIndex < 0) { return null; } int rawTargetIdx = getTargetKeyIndex(); BytesRef initTargetDoc = targetDoc; if (rawTargetIdx < termIndex) { initTargetDoc = null; } else if (rawTargetIdx > termIndex) { initTargetDoc = UnicodeUtil.BIG_TERM; } TermDocIndexKey ret = new TermDocIndexKey(termIndex, initTargetDoc); int docIndex = acceptDoc(termIndex, initTargetDoc); if (docIndex >= 0) { localDocIndex = docIndex; return termDocIndexKey = ret; } else if (ascending) { return incrementKey(ret); } else { return decrementKey(ret); } }
/** random test ensuring we don't ever split supplementaries */ public void testSurrogates2() throws IOException { int numIterations = atLeast(10000); for (int i = 0; i < numIterations; i++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + i); } String s = TestUtil.randomUnicodeString(random(), 100); TokenStream ts = analyzer.tokenStream("foo", s); try { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { assertTrue(UnicodeUtil.validUTF16String(termAtt)); } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } }
/** random test ensuring we don't ever split supplementaries */ public void testSurrogates2() throws IOException { int numIterations = atLeast(1000); for (int i = 0; i < numIterations; i++) { String s = TestUtil.randomUnicodeString(random(), 100); TokenStream ts = analyzer.tokenStream("foo", s); try { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { assertTrue(UnicodeUtil.validUTF16String(termAtt)); } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } }
@Override public void setUp() throws Exception { super.setUp(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add( "aa", "a" ); builder.add( "bbb", "b" ); builder.add( "cccc", "cc" ); builder.add( "h", "i" ); builder.add( "j", "jj" ); builder.add( "k", "kkk" ); builder.add( "ll", "llll" ); builder.add( "empty", "" ); // BMP (surrogate pair): builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef"); builder.add("\uff01", "full-width-exclamation"); normMap = builder.build(); }
@Override public void build(TermFreqIterator tfit) throws IOException { if (tfit.getComparator() != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedTermFreqIteratorWrapper(tfit); } trie = new JaspellTernarySearchTrie(); trie.setMatchAlmostDiff(editDistance); BytesRef spare; final CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { final long weight = tfit.weight(); if (spare.length == 0) { continue; } charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); trie.put(charsSpare.toString(), Long.valueOf(weight)); } }
@Override public void build(TermFreqIterator tfit) throws IOException { root = new TernaryTreeNode(); // buffer first if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator()); } ArrayList<String> tokens = new ArrayList<String>(); ArrayList<Number> vals = new ArrayList<Number>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); tokens.add(charsSpare.toString()); vals.add(Long.valueOf(tfit.weight())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); }
/** random test ensuring we don't ever split supplementaries */ public void testSurrogates2() throws IOException { int numIterations = atLeast(10000); for (int i = 0; i < numIterations; i++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + i); } String s = _TestUtil.randomUnicodeString(random(), 100); TokenStream ts = analyzer.tokenStream("foo", new StringReader(s)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { assertTrue(UnicodeUtil.validUTF16String(termAtt)); } } }
/** initialize levenshtein DFAs up to maxDistance, if possible */ private List<CompiledAutomaton> initAutomata(int maxDistance) { final List<CompiledAutomaton> runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); for (int i = runAutomata.size(); i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i); //System.out.println("compute automaton n=" + i); // constant prefix if (realPrefixLength > 0) { Automaton prefix = BasicAutomata.makeString( UnicodeUtil.newString(termText, 0, realPrefixLength)); a = BasicOperations.concatenate(prefix, a); } runAutomata.add(new CompiledAutomaton(a, true, false)); } } return runAutomata; }
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef intsSpare = new IntsRef(); final int size = hash.size(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.add(intsSpare, new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
@Override public boolean collect(BytesRef term, int count) { if (count < mincount) { return false; } if (offset > 0) { offset--; return false; } if (limit > 0) { UnicodeUtil.UTF8toUTF16(term, spare); res.add(spare.toString(), count); limit--; } return limit <= 0; }
private static TInfo parseTerm(FunctionQParser fp) throws SyntaxError { TInfo tinfo = new TInfo(); tinfo.indexedField = tinfo.field = fp.parseArg(); tinfo.val = fp.parseArg(); tinfo.indexedBytes = new BytesRef(); FieldType ft = fp.getReq().getSchema().getFieldTypeNoEx(tinfo.field); if (ft == null) ft = new StrField(); if (ft instanceof TextField) { // need to do analysis on the term String indexedVal = tinfo.val; Query q = ft.getFieldQuery(fp, fp.getReq().getSchema().getFieldOrNull(tinfo.field), tinfo.val); if (q instanceof TermQuery) { Term term = ((TermQuery)q).getTerm(); tinfo.indexedField = term.field(); indexedVal = term.text(); } UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes); } else { ft.readableToIndexed(tinfo.val, tinfo.indexedBytes); } return tinfo; }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(null); final CharsRef spare = new CharsRef(); BytesRef text; while((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/** * Adds an input string and it's stemmer override output to this builder. * * @param input the input char sequence * @param output the stemmer override output char sequence * @return <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. */ public boolean add(CharSequence input, CharSequence output) { final int length = input.length(); if (ignoreCase) { // convert on the fly to lowercase charsSpare.grow(length); final char[] buffer = charsSpare.chars; for (int i = 0; i < length; ) { i += Character.toChars( Character.toLowerCase( Character.codePointAt(input, i)), buffer, i); } UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } else { UnicodeUtil.UTF16toUTF8(input, 0, length, spare); } if (hash.add(spare) >= 0) { outputValues.add(output); return true; } return false; }
/** finds the smallest Lev(n) DFA that accepts the term. */ @Override protected AcceptStatus accept(BytesRef term) { //System.out.println("AFTE.accept term=" + term); int ed = matchers.length - 1; // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum, // so we know the outer DFA always matches. // now compute exact edit distance while (ed > 0) { if (matches(term, ed - 1)) { ed--; } else { break; } } //System.out.println("CHECK term=" + term.utf8ToString() + " ed=" + ed); // scale to a boost and return (if similarity > minSimilarity) if (ed == 0) { // exact match boostAtt.setBoost(1.0F); //System.out.println(" yes"); return AcceptStatus.YES; } else { final int codePointCount = UnicodeUtil.codePointCount(term); final float similarity = 1.0f - ((float) ed / (float) (Math.min(codePointCount, termLength))); if (similarity > minSimilarity) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); //System.out.println(" yes"); return AcceptStatus.YES; } else { return AcceptStatus.NO; } } }
/** Decompress the byte array previously returned by * compressString back into a String */ public static String decompressString(byte[] value, int offset, int length) throws DataFormatException { final byte[] bytes = decompress(value, offset, length); final char[] result = new char[bytes.length]; final int len = UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result); return new String(result, 0, len); }
static String brToString(BytesRef br) { if (br == null) { return "null"; } else if (UnicodeUtil.BIG_TERM.bytesEquals(br)) { return "[UnicodeUtil.BIG_TERM]"; } else { return br.utf8ToString(); } }
@Override public int codePoint(int index) { //FIXME: is this the correct behaviour? this.tmpByte[0] = this.contents.bytes[index]; UnicodeUtil.UTF8toUTF16( this.tmpByte, 0, 1, this.tmpChar ); return this.tmpChar[0] & 0xFFFF; }
/** * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity > * <code>minSimilarity</code>. * <p> * After calling the constructor the enumeration is already pointing to the first * valid term if such a term exists. * * @throws IOException If there is a low-level I/O error. */ public LinearFuzzyTermsEnum() throws IOException { super(terms.iterator(null)); this.text = new int[termLength - realPrefixLength]; System.arraycopy(termText, realPrefixLength, text, 0, text.length); final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); prefixBytesRef = new BytesRef(prefix); this.d = new int[this.text.length + 1]; this.p = new int[this.text.length + 1]; setInitialSeekTerm(prefixBytesRef); }
/** Returns random string, including full unicode range. */ public static String randomRegexp(Random r) { while (true) { String regexp = randomRegexpString(r); // we will also generate some undefined unicode queries if (!UnicodeUtil.validUTF16String(regexp)) continue; try { new RegExp(regexp, RegExp.NONE); return regexp; } catch (Exception e) {} } }
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) { if (!isValidUnicode) { return term.toString(); } else if (inputMode == 0) { // utf8 return toBytesRef(term).utf8ToString() + " " + term; } else { // utf32 return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; } }
public void testAllUnicodeChars() throws Throwable { CharsRefBuilder utf16 = new CharsRefBuilder(); char[] chars = new char[2]; for(int ch=0;ch<0x0010FFFF;ch++) { if (ch == 0xd800) // Skip invalid code points ch = 0xe000; int len = 0; if (ch <= 0xffff) { chars[len++] = (char) ch; } else { chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len)); String s1 = new String(chars, 0, len); String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8); assertEquals("codepoint " + ch, s1, s2); utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length); assertEquals("codepoint " + ch, s1, utf16.toString()); byte[] b = s1.getBytes(StandardCharsets.UTF_8); assertEquals(utf8.length, b.length); for(int j=0;j<utf8.length;j++) assertEquals(utf8.bytes[j], b[j]); } }
private void assertAutomaton(Automaton automaton) throws Exception { CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = atLeast(1000); for (int i = 0; i < num; i++) { final String string; if (random().nextBoolean()) { // likely not accepted string = TestUtil.randomUnicodeString(random()); } else { // will be accepted int[] codepoints = ras.getRandomAcceptedString(random()); try { string = UnicodeUtil.newString(codepoints, 0, codepoints.length); } catch (Exception e) { System.out.println(codepoints.length + " codepoints:"); for(int j=0;j<codepoints.length;j++) { System.out.println(" " + Integer.toHexString(codepoints[j])); } throw e; } } byte bytes[] = string.getBytes(StandardCharsets.UTF_8); assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length)); } }
@Override public void writeStr(String name, String val, boolean needsEscaping) throws IOException { // serialized PHP strings don't need to be escaped at all, however the // string size reported needs be the number of bytes rather than chars. utf8 = ArrayUtil.grow(utf8, val.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR); final int nBytes = UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8); writer.write("s:"); writer.write(Integer.toString(nBytes)); writer.write(":\""); writer.write(val); writer.write("\";"); }