public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0); Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] {t1, t2, t3}; Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new MockTokenizer(), tokenStream); } }; assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); }
@Override public Token next(Token reusableToken) throws IOException { Token token = reusableToken; if (tokenizer() != null) { Token t = tokenizer().next(token); if (t != null) { return t; } } char[] sent = new char[] {}; do { read = input.read(ioBuffer); if (read > 0) sent = add(sent, ioBuffer, read); } while (read != -1); if (sent.length == 0) { return null; } if (tokenizer() == null) { tokenizer = new Tknzr(sent); } else { tokenizer().reset(sent); } return tokenizer().next(token); }
@Override public Token next(Token reusableToken) throws IOException { Token token = reusableToken; if (elementTokenizer() != null) { Token t = elementTokenizer().next(token); if (t != null) { return t; } } char[] sent = new char[] {}; do { read = input.read(ioBuffer); if (read > 0) sent = add(sent, ioBuffer, read); } while (read != -1); if (sent.length == 0) { return null; } if (elementTokenizer() == null) { elementTokenizer = new JsonSentenceParser(compressPayload); } elementTokenizer().parse(String.valueOf(sent)); return elementTokenizer().next(token); }
public Token next(Token token) { if (currentPos == 0) return null; if (tokenPos <= currentPos) { token.setTermBuffer(sentence, textPositions[2 * tokenPos], textPositions[2 * tokenPos + 1] - textPositions[2 * tokenPos]); Payload p = new Payload(); byte[] b = new byte[4]; b[0] = (byte) ((payloads[tokenPos] >>> 16) & 255); b[1] = (byte) ((payloads[tokenPos] >>> 24) & 255); b[2] = (byte) ((payloads[tokenPos] >>> 8) & 255); b[3] = (byte) (payloads[tokenPos] & 255); p.setData(b); token.setPayload(p); tokenPos++; return token; } return null; }
private void printlnToken(String txt, Analyzer analyzer) throws IOException { System.out.println("---------"+txt.length()+"\n"+txt); TokenStream ts = analyzer.tokenStream("text", new StringReader(txt)); /*//lucene 2.9 以下 for(Token t= new Token(); (t=ts.next(t)) !=null;) { System.out.println(t); }*/ /*while(ts.incrementToken()) { TermAttribute termAtt = (TermAttribute)ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute)ts.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class); System.out.println("("+termAtt.term()+","+offsetAtt.startOffset()+","+offsetAtt.endOffset()+",type="+typeAtt.type()+")"); }*/ for(Token t= new Token(); (t=TokenUtils.nextToken(ts, t)) !=null;) { System.out.println(t); } }
@SuppressWarnings("unused") SetDictionary(String words, Analyzer analyzer) throws IOException { wordSet = new HashSet<String>(); if (words != null) { TokenStream tokenStream = analyzer.tokenStream(NodeDocument.TEXT_FIELD, new StringReader(words)); Token reusableToken = new Token(); Token nextToken = null; //while ((nextToken = tokenStream.next(reusableToken)) != null) { //String term = nextToken.term(); //if (term != null) { //wordSet.add(term); //} //} } }
@Override public Token next() throws IOException { if (segbuf == null) { while (segbuf == null || segbuf.length == 0) { String line = bufreader.readLine(); if (line == null) { return null; } segbuf = segmentor.segment(line); } currentSeg = 0; } Token t = new Token(segbuf[currentSeg], currentPos, currentPos + segbuf[currentSeg].length()); currentPos += segbuf[currentSeg].length(); currentSeg++; if (currentSeg >= segbuf.length) segbuf = null; return t; }
public void testGetToken() throws IOException { String content = "我们的生活\n很美好"; String[] str = { "我们", "们的", "的生", "生活", "很美", "美好" }; StringReader reader = new StringReader(content); WordSegment ws = new BigramWordSegment(); WordChineseTokenizer tokenizer = new WordChineseTokenizer(ws, reader); LinkedList<Token> results = new LinkedList<Token>(); Token t; while ((t = tokenizer.next()) != null) { results.add(t); } Assert.assertEquals(str.length, results.size()); for (int i = 0; i < results.size(); i++) { Assert.assertEquals(str[i], results.get(i).termText()); } }
private List<LookupResult> getLookupResults(SpellingOptions options, Token currentToken) throws IOException { CharsRef scratch = new CharsRef(); scratch.chars = currentToken.buffer(); scratch.offset = 0; scratch.length = currentToken.length(); boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) && !(lookup instanceof WFSTCompletionLookup) && !(lookup instanceof AnalyzingSuggester); List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count); if (suggestions == null || suggestions.size() == 0) { return null; } return suggestions; }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
@Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { SpellingResult result = new SpellingResult(); //just spit back out the results // sort the keys to make ordering predictable Iterator<String> iterator = options.customParams.getParameterNamesIterator(); List<String> lst = new ArrayList<>(); while (iterator.hasNext()) { lst.add(iterator.next()); } Collections.sort(lst); int i = 0; for (String name : lst) { String value = options.customParams.get(name); result.add(new Token(name, i, i+1), Collections.singletonList(value)); i += 2; } return result; }
protected void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, String[] types) throws IOException { String text[] = new String[tokensToCompare.length]; int startOffsets[] = new int[tokensToCompare.length]; int endOffsets[] = new int[tokensToCompare.length]; for (int i = 0; i < tokensToCompare.length; i++) { text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length()); startOffsets[i] = tokensToCompare[i].startOffset(); endOffsets[i] = tokensToCompare[i].endOffset(); } assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements); }
@Test public void testUnicode() { SpellingQueryConverter converter = new SpellingQueryConverter(); converter.init(new NamedList()); converter.setAnalyzer(new WhitespaceAnalyzer()); // chinese text value Collection<Token> tokens = converter.convert("text_field:我购买了道具和服装。"); assertTrue("tokens is null and it shouldn't be", tokens != null); assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); tokens = converter.convert("text_购field:我购买了道具和服装。"); assertTrue("tokens is null and it shouldn't be", tokens != null); assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); tokens = converter.convert("text_field:我购xyz买了道具和服装。"); assertTrue("tokens is null and it shouldn't be", tokens != null); assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size()); }
public void testLegalbutVeryLargePositions() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, 3); t1.setPositionIncrement(Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef(new byte[] { 0x1 } )); } TokenStream overflowingTokenStream = new CannedTokenStream( new Token[] { t1 } ); Field field = new TextField("foo", overflowingTokenStream); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
public void testLegalbutVeryLargeOffsets() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef("test")); } Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE); TokenStream tokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", tokenStream, ft); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
public void testTextFieldString() throws Exception { Field fields[] = new Field[] { new TextField("foo", "bar", Field.Store.NO), new TextField("foo", "bar", Field.Store.YES) }; for (Field field : fields) { field.setBoost(5f); trySetByteValue(field); trySetBytesValue(field); trySetBytesRefValue(field); trySetDoubleValue(field); trySetIntValue(field); trySetFloatValue(field); trySetLongValue(field); trySetReaderValue(field); trySetShortValue(field); field.setStringValue("baz"); field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertEquals("baz", field.stringValue()); assertEquals(5f, field.boost(), 0f); } }
public void testTextFieldReader() throws Exception { Field field = new TextField("foo", new StringReader("bar")); field.setBoost(5f); trySetByteValue(field); trySetBytesValue(field); trySetBytesRefValue(field); trySetDoubleValue(field); trySetIntValue(field); trySetFloatValue(field); trySetLongValue(field); field.setReaderValue(new StringReader("foobar")); trySetShortValue(field); trySetStringValue(field); field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertNotNull(field.readerValue()); assertEquals(5f, field.boost(), 0f); }
@Test public void testMultipleClauses() { SpellingQueryConverter converter = new SpellingQueryConverter(); converter.init(new NamedList()); converter.setAnalyzer(new WhitespaceAnalyzer()); // two field:value pairs should give two tokens Collection<Token> tokens = converter.convert("买text_field:我购买了道具和服装。 field2:bar"); assertTrue("tokens is null and it shouldn't be", tokens != null); assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size()); // a field:value pair and a search term should give two tokens tokens = converter.convert("text_field:我购买了道具和服装。 bar"); assertTrue("tokens is null and it shouldn't be", tokens != null); assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size()); }
public void testBasic() throws IOException { Index index = new Index("test", "_na_"); String name = "ngr"; Settings indexSettings = newAnalysisSettingsBuilder().build(); IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); Settings settings = newAnalysisSettingsBuilder().build(); // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wtf", 1, 5, 0, 3), token("what", 0, 1, 0, 3), token("wow", 0, 3, 0, 3), token("the", 1, 1, 0, 3), token("fudge", 1, 3, 0, 3), token("that's", 1, 1, 0, 3), token("funny", 1, 1, 0, 3), token("happened", 1, 1, 4, 12) }); TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(tokens, new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, new int[] {0, 0, 0, 0, 0, 0, 0, 4}, new int[] {3, 3, 3, 3, 3, 3, 3, 12}, new int[] {1, 0, 0, 1, 0, 1, 0, 1}, new int[] {3, 1, 1, 1, 1, 1, 1, 1}, 12); }
/** * @param singleMatch List<String>, the sequence of strings to match * @param replacement List<Token> the list of tokens to use on a match * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens * @param mergeExisting merge the replacement tokens with any other mappings that exist */ public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) { SlowSynonymMap currMap = this; for (String str : singleMatch) { if (currMap.submap==null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap<>(Version.LUCENE_CURRENT, 1, ignoreCase()); } SlowSynonymMap map = currMap.submap.get(str); if (map==null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new IllegalArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } List<Token> superset = currMap.synonyms==null ? replacement : mergeTokens(Arrays.asList(currMap.synonyms), replacement); currMap.synonyms = superset.toArray(new Token[superset.size()]); if (includeOrig) currMap.flags |= INCLUDE_ORIG; }
/** Produces a List<Token> from a List<String> */ public static List<Token> makeTokens(List<String> strings) { List<Token> ret = new ArrayList<>(strings.size()); for (String str : strings) { //Token newTok = new Token(str,0,0,"SYNONYM"); Token newTok = new Token(str, 0,0,"SYNONYM"); ret.add(newTok); } return ret; }
private void setCurrentToken(Token token) { if (token == null) return; clearAttributes(); termAtt.copyBuffer(token.buffer(), 0, token.length()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); }
private Token getNextPrefixInputToken(Token token) throws IOException { if (!prefix.incrementToken()) return null; token.copyBuffer(p_termAtt.buffer(), 0, p_termAtt.length()); token.setPositionIncrement(p_posIncrAtt.getPositionIncrement()); token.setFlags(p_flagsAtt.getFlags()); token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset()); token.setType(p_typeAtt.type()); token.setPayload(p_payloadAtt.getPayload()); return token; }
private Token getNextSuffixInputToken(Token token) throws IOException { if (!suffix.incrementToken()) return null; token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setPositionIncrement(posIncrAtt.getPositionIncrement()); token.setFlags(flagsAtt.getFlags()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); return token; }
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) { super(suffix); prefix = new PrefixAwareTokenFilter(prefix, input) { @Override public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken); } }; this.suffix = new PrefixAwareTokenFilter(prefix, suffix) { @Override public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken); } }; }
public SingleTokenTokenStream(Token token) { super(Token.TOKEN_ATTRIBUTE_FACTORY); assert token != null; this.singleToken = token.clone(); tokenAtt = (AttributeImpl) addAttribute(CharTermAttribute.class); assert (tokenAtt instanceof Token); }
public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; nodesPosition++; if (nodesPosition < nodes.size()) { reusableToken.clear(); Node node = nodes.get(nodesPosition); reusableToken.setTermBuffer(node.name); reusableToken.setPayload(node.getPayload()); return reusableToken; } return null; }
public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; reusableToken.clear(); if (spacePending) { return setReusableTokenFromLocal(reusableToken, processSpace(lastWordBeginPos)); } int i = lastReadPosition; boolean closeFound = false; while (i < buffer.length) { char c = buffer[i]; if ('(' == c) { if (encounteredOpen) { openPositions.add(cOpen.size() - 1); } cOpen.add(i); encounteredOpen = true; } else if (Character.isWhitespace(c)) { cSpace.add(i); lastWordBeginPos = cOpen.get(cOpen.size() - 1); spacePending = true; encounteredOpen = false; } else if (')' == c) { cClose.add(i); closeFound = true; encounteredOpen = false; break; } i++; } lastReadPosition = i; if (closeFound) { lastReadPosition++; return setReusableTokenFromLocal(reusableToken, processClose()); } return null; }
@Override public Token next(Token reusableToken) throws IOException { Token t = actualParser.next(reusableToken); if (t != null) return t; int readSoFar = 0; int read; do { read = input.read(ioBuffer); if (read > 0) { while (readSoFar + read > sent.length) { char[] oldSent = sent; sent = new char[sent.length + 512]; System.arraycopy(oldSent, 0, sent, 0, readSoFar); } System.arraycopy(ioBuffer, 0, sent, readSoFar, read); readSoFar += read; } } while (read != -1); if (readSoFar == 0) { return null; } try { actualParser.reset(new String(sent, 0, readSoFar)); } catch (ParseException e) { return null; } return actualParser.next(reusableToken); }
public void testLong() { String jsonString = "{\"n\":\"S\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}"; JsonSentenceParser parser = new JsonSentenceParser(false); parser.parse(jsonString); Token token = new Token(); parser.next(token); assertNotNull(token); assertEquals("S", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(32, token.getPayload().byteAt(1)); assertEquals(0, token.getPayload().byteAt(2)); assertEquals(65, token.getPayload().byteAt(3)); parser.next(token); assertEquals("NP", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(2, token.getPayload().byteAt(1)); assertEquals(1, token.getPayload().byteAt(2)); assertEquals(64, token.getPayload().byteAt(3)); parser.next(token); assertEquals("NP", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(1, token.getPayload().byteAt(1)); assertEquals(2, token.getPayload().byteAt(2)); assertEquals(4, token.getPayload().byteAt(3)); parser.next(token); assertEquals("NNP", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(1, token.getPayload().byteAt(1)); assertEquals(3, token.getPayload().byteAt(2)); assertEquals(1, token.getPayload().byteAt(3)); }
public void testSentenceContainingEscapedDoubleQuotes() { String jsonString = "{\"n\":\"S\\\"\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}"; JsonSentenceParser parser = new JsonSentenceParser(false); parser.parse(jsonString); Token token = new Token(); parser.next(token); assertNotNull(token); assertEquals("S\"", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(32, token.getPayload().byteAt(1)); assertEquals(0, token.getPayload().byteAt(2)); assertEquals(65, token.getPayload().byteAt(3)); }
private void assertPayload(Token token, int right, int left, int depth, int parent) { Payload payload = token.getPayload(); assertEquals(right, payload.byteAt(0)); assertEquals(left, payload.byteAt(1)); assertEquals(depth, payload.byteAt(2)); assertEquals(parent, payload.byteAt(3)); }
public CutLetterDigitFilter(TokenStream input) { super(input); reusableToken = new Token(); termAtt = (CharTermAttribute)addAttribute(CharTermAttribute.class); offsetAtt = (OffsetAttribute)addAttribute(OffsetAttribute.class); typeAtt = (TypeAttribute)addAttribute(TypeAttribute.class); }
private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) { Token token = new Token(oriToken.buffer(), termBufferOffset, termBufferLength, oriToken.startOffset()+termBufferOffset, oriToken.startOffset()+termBufferOffset+termBufferLength); if(type == Character.DECIMAL_DIGIT_NUMBER) { token.setType(Word.TYPE_DIGIT); } else { token.setType(Word.TYPE_LETTER); } tokenQueue.offer(token); }
public final boolean incrementToken() throws IOException { clearAttributes(); Token token = nextToken(reusableToken); if(token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { end(); return false; } }
public static List<String> toWords(String txt, Analyzer analyzer) { List<String> words = new ArrayList<String>(); TokenStream ts; try { ts = analyzer.tokenStream("text", new StringReader(txt)); for(Token t= new Token(); (t=TokenUtils.nextToken(ts, t)) !=null;) { words.add(t.toString()); } } catch (IOException e) {} return words; }
public Token next() throws IOException { Token nextToken = input.next(); if (nextToken == null) return null; String term = nextToken.termText(); term = LatCyrUtils.toLatinUnaccented(term); return new Token(term, 0, term.length()); }
void addToken(float score) { if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { int termStartOffset = offsetAtt.startOffset(); int termEndOffset = offsetAtt.endOffset(); if (numTokens == 0) { startOffset = matchStartOffset = termStartOffset; endOffset = matchEndOffset = termEndOffset; tot += score; } else { startOffset = Math.min(startOffset, termStartOffset); endOffset = Math.max(endOffset, termEndOffset); if (score > 0) { if (tot == 0) { matchStartOffset = offsetAtt.startOffset(); matchEndOffset = offsetAtt.endOffset(); } else { matchStartOffset = Math.min(matchStartOffset, termStartOffset); matchEndOffset = Math.max(matchEndOffset, termEndOffset); } tot += score; } } Token token = new Token(termStartOffset, termEndOffset); token.setEmpty().append(termAtt); tokens[numTokens] = token; scores[numTokens] = score; numTokens++; } }