@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset(); int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) { List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>(); int lastPosition = -1; int lastOffset = 0; for (String text : request.text()) { try (TokenStream stream = analyzer.tokenStream(field, text)) { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null)); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } } return tokens; }
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) { try { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), extractExtendedAttributes(stream, includeAttributes))); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { IOUtils.closeWhileHandlingException(stream); } }
/** * Creates Lucene43NGramTokenFilter with given min and max n-grams. * @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public Lucene43NGramTokenFilter(TokenStream input, int minGram, int maxGram) { super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); this.charUtils = CharacterUtils.getJava4Instance(); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; posIncAtt = new PositionIncrementAttribute() { @Override public void setPositionIncrement(int positionIncrement) {} @Override public int getPositionIncrement() { return 0; } }; posLenAtt = new PositionLengthAttribute() { @Override public void setPositionLength(int positionLength) {} @Override public int getPositionLength() { return 0; } }; }
private void setAttributes() { charTermAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); posLenAtt = addAttribute(PositionLengthAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); posAtt = addAttribute(PartOfSpeechAttribute.class); semanticClassAtt = addAttribute(SemanticClassAttribute.class); }
protected Matcher<TokenStream> positionLength(final int expectedLength) { return new TypeSafeMatcher<TokenStream>() { @Override public void describeTo(Description description) { description.appendText("positionLength=").appendValue(expectedLength); } @Override protected boolean matchesSafely(TokenStream stream) { PositionLengthAttribute attr = stream.addAttribute(PositionLengthAttribute.class); return attr.getPositionLength() == expectedLength; } }; }
/** If inputText is non-null, and the TokenStream has * offsets, we include the surface form in each arc's * label. */ public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) { this.in = in; this.out = out; this.inputText = inputText; termAtt = in.addAttribute(CharTermAttribute.class); posIncAtt = in.addAttribute(PositionIncrementAttribute.class); posLengthAtt = in.addAttribute(PositionLengthAttribute.class); if (in.hasAttribute(OffsetAttribute.class)) { offsetAtt = in.addAttribute(OffsetAttribute.class); } else { offsetAtt = null; } }
public static void printResultOfTokenStream(PrintStream out, TokenStream ts) throws IOException { CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAttr = ts.getAttribute(TypeAttribute.class); OffsetAttribute offAttr = ts.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAttr = ts.getAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAttr = ts.getAttribute(PositionLengthAttribute.class); ts.reset(); Table<String, String, String> contentTable = Tables.newCustomTable(new LinkedHashMap<String, Map<String, String>>(), new Supplier<Map<String, String>>() { @Override public Map<String, String> get() { return Maps.newLinkedHashMap(); } }); int lineNo = 1; int pos = 0; while (ts.incrementToken()) { String lineId = lineNo + "."; contentTable.put(lineId, "term", termAttr.toString()); contentTable.put(lineId, "type", typeAttr.type()); contentTable.put(lineId, "startOffset", offAttr.startOffset() + ""); contentTable.put(lineId, "endOffset", offAttr.endOffset() + ""); contentTable.put(lineId, "posInc", posIncAttr.getPositionIncrement() + ""); contentTable.put(lineId, "posLen", posLenAttr.getPositionLength() + ""); pos += posIncAttr.getPositionIncrement(); contentTable.put(lineId, "pos", pos + ""); lineNo++; } printTable(out, contentTable); }
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException { // convert the string to code points final int[] codePoints = toCodePoints(s); final int[] offsets = new int[codePoints.length + 1]; for (int i = 0; i < codePoints.length; ++i) { offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]); } final TokenStream grams = new NGramTokenizer(Version.LATEST, new StringReader(s), minGram, maxGram, edgesOnly) { @Override protected boolean isTokenChar(int chr) { return nonTokenChars.indexOf(chr) < 0; } }; final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class); final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class); grams.reset(); for (int start = 0; start < codePoints.length; ++start) { nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) { if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) { // not on an edge continue nextGram; } for (int j = start; j < end; ++j) { if (!isTokenChar(nonTokenChars, codePoints[j])) { continue nextGram; } } assertTrue(grams.incrementToken()); assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt)); assertEquals(1, posIncAtt.getPositionIncrement()); assertEquals(1, posLenAtt.getPositionLength()); assertEquals(offsets[start], offsetAtt.startOffset()); assertEquals(offsets[end], offsetAtt.endOffset()); } } assertFalse(grams.incrementToken()); grams.end(); assertEquals(s.length(), offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); }
/** * Now the graph is more interesting! For each token (arc), the PositionIncrementAttribute tells * us how many positions (nodes) ahead this arc starts from, while the new (as of 3.6.0) * PositionLengthAttribute tells us how many positions (nodes) ahead the arc arrives to. */ private static String getGraph(String input) throws IOException { final Tokenizer inputStream = new WhitespaceTokenizer(); inputStream.setReader(new StringReader(input)); // final TokenStream inputStream = new LowerCaseFilter(in); TokenStream tokenStream = new SynonymGraphFilter(inputStream, builder.build(), false); PositionIncrementAttribute posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = tokenStream.addAttribute(PositionLengthAttribute.class); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); int srcNode = -1; int destNode; StringBuilder b = new StringBuilder(); b.append("digraph Automaton {\n"); b.append(" initial [shape=plaintext,label=\"\"]\n"); b.append(" initial -> 0\n"); while (tokenStream.incrementToken()) { int posInc = posIncAtt.getPositionIncrement(); if (posInc != 0) { srcNode += posInc; b.append(" "); b.append(srcNode); b.append(" [shape=circle,label=\"" + srcNode + "\"]\n"); } destNode = srcNode + posLenAtt.getPositionLength(); b.append(" "); b.append(srcNode); b.append(" -> "); b.append(destNode); b.append(" [label=\""); b.append(termAtt); b.append("\""); b.append("]\n"); } tokenStream.end(); tokenStream.close(); b.append('}'); return b.toString(); }
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException { // convert the string to code points final int[] codePoints = toCodePoints(s); final int[] offsets = new int[codePoints.length + 1]; for (int i = 0; i < codePoints.length; ++i) { offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]); } final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly) { @Override protected boolean isTokenChar(int chr) { return nonTokenChars.indexOf(chr) < 0; } }; final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class); final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class); grams.reset(); for (int start = 0; start < codePoints.length; ++start) { nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) { if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) { // not on an edge continue nextGram; } for (int j = start; j < end; ++j) { if (!isTokenChar(nonTokenChars, codePoints[j])) { continue nextGram; } } assertTrue(grams.incrementToken()); assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt)); assertEquals(1, posIncAtt.getPositionIncrement()); assertEquals(1, posLenAtt.getPositionLength()); assertEquals(offsets[start], offsetAtt.startOffset()); assertEquals(offsets[end], offsetAtt.endOffset()); } } assertFalse(grams.incrementToken()); grams.end(); assertEquals(s.length(), offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); }