public void testMaxPosition3WithSynomyms() throws IOException { for (final boolean consumeAll : new boolean[]{true, false}) { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.setEnableChecks(consumeAll); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRefBuilder multiWordCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef.get(), true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef.get(), true); SynonymMap synonymMap = builder.build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"}, new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0}); } }
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset(); int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { final int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random().nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase); return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); } }; checkRandomData(random(), analyzer, 200); } }
public void testMaxPosition3WithSynomyms() throws IOException { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRef multiWordCharsRef = new CharsRef(); SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef, true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef, true); SynonymMap synonymMap = builder.build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 }); }
@Before public void createAnalyzers() throws Exception { queryAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // White space tokenizer, to lower case tokenizer. return new TokenStreamComponents(new MockTokenizer()); } }; SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("test"), new CharsRef("synonym1"), false); builder.add(new CharsRef("test"), new CharsRef("synonym2"), false); final SynonymMap synonyms = builder.build(); synonymAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); // Filter for adding synonyms TokenStream result = new SynonymFilter(tokenizer, synonyms, true); // Filter all non-synonyms, because the synonym filter outputs the // original token too. result = new TypeTokenFilter(result, Collections.singleton(SynonymFilter.TYPE_SYNONYM), true); return new TokenStreamComponents(tokenizer, result); } }; }
/** test that we can parse and use the solr syn file */ public void testSynonyms() throws Exception { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); assertTrue(ts instanceof SynonymFilter); assertTokenStreamContents(ts, new String[] { "GB", "gib", "gigabyte", "gigabytes" }, new int[] { 1, 0, 0, 0 }); }
@Override public TokenStream create(TokenStream tokenStream) { // fst is null means no synonyms return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase); }
@Override public TokenStream create(TokenStream input) { // if the fst is null, it means there's actually no synonyms... just return the original stream // as there is nothing to do here. return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase); }
@Override public TokenStream create(TokenStream tokenStream) { return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, true); }