public void testMaxPosition3WithSynomyms() throws IOException { for (final boolean consumeAll : new boolean[]{true, false}) { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.setEnableChecks(consumeAll); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRefBuilder multiWordCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef.get(), true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef.get(), true); SynonymMap synonymMap = builder.build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"}, new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0}); } }
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset(); int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
/** * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug * * @param synonymMap */ @Override public void update(SynonymMap synonymMap) { this.synonyms = synonymMap; this.fst = synonyms.fst; if(this.fst == null) { throw new IllegalArgumentException("fst must be non-null"); } else { this.fstReader = this.fst.getBytesReader(); this.rollBufferSize = 1 + synonyms.maxHorizontalContext; this.futureInputs = new DynamicSynonymFilter.PendingInput[this.rollBufferSize]; this.futureOutputs = new DynamicSynonymFilter.PendingOutputs[this.rollBufferSize]; for(int pos = 0; pos < this.rollBufferSize; ++pos) { this.futureInputs[pos] = new DynamicSynonymFilter.PendingInput(); this.futureOutputs[pos] = new DynamicSynonymFilter.PendingOutputs(); } this.scratchArc = new FST.Arc(); } }
@Override public void run() { try { if (synonymFile.isNeedReloadSynonymMap()) { SynonymMap newSynonymMap = synonymFile.reloadSynonymMap(); if (newSynonymMap == null || newSynonymMap.fst == null) { logger.error("Monitor thread reload remote synonym non-null! indexName:{} path:{}", indexName, synonymFile.getLocation()); return; } synonymMap = newSynonymMap; Iterator<SynonymDynamicSupport> filters = dynamicSynonymFilters.get(indexName).iterator(); while (filters.hasNext()) { filters.next().update(synonymMap); logger.info("success reload synonym success! indexName:{} path:{}", indexName, synonymFile.getLocation()); } } } catch (Exception e) { logger.error("Monitor thread reload remote synonym error! indexName:{} path:{}", indexName, synonymFile.getLocation()); } }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { final int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random().nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase); return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); } }; checkRandomData(random(), analyzer, 200); } }
@Override public void reset() throws IOException { super.reset(); block.setLength(0); prevToken = null; readBufferIndex = BUFFER_SIZE; readBufferLen = 0; ch = 0; blkStart = 0; nextBlkStart = 0; if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) { lastModified = synonymLoader.getLastModified(); final SynonymMap map = synonymLoader.getSynonymMap(); if (map != null) { synonymMap = map; fst = synonymMap.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } fstReader = fst.getBytesReader(); scratchArc = new FST.Arc<>(); clearAttributes(); } } }
protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new NGramSynonymTokenizer(n, delimiters, expand, true, new SynonymLoader(null, null, expand, null) { @Override public SynonymMap getSynonymMap() { return synonyms; } @Override protected void createSynonymMap(boolean reload) { // nothing } }); return new TokenStreamComponents(source); }
/** * Load synonyms from the solr format, "format=solr". */ private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { final boolean expand = getBoolean("expand", true); String synonyms = args.get("synonyms"); if (synonyms == null) throw new IllegalArgumentException("Missing required argument 'synonyms'."); CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer); File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); } else { List<String> files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); parser.add(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); }
/** * Load synonyms from the wordnet format, "format=wordnet". */ private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { final boolean expand = getBoolean("expand", true); String synonyms = args.get("synonyms"); if (synonyms == null) throw new IllegalArgumentException("Missing required argument 'synonyms'."); CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer); File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); } else { List<String> files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); parser.add(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); }
public void testMaxPosition3WithSynomyms() throws IOException { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRef multiWordCharsRef = new CharsRef(); SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef, true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef, true); SynonymMap synonymMap = builder.build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 }); }
/** * Load synonyms with the given {@link SynonymMap.Parser} class. */ private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SynonymMap.Parser parser; Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class); try { parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer); } catch (Exception e) { throw new RuntimeException(e); } File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(synonyms), decoder)); } else { List<String> files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); }
/** * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug * * @param synonymMap */ @Override public void update(SynonymMap synonymMap) { this.synonyms = synonymMap; this.fst = synonyms.fst; if(this.fst == null) { throw new IllegalArgumentException("fst must be non-null"); } else { this.fstReader = this.fst.getBytesReader(); this.scratchArc = new FST.Arc(); //this.ignoreCase = ignoreCase; } }
private void addTerms( NamedList<NamedList<Number>> terms, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException { TermsResponse termsResponse = new TermsResponse( terms ); for (String fieldName : searchFields ) { CharsRef fieldChars = new CharsRef( fieldName ); List<TermsResponse.Term> termList = termsResponse.getTerms( fieldName ); if (termList != null) { for (TermsResponse.Term tc : termList) { String term = tc.getTerm(); Log.debug( "Add distributed term: " + fieldName + " = " + term ); addTerm( fieldChars, term, fieldBuilder, termBuilder ); } } } }
private void buildFieldMap( ResponseBuilder rb ) throws IOException { Log.debug( "buildFieldMap" ); SolrIndexSearcher searcher = rb.req.getSearcher(); // build a synonym map from the SortedDocValues - // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true ); SynonymMap.Builder termBuilder = new SynonymMap.Builder( true ); ArrayList<String> searchFields = getStringFields( searcher ); for (String searchField : searchFields ) { Log.debug( "adding searchField " + searchField ); CharsRef fieldChars = new CharsRef( searchField ); SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField ); if (sdv == null) continue; Log.debug( "got SortedSetDocValues for " + searchField ); TermsEnum te = sdv.termsEnum(); while (te.next() != null) { BytesRef term = te.term(); String fieldValue = term.utf8ToString( ); addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder ); } } addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields ); fieldMap = fieldBuilder.build( ); termMap = termBuilder.build( ); }
@Override public Object create(Random random) { SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean()); } try { return b.build(); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }
/** * Load synonyms with the given SynonymMap.Parser class. */ protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SynonymMap.Parser parser; Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class); try { parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer); } catch (Exception e) { throw new RuntimeException(e); } File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(synonyms), decoder)); } else { List<String> files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); }
/** * Called once, during core initialization, to initialize any analysis components * that depend on the data managed by this resource. It is important that the * analysis component is only initialized once during core initialization so that * text analysis is consistent, especially in a distributed environment, as we * don't want one server applying a different set of stop words than other servers. */ @SuppressWarnings("unchecked") @Override public void onManagedResourceInitialized(NamedList<?> initArgs, final ManagedResource res) throws SolrException { NamedList<Object> args = (NamedList<Object>)initArgs; args.add("synonyms", getResourceId()); args.add("expand", "false"); args.add("format", "solr"); Map<String,String> filtArgs = new HashMap<>(); for (Map.Entry<String,?> entry : args) { filtArgs.put(entry.getKey(), entry.getValue().toString()); } // create the actual filter factory that pulls the synonym mappings // from synonymMappings using a custom parser implementation delegate = new FSTSynonymFilterFactory(filtArgs) { @Override protected SynonymMap loadSynonyms (ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { ManagedSynonymParser parser = new ManagedSynonymParser((SynonymManager)res, dedup, analyzer); // null is safe here because there's no actual parsing done against a input Reader parser.parse(null); return parser.build(); } }; try { delegate.inform(res.getResourceLoader()); } catch (IOException e) { throw new SolrException(ErrorCode.SERVER_ERROR, e); } }
@Override public void reset() throws IOException { super.reset(); captureCount = 0; finished = false; inputSkipCount = 0; nextRead = nextWrite = 0; // In normal usage these resets would not be needed, // since they reset-as-they-are-consumed, but the app // may not consume all input tokens (or we might hit an // exception), in which case we have leftover state // here: for (final PendingInput input : futureInputs) { input.reset(); } for (final PendingOutputs output : futureOutputs) { output.reset(); } if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) { lastModified = synonymLoader.getLastModified(); final SynonymMap map = synonymLoader.getSynonymMap(); if (map != null) { synonyms = map; fst = synonyms.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } fstReader = fst.getBytesReader(); scratchArc = new FST.Arc<>(); clearAttributes(); } } }
@Before public void createAnalyzers() throws Exception { queryAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // White space tokenizer, to lower case tokenizer. return new TokenStreamComponents(new MockTokenizer()); } }; SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("test"), new CharsRef("synonym1"), false); builder.add(new CharsRef("test"), new CharsRef("synonym2"), false); final SynonymMap synonyms = builder.build(); synonymAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); // Filter for adding synonyms TokenStream result = new SynonymFilter(tokenizer, synonyms, true); // Filter all non-synonyms, because the synonym filter outputs the // original token too. result = new TypeTokenFilter(result, Collections.singleton(SynonymFilter.TYPE_SYNONYM), true); return new TokenStreamComponents(tokenizer, result); } }; }
public static void main(String[] args) throws Exception { final CharsRef output = new CharsRef("color"); final SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(SynonymMap.Builder.join("blue".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("green".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("pale green".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("pale blue".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("dark sea green".split(" "), new CharsRefBuilder()), output, true); final SynonymMap synMap = builder.build(); try (PrintWriter pw = new PrintWriter("d:/tmp/syns.dot");) { Util.toDot(synMap.fst, pw, true, true); } System.out.println("Done!"); }
private SynonymMap loadSynonyms(final ResourceLoader loader, final String cname, final boolean dedup, final Analyzer analyzer, final boolean expand, final String synonyms) throws IOException, ParseException { final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SynonymMap.Parser parser; final Class<? extends SynonymMap.Parser> clazz = loader.findClass( cname, SynonymMap.Parser.class); try { parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer); } catch (final Exception e) { throw new RuntimeException(e); } final File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(synonyms), decoder)); } else { final List<String> files = splitFileNames(synonyms); for (final String file : files) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); }
protected NGramSynonymTokenizer(final Reader input, final int n, final String delimiters, final boolean expand, final boolean ignoreCase, final SynonymMap map) { super(input); this.n = n; this.delimiters = delimiters; this.expand = expand; this.ignoreCase = ignoreCase; this.map = map; if (map != null) { fst = map.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } fstReader = fst.getBytesReader(); scratchArc = new FST.Arc<BytesRef>(); } ch = 0; readBuffer = new char[BUFFER_SIZE]; readBufferIndex = BUFFER_SIZE; readBufferLen = 0; block = new StringBuilder(); nextBlkStart = 0; queue = new PriorityQueue<NGramSynonymTokenizer.MyToken>(100, new MyTokensComparator()); synonyms = new ArrayList<NGramSynonymTokenizer.MyToken>(); }
public NGramSynonymTokenizerTestAnalyzer(final int n, final String delimiters, final boolean expand, final SynonymMap synonyms) { this.n = n; this.delimiters = delimiters; this.expand = expand; this.synonyms = synonyms; }
private static void add(String input, String output, boolean keepOrig) { System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); CharsRefBuilder inputCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(input.split(" +"), inputCharsRef); CharsRefBuilder outputCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(output.split(" +"), outputCharsRef); builder.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig); }
@Test public void testSynonyms() throws Exception { String entrada = "ALCALDE KOOPER"; String salida = "FEDERICO KOOPER"; SynonymMap.Builder builder = new SynonymMap.Builder(true); CharsRef input = SynonymMap.Builder.join(entrada.split(" "), new CharsRefBuilder()); CharsRef output = SynonymMap.Builder.join(salida.split(" "), new CharsRefBuilder()); builder.add(input, output, true); SuggestAnalizer suggestAnalizer = new SuggestAnalizer(builder.build()); Analyzer.TokenStreamComponents components = suggestAnalizer.createComponents(entrada); final TokenStream tokenStream = components.getTokenStream(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { // if System.out.println(termAtt.toString()); } tokenStream.end(); tokenStream.close(); // assertTokenStreamContents(tokenStream, new String[]{ // "FEDERICO" // }); // // assertAnalyzesTo(suggestAnalizer, entrada, new String[]{ // "FEDERICO" // }); }
public DynamicSynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { super(input); this.ignoreCase = ignoreCase; update(synonyms); }
public DynamicSynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { super(input); this.ignoreCase = ignoreCase; update(synonyms); }