@Override public void inform(ResourceLoader loader) throws IOException { InputStream stream = null; try { if (dictFile != null) // the dictionary can be empty. dictionary = getWordSet(loader, dictFile, false); // TODO: Broken, because we cannot resolve real system id // ResourceLoader should also supply method like ClassLoader to get resource URL stream = loader.openResource(hypFile); final InputSource is = new InputSource(stream); is.setEncoding(encoding); // if it's null let xml parser decide is.setSystemId(hypFile); if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); } else { hyphenator = Lucene43HyphenationCompoundWordTokenFilter.getHyphenationTree(is); } } finally { IOUtils.closeWhileHandlingException(stream); } }
@Override public void inform(ResourceLoader loader) throws IOException { if (mapping != null) { List<String> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { List<String> files = splitFileNames(mapping); wlist = new ArrayList<>(); for (String file : files) { List<String> lines = getLines(loader, file.trim()); wlist.addAll(lines); } } final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
@Override public void inform(ResourceLoader loader) throws IOException { String dicts[] = dictionaryFiles.split(","); InputStream affix = null; List<InputStream> dictionaries = new ArrayList<>(); try { dictionaries = new ArrayList<>(); for (String file : dicts) { dictionaries.add(loader.openResource(file)); } affix = loader.openResource(affixFile); this.dictionary = new Dictionary(affix, dictionaries, ignoreCase); } catch (ParseException e) { throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e); } finally { IOUtils.closeWhileHandlingException(affix); IOUtils.closeWhileHandlingException(dictionaries); } }
@Override public void inform(ResourceLoader loader) throws IOException { if (dictionaryFiles != null) { assureMatchVersion(); List<String> files = splitFileNames(dictionaryFiles); if (files.size() > 0) { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase); for (String file : files) { List<String> list = getLines(loader, file.trim()); for (String line : list) { String[] mapping = line.split("\t", 2); builder.add(mapping[0], mapping[1]); } } dictionary = builder.build(); } } }
public static Dictionary getDict(String dicPath, ResourceLoader loader) { Dictionary dic = null; if(dicPath != null) { File f = new File(dicPath); if(!f.isAbsolute() && loader instanceof SolrResourceLoader) { //相对目录 SolrResourceLoader srl = (SolrResourceLoader) loader; dicPath = srl.getInstanceDir()+dicPath; f = new File(dicPath); } dic = Dictionary.getInstance(f); } else { dic = Dictionary.getInstance(); } return dic; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer token = new IKTokenizer(reader, useSmart); Map<String, String> paramsMap = new HashMap<String, String>(); Configuration cfg = DefaultConfig.getInstance(); paramsMap.put("luceneMatchVersion", luceneMatchVersion.toString()); paramsMap.put("synonyms", cfg.getExtSynonymDictionarys().get(0)); paramsMap.put("ignoreCase", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(paramsMap); ResourceLoader loader = new ClasspathResourceLoader(); try { factory.inform(loader); } catch (IOException e) { e.printStackTrace(); } return new TokenStreamComponents(token, factory.create(token)); }
@Override public void inform(ResourceLoader loader) throws IOException { if (userDictionaryPath != null) { InputStream stream = loader.openResource(userDictionaryPath); String encoding = userDictionaryEncoding; if (encoding == null) { encoding = IOUtils.UTF_8; } CharsetDecoder decoder = Charset.forName(encoding).newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = new InputStreamReader(stream, decoder); userDictionary = new UserDictionary(reader); } else { userDictionary = null; } }
@Override public void inform(ResourceLoader loader) throws IOException { assert tailored != null : "init must be called first!"; if (tailored.isEmpty()) { config = new DefaultICUTokenizerConfig(cjkAsWords); } else { final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT]; for (Map.Entry<Integer,String> entry : tailored.entrySet()) { int code = entry.getKey(); String resourcePath = entry.getValue(); breakers[code] = parseRules(resourcePath, loader); } config = new DefaultICUTokenizerConfig(cjkAsWords) { @Override public BreakIterator getBreakIterator(int script) { if (breakers[script] != null) { return (BreakIterator) breakers[script].clone(); } else { return super.getBreakIterator(script); } } // TODO: we could also allow codes->types mapping }; } }
@Override public void inform(ResourceLoader loader) throws IOException { clazz = registry.get(name.toUpperCase(Locale.ROOT)); if( clazz == null ) { clazz = resolveEncoder(name, loader); } if (maxCodeLength != null) { try { setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class); } catch (Exception e) { throw new IllegalArgumentException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e); } } getEncoder();//trigger initialization for potential problems to be thrown now }
@Override public void inform(ResourceLoader loader) throws IOException { if (stopWordFiles != null) { if (FORMAT_WORDSET.equalsIgnoreCase(format)) { stopWords = getWordSet(loader, stopWordFiles, ignoreCase); } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) { stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); } else { throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format); } } else { if (null != format) { throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format); } if (luceneMatchVersion == null) { stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } else { stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } } }
public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); assertTrue("loader is null and it shouldn't be", loader != null); KeepWordFilterFactory factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt", "ignoreCase", "true"); CharArraySet words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt, keep-2.txt", "ignoreCase", "true"); words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); }
@Override public RewriterFactory createRewriterFactory(NamedList<?> args, ResourceLoader resourceLoader) throws IOException { String rulesResourceName = (String) args.get("rules"); if (rulesResourceName == null) { throw new IllegalArgumentException("Property 'rules' not configured"); } Boolean ignoreCase = args.getBooleanArg("ignoreCase"); // querqy parser for queries that are part of the instructions in the // rules String rulesQuerqyParser = (String) args.get("querqyParser"); QuerqyParserFactory querqyParser = null; if (rulesQuerqyParser != null) { rulesQuerqyParser = rulesQuerqyParser.trim(); if (rulesQuerqyParser.length() > 0) { querqyParser = resourceLoader.newInstance(rulesQuerqyParser, QuerqyParserFactory.class); } } return new querqy.rewrite.commonrules.SimpleCommonRulesRewriterFactory( new InputStreamReader(resourceLoader.openResource(rulesResourceName), "UTF-8"), querqyParser, ignoreCase != null && ignoreCase); }
private CharArraySet getWordSet( ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet( files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet( wlist, ignoreCase)); } } return words; }
public static Dictionary getDict(String dicPath, ResourceLoader loader) { Dictionary dic = null; if(dicPath != null) { File f = new File(dicPath); if(!f.isAbsolute() && loader instanceof SolrResourceLoader) { //相对目录 SolrResourceLoader srl = (SolrResourceLoader) loader; dicPath = srl.getInstancePath().resolve(dicPath).toString(); f = new File(dicPath); } dic = Dictionary.getInstance(f); } else { dic = Dictionary.getInstance(); } return dic; }
@Override public void inform(ResourceLoader loader) throws IOException { String stopTypesFiles = args.get("types"); enablePositionIncrements = getBoolean("enablePositionIncrements", false); useWhitelist = getBoolean("useWhitelist", false); if (stopTypesFiles != null) { List<String> files = splitFileNames(stopTypesFiles); if (files.size() > 0) { stopTypes = new HashSet<String>(); for (String file : files) { List<String> typesLines = getLines(loader, file.trim()); stopTypes.addAll(typesLines); } } } else { throw new IllegalArgumentException("Missing required parameter: types."); } }
/** * Ensure the factory works with hyphenation grammar+dictionary: using default options. */ public void testHyphenationWithDictionary() throws Exception { Reader reader = new StringReader("min veninde som er lidt af en læsehest"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); ResourceLoader loader = new ClasspathResourceLoader(getClass()); Map<String,String> args = new HashMap<String,String>(); args.put("hyphenator", "da_UTF8.xml"); args.put("dictionary", "da_compoundDictionary.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } ); }
/** * Ensure the factory works with no dictionary: using hyphenation grammar only. * Also change the min/max subword sizes from the default. When using no dictionary, * its generally necessary to tweak these, or you get lots of expansions. */ public void testHyphenationOnly() throws Exception { Reader reader = new StringReader("basketballkurv"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); ResourceLoader loader = new ClasspathResourceLoader(getClass()); Map<String,String> args = new HashMap<String,String>(); args.put("hyphenator", "da_UTF8.xml"); args.put("minSubwordSize", "2"); args.put("maxSubwordSize", "4"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } ); }
public void testDelim() throws Exception { Map<String,String> args = new HashMap<String, String>(); args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, FloatEncoder.class.getName()); args.put(DelimitedPayloadTokenFilterFactory.DELIMITER_ATTR, "*"); DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); factory.init(args); ResourceLoader loader = new StringMockResourceLoader("solr/collection1"); factory.inform(loader); TokenStream input = new MockTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"), MockTokenizer.WHITESPACE, false); DelimitedPayloadTokenFilter tf = factory.create(input); tf.reset(); while (tf.incrementToken()){ PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); assertTrue("payAttr is null and it shouldn't be", payAttr != null); byte[] payData = payAttr.getPayload().bytes; assertTrue("payData is null and it shouldn't be", payData != null); float payFloat = PayloadHelper.decodeFloat(payData); assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); } }
@Override public void inform(ResourceLoader loader) throws IOException { if (stopWordFiles != null) { if (FORMAT_WORDSET.equalsIgnoreCase(format)) { stopWords = getWordSet(loader, stopWordFiles, ignoreCase); } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) { stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); } else { throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format); } } else { if (null != format) { throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format); } stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } }
/** * If no words are provided, then a set of english default stopwords is used. */ public void testDefaults() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); Map<String, String> args = Collections.emptyMap(); factory.init(args); factory.inform(loader); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "testing_the", "the_factory" }); }
@Test public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); TypeTokenFilterFactory factory = new TypeTokenFilterFactory(); Map<String, String> args = new HashMap<String, String>(); args.put("types", "stoptypes-1.txt"); args.put("enablePositionIncrements", "true"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); Set<String> types = factory.getStopTypes(); assertTrue("types is null and it shouldn't be", types != null); assertTrue("types Size: " + types.size() + " is not: " + 2, types.size() == 2); assertTrue("enablePositionIncrements was set to true but not correctly parsed", factory.isEnablePositionIncrements()); factory = new TypeTokenFilterFactory(); args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); args.put("enablePositionIncrements", "false"); args.put("useWhitelist","true"); factory.init(args); factory.inform(loader); types = factory.getStopTypes(); assertTrue("types is null and it shouldn't be", types != null); assertTrue("types Size: " + types.size() + " is not: " + 4, types.size() == 4); assertTrue("enablePositionIncrements was set to false but not correctly parsed", !factory.isEnablePositionIncrements()); }
/** * Loads the hunspell dictionary and affix files defined in the configuration * * @param loader ResourceLoader used to load the files */ @Override public void inform(ResourceLoader loader) throws IOException { String dictionaryFiles[] = dictionaryArg.split(","); InputStream affix = null; List<InputStream> dictionaries = new ArrayList<InputStream>(); try { dictionaries = new ArrayList<InputStream>(); for (String file : dictionaryFiles) { dictionaries.add(loader.openResource(file)); } affix = loader.openResource(affixFile); this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing); } catch (ParseException e) { throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e); } finally { IOUtils.closeWhileHandlingException(affix); IOUtils.closeWhileHandlingException(dictionaries); } }
@Override public void inform(ResourceLoader loader) throws IOException { InputStream stream = null; try { if (dictFile != null) // the dictionary can be empty. dictionary = getWordSet(loader, dictFile, false); // TODO: Broken, because we cannot resolve real system id // ResourceLoader should also supply method like ClassLoader to get resource URL stream = loader.openResource(hypFile); final InputSource is = new InputSource(stream); is.setEncoding(encoding); // if it's null let xml parser decide is.setSystemId(hypFile); hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); } finally { IOUtils.closeWhileHandlingException(stream); } }
@Override public void inform(ResourceLoader loader) throws IOException { if (mapping != null) { List<String> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { List<String> files = splitFileNames(mapping); wlist = new ArrayList<String>(); for (String file : files) { List<String> lines = getLines(loader, file.trim()); wlist.addAll(lines); } } final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }