public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4"); args.put("minGramSize","2"); args.put("maxGramSize","2"); TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
private void doTestTokenizer(String tokenizer) throws IOException { Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer); TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it assertFalse(mtc instanceof CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false); } }
public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory TokenizerFactory tf = new NGramTokenizerFactory(); Map<String, String> args = new HashMap<String, String>(); args.put("minGramSize","2"); args.put("maxGramSize","2"); tf.init( args ); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<String>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
private void doTestTokenizer(String tokenizer) throws IOException { TokenizerFactory factory = TokenizerFactory.forName(tokenizer); if (initialize(factory)) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it assertFalse(mtc instanceof CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false); } }
public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory Map<String, String> args = new HashMap<String, String>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4"); args.put("minGramSize","2"); args.put("maxGramSize","2"); TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<String>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after the new classloader has been created to * register the services for use. */ static void reloadLuceneSPI(ClassLoader loader) { // do NOT change the order of these method calls! // Codecs: PostingsFormat.reloadPostingsFormats(loader); DocValuesFormat.reloadDocValuesFormats(loader); Codec.reloadCodecs(loader); // Analysis: CharFilterFactory.reloadCharFilters(loader); TokenFilterFactory.reloadTokenFilters(loader); TokenizerFactory.reloadTokenizers(loader); }
public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
/** * This method looks up a class with its fully qualified name (FQN), or a short-name * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." * as the package prefix (e.g. "standard.ClassicTokenizerFactory" -> * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory"). * * If className contains a period, the class is first looked up as-is, assuming that it * is an FQN. If this fails, lookup is retried after prepending the Lucene analysis * package prefix to the class name. * * If className does not contain a period, the analysis SPI *Factory.lookupClass() * methods are used to find the class. * * @param className The name or the short name of the class. * @param expectedType The superclass className is expected to extend * @return the loaded class. * @throws ClassNotFoundException if lookup fails */ public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType) throws ClassNotFoundException { if (className.contains(".")) { try { // First, try className == FQN return Class.forName(className).asSubclass(expectedType); } catch (ClassNotFoundException e) { try { // Second, retry lookup after prepending the Lucene analysis package prefix return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType); } catch (ClassNotFoundException e1) { throw new ClassNotFoundException("Can't find class '" + className + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'"); } } } // No dot - use analysis SPI lookup final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst(""); if (CharFilterFactory.class.isAssignableFrom(expectedType)) { return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } throw new ClassNotFoundException("Can't find class '" + className + "'"); }
public AnalyzerFactory(List<CharFilterFactory> charFilterFactories, TokenizerFactory tokenizerFactory, List<TokenFilterFactory> tokenFilterFactories) { this.charFilterFactories = charFilterFactories; assert null != tokenizerFactory; this.tokenizerFactory = tokenizerFactory; this.tokenFilterFactories = tokenFilterFactories; }
/** Test showing the behavior */ public void testSimple() throws Exception { Reader reader = new StringReader("我购买了道具和服装。"); TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>()); Tokenizer tokenizer = factory.create(newAttributeFactory(), reader); // TODO: fix smart chinese to not emit punctuation tokens // at the moment: you have to clean up with WDF, or use the stoplist, etc assertTokenStreamContents(tokenizer, new String[] { "我", "购买", "了", "道具", "和", "服装", "," }); }
public void test() throws IOException { for (String tokenizer : TokenizerFactory.availableTokenizers()) { doTestTokenizer(tokenizer); } for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) { doTestTokenFilter(tokenFilter); } for (String charFilter : CharFilterFactory.availableCharFilters()) { doTestCharFilter(charFilter); } }
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after {@link #addToClassLoader(String, FileFilter, boolean)} * and {@link #addToClassLoader(String,FileFilter,boolean)} before using * this ResourceLoader. */ void reloadLuceneSPI() { // Codecs: PostingsFormat.reloadPostingsFormats(this.classLoader); DocValuesFormat.reloadDocValuesFormats(this.classLoader); Codec.reloadCodecs(this.classLoader); // Analysis: CharFilterFactory.reloadCharFilters(this.classLoader); TokenFilterFactory.reloadTokenFilters(this.classLoader); TokenizerFactory.reloadTokenizers(this.classLoader); }
@Override public void index(final IndexType indexType, final Collection<Song> songs) { executor.execute(new Runnable() { @Override public void run() { Stopwatch stopwatch = Stopwatch.createStarted(); Directory directory = new RAMDirectory(); try { LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers()); LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters()); Analyzer analyzer = CustomAnalyzer.builder() .withTokenizer("standard") .addTokenFilter("lowercase") .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25") .build(); IndexWriterConfig config = new IndexWriterConfig(analyzer); try (IndexWriter writer = new IndexWriter(directory, config)) { for (Song song : songs) { Document document = createDocument(song); writer.addDocument(document); songByUuid.put(song.getUUID(), song); } } catch (IOException e) { LOG.warn("couldn't index songs", e); } } catch (IOException e1) { LOG.warn("couldn't create analyzer", e1); } finally { putIndex(indexType, directory); stopwatch.stop(); LOG.info("indexing songs in background thread took {}", stopwatch.toString()); } } }); }
public void add(Object current) { if (!(current instanceof MultiTermAwareComponent)) return; AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent(); if (newComponent instanceof TokenFilterFactory) { if (filters == null) { filters = new ArrayList<TokenFilterFactory>(2); } filters.add((TokenFilterFactory) newComponent); } else if (newComponent instanceof TokenizerFactory) { tokenizer = (TokenizerFactory) newComponent; } else if (newComponent instanceof CharFilterFactory) { if (charFilters == null) { charFilters = new ArrayList<CharFilterFactory>(1); } charFilters.add((CharFilterFactory) newComponent); } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent); } }
private Dictionary getDictionaryByFieldType(String fieldTypeName) { FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName); Analyzer a = ft.getIndexAnalyzer(); Assert.assertEquals(a.getClass(), TokenizerChain.class); TokenizerChain tc = (TokenizerChain) a; TokenizerFactory tf = tc.getTokenizerFactory(); Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class); MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf; Assert.assertNotNull(mtf.dic); return mtf.dic; }
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after {@link #addToClassLoader(String)} * and {@link #addToClassLoader(String,FileFilter)} before using * this ResourceLoader. */ void reloadLuceneSPI() { // Codecs: PostingsFormat.reloadPostingsFormats(this.classLoader); DocValuesFormat.reloadDocValuesFormats(this.classLoader); Codec.reloadCodecs(this.classLoader); // Analysis: CharFilterFactory.reloadCharFilters(this.classLoader); TokenFilterFactory.reloadTokenFilters(this.classLoader); TokenizerFactory.reloadTokenizers(this.classLoader); }
public SuggestFieldInfo(final List<String> fieldNameList, final TokenizerFactory tokenizerFactory, final SuggestReadingConverter suggestReadingConverter, final SuggestNormalizer suggestNormalizer) { this.fieldNameList = fieldNameList; this.tokenizerFactory = tokenizerFactory; this.suggestReadingConverter = suggestReadingConverter; this.suggestNormalizer = suggestNormalizer; }
public DocumentReader(final TokenizerFactory tokenizerFactory, final SuggestReadingConverter suggestReadingConverter, final SuggestNormalizer suggestNormalizer, final SolrInputDocument solrInputDocument, final List<String> targetFields, final List<String> targetLabelFields, final List<String> targetRoleFields, final String expiresField, final String segmentField, final Set<String> badWordSet) { this.solrInputDocument = solrInputDocument; this.targetFields = targetFields; this.targetLabelFields = targetLabelFields; this.targetRoleFields = targetRoleFields; this.tokenizerFactory = tokenizerFactory; this.expiresField = expiresField; this.segmentField = segmentField; this.suggestReadingConverter = suggestReadingConverter; this.suggestNormalizer = suggestNormalizer; this.badWordSet = badWordSet; final Object expireObj = solrInputDocument.getFieldValue(expiresField); if (expireObj != null) { expire = expireObj.toString(); } else { expire = DateUtil.getThreadLocalDateFormat().format(new Date()); } final Object segmentObj = solrInputDocument.getFieldValue(segmentField); if (segmentObj != null) { segment = segmentObj.toString(); } else { segment = StringUtil.EMPTY; } }
public static TokenizerFactory getTokenizerFactory( final SuggestUpdateConfig config) { try { final Map<String, String> args = new HashMap<String, String>(); final Class cls = Class.forName(config.getFieldConfigList().get(0) .getTokenizerConfig().getClassName()); final Constructor constructor = cls.getConstructor(Map.class); final TokenizerFactory tokenizerFactory = (TokenizerFactory) constructor .newInstance(args); return tokenizerFactory; } catch (final Exception e) { e.printStackTrace(); return null; } }
private List<SuggestFieldInfo> getSuggestFieldInfoList( final SuggestUpdateConfig config, final boolean multi) { final List<SuggestFieldInfo> list = new ArrayList<SuggestFieldInfo>(); final List<String> fieldNameList = new ArrayList<String>(); fieldNameList.add("content"); final TokenizerFactory tokenizerFactory = TestUtils .getTokenizerFactory(config); final SuggestReadingConverter suggestReadingConverter = TestUtils .createConverter(); final SuggestNormalizer suggestNormalizer = TestUtils .createNormalizer(); final SuggestFieldInfo suggestFieldInfo = new SuggestFieldInfo( fieldNameList, tokenizerFactory, suggestReadingConverter, suggestNormalizer); list.add(suggestFieldInfo); if (multi) { final List<String> fieldNameList2 = new ArrayList<String>(); fieldNameList2.add("title"); final SuggestReadingConverter suggestReadingConverter2 = TestUtils .createConverter(); final SuggestNormalizer suggestNormalizer2 = TestUtils .createNormalizer(); final SuggestFieldInfo suggestFieldInfo2 = new SuggestFieldInfo( fieldNameList2, null, suggestReadingConverter2, suggestNormalizer2); list.add(suggestFieldInfo2); } return list; }
public void testTokenizers() { Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers()); missing.removeAll(getTokenizers().keySet()); assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty()); }
FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) { assert tokenizer != null; this.tokenizer = tokenizer; this.charFilter = charFilter; this.tokenfilter = tokenfilter; }
public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) { this(null,tokenizer,filters); }
public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) { this.charFilters = charFilters; this.tokenizer = tokenizer; this.filters = filters; }