public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices( indexSettings.getIndexVersionCreated(), "output_unigrams", false, deprecationLogger); final String[] asArray = settings.getAsArray("ignored_scripts"); Set<String> scripts = new HashSet<>(Arrays.asList("han", "hiragana", "katakana", "hangul")); if (asArray != null) { scripts.removeAll(Arrays.asList(asArray)); } int flags = 0; for (String script : scripts) { if ("han".equals(script)) { flags |= CJKBigramFilter.HAN; } else if ("hiragana".equals(script)) { flags |= CJKBigramFilter.HIRAGANA; } else if ("katakana".equals(script)) { flags |= CJKBigramFilter.KATAKANA; } else if ("hangul".equals(script)) { flags |= CJKBigramFilter.HANGUL; } } this.flags = flags; }
/** Creates a new CJKBigramFilterFactory */ public CJKBigramFilterFactory(Map<String,String> args) { super(args); int flags = 0; if (getBoolean(args, "han", true)) { flags |= CJKBigramFilter.HAN; } if (getBoolean(args, "hiragana", true)) { flags |= CJKBigramFilter.HIRAGANA; } if (getBoolean(args, "katakana", true)) { flags |= CJKBigramFilter.KATAKANA; } if (getBoolean(args, "hangul", true)) { flags |= CJKBigramFilter.HANGUL; } this.flags = flags; this.outputUnigrams = getBoolean(args, "outputUnigrams", false); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
@Inject public CJKBigramFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); outputUnigrams = settings.getAsBoolean("output_unigrams", false); final String[] asArray = settings.getAsArray("ignored_scripts"); Set<String> scripts = new HashSet<>(Arrays.asList("han", "hiragana", "katakana", "hangul")); if (asArray != null) { scripts.removeAll(Arrays.asList(asArray)); } int flags = 0; for (String script : scripts) { if ("han".equals(script)) { flags |= CJKBigramFilter.HAN; } else if ("hiragana".equals(script)) { flags |= CJKBigramFilter.HIRAGANA; } else if ("katakana".equals(script)) { flags |= CJKBigramFilter.KATAKANA; } else if ("hangul".equals(script)) { flags |= CJKBigramFilter.HANGUL; } } this.flags = flags; }
@Override public void init(Map<String,String> args) { super.init(args); flags = 0; if (getBoolean("han", true)) { flags |= CJKBigramFilter.HAN; } if (getBoolean("hiragana", true)) { flags |= CJKBigramFilter.HIRAGANA; } if (getBoolean("katakana", true)) { flags |= CJKBigramFilter.KATAKANA; } if (getBoolean("hangul", true)) { flags |= CJKBigramFilter.HANGUL; } outputUnigrams = getBoolean("outputUnigrams", false); }
@BeforeClass public static void setUp() throws Exception { analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, new DefaultIcuTokenizerConfig(false, true)); TokenStream result = new CJKBigramFilter(source); return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET)); } }; analyzer2 = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, new DefaultIcuTokenizerConfig(false, true)); TokenStream result = new IcuNormalizerFilter(source, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); result = new CJKBigramFilter(result); return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET)); } }; }
@Override public TokenStream create(TokenStream tokenStream) { return new CJKBigramFilter(tokenStream, flags, outputUnigrams); }
@Override public TokenStream create(TokenStream input) { return new CJKBigramFilter(input, flags, outputUnigrams); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new ICUTokenizer(newAttributeFactory(), reader, new DefaultICUTokenizerConfig(false)); TokenStream result = new CJKBigramFilter(source); return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new ICUTokenizer(reader); TokenStream result = new CJKBigramFilter(source); return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)); }