public PatternAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET; boolean lowercase = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "lowercase", true, deprecationLogger); CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, defaultStopwords); String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/); if (sPattern == null) { throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set"); } Pattern pattern = Regex.compile(sPattern, settings.get("flags")); analyzer = new PatternAnalyzer(pattern, lowercase, stopWords); }
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(Strings.commaDelimitedListToSet(value), false); } } String[] stemExclusion = settings.getAsArray("stem_exclusion", null); if (stemExclusion != null) { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(Arrays.asList(stemExclusion), false); } else { return defaultStemExclusion; } }
public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, Map<String, Set<?>> namedWords, boolean ignoreCase) { String value = settings.get(name); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, ignoreCase); } } List<String> pathLoadedWords = getWordList(env, settings, name); if (pathLoadedWords != null) { return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase); } return defaultWords; }
@Test public void testOverlappingAtBeginning() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "new york city is great"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("new_york_city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("is", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); }
@Test public void testOverlappingAtEnd() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "the great city of new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("the", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("city_of_new_york", term.toString()); }
@Test public void testIncompletePhrase() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "big apple", "new york city", "property tax", "three word phrase"), false); final String input = "some new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("some", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("york", term.toString()); }
public RomanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new RomanianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, RomanianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public BasqueAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BasqueAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, BasqueAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET; CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, defaultStopwords); analyzer = new StandardHtmlStripAnalyzer(stopWords); analyzer.setVersion(version); }
public IndonesianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new IndonesianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, IndonesianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); arabicAnalyzer = new ArabicAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, ArabicAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); arabicAnalyzer.setVersion(version); }
public SnowballAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); String language = settings.get("language", settings.get("name", "English")); CharArraySet defaultStopwords = DEFAULT_LANGUAGE_STOPWORDS.getOrDefault(language, CharArraySet.EMPTY_SET); CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, defaultStopwords); analyzer = new SnowballAnalyzer(language, stopWords); analyzer.setVersion(version); }
public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); CharArraySet stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET); this.stopAnalyzer = new StopAnalyzer(stopWords); this.stopAnalyzer.setVersion(version); }
public SwedishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new SwedishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, SwedishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public StandardAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET; CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, defaultStopwords); int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); standardAnalyzer = new StandardAnalyzer(stopWords); standardAnalyzer.setVersion(version); standardAnalyzer.setMaxTokenLength(maxTokenLength); }
public SpanishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new SpanishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, SpanishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
public PortugueseAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new PortugueseAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, PortugueseAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public DanishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new DanishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, DanishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public ArmenianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new ArmenianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, ArmenianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public CjkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); CharArraySet stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, CJKAnalyzer.getDefaultStopSet()); analyzer = new CJKAnalyzer(stopWords); analyzer.setVersion(version); }
public GalicianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new GalicianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, GalicianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public GermanAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new GermanAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, GermanAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public BrazilianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BrazilianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, BrazilianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new ItalianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, ItalianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
public FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new FrenchAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, FrenchAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public HungarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new HungarianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, HungarianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public CzechAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new CzechAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, CzechAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public NorwegianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new NorwegianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, NorwegianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public FinnishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new FinnishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, FinnishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public DutchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new DutchAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, DutchAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public RussianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new RussianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, RussianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public BulgarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BulgarianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, BulgarianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public TurkishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new TurkishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, TurkishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public CatalanAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new CatalanAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, CatalanAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public SoraniAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new SoraniAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, SoraniAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public HindiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new HindiAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, HindiAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new LithuanianAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, LithuanianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }