/** * Test stopwords in snowball format */ public void testSnowballListLoading() throws IOException { String s = "|comment\n" + // commented line " |comment\n" + // commented line with leading whitespace "\n" + // blank line " \t\n" + // line with only whitespace " |comment | comment\n" + // commented line with comment "ONE\n" + // stopword, in uppercase " two \n" + // stopword with leading/trailing space " three four five \n" + // multiple stopwords "six seven | comment\n"; //multiple stopwords + comment CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s)); assertEquals(7, wordset.size()); assertTrue(wordset.contains("ONE")); assertTrue(wordset.contains("two")); assertTrue(wordset.contains("three")); assertTrue(wordset.contains("four")); assertTrue(wordset.contains("five")); assertTrue(wordset.contains("six")); assertTrue(wordset.contains("seven")); }
/** * Test stopwords in snowball format */ public void testSnowballListLoading() throws IOException { String s = "|comment\n" + // commented line " |comment\n" + // commented line with leading whitespace "\n" + // blank line " \t\n" + // line with only whitespace " |comment | comment\n" + // commented line with comment "ONE\n" + // stopword, in uppercase " two \n" + // stopword with leading/trailing space " three four five \n" + // multiple stopwords "six seven | comment\n"; //multiple stopwords + comment CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT); assertEquals(7, wordset.size()); assertTrue(wordset.contains("ONE")); assertTrue(wordset.contains("two")); assertTrue(wordset.contains("three")); assertTrue(wordset.contains("four")); assertTrue(wordset.contains("five")); assertTrue(wordset.contains("six")); assertTrue(wordset.contains("seven")); }
/** * Ritorna il set di stop words di default per una lingua * * @param language lingua * @return set di stop words */ public static CharArraySet getDefaultStopSet(String language) { try { if ("en".equalsIgnoreCase(language)) { return StandardAnalyzer.STOP_WORDS_SET; } else if ("es".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "spanish_stop.txt", StandardCharsets.UTF_8)); } else if ("fr".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "french_stop.txt", StandardCharsets.UTF_8)); } else if ("de".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "german_stop.txt", StandardCharsets.UTF_8)); } else if ("pl".equalsIgnoreCase(language)) { return WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class, "stopwords.txt", StandardCharsets.UTF_8), "#"); } else if ("pt".equalsIgnoreCase(language) || "br".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "portuguese_stop.txt", StandardCharsets.UTF_8)); } else if ("it".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "italian_stop.txt", StandardCharsets.UTF_8)); } else if ("cz".equalsIgnoreCase(language) || "sk".equalsIgnoreCase(language)) { return WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class, "stopwords.txt", StandardCharsets.UTF_8), "#"); } else if ("tr".equalsIgnoreCase(language)) { return TurkishAnalyzer.loadStopwordSet(false, TurkishAnalyzer.class, "stopwords.txt", "#"); } else if ("ru".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "russian_stop.txt", StandardCharsets.UTF_8)); } else if ("ro".equalsIgnoreCase(language)) { return RomanianAnalyzer.loadStopwordSet(false, RomanianAnalyzer.class, "stopwords.txt", "#"); } else if ("bg".equalsIgnoreCase(language)) { return BulgarianAnalyzer.loadStopwordSet(false, BulgarianAnalyzer.class, "stopwords.txt", "#"); } else if ("nl".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "dutch_stop.txt", StandardCharsets.UTF_8)); } } catch (Exception ignored) { throw new RuntimeException("Unable to load default stopword set"); } return StandardAnalyzer.STOP_WORDS_SET; }
static CharArraySet loadDefaultStopWordSet() throws IOException { // make sure it is unmodifiable as we expose it in the outer class return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT, Version.LATEST)); }
public void testWordlistLoading() throws IOException { String s = "ONE\n two \nthree"; CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s)); checkSet(wordSet1); CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s))); checkSet(wordSet2); }
public void testComments() throws Exception { String s = "ONE\n two \nthree\n#comment"; CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#"); checkSet(wordSet1); assertFalse(wordSet1.contains("#comment")); assertFalse(wordSet1.contains("comment")); }
public List<String> getLines(String resource, Charset charset) throws IOException{ try { return WordlistLoader.getLines(openResource(resource), charset); } catch (CharacterCodingException ex) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error loading resource (wrong encoding?): " + resource, ex); } }
static CharArraySet loadDefaultStopWordSet() throws IOException { // make sure it is unmodifiable as we expose it in the outer class return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT)); }
public void testWordlistLoading() throws IOException { String s = "ONE\n two \nthree"; CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT); checkSet(wordSet1); CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT); checkSet(wordSet2); }
public void testComments() throws Exception { String s = "ONE\n two \nthree\n#comment"; CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT); checkSet(wordSet1); assertFalse(wordSet1.contains("#comment")); assertFalse(wordSet1.contains("comment")); }
protected void initStopWords() { try { stopWords = WordlistLoader.getWordSet(IOUtils.getDecodingReader(getClass(), "stopwords.txt", StandardCharsets.UTF_8), "#", getLuceneMatchVersion()); } catch (IOException ex) { throw new RuntimeException("Unable to load default stopword set"); } }
static CharArraySet loadDefaultStopWordSet() throws IOException { // make sure it is unmodifiable as we expose it in the outer class return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet( IOUtils.getDecodingReader(SimpleChineseAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT)); }
private List<String> getLines(ResourceLoader loader, String resource) throws IOException { return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8); }