public static void url(String id, URL u, String url_in, SpimeDB db, float pri) { DObject p = db.get(id); Long whenCached = p != null ? p.get("url_cached") : null; try { if (whenCached == null || whenCached < u.openConnection().getLastModified()) { String urlString = u.toString(); Set<String> keywords = parseKeywords(new LowerCaseTokenizer(), urlString); MutableNObject n = new MutableNObject(id) .withTags(keywords.toArray(new String[keywords.size()])) .put("url_in", url_in) .put("url", urlString); //logger.info("crawl {}", n); db.addAsync(pri, n); } } catch (IOException e) { e.printStackTrace(); } }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet( 1, true); set.add("fischen"); GermanStemFilter filter = new GermanStemFilter( new SetKeywordMarkerFilter(new LowerCaseTokenizer(new StringReader( "Fischen Trinken")), set)); assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("Brasília"); BrazilianStemFilter filter = new BrazilianStemFilter( new SetKeywordMarkerFilter(new LowerCaseTokenizer(new StringReader( "Brasília Brasilia")), set)); assertTokenStreamContents(filter, new String[] { "brasília", "brasil" }); }
public void testReadSupplementaryChars() throws IOException { StringBuilder builder = new StringBuilder(); // create random input int num = 1024 + random().nextInt(1024); num *= RANDOM_MULTIPLIER; for (int i = 1; i < num; i++) { builder.append("\ud801\udc1cabc"); if((i % 10) == 0) builder.append(" "); } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" ")); }
public void testExtendCharBuffer() throws IOException { for (int i = 0; i < 40; i++) { StringBuilder builder = new StringBuilder(); for (int j = 0; j < 1+i; j++) { builder.append("a"); } builder.append("\ud801\udc1cabc"); Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)}); } }
public void testMaxWordLength() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 255; i++) { builder.append("A"); } Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); }
public void testMaxWordLengthWithSupplementary() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 254; i++) { builder.append("A"); } builder.append("\ud801\udc1c"); Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("fischen"); GermanStemFilter filter = new GermanStemFilter( new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader( "Fischen Trinken")), set)); assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("Brasília"); BrazilianStemFilter filter = new BrazilianStemFilter( new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader( "Brasília Brasilia")), set)); assertTokenStreamContents(filter, new String[] { "brasília", "brasil" }); }
public void testReadSupplementaryChars() throws IOException { StringBuilder builder = new StringBuilder(); // create random input int num = 1024 + random().nextInt(1024); num *= RANDOM_MULTIPLIER; for (int i = 1; i < num; i++) { builder.append("\ud801\udc1cabc"); if((i % 10) == 0) builder.append(" "); } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" ")); }
public void testExtendCharBuffer() throws IOException { for (int i = 0; i < 40; i++) { StringBuilder builder = new StringBuilder(); for (int j = 0; j < 1+i; j++) { builder.append("a"); } builder.append("\ud801\udc1cabc"); Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)}); } }
public void testMaxWordLength() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 255; i++) { builder.append("A"); } Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); }
public void testMaxWordLengthWithSupplementary() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 254; i++) { builder.append("A"); } builder.append("\ud801\udc1c"); Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("fischen"); GermanStemFilter filter = new GermanStemFilter( new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader( "Fischen Trinken")), set)); assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("Brasília"); BrazilianStemFilter filter = new BrazilianStemFilter( new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader( "Brasília Brasilia")), set)); assertTokenStreamContents(filter, new String[] { "brasília", "brasil" }); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new LowerCaseTokenizer(); TokenStream filter = new NGramTokenFilter(tokenizer, 1, 5); return new TokenStreamComponents(tokenizer, filter); }
@Override public Tokenizer create() { return new LowerCaseTokenizer(); }
public static List<String> getNgrams(String text, int N) throws IOException { List<String> tokens = new ArrayList<String>(); Reader reader = new StringReader(text); // Tokenizer //StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_46, reader); LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_46, reader); // Filters LowerCaseFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_46, tokenizer); KStemFilter kStemFilter = new KStemFilter(lowerCaseFilter); CharArraySet stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; StopFilter stopFilter = new StopFilter(Version.LUCENE_46, kStemFilter, stopwords); TokenStream ts; if(N > 1) { PositionFilter positionFilter = new PositionFilter(stopFilter); //@SuppressWarnings("resource") //ShingleFilter shingleFilter = new ShingleFilter(positionFilter, N, N); //shingleFilter.setOutputUnigrams(false); @SuppressWarnings("resource") ShingleFilter shingleFilter = new ShingleFilter(positionFilter, 2, N); shingleFilter.setOutputUnigrams(true); ts = shingleFilter; } else { ts = stopFilter; } CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = charTermAtt.toString(); if(token.length()>1) tokens.add(token); } ts.end(); ts.close(); return tokens; }
@Override protected TokenStreamComponents createComponents(String s) { Tokenizer source = new LowerCaseTokenizer(); return new TokenStreamComponents(source, new PorterStemFilter(source)); }
@Override public LowerCaseTokenizer create(Reader input) { return new LowerCaseTokenizer(luceneMatchVersion,input); }
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader); return new TokenStreamComponents(source, new ASCIIFoldingFilter(source)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new LowerCaseTokenizer(LUCENE_VERSION, reader); return new TokenStreamComponents(source, new PorterStemFilter(source)); }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { return new LowerCaseTokenizer(reader); }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { StopFilter stopFilter = new StopFilter(true, new LowerCaseTokenizer(reader), stopWords); stopFilter.setEnablePositionIncrements(true); return new PorterStemFilter(stopFilter); }