Java 类org.apache.lucene.analysis.core.StopFilter 实例源码

项目:lams    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:lams    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:search    文件:TestFreeTextSuggester.java   
public void testEndingHole() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader);
        CharArraySet stopSet = StopFilter.makeStopSet("of");
        return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
      new Input("wizard of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("wizard _ oz/1.00",
               toString(sug.lookup("wizard of", 10)));

  // Falls back to unigram model, with backoff 0.4 times
  // prop 0.5:
  assertEquals("oz/0.20",
               toString(sug.lookup("wizard o", 10)));
}
项目:search    文件:TestFreeTextSuggester.java   
public void testTwoEndingHoles() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader);
        CharArraySet stopSet = StopFilter.makeStopSet("of");
        return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
      new Input("wizard of of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("",
               toString(sug.lookup("wizard of of", 10)));
}
项目:search    文件:TestSuggestStopFilter.java   
public void testEndIsStopWord() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet("to");
  TokenStream stream = new MockTokenizer(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}
项目:search    文件:TestSuggestStopFilter.java   
public void testMidStopWord() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet("to");
  TokenStream stream = new MockTokenizer(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}
项目:search    文件:TestSuggestStopFilter.java   
public void testMultipleStopWords() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  TokenStream stream = new MockTokenizer(new StringReader("go to a the school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "school" },
                            new int[] {0, 12},
                            new int[] {2, 18},
                            null,
                            new int[] {1, 4},
                            null,
                            18,
                            new boolean[] {false, false},
                            true);
}
项目:search    文件:TestSuggestStopFilter.java   
public void testMultipleStopWordsEnd() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  TokenStream stream = new MockTokenizer(new StringReader("go to a the"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "the"},
                            new int[] {0, 8},
                            new int[] {2, 11},
                            null,
                            new int[] {1, 3},
                            null,
                            11,
                            new boolean[] {false, true},
                            true);
}
项目:search    文件:TestSuggestStopFilter.java   
public void testMultipleStopWordsEnd2() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  TokenStream stream = new MockTokenizer(new StringReader("go to a the "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            12,
                            new boolean[] {false},
                            true);
}
项目:search    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:search    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:auto-phrase-tokenfilter    文件:AutoPhrasingQParserPlugin.java   
private CharArraySet getWordSet( ResourceLoader loader,
                               String wordFiles, boolean ignoreCase)
                               throws IOException {
   List<String> files = splitFileNames(wordFiles);
CharArraySet words = null;
   if (files.size() > 0) {
     // default stopwords list has 35 or so words, but maybe don't make it that
     // big to start
     words = new CharArraySet( files.size() * 10, ignoreCase);
     for (String file : files) {
       List<String> wlist = getLines(loader, file.trim());
    words.addAll(StopFilter.makeStopSet( wlist, ignoreCase));
     }
   }
   return words;
 }
项目:NYBC    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:NYBC    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:read-open-source-code    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:read-open-source-code    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:read-open-source-code    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:read-open-source-code    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:read-open-source-code    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:read-open-source-code    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:information-retrieval-adventure    文件:SkippingNumbersPreservingChemicals.java   
public static void main(String[] args) throws IOException {

    String theSentence =
        "this is the scientific article about chemicals like H20 C2H50H with concentration "
            + "of 3.99 kilograms and 0,123 micrograms also i have some CO2 gas n=3 x=45";
    StringReader reader = new StringReader(theSentence);
    Tokenizer whitespaceTokenizer = new WhitespaceTokenizer(reader);
    TokenStream tokenStream =
        new StopFilter(whitespaceTokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    tokenStream = new ScientificFiltering(tokenStream);

    final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();

    while (tokenStream.incrementToken()) {
      System.out.println(charTermAttribute.toString());
    }

    tokenStream.end();
    tokenStream.close();
  }
项目:Maskana-Gestor-de-Conocimiento    文件:TestFreeTextSuggester.java   
public void testEndingHole() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader);
        CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
        return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = shuffle(
      new Input("wizard of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("wizard _ oz/1.00",
               toString(sug.lookup("wizard of", 10)));

  // Falls back to unigram model, with backoff 0.4 times
  // prop 0.5:
  assertEquals("oz/0.20",
               toString(sug.lookup("wizard o", 10)));
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestFreeTextSuggester.java   
public void testTwoEndingHoles() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader);
        CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
        return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = shuffle(
      new Input("wizard of of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("",
               toString(sug.lookup("wizard of of", 10)));
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestSuggestStopFilter.java   
public void testEndIsStopWord() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
  TokenStream stream = new MockTokenizer(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestSuggestStopFilter.java   
public void testMidStopWord() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
  TokenStream stream = new MockTokenizer(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestSuggestStopFilter.java   
public void testMultipleStopWords() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
  TokenStream stream = new MockTokenizer(new StringReader("go to a the school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "school" },
                            new int[] {0, 12},
                            new int[] {2, 18},
                            null,
                            new int[] {1, 4},
                            null,
                            18,
                            new boolean[] {false, false},
                            true);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestSuggestStopFilter.java   
public void testMultipleStopWordsEnd() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
  TokenStream stream = new MockTokenizer(new StringReader("go to a the"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "the"},
                            new int[] {0, 8},
                            new int[] {2, 11},
                            null,
                            new int[] {1, 3},
                            null,
                            11,
                            new boolean[] {false, true},
                            true);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestSuggestStopFilter.java   
public void testMultipleStopWordsEnd2() throws Exception {

  CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
  TokenStream stream = new MockTokenizer(new StringReader("go to a the "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);

  filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            12,
                            new boolean[] {false},
                            true);
}
项目:Maskana-Gestor-de-Conocimiento    文件:AbstractAnalysisFactory.java   
/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
    String wordFiles, boolean ignoreCase) throws IOException {
  assureMatchVersion();
  List<String> files = splitFileNames(wordFiles);
  CharArraySet words = null;
  if (files.size() > 0) {
    // default stopwords list has 35 or so words, but maybe don't make it that
    // big to start
    words = new CharArraySet(luceneMatchVersion,
        files.size() * 10, ignoreCase);
    for (String file : files) {
      List<String> wlist = getLines(loader, file.trim());
      words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
          ignoreCase));
    }
  }
  return words;
}
项目:Maskana-Gestor-de-Conocimiento    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:lams    文件:QueryAutoStopWordAnalyzer.java   
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
  Set<String> stopWords = stopWordsPerField.get(fieldName);
  if (stopWords == null) {
    return components;
  }
  StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
      new CharArraySet(stopWords, false));
  return new TokenStreamComponents(components.getTokenizer(), stopFilter);
}
项目:Elasticsearch    文件:StopTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    if (removeTrailing) {
        if (version.onOrAfter(Version.LUCENE_4_4)) {
            return new StopFilter(tokenStream, stopWords);
        } else {
            return new Lucene43StopFilter(enablePositionIncrements, tokenStream, stopWords);
        }
    } else {
        return new SuggestStopFilter(tokenStream, stopWords);
    }
}
项目:Indra    文件:IndraAnalyzer.java   
private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) {

        if (metadataStopWords != null && !metadataStopWords.isEmpty()) {
            return new StopFilter(stream, new CharArraySet(metadataStopWords, false));

        } else {
            try {
                InputStream in = ClassLoader.getSystemResourceAsStream(lang.toLowerCase() + ".stopwords");
                if (in != null) {
                    logger.debug("Loading Stop words for lang={}", lang);
                    CharArraySet stopWords = new CharArraySet(30, true);
                    try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) {
                        String line;
                        String[] parts;
                        while ((line = bin.readLine()) != null) {
                            parts = line.split(Pattern.quote("|"));
                            line = parts[0].trim();

                            if (line.length() > 0) {
                                stopWords.add(line);
                            }
                        }
                        return new StopFilter(stream, stopWords);
                    }
                } else {
                    logger.warn("No stop words found for lang={}", lang);
                }
            } catch (Exception e) {
                logger.error("Error creating stop filter for lang={}", lang, e);
            }
        }

        return stream;
    }
项目:GeoCrawler    文件:LuceneAnalyzerUtil.java   
/**
 * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then 
 * user provided stop words will be added to the Lucene default stopset.
 */
public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) {
  LuceneAnalyzerUtil.stemFilterType = stemFilterType;
  if(addToDefault) {
    stopSet.addAll(stopWords);
  }
  else {
    stopSet = StopFilter.makeStopSet(stopWords);
  }
}
项目:AdSearch_Endpoints    文件:QueryParserImpl.java   
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
    List<String> tokens = new ArrayList<String>();
    AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
    Tokenizer tokenizer = new StandardTokenizer(factory);
    tokenizer.setReader(new StringReader(queryStr));
    CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();

            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
    } catch (IOException e) {
        e.printStackTrace();
    }
//  System.out.println("QU="+ sb.toString());
    return tokens;  
  }
项目:news-credibility    文件:EgdeMain.java   
public static void main(String[] args) throws IOException {
    System.out.println(NumberUtils.isDigits("12345"));
    System.out.println(NumberUtils.isDigits("12345.1"));
    System.out.println(NumberUtils.isDigits("12345,2"));

    System.out.println(NumberUtils.isNumber("12345"));
    System.out.println(NumberUtils.isNumber("12345.1"));
    System.out.println(NumberUtils.isNumber("12345,2".replace(",", ".")));
    System.out.println(NumberUtils.isNumber("12345,2"));
    StringReader input = new StringReader(
            "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24"
                    .replaceAll("br2n", ""));

    LetterTokenizer tokenizer = new LetterTokenizer();
    tokenizer.setReader(input);

    TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET);
    TokenFilter length = new LengthFilter(stopFilter, 3, 1000);
    TokenFilter stemmer = new BulgarianStemFilter(length);
    TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2);

    try (TokenFilter filter = ngrams) {

        Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        while (filter.incrementToken()) {
            String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", "");
            System.out.println(word);
        }
    }
}
项目:search    文件:TestSuggestStopFilter.java   
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  TokenStream stream = new MockTokenizer(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
项目:search    文件:QueryAutoStopWordAnalyzer.java   
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
  Set<String> stopWords = stopWordsPerField.get(fieldName);
  if (stopWords == null) {
    return components;
  }
  StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
      new CharArraySet(stopWords, false));
  return new TokenStreamComponents(components.getTokenizer(), stopFilter);
}
项目:search    文件:ManagedStopFilterFactory.java   
/**
 * Returns a StopFilter based on our managed stop word set.
 */
@Override
public TokenStream create(TokenStream input) {    
  if (stopWords == null) {
    throw new IllegalStateException("Managed stopwords not initialized correctly!");
  }
  return new StopFilter(input, stopWords);
}
项目:lucenelab    文件:AnnotatorTokenFilterTest.java   
@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) {
        stubAnnotator(TWO);
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(TWO, 1));
    }
}