/** * 对一段文本进行分词,并将分词及其位置加入到urlInfo中 * @param text 待分词的文本 */ private void segment(String text) { IKAnalyzer analyzer = new IKAnalyzer(true); StringReader reader = new StringReader(text); TokenStream tokenStream = analyzer.tokenStream("*", reader); TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class); try { while (tokenStream.incrementToken()) { location ++; String term = termAtt.term(); urlInfo.putURLLocation(term, location); } } catch(IOException exp) { exp.printStackTrace(); } }
protected Set<String> getHighlightWords(final String searchString) { try { final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); final TokenStream stream = analyzer.tokenStream("content", new StringReader(searchString)); final TermAttribute termAtt = stream.addAttribute(TermAttribute.class); for (boolean next = stream.incrementToken(); next; next = stream.incrementToken()) { final String term = termAtt.term(); if (log.isDebugEnabled()) { log.debug(term); } } } catch (final IOException e) { log.error("", e); } return null; }
public List<String> removeStopwordsAndSpecialChars(String value){ List<String> retVal = new ArrayList<String>(); value = value.replaceAll(replaceExpr, ""); StringReader sr = new StringReader(value); TokenStream ts = analyzer.tokenStream(value, sr); try { while(ts.incrementToken()){ TermAttribute m = ts.getAttribute(TermAttribute.class); retVal.add(m.term()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return retVal; }
private void extractTerms(String e) { Set<String> s = new LinkedHashSet<String>(); TokenStream ts = analyzer.tokenStream("", new StringReader(e)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); try { while ( ts.incrementToken() ) { s.add( termAtt.term() ); } } catch (IOException ex) { ex.printStackTrace(); } /* Token token; try { while ((token = ts.next()) != null) { s.add(token.termText()); } } catch (IOException ex) { ex.printStackTrace(); } */ map.put(e, s); }
public void testJumps() throws Exception { TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A new StringReader("jumps")); // #A TermAttribute term = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int i = 0; String[] expected = new String[]{"jumps", // #B "hops", // #B "leaps"}; // #B while(stream.incrementToken()) { assertEquals(expected[i], term.term()); int expectedPos; // #C if (i == 0) { // #C expectedPos = 1; // #C } else { // #C expectedPos = 0; // #C } // #C assertEquals(expectedPos, // #C posIncr.getPositionIncrement()); // #C i++; } assertEquals(3, i); }
private String analyzeQuery(String query) throws IOException { StringBuilder result = new StringBuilder(); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query))); TermAttribute termAttribute = filter.getAttribute(TermAttribute.class); while (filter.incrementToken()) { result.append(termAttribute.term()).append("* "); } return result.toString(); }
/** * * @param text * @return * @throws IOException */ public Map<String, Float> buildBag(String text) throws IOException { Map<String, Float> bag = new HashMap<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); SnowballStemmer stemmer = null; if (stemming) { stemmer = getStemmer(language); if (stemmer == null) { Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language); } } TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text)); while (tokenStream.incrementToken()) { TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); String term = token.term(); if (stemmer != null) { stemmer.setCurrent(term); if (stemmer.stem()) { term = stemmer.getCurrent(); } } Float c = bag.get(term); if (c == null) { bag.put(term, 1f); } else { bag.put(term, c + 1f); } } return bag; }
/** * Takes a gloss-like string (text) and returns it tokenized. * with: * - stopwords * - lower case * - porter stemmer */ protected Set<String> tokenizeGloss( String s ) throws IOException { Set<String> result = new HashSet<String>(); // I am affraid that I am reimplementing the StandardAnalizer... TokenStream ts = new PorterStemFilter( new StopFilter( true, new LowerCaseTokenizer( new StringReader( s ) ), stopWords, true )); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while ( ts.incrementToken() ) { result.add( termAtt.term() ); } return result; }
/** * add all words contained in toAnalyse into words collection. Words are stemmed. * @param toAnalyse : the string to be analysed * @param words : the collection to add extracted words */ protected void analyseString(String toAnalyse, Collection<String> words) { TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse)); TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class); try { while ( tokenS.incrementToken() ) { words.add( termAtt.term() ); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public SynonymFilter(TokenStream in, SynonymEngine engine) { super(in); synonymStack = new Stack<String>(); //#1 this.engine = engine; this.termAtt = addAttribute(TermAttribute.class); this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); }
private static void analyze(String string, Analyzer analyzer) throws IOException { StringBuffer buffer = new StringBuffer(); TokenStream stream = analyzer.tokenStream("contents", new StringReader(string)); TermAttribute term = stream.addAttribute(TermAttribute.class); while(stream.incrementToken()) { //C buffer.append("["); buffer.append(term.term()); buffer.append("] "); } String output = buffer.toString(); Frame f = new Frame(); f.setTitle(analyzer.getClass().getSimpleName() + " : " + string); f.setResizable(true); Font font = new Font(null, Font.PLAIN, 36); int width = getWidth(f.getFontMetrics(font), output); f.setSize((width < 250) ? 250 : width + 50, 75); // NOTE: if Label doesn't render the Chinese characters // properly, try using javax.swing.JLabel instead Label label = new Label(output); //D label.setSize(width, 75); label.setAlignment(Label.CENTER); label.setFont(font); f.add(label); f.setVisible(true); }
protected MySynonymFilter(TokenStream input) { super(input); termAtt = addAttribute(TermAttribute.class); synonymMap.put("lucene", "information retrieval"); synonymMap.put("c#", "csharp"); }
public static void testStandardAnalyzer() throws Exception { System.out.println("Standard Analyzer"); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream ts = analyzer.tokenStream("Field", new StringReader("The quick brown fox jumps over lazy dog")); ts.reset(); while (ts.incrementToken()) { //System.out.println("token: " + ts.toString()); System.out.println("token: " + ts.getAttribute(TermAttribute.class).term()); } ts.close(); }
public static void testSynonymAnalyzer() throws Exception { Analyzer analyzer = new SynonymAnalyzer(); TokenStream ts = analyzer.tokenStream("Address", new StringReader("Expertise in C# and Lucene")); ts.reset(); while (ts.incrementToken()) { //System.out.println("token: " + ts.toString()); System.out.println("token: " + ts.getAttribute(TermAttribute.class).term()); } ts.close(); }
BulletinPayloadsFilter(TokenStream in, float warningBoost) { super(in); payloadAttr = addAttribute(PayloadAttribute.class); termAtt = addAttribute(TermAttribute.class); boostPayload = new Payload(PayloadHelper.encodeFloat(warningBoost)); }
public PositionalStopFilter(TokenStream in, CharArraySet stopWords) { super(in); this.stopWords = stopWords; posIncrAttr = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); termAttr = (TermAttribute) addAttribute(TermAttribute.class); }