@OptionMetadata(displayName = "tokenizer", description = "The tokenizing algorithm to use on the tweets. Uses the CMU TweetNLP tokenizer as default", commandLineParamName = "tokenizer", commandLineParamSynopsis = "-tokenizer <string>", displayOrder = 3) public Tokenizer getTokenizer() { return m_tokenizer; }
@OptionMetadata(displayName = "tokenizer", description = "The tokenizing algorithm to use on the tweets. Uses the CMU TweetNLP tokenizer as default", commandLineParamName = "tokenizer", commandLineParamSynopsis = "-tokenizer <string>", displayOrder = 1) public Tokenizer getTokenizer() { return m_tokenizer; }
/** * Tokenizes a String * @param content the content * @param toLowerCase true for lowercasing the content * @param standarizeUrlsUsers true for standarizing urls and users * @param reduceRepeatedLetters true for reduing repeated letters * @param tokenizer the tokenizer * @param stemmer the stemmer * @param stop the stopwords handler * @return a list of tokens */ static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers, boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) { if (toLowerCase) content = content.toLowerCase(); // if a letters appears two or more times it is replaced by only two // occurrences of it if (reduceRepeatedLetters) content = content.replaceAll("([a-z])\\1+", "$1$1"); List<String> tokens = new ArrayList<String>(); tokenizer.tokenize(content); for(;tokenizer.hasMoreElements();){ String token=tokenizer.nextElement(); if(!stop.isStopword(token)){ if (standarizeUrlsUsers) { // Replace URLs to a generic URL if (token.matches("http.*|ww\\..*|www\\..*")) { token="http://www.url.com"; } // Replaces user mentions to a generic user else if (token.matches("@.*")) { token="@user"; } } tokens.add(stemmer.stem(token)); } } return tokens; }
public void setTokenizer(Tokenizer m_tokenizer) { this.m_tokenizer = m_tokenizer; }
/** * the tokenizer algorithm to use. * * @param value the configured tokenizing algorithm */ public void setTokenizer(Tokenizer value) { m_tokenizer = value; }
/** * Returns the current tokenizer algorithm. * * @return the current tokenizer algorithm */ public Tokenizer getTokenizer() { return m_tokenizer; }
/** * the tokenizer algorithm to use. * * @param value the configured tokenizing algorithm */ public void setTokenizer(Tokenizer value) { m_Tokenizer = value; }
/** * Returns the current tokenizer algorithm. * * @return the current tokenizer algorithm */ public Tokenizer getTokenizer() { return m_Tokenizer; }