@OptionMetadata(displayName = "stopwordsHandler", description = "The stopwords handler to use (Null means no stopwords are used).", commandLineParamName = "stopwords-handler", commandLineParamSynopsis = "-stopwords-handler <string>", displayOrder = 5) public StopwordsHandler getStopwordsHandler() { return m_stopwordsHandler; }
/** * Tokenizes a String * @param content the content * @param toLowerCase true for lowercasing the content * @param standarizeUrlsUsers true for standarizing urls and users * @param reduceRepeatedLetters true for reduing repeated letters * @param tokenizer the tokenizer * @param stemmer the stemmer * @param stop the stopwords handler * @return a list of tokens */ static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers, boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) { if (toLowerCase) content = content.toLowerCase(); // if a letters appears two or more times it is replaced by only two // occurrences of it if (reduceRepeatedLetters) content = content.replaceAll("([a-z])\\1+", "$1$1"); List<String> tokens = new ArrayList<String>(); tokenizer.tokenize(content); for(;tokenizer.hasMoreElements();){ String token=tokenizer.nextElement(); if(!stop.isStopword(token)){ if (standarizeUrlsUsers) { // Replace URLs to a generic URL if (token.matches("http.*|ww\\..*|www\\..*")) { token="http://www.url.com"; } // Replaces user mentions to a generic user else if (token.matches("@.*")) { token="@user"; } } tokens.add(stemmer.stem(token)); } } return tokens; }
public void setStopwordsHandler(StopwordsHandler m_stopwordsHandler) { this.m_stopwordsHandler = m_stopwordsHandler; }
/** * Gets the stopwords handler. * * @return the stopwords handler */ public StopwordsHandler getStopwordsHandler() { return m_StopwordsHandler; }
@Override public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) { final String html = documentContentData.getContent(); final Attribute input = new Attribute(HTML, (ArrayList<String>) null); final ArrayList<Attribute> inputVec = new ArrayList<>(); inputVec.add(input); final Instances htmlInst = new Instances(HTML, inputVec, 1); htmlInst.add(new DenseInstance(1)); htmlInst.instance(0).setValue(0, html); final StopwordsHandler stopwordsHandler = new StopwordsHandler() { @Override public boolean isStopword(final String word) { return word.length() <5; } }; final NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(1); tokenizer.setNGramMaxSize(1); tokenizer.setDelimiters(TOKEN_DELIMITERS); final StringToWordVector filter = new StringToWordVector(); filter.setTokenizer(tokenizer); filter.setStopwordsHandler(stopwordsHandler); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setWordsToKeep(maxResult); final Map<String,Integer> result = new HashMap<>(); try { filter.setInputFormat(htmlInst); final Instances dataFiltered = Filter.useFilter(htmlInst, filter); final Instance last = dataFiltered.lastInstance(); final int numAttributes = last.numAttributes(); for (int i = 0; i < numAttributes; i++) { result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i))); } } catch (final Exception e) { LOGGER.warn("Problem calculating wordcount for : {} , exception:{}",documentContentData.getId() ,e); } return result; }