public static StringToWordVector getWordFilter(Instances data, ClassifierStructure struct) { StringToWordVector filter = new StringToWordVector(); try { filter.setWordsToKeep(20000000); WordTokenizer token = new WordTokenizer(); filter.setTokenizer(token); filter.setInputFormat(data); } catch (Exception e1) { e1.printStackTrace(); } return filter; }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(); newVector.add(new Option("\tUse word frequencies instead of " + "binary bag of words.", "W", 0, "-W")); newVector.add(new Option("\tHow often to prune the dictionary " + "of low frequency words (default = 0, i.e. don't prune)", "P", 1, "-P <# instances>")); newVector.add(new Option("\tMinimum word frequency. Words with less " + "than this frequence are ignored.\n\tIf periodic pruning " + "is turned on then this is also used to determine which\n\t" + "words to remove from the dictionary (default = 3).", "M", 1, "-M <double>")); newVector.addElement(new Option( "\tNormalize document length (use in conjunction with -norm and " + "-lnorm)", "normalize", 0, "-normalize")); newVector.addElement(new Option( "\tSpecify the norm that each instance must have (default 1.0)", "norm", 1, "-norm <num>")); newVector.addElement( new Option("\tSpecify L-norm to use (default 2.0)", "lnorm", 1, "-lnorm <num>")); newVector.addElement(new Option("\tConvert all tokens to lowercase " + "before adding to the dictionary.", "lowercase", 0, "-lowercase")); newVector.addElement( new Option("\tThe stopwords handler to use (default Null).", "-stopwords-handler", 1, "-stopwords-handler")); newVector.addElement(new Option( "\tThe tokenizing algorihtm (classname plus parameters) to use.\n" + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>")); newVector.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); newVector.addAll(Collections.list(super.listOptions())); return newVector.elements(); }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(); newVector.add(new Option("\tSet the loss function to minimize. 0 = " + "hinge loss (SVM), 1 = log loss (logistic regression)\n\t" + "(default = 0)", "F", 1, "-F")); newVector .add(new Option("\tOutput probabilities for SVMs (fits a logsitic\n\t" + "model to the output of the SVM)", "output-probs", 0, "-outputProbs")); newVector.add(new Option("\tThe learning rate (default = 0.01).", "L", 1, "-L")); newVector.add(new Option("\tThe lambda regularization constant " + "(default = 0.0001)", "R", 1, "-R <double>")); newVector.add(new Option("\tThe number of epochs to perform (" + "batch learning only, default = 500)", "E", 1, "-E <integer>")); newVector.add(new Option("\tUse word frequencies instead of " + "binary bag of words.", "W", 0, "-W")); newVector.add(new Option("\tHow often to prune the dictionary " + "of low frequency words (default = 0, i.e. don't prune)", "P", 1, "-P <# instances>")); newVector.add(new Option("\tMinimum word frequency. Words with less " + "than this frequence are ignored.\n\tIf periodic pruning " + "is turned on then this is also used to determine which\n\t" + "words to remove from the dictionary (default = 3).", "M", 1, "-M <double>")); newVector.add(new Option("\tMinimum absolute value of coefficients " + "in the model.\n\tIf periodic pruning is turned on then this\n\t" + "is also used to prune words from the dictionary\n\t" + "(default = 0.001", "min-coeff", 1, "-min-coeff <double>")); newVector.addElement(new Option( "\tNormalize document length (use in conjunction with -norm and " + "-lnorm)", "normalize", 0, "-normalize")); newVector.addElement(new Option( "\tSpecify the norm that each instance must have (default 1.0)", "norm", 1, "-norm <num>")); newVector.addElement(new Option("\tSpecify L-norm to use (default 2.0)", "lnorm", 1, "-lnorm <num>")); newVector.addElement(new Option("\tConvert all tokens to lowercase " + "before adding to the dictionary.", "lowercase", 0, "-lowercase")); newVector.addElement(new Option( "\tThe stopwords handler to use (default Null).", "-stopwords-handler", 1, "-stopwords-handler")); newVector.addElement(new Option( "\tThe tokenizing algorihtm (classname plus parameters) to use.\n" + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>")); newVector.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); newVector.addAll(Collections.list(super.listOptions())); return newVector.elements(); }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(); newVector.add(new Option("\tUse word frequencies instead of " + "binary bag of words.", "W", 0, "-W")); newVector.add(new Option("\tHow often to prune the dictionary " + "of low frequency words (default = 0, i.e. don't prune)", "P", 1, "-P <# instances>")); newVector.add(new Option("\tMinimum word frequency. Words with less " + "than this frequence are ignored.\n\tIf periodic pruning " + "is turned on then this is also used to determine which\n\t" + "words to remove from the dictionary (default = 3).", "M", 1, "-M <double>")); newVector.addElement(new Option( "\tNormalize document length (use in conjunction with -norm and " + "-lnorm)", "normalize", 0, "-normalize")); newVector.addElement(new Option( "\tSpecify the norm that each instance must have (default 1.0)", "norm", 1, "-norm <num>")); newVector.addElement(new Option( "\tSpecify L-norm to use (default 2.0)", "lnorm", 1, "-lnorm <num>")); newVector.addElement(new Option("\tConvert all tokens to lowercase " + "before adding to the dictionary.", "lowercase", 0, "-lowercase")); newVector.addElement(new Option( "\tIgnore words that are in the stoplist.", "stoplist", 0, "-stoplist")); newVector.addElement(new Option( "\tA file containing stopwords to override the default ones.\n" + "\tUsing this option automatically sets the flag ('-stoplist') to use the\n" + "\tstoplist if the file exists.\n" + "\tFormat: one stopword per line, lines starting with '#'\n" + "\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>")); newVector.addElement(new Option( "\tThe tokenizing algorihtm (classname plus parameters) to use.\n" + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>")); newVector.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); return newVector.elements(); }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(); newVector.add(new Option("\tSet the loss function to minimize. 0 = " + "hinge loss (SVM), 1 = log loss (logistic regression)\n\t" + "(default = 0)", "F", 1, "-F")); newVector.add(new Option("\tOutput probabilities for SVMs (fits a logsitic\n\t" + "model to the output of the SVM)", "output-probs", 0, "-outputProbs")); newVector.add(new Option("\tThe learning rate (default = 0.01).", "L", 1, "-L")); newVector.add(new Option("\tThe lambda regularization constant " + "(default = 0.0001)", "R", 1, "-R <double>")); newVector.add(new Option("\tThe number of epochs to perform (" + "batch learning only, default = 500)", "E", 1, "-E <integer>")); newVector.add(new Option("\tUse word frequencies instead of " + "binary bag of words.", "W", 0, "-W")); newVector.add(new Option("\tHow often to prune the dictionary " + "of low frequency words (default = 0, i.e. don't prune)", "P", 1, "-P <# instances>")); newVector.add(new Option("\tMinimum word frequency. Words with less " + "than this frequence are ignored.\n\tIf periodic pruning " + "is turned on then this is also used to determine which\n\t" + "words to remove from the dictionary (default = 3).", "M", 1, "-M <double>")); newVector.addElement(new Option( "\tNormalize document length (use in conjunction with -norm and " + "-lnorm)", "normalize", 0, "-normalize")); newVector.addElement(new Option( "\tSpecify the norm that each instance must have (default 1.0)", "norm", 1, "-norm <num>")); newVector.addElement(new Option( "\tSpecify L-norm to use (default 2.0)", "lnorm", 1, "-lnorm <num>")); newVector.addElement(new Option("\tConvert all tokens to lowercase " + "before adding to the dictionary.", "lowercase", 0, "-lowercase")); newVector.addElement(new Option( "\tIgnore words that are in the stoplist.", "stoplist", 0, "-stoplist")); newVector.addElement(new Option( "\tA file containing stopwords to override the default ones.\n" + "\tUsing this option automatically sets the flag ('-stoplist') to use the\n" + "\tstoplist if the file exists.\n" + "\tFormat: one stopword per line, lines starting with '#'\n" + "\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>")); newVector.addElement(new Option( "\tThe tokenizing algorihtm (classname plus parameters) to use.\n" + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>")); newVector.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); return newVector.elements(); }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(); newVector.add(new Option("\tUse word frequencies instead of " + "binary bag of words.", "W", 0, "-W")); newVector.add(new Option("\tHow often to prune the dictionary " + "of low frequency words (default = 0, i.e. don't prune)", "P", 1, "-P <# instances>")); newVector.add(new Option("\tMinimum word frequency. Words with less " + "than this frequence are ignored.\n\tIf periodic pruning " + "is turned on then this is also used to determine which\n\t" + "words to remove from the dictionary (default = 3).", "M", 1, "-M <double>")); newVector.addElement(new Option( "\tNormalize document length (use in conjunction with -norm and " + "-lnorm)", "normalize", 0, "-normalize")); newVector.addElement(new Option( "\tSpecify the norm that each instance must have (default 1.0)", "norm", 1, "-norm <num>")); newVector.addElement(new Option("\tSpecify L-norm to use (default 2.0)", "lnorm", 1, "-lnorm <num>")); newVector.addElement(new Option("\tConvert all tokens to lowercase " + "before adding to the dictionary.", "lowercase", 0, "-lowercase")); newVector.addElement(new Option( "\tThe stopwords handler to use (default Null).", "-stopwords-handler", 1, "-stopwords-handler")); newVector.addElement(new Option( "\tThe tokenizing algorihtm (classname plus parameters) to use.\n" + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>")); newVector.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); newVector.addAll(Collections.list(super.listOptions())); return newVector.elements(); }