Java 类weka.core.tokenizers.WordTokenizer 实例源码

项目:TweetRetriever    文件:LiveClassifier.java   
public static StringToWordVector getWordFilter(Instances data, ClassifierStructure struct) {
    StringToWordVector filter = new StringToWordVector();

    try {
        filter.setWordsToKeep(20000000);
        WordTokenizer token = new WordTokenizer();
        filter.setTokenizer(token);
        filter.setInputFormat(data); 
    } catch (Exception e1) {
        e1.printStackTrace();
    }

    return filter;
}
项目:repo.kmeanspp.silhouette_score    文件:NaiveBayesMultinomialText.java   
/**
 * Returns an enumeration describing the available options.
 *
 * @return an enumeration of all the available options.
 */
@Override
public Enumeration<Option> listOptions() {

  Vector<Option> newVector = new Vector<Option>();

  newVector.add(new Option("\tUse word frequencies instead of "
    + "binary bag of words.", "W", 0, "-W"));
  newVector.add(new Option("\tHow often to prune the dictionary "
    + "of low frequency words (default = 0, i.e. don't prune)", "P", 1,
    "-P <# instances>"));
  newVector.add(new Option("\tMinimum word frequency. Words with less "
    + "than this frequence are ignored.\n\tIf periodic pruning "
    + "is turned on then this is also used to determine which\n\t"
    + "words to remove from the dictionary (default = 3).", "M", 1,
    "-M <double>"));
  newVector.addElement(new Option(
    "\tNormalize document length (use in conjunction with -norm and "
      + "-lnorm)", "normalize", 0, "-normalize"));
  newVector.addElement(new Option(
    "\tSpecify the norm that each instance must have (default 1.0)", "norm",
    1, "-norm <num>"));
  newVector.addElement(
    new Option("\tSpecify L-norm to use (default 2.0)", "lnorm", 1,
      "-lnorm <num>"));
  newVector.addElement(new Option("\tConvert all tokens to lowercase "
    + "before adding to the dictionary.", "lowercase", 0, "-lowercase"));
  newVector.addElement(
    new Option("\tThe stopwords handler to use (default Null).",
      "-stopwords-handler", 1, "-stopwords-handler"));
  newVector.addElement(new Option(
    "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
      + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1,
    "-tokenizer <spec>"));
  newVector.addElement(new Option(
    "\tThe stemmering algorihtm (classname plus parameters) to use.",
    "stemmer", 1, "-stemmer <spec>"));

  newVector.addAll(Collections.list(super.listOptions()));

  return newVector.elements();
}
项目:repo.kmeanspp.silhouette_score    文件:SGDText.java   
/**
 * Returns an enumeration describing the available options.
 * 
 * @return an enumeration of all the available options.
 */
@Override
public Enumeration<Option> listOptions() {

  Vector<Option> newVector = new Vector<Option>();
  newVector.add(new Option("\tSet the loss function to minimize. 0 = "
    + "hinge loss (SVM), 1 = log loss (logistic regression)\n\t"
    + "(default = 0)", "F", 1, "-F"));
  newVector
    .add(new Option("\tOutput probabilities for SVMs (fits a logsitic\n\t"
      + "model to the output of the SVM)", "output-probs", 0, "-outputProbs"));
  newVector.add(new Option("\tThe learning rate (default = 0.01).", "L", 1,
    "-L"));
  newVector.add(new Option("\tThe lambda regularization constant "
    + "(default = 0.0001)", "R", 1, "-R <double>"));
  newVector.add(new Option("\tThe number of epochs to perform ("
    + "batch learning only, default = 500)", "E", 1, "-E <integer>"));
  newVector.add(new Option("\tUse word frequencies instead of "
    + "binary bag of words.", "W", 0, "-W"));
  newVector.add(new Option("\tHow often to prune the dictionary "
    + "of low frequency words (default = 0, i.e. don't prune)", "P", 1,
    "-P <# instances>"));
  newVector.add(new Option("\tMinimum word frequency. Words with less "
    + "than this frequence are ignored.\n\tIf periodic pruning "
    + "is turned on then this is also used to determine which\n\t"
    + "words to remove from the dictionary (default = 3).", "M", 1,
    "-M <double>"));

  newVector.add(new Option("\tMinimum absolute value of coefficients " +
    "in the model.\n\tIf periodic pruning is turned on then this\n\t"
    + "is also used to prune words from the dictionary\n\t"
    + "(default = 0.001", "min-coeff", 1, "-min-coeff <double>"));

  newVector.addElement(new Option(
    "\tNormalize document length (use in conjunction with -norm and "
      + "-lnorm)", "normalize", 0, "-normalize"));
  newVector.addElement(new Option(
    "\tSpecify the norm that each instance must have (default 1.0)", "norm",
    1, "-norm <num>"));
  newVector.addElement(new Option("\tSpecify L-norm to use (default 2.0)",
    "lnorm", 1, "-lnorm <num>"));
  newVector.addElement(new Option("\tConvert all tokens to lowercase "
    + "before adding to the dictionary.", "lowercase", 0, "-lowercase"));
  newVector.addElement(new Option(
    "\tThe stopwords handler to use (default Null).",
    "-stopwords-handler", 1, "-stopwords-handler"));
  newVector.addElement(new Option(
    "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
      + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1,
    "-tokenizer <spec>"));
  newVector.addElement(new Option(
    "\tThe stemmering algorihtm (classname plus parameters) to use.",
    "stemmer", 1, "-stemmer <spec>"));

  newVector.addAll(Collections.list(super.listOptions()));

  return newVector.elements();
}
项目:autoweka    文件:NaiveBayesMultinomialText.java   
/**
 * Returns an enumeration describing the available options.
 *
 * @return an enumeration of all the available options.
 */
public Enumeration<Option> listOptions() {

  Vector<Option> newVector = new Vector<Option>();

  newVector.add(new Option("\tUse word frequencies instead of " +
              "binary bag of words.", "W", 0, 
              "-W"));
  newVector.add(new Option("\tHow often to prune the dictionary " +
              "of low frequency words (default = 0, i.e. don't prune)", 
              "P", 1, "-P <# instances>"));
  newVector.add(new Option("\tMinimum word frequency. Words with less " +
              "than this frequence are ignored.\n\tIf periodic pruning " +
              "is turned on then this is also used to determine which\n\t" +
              "words to remove from the dictionary (default = 3).",
              "M", 1, "-M <double>"));
  newVector.addElement(new Option(
      "\tNormalize document length (use in conjunction with -norm and " +
      "-lnorm)", "normalize", 0, "-normalize"));
  newVector.addElement(new Option(
      "\tSpecify the norm that each instance must have (default 1.0)",
      "norm", 1, "-norm <num>"));
  newVector.addElement(new Option(
      "\tSpecify L-norm to use (default 2.0)",
      "lnorm", 1, "-lnorm <num>"));
  newVector.addElement(new Option("\tConvert all tokens to lowercase " +
              "before adding to the dictionary.",
      "lowercase", 0, "-lowercase"));
  newVector.addElement(new Option(
      "\tIgnore words that are in the stoplist.",
      "stoplist", 0, "-stoplist"));
  newVector.addElement(new Option(
      "\tA file containing stopwords to override the default ones.\n"
      + "\tUsing this option automatically sets the flag ('-stoplist') to use the\n"
      + "\tstoplist if the file exists.\n"
      + "\tFormat: one stopword per line, lines starting with '#'\n"
      + "\tare interpreted as comments and ignored.",
      "stopwords", 1, "-stopwords <file>"));
  newVector.addElement(new Option(
      "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
      + "\t(default: " + WordTokenizer.class.getName() + ")",
      "tokenizer", 1, "-tokenizer <spec>"));
  newVector.addElement(new Option(
      "\tThe stemmering algorihtm (classname plus parameters) to use.",
      "stemmer", 1, "-stemmer <spec>"));

  return newVector.elements();
}
项目:autoweka    文件:SGDText.java   
/**
 * Returns an enumeration describing the available options.
 *
 * @return an enumeration of all the available options.
 */
public Enumeration<Option> listOptions() {

  Vector<Option> newVector = new Vector<Option>();
  newVector.add(new Option("\tSet the loss function to minimize. 0 = " +
      "hinge loss (SVM), 1 = log loss (logistic regression)\n\t" +
      "(default = 0)", "F", 1, "-F"));
  newVector.add(new Option("\tOutput probabilities for SVMs (fits a logsitic\n\t" +
        "model to the output of the SVM)", "output-probs", 
      0, "-outputProbs"));
  newVector.add(new Option("\tThe learning rate (default = 0.01).", "L", 1, "-L"));
  newVector.add(new Option("\tThe lambda regularization constant " +
              "(default = 0.0001)",
              "R", 1, "-R <double>"));
  newVector.add(new Option("\tThe number of epochs to perform (" +
              "batch learning only, default = 500)", "E", 1,
              "-E <integer>"));
  newVector.add(new Option("\tUse word frequencies instead of " +
        "binary bag of words.", "W", 0, 
        "-W"));
  newVector.add(new Option("\tHow often to prune the dictionary " +
        "of low frequency words (default = 0, i.e. don't prune)", 
        "P", 1, "-P <# instances>"));
  newVector.add(new Option("\tMinimum word frequency. Words with less " +
        "than this frequence are ignored.\n\tIf periodic pruning " +
        "is turned on then this is also used to determine which\n\t" +
        "words to remove from the dictionary (default = 3).",
        "M", 1, "-M <double>"));
  newVector.addElement(new Option(
      "\tNormalize document length (use in conjunction with -norm and " +
      "-lnorm)", "normalize", 0, "-normalize"));
  newVector.addElement(new Option(
      "\tSpecify the norm that each instance must have (default 1.0)",
      "norm", 1, "-norm <num>"));
  newVector.addElement(new Option(
      "\tSpecify L-norm to use (default 2.0)",
      "lnorm", 1, "-lnorm <num>"));
  newVector.addElement(new Option("\tConvert all tokens to lowercase " +
        "before adding to the dictionary.",
      "lowercase", 0, "-lowercase"));
  newVector.addElement(new Option(
      "\tIgnore words that are in the stoplist.",
      "stoplist", 0, "-stoplist"));
  newVector.addElement(new Option(
      "\tA file containing stopwords to override the default ones.\n"
      + "\tUsing this option automatically sets the flag ('-stoplist') to use the\n"
      + "\tstoplist if the file exists.\n"
      + "\tFormat: one stopword per line, lines starting with '#'\n"
      + "\tare interpreted as comments and ignored.",
      "stopwords", 1, "-stopwords <file>"));
  newVector.addElement(new Option(
      "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
      + "\t(default: " + WordTokenizer.class.getName() + ")",
      "tokenizer", 1, "-tokenizer <spec>"));
  newVector.addElement(new Option(
      "\tThe stemmering algorihtm (classname plus parameters) to use.",
      "stemmer", 1, "-stemmer <spec>"));

  return newVector.elements();
}
项目:umple    文件:NaiveBayesMultinomialText.java   
/**
 * Returns an enumeration describing the available options.
 * 
 * @return an enumeration of all the available options.
 */
@Override
public Enumeration<Option> listOptions() {

  Vector<Option> newVector = new Vector<Option>();

  newVector.add(new Option("\tUse word frequencies instead of "
    + "binary bag of words.", "W", 0, "-W"));
  newVector.add(new Option("\tHow often to prune the dictionary "
    + "of low frequency words (default = 0, i.e. don't prune)", "P", 1,
    "-P <# instances>"));
  newVector.add(new Option("\tMinimum word frequency. Words with less "
    + "than this frequence are ignored.\n\tIf periodic pruning "
    + "is turned on then this is also used to determine which\n\t"
    + "words to remove from the dictionary (default = 3).", "M", 1,
    "-M <double>"));
  newVector.addElement(new Option(
    "\tNormalize document length (use in conjunction with -norm and "
      + "-lnorm)", "normalize", 0, "-normalize"));
  newVector.addElement(new Option(
    "\tSpecify the norm that each instance must have (default 1.0)", "norm",
    1, "-norm <num>"));
  newVector.addElement(new Option("\tSpecify L-norm to use (default 2.0)",
    "lnorm", 1, "-lnorm <num>"));
  newVector.addElement(new Option("\tConvert all tokens to lowercase "
    + "before adding to the dictionary.", "lowercase", 0, "-lowercase"));
  newVector.addElement(new Option(
    "\tThe stopwords handler to use (default Null).",
    "-stopwords-handler", 1, "-stopwords-handler"));
  newVector.addElement(new Option(
    "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
      + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1,
    "-tokenizer <spec>"));
  newVector.addElement(new Option(
    "\tThe stemmering algorihtm (classname plus parameters) to use.",
    "stemmer", 1, "-stemmer <spec>"));

  newVector.addAll(Collections.list(super.listOptions()));

  return newVector.elements();
}
项目:umple    文件:SGDText.java   
/**
 * Returns an enumeration describing the available options.
 * 
 * @return an enumeration of all the available options.
 */
@Override
public Enumeration<Option> listOptions() {

  Vector<Option> newVector = new Vector<Option>();
  newVector.add(new Option("\tSet the loss function to minimize. 0 = "
    + "hinge loss (SVM), 1 = log loss (logistic regression)\n\t"
    + "(default = 0)", "F", 1, "-F"));
  newVector
    .add(new Option("\tOutput probabilities for SVMs (fits a logsitic\n\t"
      + "model to the output of the SVM)", "output-probs", 0, "-outputProbs"));
  newVector.add(new Option("\tThe learning rate (default = 0.01).", "L", 1,
    "-L"));
  newVector.add(new Option("\tThe lambda regularization constant "
    + "(default = 0.0001)", "R", 1, "-R <double>"));
  newVector.add(new Option("\tThe number of epochs to perform ("
    + "batch learning only, default = 500)", "E", 1, "-E <integer>"));
  newVector.add(new Option("\tUse word frequencies instead of "
    + "binary bag of words.", "W", 0, "-W"));
  newVector.add(new Option("\tHow often to prune the dictionary "
    + "of low frequency words (default = 0, i.e. don't prune)", "P", 1,
    "-P <# instances>"));
  newVector.add(new Option("\tMinimum word frequency. Words with less "
    + "than this frequence are ignored.\n\tIf periodic pruning "
    + "is turned on then this is also used to determine which\n\t"
    + "words to remove from the dictionary (default = 3).", "M", 1,
    "-M <double>"));

  newVector.add(new Option("\tMinimum absolute value of coefficients " +
    "in the model.\n\tIf periodic pruning is turned on then this\n\t"
    + "is also used to prune words from the dictionary\n\t"
    + "(default = 0.001", "min-coeff", 1, "-min-coeff <double>"));

  newVector.addElement(new Option(
    "\tNormalize document length (use in conjunction with -norm and "
      + "-lnorm)", "normalize", 0, "-normalize"));
  newVector.addElement(new Option(
    "\tSpecify the norm that each instance must have (default 1.0)", "norm",
    1, "-norm <num>"));
  newVector.addElement(new Option("\tSpecify L-norm to use (default 2.0)",
    "lnorm", 1, "-lnorm <num>"));
  newVector.addElement(new Option("\tConvert all tokens to lowercase "
    + "before adding to the dictionary.", "lowercase", 0, "-lowercase"));
  newVector.addElement(new Option(
    "\tThe stopwords handler to use (default Null).",
    "-stopwords-handler", 1, "-stopwords-handler"));
  newVector.addElement(new Option(
    "\tThe tokenizing algorihtm (classname plus parameters) to use.\n"
      + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1,
    "-tokenizer <spec>"));
  newVector.addElement(new Option(
    "\tThe stemmering algorihtm (classname plus parameters) to use.",
    "stemmer", 1, "-stemmer <spec>"));

  newVector.addAll(Collections.list(super.listOptions()));

  return newVector.elements();
}