/** * Create a MatchingRule, which can be trained using the Weka library for * identity resolution. * * @param finalThreshold * determines the confidence level, which needs to be exceeded by * the classifier, so that it can classify a record as match. * * @param classifierName * Has the name of a specific classifier from the Weka library. * * @param parameters * Hold the parameters to tune the classifier. */ public WekaMatchingRule(double finalThreshold, String classifierName, String parameters[]) { super(finalThreshold); this.parameters = parameters; // create classifier try { this.classifier = (Classifier) Utils.forName(Classifier.class, classifierName, parameters); } catch (Exception e) { e.printStackTrace(); } // create list for comparators this.comparators = new LinkedList<>(); }
public SentenceType classifySentence(Sentence sentence) { SpeechActsClassifier.Features features = speechActsClassifier.classifyFeatures(sentence); Instance inst = new DenseInstance(6); inst.setDataset(dataSet); inst.setValue(0, features.getSentenceLength()); inst.setValue(1, features.getNumberOfNouns()); inst.setValue(2, (features.isEndingInNounOrAdjective() ? 1 : 0)); inst.setValue(3, (features.isBeginningInVerb() ? 1 : 0)); inst.setValue(4, features.getCountOfWhMarkers()); inst.setValue(5, Utils.missingValue()); try { return SentenceType.valueOf(classifier.classifyInstance(inst)); } catch (Exception e) { throw new RuntimeException("Can't classify"); } }
public QuestionType classifyQuestion(Sentence sentence) { if (!sentence.isQuestion()) { return QuestionType.NA; } QuestionTypeClassifier.Features features = questionTypeClassifier.classifyFeatures(sentence); Instance inst = new DenseInstance(5); inst.setDataset(dataSet); inst.setValue(0, features.getWhWord()); inst.setValue(1, features.getWhWordPos()); inst.setValue(2, features.getPosOfNext()); inst.setValue(3, features.getRootPos()); inst.setValue(4, Utils.missingValue()); try { int ndx = (int) classifier.classifyInstance(inst); return QuestionType.valueOf(ndx); } catch (Exception e) { throw new RuntimeException("Not classified"); } }
/** * Returns an instance of the class used for generating plot instances for * displaying the cluster assignments. * * @return an instance of the class */ public static ClustererAssignmentsPlotInstances getClustererAssignmentsPlotInstances() { ClustererAssignmentsPlotInstances result; String classname; String[] options; try { options = Utils.splitOptions(get("ClustererAssignmentsPlotInstances", "weka.gui.explorer.ClustererAssignmentsPlotInstances")); classname = options[0]; options[0] = ""; result = (ClustererAssignmentsPlotInstances) Utils.forName( ClustererAssignmentsPlotInstances.class, classname, options); } catch (Exception e) { e.printStackTrace(); result = new ClustererAssignmentsPlotInstances(); } return result; }
/** * returns a random index based on the given proportions * * @param proportionArray the proportions * @param random the random number generator to use * @return the random index */ protected int chooseRandomIndexBasedOnProportions(double[] proportionArray, Random random) { double probSum; double val; int index; double sum; probSum = Utils.sum(proportionArray); val = random.nextDouble() * probSum; index = 0; sum = 0.0; while ((sum <= val) && (index < proportionArray.length)) { sum += proportionArray[index++]; } return index - 1; }
/** * Returns description of the bagged classifier. * * @return description of the bagged classifier as a string */ @Override public String toString() { if (m_Classifiers == null) { return "Bagging: No model built yet."; } StringBuffer text = new StringBuffer(); text.append("All the base classifiers: \n\n"); for (int i = 0; i < m_Classifiers.length; i++) text.append(m_Classifiers[i].toString() + "\n\n"); if (m_CalcOutOfBag) { text.append("Out of bag error: " + Utils.doubleToString(m_OutOfBagError, 4) + "\n\n"); } return text.toString(); }
/** * Choose last index (ie. choose rule). */ public final int chooseLastIndex() { int minIndex = 0; double estimated, min = Double.MAX_VALUE; if (!m_isLeaf) { for (int i = 0; i < m_sons.length; i++) { if (son(i) != null) { if (Utils.grOrEq(localModel().distribution().perBag(i), m_minNumObj)) { estimated = son(i).getSizeOfBranch(); if (Utils.sm(estimated, min)) { min = estimated; minIndex = i; } } } } } return minIndex; }
/** * Calculates the area under the learning curve (ALC). * * @param ds * an array of values * @param xDelta * The step * * @return The area under learning curve */ public static double getAreaUnderLearningCurve(double[] ds, double xDelta) { final int n = ds.length; if (n == 0) { return Double.NaN; } double area = 0; double total = 0; for (int i = n - 2; i >= 0; i--) { total += xDelta; area += (ds[i] * xDelta); } if (area == 0) { return Utils.missingValue(); } return area / total; }
/** * Help method for computing the split entropy. */ private final double splitEnt(Distribution bags, double totalnoInst) { double returnValue = 0; double noUnknown; int i; noUnknown = totalnoInst - bags.total(); if (Utils.gr(bags.total(), 0)) { for (i = 0; i < bags.numBags(); i++) { returnValue = returnValue - lnFunc(bags.perBag(i)); } returnValue = returnValue - lnFunc(noUnknown); returnValue = returnValue + lnFunc(totalnoInst); } return returnValue / ContingencyTables.log2; }
/** * returns a key for all the col names, for better readability if * the names got cut off. * * @return the key */ public String toStringKey() { String result; int i; result = "Key,\n"; for (i = 0; i < getColCount(); i++) { if (getColHidden(i)) continue; result += LEFT_PARENTHESES + (i+1) + RIGHT_PARENTHESES + "," + Utils.quote(removeFilterName(m_ColNames[i])) + "\n"; } return result; }
/** * Calculate the entropy of the prior distribution. * * @return the entropy of the prior distribution * @throws Exception if the class is not nominal */ public final double priorEntropy() throws Exception { if (!m_ClassIsNominal) { throw new Exception("Can't compute entropy of class prior: " + "class numeric!"); } if (m_NoPriors) { return Double.NaN; } double entropy = 0; for (int i = 0; i < m_NumClasses; i++) { entropy -= m_ClassPriors[i] / m_ClassPriorsSum * Utils.log2(m_ClassPriors[i] / m_ClassPriorsSum); } return entropy; }
/** * Classifies the given test instance. The instance has to belong to a * dataset when it's being classified. * * @param inst the instance to be classified * @return the predicted most likely class for the instance or * Utils.missingValue() if no prediction is made * @exception Exception if an error occurred during the prediction */ public double[] distributionForInstance(Instance inst) throws Exception { if (!m_initialized) { mapToMiningSchema(inst.dataset()); } double[] preds = null; if (m_miningSchema.getFieldsAsInstances().classAttribute().isNumeric()) { preds = new double[1]; } else { preds = new double[m_miningSchema.getFieldsAsInstances().classAttribute().numValues()]; } double[] incoming = m_fieldsMap.instanceToSchema(inst, m_miningSchema); preds = m_ruleSet.score(incoming, m_miningSchema.getFieldsAsInstances().classAttribute()); if (m_miningSchema.getFieldsAsInstances().classAttribute().isNominal()) { Utils.normalize(preds); } return preds; }
@Override public void setup(Context context) throws IOException { m_task = new CSVToARFFHeaderReduceTask(); Configuration conf = context.getConfiguration(); String taskOpts = conf.get(CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS); if (taskOpts != null && taskOpts.length() > 0) { try { String[] options = Utils.splitOptions(taskOpts); m_estimateQuantiles = Utils.getFlag("compute-quartiles", options); } catch (Exception ex) { throw new IOException(ex); } } }
/** * Pretty hokey heuristic to try and set t2 distance automatically based on * standard deviation * * @param trainingBatch the training instances * @throws Exception if a problem occurs */ protected void setT2T1BasedOnStdDev(Instances trainingBatch) throws Exception { double normalizedStdDevSum = 0; for (int i = 0; i < trainingBatch.numAttributes(); i++) { if (trainingBatch.attribute(i).isNominal()) { normalizedStdDevSum += 0.25; } else if (trainingBatch.attribute(i).isNumeric()) { AttributeStats stats = trainingBatch.attributeStats(i); if (trainingBatch.numInstances() - stats.missingCount > 2) { double stdDev = stats.numericStats.stdDev; double min = stats.numericStats.min; double max = stats.numericStats.max; if (!Utils.isMissingValue(stdDev) && max - min > 0) { stdDev = 0.5 * stdDev / (max - min); normalizedStdDevSum += stdDev; } } } } normalizedStdDevSum = Math.sqrt(normalizedStdDevSum); if (normalizedStdDevSum > 0) { m_t2 = normalizedStdDevSum; } }
/** * Gets the current settings of the datagenerator RDG1. Removing of * blacklisted options has to be done in the derived class, that defines the * blacklist-entry. * * @return an array of strings suitable for passing to setOptions * @see #removeBlacklist(String[]) */ @Override public String[] getOptions() { Vector<String> result = new Vector<String>(); // to avoid endless loop if (!m_CreatingRelationName) { result.add("-r"); result.add(Utils.quote(getRelationNameToUse())); } if (getDebug()) { result.add("-d"); } result.add("-S"); result.add("" + getSeed()); return result.toArray(new String[result.size()]); }
@Override public void run(Object toRun, String[] args) throws IllegalArgumentException { if (!(toRun instanceof WekaClassifierEvaluationHadoopJob)) { throw new IllegalArgumentException( "Object to run is not a WekaClassifierEvaluationHadoopJob!"); } try { WekaClassifierEvaluationHadoopJob wchej = (WekaClassifierEvaluationHadoopJob) toRun; if (Utils.getFlag('h', args)) { String help = DistributedJob.makeOptionsStr(wchej); System.err.println(help); System.exit(1); } wchej.setOptions(args); wchej.runJob(); System.out.print(wchej.getText()); } catch (Exception ex) { ex.printStackTrace(); } }
/** * Updates the options that the current classifier is using. */ protected void updateOptions() { if (m_Template instanceof OptionHandler) { m_ClassifierOptions = Utils.joinOptions(((OptionHandler) m_Template) .getOptions()); } else { m_ClassifierOptions = ""; } if (m_Template instanceof Serializable) { ObjectStreamClass obs = ObjectStreamClass.lookup(m_Template.getClass()); m_ClassifierVersion = "" + obs.getSerialVersionUID(); } else { m_ClassifierVersion = ""; } }
/** * Returns class with highest frequency over all bags. */ public final int maxClass() { double maxCount = 0; int maxIndex = 0; int i; for (i = 0; i < m_perClass.length; i++) { if (Utils.gr(m_perClass[i], maxCount)) { maxCount = m_perClass[i]; maxIndex = i; } } return maxIndex; }
/** * Constructor that takes a precision argument. * * @param precision the precision to which numeric values are given. For * example, if the precision is stated to be 0.1, the values in the * interval (0.25,0.35] are all treated as 0.3. */ public KernelEstimator(double precision) { m_Values = new double[50]; m_Weights = new double[50]; m_NumValues = 0; m_SumOfWeights = 0; m_AllWeightsOne = true; m_Precision = precision; // precision cannot be zero if (m_Precision < Utils.SMALL) m_Precision = Utils.SMALL; // m_StandardDev = 1e10 * m_Precision; // Set the standard deviation // initially very wide m_StandardDev = m_Precision / (2 * 3); }
@Override public String[] getOptions() { Vector<String> result = new Vector<String>(); result.add("-dest"); //$NON-NLS-1$ result.add(getHDFSPath()); if (!DistributedJobConfig.isEmpty(getDFSReplicationFactor())) { result.add("-dfs-replication"); //$NON-NLS-1$ result.add(getDFSReplicationFactor()); } result.add("-saver"); //$NON-NLS-1$ String saverSpec = m_delegate.getClass().getName(); if (m_delegate != null) { saverSpec += " " //$NON-NLS-1$ + Utils.joinOptions(((OptionHandler) m_delegate).getOptions()); } result.add(saverSpec); for (String s : m_config.getOptions()) { result.add(s); } return result.toArray(new String[result.size()]); }
/** * Computes an array that has a value for each element in the partition. (If * the base classifier supports this.) */ public double[] getMembershipValues(Instance inst) throws Exception { if (m_Classifier instanceof PartitionGenerator) { Instance newInstance = filterInstance(inst); if (newInstance == null) { double[] unclassified = new double[numElements()]; for (int i = 0; i < unclassified.length; i++) { unclassified[i] = Utils.missingValue(); } return unclassified; } else { return ((PartitionGenerator) m_Classifier) .getMembershipValues(newInstance); } } else throw new Exception("Classifier: " + getClassifierSpec() + " cannot generate a partition"); }
@Override public void setOptions(String[] options) throws Exception { super.setOptions(options); String host = Utils.getOption("hdfs-host", options); if (!isEmpty(host)) { setHDFSHost(host); } else { setHDFSHost(DEFAULT_HOST); } String port = Utils.getOption("hdfs-port", options); if (!isEmpty(port)) { setHDFSPort(port); } else { setHDFSPort(DEFAULT_PORT); } }
/** * Returns index of subset instance is assigned to. Returns -1 if instance is * assigned to more than one subset. * * @exception Exception if something goes wrong */ @Override public final int whichSubset(Instance instance) throws Exception { if (instance.isMissing(m_attIndex)) { return -1; } else { if (instance.attribute(m_attIndex).isNominal()) { return (int) instance.value(m_attIndex); } else if (Utils.smOrEq(instance.value(m_attIndex), m_splitPoint)) { return 0; } else { return 1; } } }
/** * Sets split point to greatest value in given data smaller or equal to old * split point. (C4.5 does this for some strange reason). */ public final void setSplitPoint(Instances allInstances) { double newSplitPoint = -Double.MAX_VALUE; double tempValue; Instance instance; if ((allInstances.attribute(m_attIndex).isNumeric()) && (m_numSubsets > 1)) { Enumeration<Instance> enu = allInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = enu.nextElement(); if (!instance.isMissing(m_attIndex)) { tempValue = instance.value(m_attIndex); if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint)) { newSplitPoint = tempValue; } } } m_splitPoint = newSplitPoint; } }
/** * Returns true if the supplied header already has quartile infomration * calculated and there are numeric attributes in the data * * @param headerWithSummary the header to check * @return true if the supplied header has quartile information * @throws DistributedWekaException if a problem occurs */ public static boolean headerContainsQuartiles(Instances headerWithSummary) throws DistributedWekaException { Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary); boolean hasQuartiles = false; for (int i = 0; i < headerNoSummary.numAttributes(); i++) { if (headerNoSummary.attribute(i).isNumeric()) { Attribute summary = headerWithSummary .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + headerNoSummary.attribute(i).name()); if (!Utils.isMissingValue(ArffSummaryNumericMetric.FIRSTQUARTILE .valueFromAttribute(summary))) { hasQuartiles = true; break; } } } return hasQuartiles; }
/** * Output the cumulative margin distribution as a string suitable for input * for gnuplot or similar package. * * @return the cumulative margin distribution * @throws Exception if the class attribute is nominal */ public String toCumulativeMarginDistributionString() throws Exception { if (!m_ClassIsNominal) { throw new Exception("Class must be nominal for margin distributions"); } String result = ""; double cumulativeCount = 0; double margin; for (int i = 0; i <= k_MarginResolution; i++) { if (m_MarginCounts[i] != 0) { cumulativeCount += m_MarginCounts[i]; margin = i * 2.0 / k_MarginResolution - 1.0; result = result + Utils.doubleToString(margin, 7, 3) + ' ' + Utils.doubleToString(cumulativeCount * 100 / m_WithClass, 7, 3) + '\n'; } else if (i == 0) { result = Utils.doubleToString(-1.0, 7, 3) + ' ' + Utils.doubleToString(0, 7, 3) + '\n'; } } return result; }
@Override public double[] getVotesForInstance(Instance inst) { // TODO Auto-generated method stub // increase no. of seen intances totalSeenInstances++; // check if there is any rules that cover the instance ArrayList<Rule> coveredRules = RulesCoveredInstance(inst); // logger.debug("No. Rules cover instance: " + coveredRules.size()); // logger.debug(inst); // return prediction if there are rules that cover the instance if(coveredRules.size() > 0){ actualAttempts++; double[] classPrediction = new double[inst.numClasses()]; // vote class labels from all available rules for (Rule rule : coveredRules) { classPrediction[(int)rule.classification]++; // logger.debug(rule.printRule()); } // actual attempt if(Utils.maxIndex(classPrediction) == (int) inst.classValue()){ actualAttemptsCorrectlyClassified++; } return classPrediction ; } // otherwise, return the majority class return observedClassDistribution.getArrayCopy(); }
public void selectFeatures(){ AttributeSelection attSelection = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); BestFirst search = new BestFirst(); attSelection.setEvaluator(eval); attSelection.setSearch(search); try { attSelection.SelectAttributes(iris); int[] attIndex = attSelection.selectedAttributes(); System.out.println(Utils.arrayToString(attIndex)); } catch (Exception e) { } }
private static M5P buildModel(Instances modelInstances, int numOfInstanceInLeaf) throws Exception{ M5P retval = new M5P(); retval.setSaveInstances(true); retval.setOptions(Utils.splitOptions("-N -L -M "+numOfInstanceInLeaf)); retval.buildClassifier(modelInstances); return retval; }
/** * registers all the editors in Weka. */ public static void registerEditors() { Properties props; Enumeration<?> enm; String name; String value; if (m_EditorsRegistered) { return; } Logger.log(weka.core.logging.Logger.Level.INFO, "---Registering Weka Editors---"); m_EditorsRegistered = true; // load properties try { props = Utils.readProperties(GUIEDITORS_PROPERTY_FILE); } catch (Exception e) { props = new Properties(); e.printStackTrace(); } // show the tool tip? m_ShowGlobalInfoToolTip = props.getProperty( "ShowGlobalInfoToolTip", "true").equals("true"); enm = props.propertyNames(); while (enm.hasMoreElements()) { name = enm.nextElement().toString(); value = props.getProperty(name, ""); registerEditor(name, value); } }
/** * Outputs one node for graph. * * @param text the buffer to append the output to * @param num the current node id * @param parent the parent of the nodes * @return the next node id * @throws Exception if something goes wrong */ protected int toGraph(StringBuffer text, int num, Tree parent) throws Exception { num++; if (m_Attribute == -1) { text.append("N" + Integer.toHexString(Tree.this.hashCode()) + " [label=\"" + num + Utils.backQuoteChars(leafString(parent)) + "\"" + "shape=box]\n"); } else { text.append("N" + Integer.toHexString(Tree.this.hashCode()) + " [label=\"" + num + ": " + Utils.backQuoteChars(m_Info.attribute(m_Attribute).name()) + "\"]\n"); for (int i = 0; i < m_Successors.length; i++) { text.append("N" + Integer.toHexString(Tree.this.hashCode()) + "->" + "N" + Integer.toHexString(m_Successors[i].hashCode()) + " [label=\""); if (m_Info.attribute(m_Attribute).isNumeric()) { if (i == 0) { text.append(" < " + Utils.doubleToString(m_SplitPoint, 2)); } else { text.append(" >= " + Utils.doubleToString(m_SplitPoint, 2)); } } else { text.append(" = " + Utils.backQuoteChars(m_Info.attribute(m_Attribute).value(i))); } text.append("\"]\n"); num = m_Successors[i].toGraph(text, num, this); } } return num; }
/** * Prunes the end of the rule. */ protected void pruneEnd() throws Exception { double errorsLeaf, errorsTree; errorsTree = errorsForTree(); errorsLeaf = errorsForLeaf(); if (Utils.smOrEq(errorsLeaf, errorsTree)) { m_isLeaf = true; m_sons = null; m_localModel = new NoSplit(localModel().distribution()); } }
/** * Returns number of classes actually occuring in distribution. */ public final int actualNumClasses() { int returnValue = 0; int i; for (i = 0; i < m_perClass.length; i++) { if (Utils.gr(m_perClass[i], 0)) { returnValue++; } } return returnValue; }
protected WekaClassifierMapTask setupIncrementalRegressor() { WekaClassifierMapTask task = new WekaClassifierMapTask(); weka.classifiers.functions.SGD sgd = new weka.classifiers.functions.SGD(); try { sgd.setOptions(Utils.splitOptions("-F 2")); task.setClassifier(sgd); } catch (Exception e) { e.printStackTrace(); } return task; }
/** * Return a textual description of this RegressionTable. */ public String toString() { Instances miningSchema = m_miningSchema.getFieldsAsInstances(); StringBuffer temp = new StringBuffer(); temp.append("Regression table:\n"); temp.append(miningSchema.classAttribute().name()); if (m_functionType == CLASSIFICATION) { temp.append("=" + miningSchema. classAttribute().value(m_targetCategory)); } temp.append(" =\n\n"); // do the predictors for (int i = 0; i < m_predictors.size(); i++) { temp.append(m_predictors.get(i).toString() + " +\n"); } // do the predictor terms for (int i = 0; i < m_predictorTerms.size(); i++) { temp.append(m_predictorTerms.get(i).toString() + " +\n"); } temp.append(Utils.doubleToString(m_intercept, 12, 4)); temp.append("\n\n"); return temp.toString(); }
/** * Computes the difference between two given attribute values. */ protected double difference(int index, double val1, double val2) { switch (m_instances.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2)) { if (Utils.isMissingValue(val1) && Utils.isMissingValue(val2)) { return 1; } else { double diff; if (Utils.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } }
/** * sets a specific option/value of the generator (option must be w/o then '-') * * @param generator the generator to set the option for * @param option the option to set * @param value the new value for the option */ protected void setGeneratorOption(BayesNetGenerator generator, String option, String value) { String[] options; Vector<String> list; int i; try { // get options and remove specific option options = generator.getOptions(); Utils.getOption(option, options); // add option and set the new options list = new Vector<String>(); for (i = 0; i < options.length; i++) { if (options[i].length() != 0) { list.add(options[i]); } } list.add("-" + option); list.add(value); setGeneratorOptions(generator, list); } catch (Exception e) { e.printStackTrace(); } }
/** * Sets the options. * * @param options the options * @throws Exception if invalid option */ @Override public void setOptions(String[] options) throws Exception { String tmpStr; super.setOptions(options); tmpStr = Utils.getOption('n', options); if (tmpStr.length() != 0) { setNumExamples(Integer.parseInt(tmpStr)); } else { setNumExamples(defaultNumExamples()); } }
/** * calculates the mean of the given numeric column */ private void calcMean() { ArffSortedTableModel model; int i; double mean; // no column selected? if (m_CurrentCol == -1) { return; } model = (ArffSortedTableModel) m_TableArff.getModel(); // not numeric? if (!model.getAttributeAt(m_CurrentCol).isNumeric()) { return; } mean = 0; for (i = 0; i < model.getRowCount(); i++) { mean += model.getInstances().instance(i).value(m_CurrentCol - 1); } mean = mean / model.getRowCount(); // show result ComponentHelper.showMessageBox(getParent(), "Mean for attribute...", "Mean for attribute '" + m_TableArff.getPlainColumnName(m_CurrentCol) + "':\n\t" + Utils.doubleToString(mean, 3), JOptionPane.OK_CANCEL_OPTION, JOptionPane.PLAIN_MESSAGE); }