/** * Help method for computing the split entropy. */ private final double splitEnt(Distribution bags, double totalnoInst) { double returnValue = 0; double noUnknown; int i; noUnknown = totalnoInst - bags.total(); if (Utils.gr(bags.total(), 0)) { for (i = 0; i < bags.numBags(); i++) { returnValue = returnValue - lnFunc(bags.perBag(i)); } returnValue = returnValue - lnFunc(noUnknown); returnValue = returnValue + lnFunc(totalnoInst); } return returnValue / ContingencyTables.log2; }
/** * Computes entropy of test distribution with respect to training distribution. */ public final double splitCritValue(Distribution train, Distribution test) { double result = 0; int numClasses = 0; int i, j; // Find out relevant number of classes for (j = 0; j < test.numClasses(); j++) if (Utils.gr(train.perClass(j), 0) || Utils.gr(test.perClass(j), 0)) numClasses++; // Compute entropy of test data with respect to training data for (i = 0; i < test.numBags(); i++) if (Utils.gr(test.perBag(i),0)) { for (j = 0; j < test.numClasses(); j++) if (Utils.gr(test.perClassPerBag(i, j), 0)) result -= test.perClassPerBag(i, j)* Math.log(train.perClassPerBag(i, j) + 1); result += test.perBag(i) * Math.log(train.perBag(i) + numClasses); } return result / ContingencyTables.log2; }
/** * Help method for computing entropy. */ public final double lnFunc(double num) { // Constant hard coded for efficiency reasons if (num < 1e-6) return 0; else return ContingencyTables.lnFunc(num); }
/** * Computes entropy of distribution before splitting. */ public final double oldEnt(Distribution bags) { double returnValue = 0; int j; for (j=0;j<bags.numClasses();j++) returnValue = returnValue+lnFunc(bags.perClass(j)); return (lnFunc(bags.total())-returnValue)/ContingencyTables.log2; }
/** * Computes entropy of distribution after splitting. */ public final double newEnt(Distribution bags) { double returnValue = 0; int i,j; for (i=0;i<bags.numBags();i++){ for (j=0;j<bags.numClasses();j++) returnValue = returnValue+lnFunc(bags.perClassPerBag(i,j)); returnValue = returnValue-lnFunc(bags.perBag(i)); } return -(returnValue/ContingencyTables.log2); }
/** * Computes entropy after splitting without considering the * class values. */ public final double splitEnt(Distribution bags) { double returnValue = 0; int i; for (i=0;i<bags.numBags();i++) returnValue = returnValue+lnFunc(bags.perBag(i)); return (lnFunc(bags.total())-returnValue)/ContingencyTables.log2; }
/** * Method for choosing a subset to expand. */ public final int chooseIndex() { int minIndex = -1; double estimated, min = Double.MAX_VALUE; int i, j; for (i = 0; i < m_sons.length; i++) { if (son(i) == null) { if (Utils.sm(localModel().distribution().perBag(i), m_minNumObj)) { estimated = Double.MAX_VALUE; } else { estimated = 0; for (j = 0; j < localModel().distribution().numClasses(); j++) { estimated -= m_splitCrit.lnFunc(localModel().distribution() .perClassPerBag(i, j)); } estimated += m_splitCrit .lnFunc(localModel().distribution().perBag(i)); estimated /= (localModel().distribution().perBag(i) * ContingencyTables.log2); } if (Utils.smOrEq(estimated, 0)) { return i; } if (Utils.sm(estimated, min)) { min = estimated; minIndex = i; } } } return minIndex; }
/** * Finds best split for nominal attribute and nominal class * and returns value. * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ protected double findSplitNominalNominal(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal; double[][] counts = new double[m_Instances.attribute(index).numValues() + 1][m_Instances.numClasses()]; double[] sumCounts = new double[m_Instances.numClasses()]; double[][] bestDist = new double[3][m_Instances.numClasses()]; int numMissing = 0; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (inst.isMissing(index)) { numMissing++; counts[m_Instances.attribute(index).numValues()] [(int)inst.classValue()] += inst.weight(); } else { counts[(int)inst.value(index)][(int)inst.classValue()] += inst .weight(); } } // Compute sum of counts for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { sumCounts[j] += counts[i][j]; } } // Make split counts for each possible split and evaluate System.arraycopy(counts[m_Instances.attribute(index).numValues()], 0, m_Distribution[2], 0, m_Instances.numClasses()); for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { m_Distribution[0][j] = counts[i][j]; m_Distribution[1][j] = sumCounts[j] - counts[i][j]; } currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution); if (currVal < bestVal) { bestVal = currVal; m_SplitPoint = (double)i; for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } } } // No missing values in training data. if (numMissing == 0) { System.arraycopy(sumCounts, 0, bestDist[2], 0, m_Instances.numClasses()); } m_Distribution = bestDist; return bestVal; }
/** * Test using Fayyad and Irani's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the splits is acceptable */ private boolean FayyadAndIranisMDL(double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double priorEntropy, entropy, gain; double entropyLeft, entropyRight, delta; int numClassesTotal, numClassesRight, numClassesLeft; // Compute entropy before split. priorEntropy = ContingencyTables.entropy(priorCounts); // Compute entropy after split. entropy = ContingencyTables.entropyConditionedOnRows(bestCounts); // Compute information gain. gain = priorEntropy - entropy; // Number of classes occuring in the set numClassesTotal = 0; for (double priorCount : priorCounts) { if (priorCount > 0) { numClassesTotal++; } } // Number of classes occuring in the left subset numClassesLeft = 0; for (int i = 0; i < bestCounts[0].length; i++) { if (bestCounts[0][i] > 0) { numClassesLeft++; } } // Number of classes occuring in the right subset numClassesRight = 0; for (int i = 0; i < bestCounts[1].length; i++) { if (bestCounts[1][i] > 0) { numClassesRight++; } } // Entropy of the left and the right subsets entropyLeft = ContingencyTables.entropy(bestCounts[0]); entropyRight = ContingencyTables.entropy(bestCounts[1]); // Compute terms for MDL formula delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) - ((numClassesTotal * priorEntropy) - (numClassesRight * entropyRight) - (numClassesLeft * entropyLeft)); // Check if split is to be accepted return (gain > (Utils.log2(numCutPoints) + delta) / numInstances); }
/** * Finds best split for nominal attribute and nominal class * and returns value. * * @param index attribute index * @return value of criterion for the best split * @throws Exception if something goes wrong */ private double findSplitNominalNominal(int index) throws Exception { double bestVal = Double.MAX_VALUE, currVal; double[][] counts = new double[m_Instances.attribute(index).numValues() + 1][m_Instances.numClasses()]; double[] sumCounts = new double[m_Instances.numClasses()]; double[][] bestDist = new double[3][m_Instances.numClasses()]; int numMissing = 0; // Compute counts for all the values for (int i = 0; i < m_Instances.numInstances(); i++) { Instance inst = m_Instances.instance(i); if (inst.isMissing(index)) { numMissing++; counts[m_Instances.attribute(index).numValues()] [(int)inst.classValue()] += inst.weight(); } else { counts[(int)inst.value(index)][(int)inst.classValue()] += inst .weight(); } } // Compute sum of counts for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { sumCounts[j] += counts[i][j]; } } // Make split counts for each possible split and evaluate System.arraycopy(counts[m_Instances.attribute(index).numValues()], 0, m_Distribution[2], 0, m_Instances.numClasses()); for (int i = 0; i < m_Instances.attribute(index).numValues(); i++) { for (int j = 0; j < m_Instances.numClasses(); j++) { m_Distribution[0][j] = counts[i][j]; m_Distribution[1][j] = sumCounts[j] - counts[i][j]; } currVal = ContingencyTables.entropyConditionedOnRows(m_Distribution); if (currVal < bestVal) { bestVal = currVal; m_SplitPoint = (double)i; for (int j = 0; j < 3; j++) { System.arraycopy(m_Distribution[j], 0, bestDist[j], 0, m_Instances.numClasses()); } } } // No missing values in training data. if (numMissing == 0) { System.arraycopy(sumCounts, 0, bestDist[2], 0, m_Instances.numClasses()); } m_Distribution = bestDist; return bestVal; }
/** * Test using Fayyad and Irani's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the splits is acceptable */ private boolean FayyadAndIranisMDL(double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double priorEntropy, entropy, gain; double entropyLeft, entropyRight, delta; int numClassesTotal, numClassesRight, numClassesLeft; // Compute entropy before split. priorEntropy = ContingencyTables.entropy(priorCounts); // Compute entropy after split. entropy = ContingencyTables.entropyConditionedOnRows(bestCounts); // Compute information gain. gain = priorEntropy - entropy; // Number of classes occuring in the set numClassesTotal = 0; for (int i = 0; i < priorCounts.length; i++) { if (priorCounts[i] > 0) { numClassesTotal++; } } // Number of classes occuring in the left subset numClassesLeft = 0; for (int i = 0; i < bestCounts[0].length; i++) { if (bestCounts[0][i] > 0) { numClassesLeft++; } } // Number of classes occuring in the right subset numClassesRight = 0; for (int i = 0; i < bestCounts[1].length; i++) { if (bestCounts[1][i] > 0) { numClassesRight++; } } // Entropy of the left and the right subsets entropyLeft = ContingencyTables.entropy(bestCounts[0]); entropyRight = ContingencyTables.entropy(bestCounts[1]); // Compute terms for MDL formula delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) - (((double) numClassesTotal * priorEntropy) - (numClassesRight * entropyRight) - (numClassesLeft * entropyLeft)); // Check if split is to be accepted return (gain > (Utils.log2(numCutPoints) + delta) / (double)numInstances); }
/** * Computes value of splitting criterion before split. * * @param dist the distributions * @return the splitting criterion */ protected double priorVal(double[][] dist) { return ContingencyTables.entropyOverColumns(dist); }
/** * Computes value of splitting criterion after split. * * @param dist the distributions * @param priorVal the splitting criterion * @return the gain after the split */ protected double gain(double[][] dist, double priorVal) { return priorVal - ContingencyTables.entropyConditionedOnRows(dist); }
/** * Computes value of splitting criterion before split. * * @param dist * @return the splitting criterion */ protected double priorVal(double[][] dist) { return ContingencyTables.entropyOverColumns(dist); }
/** * Computes value of splitting criterion after split. * * @param dist * @param priorVal the splitting criterion * @return the gain after splitting */ protected double gain(double[][] dist, double priorVal) { return priorVal - ContingencyTables.entropyConditionedOnRows(dist); }
/** * Computes value of splitting criterion before split. * * @param dist * the distributions * @return the splitting criterion */ protected double priorVal(double[][] dist) { return ContingencyTables.entropyOverColumns(dist); }
/** * Computes value of splitting criterion after split. * * @param dist * the distributions * @param priorVal * the splitting criterion * @return the gain after the split */ protected double gain(double[][] dist, double priorVal) { return priorVal - ContingencyTables.entropyConditionedOnRows(dist); }