/** * Calculate metric value * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData){ double mean = 0.0; Instances instances = mlData.getDataSet(); int countNominal = 0; int [] featureIndices = mlData.getFeatureIndices(); for(int fIndex : featureIndices){ AttributeStats attStats = instances.attributeStats(fIndex); if(attStats.nominalCounts != null){ countNominal++; mean += Utils.entropy(attStats.nominalCounts); } } mean = mean/countNominal; this.value = mean; return value; }
/** * Pretty hokey heuristic to try and set t2 distance automatically based on * standard deviation * * @param trainingBatch the training instances * @throws Exception if a problem occurs */ protected void setT2T1BasedOnStdDev(Instances trainingBatch) throws Exception { double normalizedStdDevSum = 0; for (int i = 0; i < trainingBatch.numAttributes(); i++) { if (trainingBatch.attribute(i).isNominal()) { normalizedStdDevSum += 0.25; } else if (trainingBatch.attribute(i).isNumeric()) { AttributeStats stats = trainingBatch.attributeStats(i); if (trainingBatch.numInstances() - stats.missingCount > 2) { double stdDev = stats.numericStats.stdDev; double min = stats.numericStats.min; double max = stats.numericStats.max; if (!Utils.isMissingValue(stdDev) && max - min > 0) { stdDev = 0.5 * stdDev / (max - min); normalizedStdDevSum += stdDev; } } } } normalizedStdDevSum = Math.sqrt(normalizedStdDevSum); if (normalizedStdDevSum > 0) { m_t2 = normalizedStdDevSum; } }
/** * Tells the panel to use a new set of instances. * * @param inst a set of Instances */ public void setInstances(Instances inst) { m_Instances = inst; m_AttributeStats = new AttributeStats[inst.numAttributes()]; m_AttributeNameLab.setText(NO_SOURCE); m_AttributeTypeLab.setText(NO_SOURCE); m_MissingLab.setText(NO_SOURCE); m_UniqueLab.setText(NO_SOURCE); m_DistinctLab.setText(NO_SOURCE); m_StatsTable.setModel(new DefaultTableModel()); m_allEqualWeights = true; if (m_Instances.numInstances() == 0) { return; } double w = m_Instances.instance(0).weight(); for (int i = 1; i < m_Instances.numInstances(); i++) { if (m_Instances.instance(i).weight() != w) { m_allEqualWeights = false; break; } } }
public void testTypical() { Instances result = useFilter(); // Number of attributes shouldn't change assertEquals(m_Instances.numAttributes(), result.numAttributes()); // Number of instances may change (if an instance has all missing values) // assertEquals(m_Instances.numInstances(), result.numInstances()); for (int j = 0; j < result.numAttributes(); j++) { if (j == m_Instances.classIndex() && m_Instances.attribute(j).isNumeric() == false) { continue; } AttributeStats currentStats = m_Instances.attributeStats(j); if (currentStats.distinctCount < 2) { continue; } assertTrue("All missing values except for those in nonnumeric class " + "attributes should be replaced.", result.attributeStats(j).missingCount == 0); } }
/** * Tells the panel to use a new set of instances. * * @param inst a set of Instances */ public void setInstances(Instances inst) { m_Instances = inst; m_AttributeStats = new AttributeStats [inst.numAttributes()]; m_AttributeNameLab.setText(NO_SOURCE); m_AttributeTypeLab.setText(NO_SOURCE); m_MissingLab.setText(NO_SOURCE); m_UniqueLab.setText(NO_SOURCE); m_DistinctLab.setText(NO_SOURCE); m_StatsTable.setModel(new DefaultTableModel()); m_allEqualWeights = true; double w = m_Instances.instance(0).weight(); for (int i = 1; i < m_Instances.numInstances(); i++) { if (m_Instances.instance(i).weight() != w) { m_allEqualWeights = false; break; } } }
private void testDistributionSpread_X(double factor) throws Exception { AttributeStats origs = m_Instances.attributeStats(1); assertNotNull(origs.nominalCounts); ((SpreadSubsample)m_Filter).setDistributionSpread(factor); Instances result = useFilter(); assertEquals(m_Instances.numAttributes(), result.numAttributes()); AttributeStats outs = result.attributeStats(1); // Check distributions are pretty similar assertNotNull(outs.nominalCounts); assertEquals(origs.nominalCounts.length, outs.nominalCounts.length); int min = outs.nominalCounts[0]; int max = outs.nominalCounts[0]; for (int i = 1; i < outs.nominalCounts.length; i++) { if (outs.nominalCounts[i] < min) { min = outs.nominalCounts[i]; } if (outs.nominalCounts[i] > max) { max = outs.nominalCounts[i]; } } assertTrue(max / factor <= min); }
public void testNoBias() throws Exception { m_Instances.setClassIndex(1); AttributeStats origs = m_Instances.attributeStats(1); assertNotNull(origs.nominalCounts); Instances result = useFilter(); assertEquals(m_Instances.numAttributes(), result.numAttributes()); AttributeStats outs = result.attributeStats(1); // Check distributions are pretty similar assertNotNull(outs.nominalCounts); assertEquals(origs.nominalCounts.length, outs.nominalCounts.length); for (int i = 0; i < origs.nominalCounts.length; i++) { int est = origs.nominalCounts[i] / 2 - 1; assertTrue("Counts for value:" + i + " orig:" + origs.nominalCounts[i] + " out50%:" + outs.nominalCounts[i], (est <= outs.nominalCounts[i]) && (outs.nominalCounts[i] <= (est + 3))); } }
public void testBiasToUniform() throws Exception { m_Instances.setClassIndex(1); AttributeStats origs = m_Instances.attributeStats(1); assertNotNull(origs.nominalCounts); ((Resample)m_Filter).setBiasToUniformClass(1.0); Instances result = useFilter(); assertEquals(m_Instances.numAttributes(), result.numAttributes()); AttributeStats outs = result.attributeStats(1); // Check distributions are pretty similar assertNotNull(outs.nominalCounts); assertEquals(origs.nominalCounts.length, outs.nominalCounts.length); int est = (origs.totalCount - origs.missingCount) / origs.distinctCount; est = est / 2 - 1; for (int i = 0; i < origs.nominalCounts.length; i++) { assertTrue("Counts for value:" + i + " orig:" + origs.nominalCounts[i] + " out50%:" + outs.nominalCounts[i] + " ~wanted:" + est, (est <= outs.nominalCounts[i]) && (outs.nominalCounts[i] <= (est + 3))); } }
/** * Calculate metric value * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData){ Instances instances = mlData.getDataSet(); int nLabels = mlData.getNumLabels(); int [] labels = mlData.getLabelIndices(); double [] entropies = new double[nLabels]; for(int i=0; i<nLabels; i++){ AttributeStats attStats = instances.attributeStats(labels[i]); if(attStats.nominalCounts != null){ entropies[i] = Utils.entropy(attStats.nominalCounts); } } double minEntropy = Double.MAX_VALUE; for(double e : entropies){ if(e < minEntropy){ minEntropy = e; } } this.value = minEntropy; return value; }
/** * Calculate metric value * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData){ Instances instances = mlData.getDataSet(); int nLabels = mlData.getNumLabels(); int [] labels = mlData.getLabelIndices(); double [] entropies = new double[nLabels]; for(int i=0; i<nLabels; i++){ AttributeStats attStats = instances.attributeStats(labels[i]); if(attStats.nominalCounts != null){ entropies[i] = Utils.entropy(attStats.nominalCounts); } } double maxEntropy = Double.MIN_VALUE; for(double e : entropies){ if(e > maxEntropy){ maxEntropy = e; } } this.value = maxEntropy; return value; }
/** * Calculate metric value * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData){ Instances instances = mlData.getDataSet(); int nLabels = mlData.getNumLabels(); int [] labels = mlData.getLabelIndices(); double [] entropies = new double[nLabels]; for(int i=0; i<nLabels; i++){ AttributeStats attStats = instances.attributeStats(labels[i]); if(attStats.nominalCounts != null){ entropies[i] = Utils.entropy(attStats.nominalCounts); } } double meanEntropy = 0; for(double e : entropies){ meanEntropy += e; } meanEntropy /= entropies.length; this.value = meanEntropy; return value; }
public static void main(String[] args) { try { weka.core.Instances inst = new weka.core.Instances(new java.io.FileReader(args[0])); double quantile = Double.parseDouble(args[1]); IncrementalQuantileEstimator ps = new IncrementalQuantileEstimator(quantile); int attIndex = Integer.parseInt(args[2]) - 1; for (int i = 0; i < inst.numInstances(); i++) { if (!inst.instance(i).isMissing(attIndex)) { ps.add(inst.instance(i).value(attIndex)); } } System.err.println("Estimated quantile (" + quantile + ") " + ps.getQuantile()); inst.sort(attIndex); double actualQuant = 0; AttributeStats as = inst.attributeStats(attIndex); double pIndex = quantile * (inst.numInstances() - as.missingCount); double mean = as.numericStats.mean; if (pIndex - (int) pIndex > 0) { pIndex = (int) pIndex; actualQuant = inst.instance((int) pIndex).value(attIndex); } else { double f = inst.instance((int) pIndex - 1).value(attIndex); double s = inst.instance((int) pIndex).value(attIndex); actualQuant = (f + s) / 2.0; } System.err.println("Actual quantile (" + quantile + ") " + actualQuant); System.err.println("Mean: " + mean); } catch (Exception ex) { ex.printStackTrace(); } }
/** * Update attribute stats using the supplied instance. * * @param updateInstance the instance for updating * @param delete true if the values of the supplied instance are to be * removed from the statistics */ protected void updateStats(Instance updateInstance, boolean delete) { if (m_attStats == null) { m_attStats = new AttributeStats[m_numAttributes]; for (int i = 0; i < m_numAttributes; i++) { m_attStats[i] = new AttributeStats(); if (m_clusterInstances.attribute(i).isNominal()) { m_attStats[i].nominalCounts = new int[m_clusterInstances.attribute( i).numValues()]; } else { m_attStats[i].numericStats = new Stats(); } } } for (int i = 0; i < m_numAttributes; i++) { if (!updateInstance.isMissing(i)) { double value = updateInstance.value(i); if (m_clusterInstances.attribute(i).isNominal()) { m_attStats[i].nominalCounts[(int) value] += (delete) ? (-1.0 * updateInstance .weight()) : updateInstance.weight(); m_attStats[i].totalCount += (delete) ? (-1.0 * updateInstance .weight()) : updateInstance.weight(); } else { if (delete) { m_attStats[i].numericStats.subtract(value, updateInstance.weight()); } else { m_attStats[i].numericStats.add(value, updateInstance.weight()); } } } } m_totalInstances += (delete) ? (-1.0 * updateInstance.weight()) : (updateInstance.weight()); }
/** * Sets the instances for use * * @param newins a set of Instances */ public void setInstances(Instances newins) { m_attribIndex = 0; m_as = null; m_data = new Instances(newins); if (m_colorAttrib != null) { m_colorAttrib.removeAllItems(); m_colorAttrib.addItem("No class"); for (int i = 0; i < m_data.numAttributes(); i++) { String type = "(" + Attribute.typeToStringShort(m_data.attribute(i)) + ")"; m_colorAttrib.addItem(new String("Class: " + m_data.attribute(i).name() + " " + type)); } if (m_data.classIndex() >= 0) { m_colorAttrib.setSelectedIndex(m_data.classIndex() + 1); } else { m_colorAttrib.setSelectedIndex(m_data.numAttributes()); } // if (m_data.classIndex() >= 0) { // m_colorAttrib.setSelectedIndex(m_data.classIndex()); // } } if (m_data.classIndex() >= 0) { m_classIndex = m_data.classIndex(); } else { m_classIndex = m_data.numAttributes() - 1; } m_asCache = new AttributeStats[m_data.numAttributes()]; }
/** * Sets the gui elements for fields that are stored in the AttributeStats * structure. * * @param index the index of the attribute */ protected void setDerived(int index) { AttributeStats as = m_AttributeStats[index]; long percent = Math.round(100.0 * as.missingCount / as.totalCount); m_MissingLab.setText("" + as.missingCount + " (" + percent + "%)"); percent = Math.round(100.0 * as.uniqueCount / as.totalCount); m_UniqueLab.setText("" + as.uniqueCount + " (" + percent + "%)"); m_DistinctLab.setText("" + as.distinctCount); setTable(as, index); }
/** * builds the classifier * * @throws Exception if something goes wrong */ @Override protected void build() throws Exception { AttributeStats stats; int i; // determine class distribution m_ClassDistribution = new double[2]; stats = m_Trainset.attributeStats(m_Trainset.classIndex()); for (i = 0; i < 2; i++) m_ClassDistribution[i] = stats.nominalCounts[i] / stats.totalCount; // the number of instances added to the training set in each iteration m_InstancesPerIteration = (double) m_Testset.numInstances() / getFolds(); if (getDebug()) System.out.println("InstancesPerIteration: " + m_InstancesPerIteration); // build classifier m_Random = new Random(getSeed()); for (i = 0; i <= getFolds(); i++) { if (getVerbose() || getDebug()) { if (getCutOff() > 0) System.out.println( "\nFold " + i + "/" + getFolds() + " (CutOff at " + getCutOff() + ")"); else System.out.println("\nFold " + i + "/" + getFolds()); } buildTrainSet(i); buildClassifier(); // cutoff of folds reached? if ( (i > 0) && (i == getCutOff()) ) break; } }
/** * sets the class probabilities based on the given data * * @param data the data to get the class probabilities from */ public void setClassProbabilities(Instances data) { AttributeStats stats; int total; int i; stats = data.attributeStats(data.classIndex()); total = Utils.sum(stats.nominalCounts); m_ClassProbs = new double[data.classAttribute().numValues()]; for (i = 0; i < m_ClassProbs.length; i++) m_ClassProbs[i] = (double) stats.nominalCounts[i] / (double) total; }
/** * randomly initializes the class labels in the given set according to the * class distribution in the training set * @param train the training instances to retrieve the class * distribution from * @param instances the instances to initialize * @param from the first instance to initialize * @param count the number of instances to initialize * @return the initialize instances * @throws Exception if something goes wrong */ public Instances initializeLabels( Instances train, Instances instances, int from, int count ) throws Exception { int i; AttributeStats stats; Attribute classAttr; double percentage; // reset flip count m_FlippedLabels = 0; // explicitly set labels to "missing" for (i = from; i < from + count; i++) instances.instance(i).setClassMissing(); // determining the percentage of the first class stats = train.attributeStats(train.classIndex()); percentage = (double) stats.nominalCounts[0] / (double) stats.totalCount; // set lables classAttr = instances.attribute(instances.classIndex()); for (i = from; i < from + count; i++) { // random class if (m_Random.nextDouble() < percentage) instances.instance(i).setClassValue(classAttr.value(0)); else instances.instance(i).setClassValue(classAttr.value(1)); } return instances; }
/** * Update attribute stats using the supplied instance. * * @param updateInstance the instance for updating * @param delete true if the values of the supplied instance are * to be removed from the statistics */ protected void updateStats(Instance updateInstance, boolean delete) { if (m_attStats == null) { m_attStats = new AttributeStats[m_numAttributes]; for (int i = 0; i < m_numAttributes; i++) { m_attStats[i] = new AttributeStats(); if (m_clusterInstances.attribute(i).isNominal()) { m_attStats[i].nominalCounts = new int [m_clusterInstances.attribute(i).numValues()]; } else { m_attStats[i].numericStats = new Stats(); } } } for (int i = 0; i < m_numAttributes; i++) { if (!updateInstance.isMissing(i)) { double value = updateInstance.value(i); if (m_clusterInstances.attribute(i).isNominal()) { m_attStats[i].nominalCounts[(int)value] += (delete) ? (-1.0 * updateInstance.weight()) : updateInstance.weight(); m_attStats[i].totalCount += (delete) ? (-1.0 * updateInstance.weight()) : updateInstance.weight(); } else { if (delete) { m_attStats[i].numericStats.subtract(value, updateInstance.weight()); } else { m_attStats[i].numericStats.add(value, updateInstance.weight()); } } } } m_totalInstances += (delete) ? (-1.0 * updateInstance.weight()) : (updateInstance.weight()); }
/** * Sets the instances for use * * @param newins a set of Instances */ public void setInstances(Instances newins) { m_attribIndex = 0; m_as = null; m_data = new Instances(newins); if(m_colorAttrib!=null) { m_colorAttrib.removeAllItems(); m_colorAttrib.addItem("No class"); for(int i=0; i<m_data.numAttributes(); i++) { String type = "(" + Attribute.typeToStringShort(m_data.attribute(i)) + ")"; m_colorAttrib.addItem(new String("Class: " + m_data.attribute(i).name() + " " + type)); } if (m_data.classIndex() >= 0) { m_colorAttrib.setSelectedIndex(m_data.classIndex() + 1); } else { m_colorAttrib.setSelectedIndex(m_data.numAttributes()); } //if (m_data.classIndex() >= 0) { // m_colorAttrib.setSelectedIndex(m_data.classIndex()); //} } if (m_data.classIndex() >= 0) { m_classIndex = m_data.classIndex(); } else { m_classIndex = m_data.numAttributes()-1; } m_asCache = new AttributeStats[m_data.numAttributes()]; }
/** * Signify that this batch of input to the filter is finished. * If the filter requires all instances prior to filtering, * output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_attStats == null) { Instances input = getInputFormat(); m_attStats = new AttributeStats [input.numAttributes()]; for (int i = 0; i < input.numAttributes(); i++) { if (input.attribute(i).isNumeric() && (input.classIndex() != i)) { m_attStats[i] = input.attributeStats(i); } } // Convert pending input instances for(int i = 0; i < input.numInstances(); i++) { convertInstance(input.instance(i)); } } // Free memory flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
/** * Signify that this batch of input to the filter is finished. If the filter * requires all instances prior to filtering, output() may now be called to * retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_attStats == null) { Instances input = getInputFormat(); m_attStats = new AttributeStats[input.numAttributes()]; for (int i = 0; i < input.numAttributes(); i++) { if (input.attribute(i).isNumeric() && (input.classIndex() != i)) { m_attStats[i] = input.attributeStats(i); } } // Convert pending input instances for (int i = 0; i < input.numInstances(); i++) { convertInstance(input.instance(i)); } } // Free memory flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
/** * Tells the panel to use a new set of instances. * * @param inst a set of Instances */ public void setInstances(Instances inst) { m_Instances = inst; m_AttributeStats = new AttributeStats [inst.numAttributes()]; m_AttributeNameLab.setText(NO_SOURCE); m_AttributeTypeLab.setText(NO_SOURCE); m_MissingLab.setText(NO_SOURCE); m_UniqueLab.setText(NO_SOURCE); m_DistinctLab.setText(NO_SOURCE); m_StatsTable.setModel(new DefaultTableModel()); }
/** * Update attribute stats using the supplied instance. * * @param updateInstance the instance for updating * @param delete true if the values of the supplied instance are * to be removed from the statistics */ protected void updateStats(Instance updateInstance, boolean delete) { if (m_attStats == null) { m_attStats = new AttributeStats[m_numAttributes]; for (int i = 0; i < m_numAttributes; i++) { m_attStats[i] = new AttributeStats(); if (m_clusterInstances.attribute(i).isNominal()) { m_attStats[i].nominalCounts = new int[m_clusterInstances.attribute(i).numValues()]; } else { m_attStats[i].numericStats = new Stats(); } } } for (int i = 0; i < m_numAttributes; i++) { if (!updateInstance.isMissing(i)) { double value = updateInstance.value(i); if (m_clusterInstances.attribute(i).isNominal()) { m_attStats[i].nominalCounts[(int) value] += (delete) ? (-1.0 * updateInstance.weight()) : updateInstance.weight(); m_attStats[i].totalCount += (delete) ? (-1.0 * updateInstance.weight()) : updateInstance.weight(); } else { if (delete) { m_attStats[i].numericStats.subtract(value, updateInstance.weight()); } else { m_attStats[i].numericStats.add(value, updateInstance.weight()); } } } } m_totalInstances += (delete) ? (-1.0 * updateInstance.weight()) : (updateInstance.weight()); }
/** * Removes columns that are all missing from the data * * @param instances the instances * @return a new set of instances with all missing columns removed * @throws Exception if something goes wrong */ protected Instances removeMissingColumns(Instances instances) throws Exception { int numInstances = instances.numInstances(); StringBuffer deleteString = new StringBuffer(); int removeCount = 0; boolean first = true; int maxCount = 0; for (int i = 0; i < instances.numAttributes(); i++) { AttributeStats as = instances.attributeStats(i); if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { // see if we can decrease this by looking for the most frequent value int[] counts = as.nominalCounts; if (counts[Utils.maxIndex(counts)] > maxCount) { maxCount = counts[Utils.maxIndex(counts)]; } } if (as.missingCount == numInstances) { if (first) { deleteString.append((i + 1)); first = false; } else { deleteString.append("," + (i + 1)); } removeCount++; } } if (m_verbose) { System.err.println("Removed : " + removeCount + " columns with all missing " + "values."); } if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { m_upperBoundMinSupport = (double) maxCount / (double) numInstances; if (m_verbose) { System.err.println("Setting upper bound min support to : " + m_upperBoundMinSupport); } } if (deleteString.toString().length() > 0) { Remove af = new Remove(); af.setAttributeIndices(deleteString.toString()); af.setInvertSelection(false); af.setInputFormat(instances); Instances newInst = Filter.useFilter(instances, af); return newInst; } return instances; }
/** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws Exception if no input format defined */ @Override public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_removeFilter == null) { // establish attributes to remove from first batch Instances toFilter = getInputFormat(); int[] attsToDelete = new int[toFilter.numAttributes()]; int numToDelete = 0; for (int i = 0; i < toFilter.numAttributes(); i++) { if (i == toFilter.classIndex()) { continue; // skip class } AttributeStats stats = toFilter.attributeStats(i); if (stats.missingCount == toFilter.numInstances()) { attsToDelete[numToDelete++] = i; } else if (stats.distinctCount < 2) { // remove constant attributes attsToDelete[numToDelete++] = i; } else if (toFilter.attribute(i).isNominal()) { // remove nominal attributes that vary too much double variancePercent = (double) stats.distinctCount / (double) (stats.totalCount - stats.missingCount) * 100.0; if (variancePercent > m_maxVariancePercentage) { attsToDelete[numToDelete++] = i; } } } int[] finalAttsToDelete = new int[numToDelete]; System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete); m_removeFilter = new Remove(); m_removeFilter.setAttributeIndicesArray(finalAttsToDelete); m_removeFilter.setInvertSelection(false); m_removeFilter.setInputFormat(toFilter); for (int i = 0; i < toFilter.numInstances(); i++) { m_removeFilter.input(toFilter.instance(i)); } m_removeFilter.batchFinished(); Instance processed; Instances outputDataset = m_removeFilter.getOutputFormat(); // restore old relation name to hide attribute filter stamp outputDataset.setRelationName(toFilter.relationName()); setOutputFormat(outputDataset); while ((processed = m_removeFilter.output()) != null) { processed.setDataset(outputDataset); push(processed); } } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
/** * determines the values to retain, it is always at least 1 and up to the * maximum number of distinct values * * @param inst the Instances to determine the values from which are kept */ public void determineValues(Instances inst) { int i; AttributeStats stats; int attIdx; int min; int max; int count; m_AttIndex.setUpper(inst.numAttributes() - 1); attIdx = m_AttIndex.getIndex(); // init names m_Values = new HashSet<String>(); // number of values to retain stats = inst.attributeStats(attIdx); if (m_Invert) { count = stats.nominalCounts.length - m_NumValues; } else { count = m_NumValues; } // out of bounds? -> fix if (count < 1) { count = 1; // at least one value! } if (count > stats.nominalCounts.length) { count = stats.nominalCounts.length; // at max the existing values } // determine min/max occurences Arrays.sort(stats.nominalCounts); if (m_LeastValues) { min = stats.nominalCounts[0]; max = stats.nominalCounts[count - 1]; } else { min = stats.nominalCounts[(stats.nominalCounts.length - 1) - count + 1]; max = stats.nominalCounts[stats.nominalCounts.length - 1]; } // add values if they are inside min/max (incl. borders) and not more than // count stats = inst.attributeStats(attIdx); for (i = 0; i < stats.nominalCounts.length; i++) { if ((stats.nominalCounts[i] >= min) && (stats.nominalCounts[i] <= max) && (m_Values.size() < count)) { m_Values.add(inst.attribute(attIdx).value(i)); } } }
/** * here initialization and building, possible iterations will happen * * @throws Exception if something goes wrong */ @Override protected void build() throws Exception { AttributeStats stats; int i; // determine number of features to be selected m_KValue = getNumFeatures(); if (m_KValue < 1) m_KValue = (int) Utils.log2(m_Trainset.numAttributes()) + 1; // determine class distribution m_ClassDistribution = new double[2]; stats = m_Trainset.attributeStats(m_Trainset.classIndex()); for (i = 0; i < 2; i++) { if (stats.totalCount > 0) m_ClassDistribution[i] = stats.nominalCounts[i] / stats.totalCount; else m_ClassDistribution[i] = 0; } // the number of instances added to the training set in each iteration m_InstancesPerIteration = (double) m_Testset.numInstances() / getFolds(); if (getDebug()) System.out.println("InstancesPerIteration: " + m_InstancesPerIteration); // build list of sorted test instances m_List = new RankedList(m_Testset, m_ClassDistribution); // build classifier m_Random = new Random(getSeed()); for (i = 0; i <= getFolds(); i++) { if (getVerbose()) { if (getCutOff() > 0) System.out.println( "\nFold " + i + "/" + getFolds() + " (CutOff at " + getCutOff() + ")"); else System.out.println("\nFold " + i + "/" + getFolds()); } buildTrainSet(i); buildClassifier(); // cutoff of folds reached? if ( (i > 0) && (i == getCutOff()) ) break; } }