/** * Constructor by default. * * @param objectiveColumns The objetive colums * @param rows The rows * @param distanceFunction The distance function used to calculate the * similarity */ public AcumulativeDistanceContainer(Instances objectiveColumns, Instances rows, NormalizableDistance distanceFunction) { this.distanceFunction = distanceFunction; indexesChanges = new int[rows.numInstances()]; for (int i = 0; i < indexesChanges.length; i++) { indexesChanges[i] = i; } size = indexesChanges.length; acumulativeValue = new double[rows.numInstances()]; for (int i = 0; i < acumulativeValue.length; i++) { for (int j = 0; j < objectiveColumns.numInstances(); j++) { acumulativeValue[i] += distanceFunction.distance(rows.instance(i), objectiveColumns.instance(j)); } } }
/** * * @param typeOfDistance * Type of distance used */ public DensityDiversityQueryStrategy(NormalizableDistance typeOfDistance) { super(); this.setMaximal(false); this.typeOfDistance = typeOfDistance; }
/** * Utility function to examine the attribute ranges in a bunch of distance * functions and return a two instance dataset with the global mins/maxes of * numeric attributes set. This can be used to "prime" a distance function. * * @param distanceFuncs a list of distance functions (where each potentially * has only seen part of the overall dataset * @param headerNoSummary the header of the data that the distance functions * have been seeing * @return a priming data set with global min and max values for numeric * attributes * @throws DistributedWekaException if a problem occurs */ public static Instances computeDistancePrimingDataFromDistanceFunctions( List<NormalizableDistance> distanceFuncs, Instances headerNoSummary) throws DistributedWekaException { Instances prime = null; double[] mins = new double[headerNoSummary.numAttributes()]; double[] maxes = new double[headerNoSummary.numAttributes()]; try { for (int i = 0; i < headerNoSummary.numAttributes(); i++) { if (headerNoSummary.attribute(i).isNumeric()) { mins[i] = Double.MAX_VALUE; maxes[i] = Double.MIN_VALUE; } else { mins[i] = Utils.missingValue(); maxes[i] = Utils.missingValue(); } } for (NormalizableDistance d : distanceFuncs) { double[][] ranges = d.getRanges(); for (int i = 0; i < headerNoSummary.numAttributes(); i++) { if (ranges[i][NormalizableDistance.R_MIN] < mins[i]) { mins[i] = ranges[i][NormalizableDistance.R_MIN]; } if (ranges[i][NormalizableDistance.R_MAX] > maxes[i]) { maxes[i] = ranges[i][NormalizableDistance.R_MAX]; } } } } catch (Exception ex) { throw new DistributedWekaException(ex); } prime = new Instances(headerNoSummary, 2); prime.add(new DenseInstance(1.0, mins)); prime.add(new DenseInstance(1.0, maxes)); return prime; }
/** * Returns the widest dimension. The width of each dimension (for the points * inside the node) is normalized, if m_NormalizeNodeWidth is set to true. * * @param nodeRanges The attributes' range of the points inside the node that * is to be split. * @param universe The attributes' range for the whole point-space. * @return The index of the attribute/dimension in which the points of the * node have widest spread. */ protected int widestDim(double[][] nodeRanges, double[][] universe) { final int classIdx = m_Instances.classIndex(); double widest = 0.0; int w = -1; if (m_NormalizeDimWidths) { for (int i = 0; i < nodeRanges.length; i++) { double newWidest = nodeRanges[i][NormalizableDistance.R_WIDTH] / universe[i][NormalizableDistance.R_WIDTH]; if (newWidest > widest) { if (i == classIdx) { continue; } widest = newWidest; w = i; } } } else { for (int i = 0; i < nodeRanges.length; i++) { if (nodeRanges[i][NormalizableDistance.R_WIDTH] > widest) { if (i == classIdx) { continue; } widest = nodeRanges[i][NormalizableDistance.R_WIDTH]; w = i; } } } return w; }
/** * Make the final PreconstructedKMeans clusterer to wrap the centroids and * stats found during map-reduce. * * @param best the best result from the runs of k-means that were performed in * parallel * @param preprocess any pre-processing filters applied * @param initialStartingPoints the initial starting centroids * @param finalNumIterations the final number of iterations performed * @return a final clusterer object * @throws DistributedWekaException if a problem occurs */ protected Clusterer makeFinalClusterer(KMeansReduceTask best, Filter preprocess, Instances initialStartingPoints, int finalNumIterations) throws DistributedWekaException { Clusterer finalClusterer = null; PreconstructedKMeans finalKMeans = new PreconstructedKMeans(); // global priming data for the distance function (this will be in // the transformed space if we're using preprocessing filters) Instances globalPrimingData = best.getGlobalDistanceFunctionPrimingData(); NormalizableDistance dist = new EuclideanDistance(); dist.setInstances(globalPrimingData); finalKMeans.setClusterCentroids(best.getCentroidsForRun()); finalKMeans.setFinalNumberOfIterations(finalNumIterations + 1); if (initialStartingPoints != null) { finalKMeans.setInitialStartingPoints(initialStartingPoints); } try { finalKMeans.setDistanceFunction(dist); finalKMeans.setClusterStats(best.getAggregatedCentroidSummaries()); } catch (Exception e) { throw new DistributedWekaException(e); } if (!getInitWithRandomCentroids()) { finalKMeans.setInitializationMethod(new SelectedTag( SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION)); } finalKMeans.setDisplayStdDevs(getDisplayCentroidStdDevs()); finalClusterer = finalKMeans; if (preprocess != null) { PreconstructedFilteredClusterer fc = new PreconstructedFilteredClusterer(); fc.setFilter(preprocess); fc.setClusterer(finalKMeans); finalClusterer = fc; } return finalClusterer; }
public NormalizableDistance getDistanceFunction() { return m_distanceFunction; }
/** * Initializes the final distance function using range information in the * distance functions of the individual Canopy clusterers. We use this * initialization when there is more than just a missing values filter being * used because, in this case, the min/max info in the global attribute * summary info is not applicable (i.e. filter(s) might transform or create * new attributes for which we don't have summary information for in the * global ARFF header). * * @param clist the list of individual Canopy clusterers * @param finalDistance the distance function to initialize * @throws Exception if a problem occurs */ protected void initFinalDistanceFunctionFiltersInPlay(List<Canopy> clist, NormalizableDistance finalDistance) throws Exception { Instances filteredStructure = new Instances(((ECanopy) clist.get(0)).getDistanceFunction() .getInstances(), 0); double[] globalMax = new double[filteredStructure.numAttributes()]; double[] globalMin = new double[filteredStructure.numAttributes()]; double[][] ranges = ((ECanopy) clist.get(0)).getDistanceFunction().getRanges(); for (int i = 0; i < filteredStructure.numAttributes(); i++) { globalMin[i] = ranges[i][NormalizableDistance.R_MIN]; globalMax[i] = ranges[i][NormalizableDistance.R_MAX]; } for (int i = 1; i < clist.size(); i++) { ECanopy currentC = ((ECanopy) clist.get(i)); ranges = currentC.getDistanceFunction().getRanges(); for (int k = 0; k < filteredStructure.numAttributes(); k++) { if (ranges[k][NormalizableDistance.R_MIN] < globalMin[k]) { globalMin[k] = ranges[k][NormalizableDistance.R_MIN]; } if (ranges[k][NormalizableDistance.R_MAX] > globalMax[k]) { globalMax[k] = ranges[k][NormalizableDistance.R_MAX]; } } } for (int i = 0; i < filteredStructure.numAttributes(); i++) { if (filteredStructure.attribute(i).isNominal()) { // doesn't matter for non-numeric globalMin[i] = Utils.missingValue(); globalMax[i] = Utils.missingValue(); } } filteredStructure.add(new DenseInstance(1.0, globalMin)); filteredStructure.add(new DenseInstance(1.0, globalMax)); finalDistance.setInstances(filteredStructure); }
/** * Constructor * * @param typeOfDistance * Type of distance used */ public MultiLabelDensityDiversityQueryStrategy(NormalizableDistance typeOfDistance) { super(); setMaximal(false); setTypeOfDistance(typeOfDistance); }
/** * Constructor * * @param instances * The instances * @param distanceFunction * The distance function to be used * @param k * The number of k nearest neighbors */ public KNearestDistanceContainer(Instances instances, NormalizableDistance distanceFunction, int k) { super(instances, distanceFunction); accumulativeDistanceKNearest = new double[size]; kNearest = new HashSet[size]; this.k = k; // Compute the k-nearest neighbors of each instance for (int i = 0; i < size; i++) { ArrayList<Container> array = new ArrayList<Container>(k); kNearest[i] = new HashSet<Integer>(); for (int j = 0; j < size; j++) { if (i != j) array.add(new Container<Integer>(getDistance(i, j), j)); } OrderUtils.mergeSort(array, false); // Fill the distances of the k-nearest neighbors of i for (int pos = 0; pos < k; pos++) { accumulativeDistanceKNearest[i] += array.get(pos).getKey(); kNearest[i].add(Integer.parseInt(array.get(pos).getValue().toString())); } } }
/** * Constructor. * * @param initialSketch the initial starting point (typically one randomly * chosen instance for the k-means|| algorithm) * @param distanceFunction the distance function to use * @param size the size of the reservoir (i.e. how many points to consider * adding to the sketch at each iteration) * @param seed the seed for random number generation */ public CentroidSketch(Instances initialSketch, NormalizableDistance distanceFunction, int size, int seed) { m_currentSketch = initialSketch; m_distanceFunction = distanceFunction; m_seed = seed; m_size = size; m_weightedCenterSample = new WeightedReservoirSample(m_size, m_seed); }
/** * Get the distance function used * * @return The type of similarity. */ public NormalizableDistance getTypeOfDistance() { return typeOfDistance; }
/** * Set the distance function to use * * @param typeOfDistance * The type of similarity. Used in Density Diversity. */ public void setTypeOfDistance(NormalizableDistance typeOfDistance) { this.typeOfDistance = typeOfDistance; }
/** * Set the distance function to use * * @param typeOfDistance * The type of similarity. Used in Density Diversity. */ private void setTypeOfDistance(NormalizableDistance typeOfDistance) { this.typeOfDistance = typeOfDistance; }
/** * Default constructor * * @param instances * The set of instances * @param distanceFunction * The distance function used to calculate the distance among two * instances */ public DistanceContainer(Instances instances, NormalizableDistance distanceFunction) { size = instances.numInstances(); indexesChanges = new int[size]; acumulativeValue = new double[size]; numAttributes = instances.numAttributes(); int m = size - 1; distance = new double[m][]; int temp; double valueTemp; maxDistance = Double.MIN_VALUE; minDistance = Double.MAX_VALUE; for (int i = 0; i < m; ++i) { distance[i] = new double[size - i - 1]; // In the begining the index and the value are equals indexesChanges[i] = i; for (int j = i + 1; j < size; ++j) { temp = j - i - 1; valueTemp = distanceFunction.distance(instances.instance(i), instances.instance(j)); if (valueTemp > maxDistance) maxDistance = valueTemp; if (valueTemp < minDistance) minDistance = valueTemp; setStoreDistance(i, temp, valueTemp); // accumulative distance // acumulativeValue[i] += valueTemp; // acumulativeValue[j] += valueTemp; } } indexesChanges[size - 1] = size - 1; scaleMinMax(); }
/** * Constructor by default * * @param instances * dataset * @param distanceFunction * The distance function used to calculate the distance * @param matrixOverFile * Whether the matrix will be stored into a file * @throws java.lang.Exception * The exception that will be launched */ public DistanceContainer(Instances instances, NormalizableDistance distanceFunction, boolean matrixOverFile) throws Exception { this.matrixOverFile = matrixOverFile; size = instances.numInstances(); indexesChanges = new int[size]; acumulativeValue = new double[size]; numAttributes = instances.numAttributes(); int m = size - 1; if (matrixOverFile) { distanceMatrix = new Matrix(size, size, true); } else { distance = new double[m][]; } maxDistance = Double.MIN_VALUE; minDistance = Double.MAX_VALUE; int temp; double valueTemp; for (int i = 0; i < m; ++i) { if (!matrixOverFile) { distance[i] = new double[size - i - 1]; } // In the begining the index and the value are equals indexesChanges[i] = i; for (int j = i + 1; j < size; ++j) { temp = j - i - 1; valueTemp = distanceFunction.distance(instances.instance(i), instances.instance(j)); if (valueTemp > maxDistance) maxDistance = valueTemp; if (valueTemp < minDistance) minDistance = valueTemp; setStoreDistance(i, temp, valueTemp); // acumulative distance // acumulativeValue[i] += valueTemp; // acumulativeValue[j] += valueTemp; } } indexesChanges[size - 1] = size - 1; scaleMinMax(); }
/** * Get the distance function being used * * @return the distance function */ public NormalizableDistance getDistanceFunction() { return m_distanceFunction; }
/** * Set the distance function to use * * @param distFunc the distance function to use */ public void setDistanceFunction(NormalizableDistance distFunc) { m_distanceFunction = distFunc; }
/** * Get the distance function in use * * @return the distance function in use */ public NormalizableDistance getDistanceFunction() { return m_distanceFunction; }
/** * Constructor * * @param instances * The instances * @param distanceFunction * The distance function to be used * @param matrixOverFile * It indicates whether the matrix is stored over a file or the * main memory * @throws Exception * Launch an exception in case that an error occurs. */ public KNearestDistanceContainer(Instances instances, NormalizableDistance distanceFunction, boolean matrixOverFile) throws Exception { super(instances, distanceFunction, matrixOverFile); }