public static void main(String[] args) { try { BookDecisionTree decisionTree = new BookDecisionTree("books.arff"); J48 tree = decisionTree.performTraining(); System.out.println(tree.toString()); Instance testInstance = decisionTree. getTestInstance("Leather", "yes", "historical"); int result = (int) tree.classifyInstance(testInstance); String results = decisionTree.trainingData.attribute(3).value(result); System.out.println( "Test with: " + testInstance + " Result: " + results); testInstance = decisionTree. getTestInstance("Paperback", "no", "historical"); result = (int) tree.classifyInstance(testInstance); results = decisionTree.trainingData.attribute(3).value(result); System.out.println( "Test with: " + testInstance + " Result: " + results); } catch (Exception ex) { ex.printStackTrace(); } }
@Override protected Tuple fetchNext() throws DbException, TransactionAbortedException { // Block, adding all of the child tuples to the buffer. while (child.hasNext()){ buffer.add(child.next()); } // Do the impute, if we have not already. if (imputedInstances == null){ doImpute(); } // We've imputed values, so we can actually return them now. if (nextTupleIndex < buffer.size()){ Tuple original = buffer.get(nextTupleIndex); Instance inst = imputedInstances.get(nextTupleIndex); Tuple imputed = new Tuple(original); mergeInstanceIntoTuple(inst, imputed); nextTupleIndex++; return imputed; } else { return null; } }
public Main() { try { BufferedReader datafile; datafile = readDataFile("camping.txt"); Instances data = new Instances(datafile); data.setClassIndex(data.numAttributes() - 1); Instances trainingData = new Instances(data, 0, 14); Instances testingData = new Instances(data, 14, 5); Evaluation evaluation = new Evaluation(trainingData); SMO smo = new SMO(); smo.buildClassifier(data); evaluation.evaluateModel(smo, testingData); System.out.println(evaluation.toSummaryString()); // Test instance Instance instance = new DenseInstance(3); instance.setValue(data.attribute("age"), 78); instance.setValue(data.attribute("income"), 125700); instance.setValue(data.attribute("camps"), 1); instance.setDataset(data); System.out.println("The instance: " + instance); System.out.println(smo.classifyInstance(instance)); } catch (Exception ex) { ex.printStackTrace(); } }
private boolean isClassTheMajortiy(ArrayList<Instance> instances, double classification){ List<Instance> instancesList = new ArrayList<>(instances); TreeMap<Double, Double> classificationProbability = new TreeMap<>(); Attribute classAttribute = instances.get(0).classAttribute(); for (double i = 0; i < classAttribute.numValues(); i++) { int matchedClassCount = 0; for (Instance instance : instancesList) { if(instance.classValue() == i){ matchedClassCount++; } } classificationProbability.put(((double) matchedClassCount / (double) instancesList.size()), i); } return (classificationProbability.lastEntry().getValue() == classification); }
public Instance createInstance(List<String> featureNames, I key) { double[] vals = new double[featureNames.size()]; for (int i = 0; i < featureNames.size(); i++) { Feature<Object> f = this.featureValues.get(key).get(featureNames.get(i)); if (f != null) vals[i] = f.getDoubleValue(); else { Class<Object> type = features.get(featureNames.get(i)); if (type.equals(Double.class) || type.equals(Float.class) || type.equals(Integer.class)) vals[i] = Double.NaN; if (type.equals(Boolean.class) || type.equals(String.class)) vals[i] = 0; } } return new DenseInstance(1.0, vals); }
/** * testTransactionLookup */ @Test public void testTransactionLookup() throws Exception { int txn_id_idx = FeatureExtractor.TXNID_ATTRIBUTE_IDX; assertEquals(workload.getTransactionCount(), data.numInstances()); List<TransactionTrace> txns = new ArrayList<TransactionTrace>(workload.getTransactions()); // System.err.println(StringUtil.join("\n", txns)); // System.err.println(); for (int i = 0, cnt = data.numInstances(); i < cnt; i++) { Instance inst = data.instance(i); assertNotNull(inst); String value = inst.stringValue(txn_id_idx); // System.err.println("VALUE: " + value); Long txn_id = Long.valueOf(value); assertNotNull(txn_id); TransactionTrace txn_trace = workload.getTransaction(txn_id); TransactionTrace expected = txns.get(i); // System.err.println("EXPECTED: " + expected.getTransactionId()); // System.err.println("FOUND: " + txn_id); assertNotNull(String.format("[%05d] Failed to txn #%d", i, txn_id), txn_trace); assertEquals(expected.getTransactionId(), txn_trace.getTransactionId()); } // FOR }
/** * testCreateMarkovAttributeSetFilter */ @Test public void testCreateMarkovAttributeSetFilter() throws Exception { // Test that we can create a filter from an MarkovAttributeSet MarkovAttributeSet aset = new MarkovAttributeSet(data, FeatureUtil.getFeatureKeyPrefix(ParamArrayLengthFeature.class)); assertEquals(CatalogUtil.getArrayProcParameters(catalog_proc).size(), aset.size()); Filter filter = aset.createFilter(data); Instances newData = Filter.useFilter(data, filter); for (int i = 0, cnt = newData.numInstances(); i < cnt; i++) { Instance processed = newData.instance(i); // System.err.println(processed); assertEquals(aset.size(), processed.numAttributes()); } // WHILE assertEquals(data.numInstances(), newData.numInstances()); // System.err.println("MarkovAttributeSet: " + aset); }
/** * Create a new Tuple by extracting the values from the Instance inst and using the TupleDesc td * @param inst Instance * @param td TupleDesc * @return new Tuple */ public static Tuple instanceToTuple(Instance inst, TupleDesc td){ Tuple t = new Tuple(td); for (int i=0; i<td.numFields(); i++){ double value = inst.value(i); Type type = td.getFieldType(i); Field field = null; if (type.equals(Type.INT_TYPE)){ field = new IntField((int) value); } else if (type.equals(Type.DOUBLE_TYPE)){ field = new DoubleField(value); } else if (type.equals(Type.STRING_TYPE)){ throw new UnsupportedOperationException(); // field = new StringField(value); } t.setField(i, field); } return t; }
public SentenceType classifySentence(Sentence sentence) { SpeechActsClassifier.Features features = speechActsClassifier.classifyFeatures(sentence); Instance inst = new DenseInstance(6); inst.setDataset(dataSet); inst.setValue(0, features.getSentenceLength()); inst.setValue(1, features.getNumberOfNouns()); inst.setValue(2, (features.isEndingInNounOrAdjective() ? 1 : 0)); inst.setValue(3, (features.isBeginningInVerb() ? 1 : 0)); inst.setValue(4, features.getCountOfWhMarkers()); inst.setValue(5, Utils.missingValue()); try { return SentenceType.valueOf(classifier.classifyInstance(inst)); } catch (Exception e) { throw new RuntimeException("Can't classify"); } }
public QuestionType classifyQuestion(Sentence sentence) { if (!sentence.isQuestion()) { return QuestionType.NA; } QuestionTypeClassifier.Features features = questionTypeClassifier.classifyFeatures(sentence); Instance inst = new DenseInstance(5); inst.setDataset(dataSet); inst.setValue(0, features.getWhWord()); inst.setValue(1, features.getWhWordPos()); inst.setValue(2, features.getPosOfNext()); inst.setValue(3, features.getRootPos()); inst.setValue(4, Utils.missingValue()); try { int ndx = (int) classifier.classifyInstance(inst); return QuestionType.valueOf(ndx); } catch (Exception e) { throw new RuntimeException("Not classified"); } }
protected List<FeatureWeight> getTopNegativeWekaFeaturesInReport( Instance reportInstance, HashMap<String, Integer> featureIndexMap, String[][] featureWeightTable, int topKwords) throws Exception { List<FeatureWeight> topNegativeFeatureList = new ArrayList<>(); int iFeature = 0; double weight; FeatureWeight featureWeight; while(topNegativeFeatureList.size() < topKwords && iFeature < featureIndexMap.size()) { weight = Double.parseDouble(featureWeightTable[iFeature][1]); if(weight < 0 && reportInstance.value(featureIndexMap.get(featureWeightTable[iFeature][0]) + 1) == 1) { // reportID is the first att in reportInstance featureWeight = new FeatureWeight(); featureWeight.setTerm(featureWeightTable[iFeature][0]); featureWeight.setWeight(weight); topNegativeFeatureList.add(featureWeight); } iFeature++; } return topNegativeFeatureList; }
protected List<FeatureWeight> getTopPositiveWekaFeaturesInReport(Instance reportInstance, HashMap<String, Integer> featureIndexMap, String[][] featureWeightTable, int topKwords) throws Exception { List<FeatureWeight> topPositiveFeatureList = new ArrayList<>(); int iFeature = 0; double weight; FeatureWeight featureWeight; while(topPositiveFeatureList.size() < topKwords && iFeature < featureIndexMap.size()) { weight = Double.parseDouble(featureWeightTable[iFeature][1]); if(weight > 0 && reportInstance.value(featureIndexMap.get(featureWeightTable[iFeature][0]) + 1) == 1) { // reportID is the first att in reportInstance featureWeight = new FeatureWeight(); featureWeight.setTerm(featureWeightTable[iFeature][0]); featureWeight.setWeight(weight); topPositiveFeatureList.add(featureWeight); } iFeature++; } return topPositiveFeatureList; }
public static Instance getInstanceObject (String[] instanceText, String[] globalFeatureVector, String docID, String classValue, Instances ds) throws Exception { FeatureVector instanceFeatureVector = getInstanceFeatureVector(instanceText, globalFeatureVector, docID); Instance instance = new Instance(globalFeatureVector.length + 2); instance.setDataset(ds); instance.setValue(0, docID); for(int i = 0; i < globalFeatureVector.length; i++) { double value = 0; if(instanceFeatureVector.m_FeatureVector[0].containsKey(i)) { value = instanceFeatureVector.m_FeatureVector[0].get(i); } instance.setValue(i + 1, value); } instance.setValue(globalFeatureVector.length + 1, classValue); return new SparseInstance(instance); }
/** * Calculate support value of a given rule on the dataset * * @param dataset the dataset * @param bodySide left-side or BODY part of the rule * @return support value for the rule on the given dataset */ public double calculateSupport(Instances dataset, List<Term> bodySide){ Iterator<Instance> datasetIterator = dataset.iterator(); int supportCount = 0; while(datasetIterator.hasNext()){ Instance anInstance = datasetIterator.next(); if(instanceCoveredByTermsList(anInstance,bodySide)){ supportCount++; } } return !dataset.isEmpty() ? (double) supportCount / (double) dataset.size() : 0.0d; }
/** * Calculate confidence value of a given rule on the dataset * * @param dataset the dataset * @param bodySide left-side or BODY part of the rule * @param HeadSide right-side or HEAD part of the rule * @return confidence value for the rule on the given dataset */ public double calcualteConfidence(Instances dataset, List<Term> bodySide, List<Term> HeadSide){ Iterator<Instance> datasetIterator = dataset.iterator(); int confidenceCount = 0; int supportCount = 0; while(datasetIterator.hasNext()){ Instance anInstance = datasetIterator.next(); if(instanceCoveredByTermsList(anInstance,bodySide)){ supportCount++; if(instanceCoveredByTermsList(anInstance,HeadSide)){ confidenceCount++; } } } return !dataset.isEmpty() ? (double) confidenceCount / (double) supportCount : 0.0d; }
public static DMatrix instancesToDMatrix(Instances instances) throws XGBoostError { long[] rowHeaders = new long[instances.size()+1]; rowHeaders[0]=0; List<Float> dataList = new ArrayList<>(); List<Integer> colList = new ArrayList<>(); float[] labels = new float[instances.size()]; for(int i=0; i<instances.size(); i++) { Instance instance = instances.get(i); rowHeaders[i] = dataList.size(); processInstance(instance, dataList, colList); labels[i] = (float) instance.classValue(); } rowHeaders[rowHeaders.length - 1] = dataList.size(); int colNum = instances.numAttributes()-1; DMatrix dMatrix = createDMatrix(rowHeaders, dataList, colList, colNum); dMatrix.setLabel(labels); return dMatrix; }
public static DMatrix instanceToDenseDMatrix(Instance instance) throws XGBoostError { Attribute classAttribute = instance.classAttribute(); int classAttrIndex = classAttribute.index(); int colNum = instance.numAttributes()-1; int rowNum = 1; float[] data = new float[colNum*rowNum]; Enumeration<Attribute> attributeEnumeration = instance.enumerateAttributes(); int dataIndex = 0; while (attributeEnumeration.hasMoreElements()) { Attribute attribute = attributeEnumeration.nextElement(); int attrIndex = attribute.index(); if(attrIndex == classAttrIndex){ continue; } data[dataIndex]= (float) instance.value(attribute); dataIndex++; } return new DMatrix(data, rowNum, colNum); }
@TimeThis(task="prediction") protected void predictExamples(ProcessingContext<Corpus> ctx, Classifier classifier, IdentifiedInstances<Element> devSet, Corpus corpus) throws Exception { ElementClassifierResolvedObjects resObj = getResolvedObjects(); RelationDefinition relationDefinition = resObj.getRelationDefinition(); Evaluator examples = resObj.getExamples(); String predictedClassFeatureKey = getPredictedClassFeatureKey(); TargetStream evaluationFile = getEvaluationFile(); boolean withId = evaluationFile != null; String[] classes = getClasses(devSet); getLogger(ctx).info("predicting class for each example"); EvaluationContext evalCtx = new EvaluationContext(getLogger(ctx)); for (Element example : Iterators.loop(getExamples(corpus, examples, evalCtx))) { Instance inst = relationDefinition.addExample(devSet, evalCtx, example, withId, withId); double prediction = classifier.classifyInstance(inst); example.addFeature(predictedClassFeatureKey, classes[(int) prediction]); if (!withId) devSet.delete(); } }
private void mergeInstanceIntoTuple(Instance instance, Tuple tuple) throws DbException { // Merge the tuple and the instance. Only values in dropFields will // change; the other values of the tuple will be unchanged by the // imputation. Iterator<Entry<Integer, Integer>> indexIt = this.dropFieldsIndicesMap.entrySet().iterator(); while (indexIt.hasNext()){ Entry<Integer, Integer> map = indexIt.next(); int k = map.getKey(); // index in Instance int v = map.getValue(); // index in Tuple double value = instance.value(k); if (td.getFieldType(v) == Type.INT_TYPE){ tuple.setField(v, new IntField((int) value)); } else if (td.getFieldType(v) == Type.DOUBLE_TYPE){ tuple.setField(v, new DoubleField(value)); } else { throw new DbException("Field type not implemented."); } } }
public static void getBestPerfFrom(String path){ try { BestConf bestconf = new BestConf(); Instances trainingSet = DataIOFile.loadDataFromArffFile(path); Instance best = trainingSet.firstInstance(); //set the best configuration to the cluster Map<Attribute,Double> attsmap = new HashMap<Attribute,Double>(); for(int i=0;i<best.numAttributes()-1;i++){ attsmap.put(best.attribute(i), best.value(i)); } double bestPerf = bestconf.setOptimal(attsmap, "getBestPerfFrom"); System.out.println("========================================="); System.err.println("The actual performance for the best point is : "+bestPerf); System.out.println("========================================="); } catch (IOException e) { e.printStackTrace(); } }
private Instance getTestInstance( String binding, String multicolor, String genre) { Instance instance = new DenseInstance(3); instance.setDataset(trainingData); instance.setValue(trainingData.attribute(0), binding); instance.setValue(trainingData.attribute(1), multicolor); instance.setValue(trainingData.attribute(2), genre); return instance; }
@Override public double[] getVotesForInstance(Instance inst) { // TODO Auto-generated method stub // increase no. of seen intances totalSeenInstances++; // check if there is any rules that cover the instance ArrayList<Rule> coveredRules = RulesCoveredInstance(inst); // logger.debug("No. Rules cover instance: " + coveredRules.size()); // logger.debug(inst); // return prediction if there are rules that cover the instance if(coveredRules.size() > 0){ actualAttempts++; double[] classPrediction = new double[inst.numClasses()]; // vote class labels from all available rules for (Rule rule : coveredRules) { classPrediction[(int)rule.classification]++; // logger.debug(rule.printRule()); } // actual attempt if(Utils.maxIndex(classPrediction) == (int) inst.classValue()){ actualAttemptsCorrectlyClassified++; } return classPrediction ; } // otherwise, return the majority class return observedClassDistribution.getArrayCopy(); }
public ArrayList<Rule> RulesCoveredInstance(Instance instance){ ArrayList<Rule> coveredRule = new ArrayList<>(); for (Rule rule : rulesList) { if(rule.coveredByRule(instance)){ coveredRule.add(rule); } } return coveredRule; }
private ArrayList<Instance> instancesCoveredByRuleTerm(ArrayList<Instance> instances, RuleTerm ruleTerm){ List<Instance> instancesList = new ArrayList<>(instances); List<Instance> instancesCoveredList = new ArrayList<>(); for (Instance instance : instancesList) { if(ruleTerm.coveredByRuleTerm(instance)){ instancesCoveredList.add(instance); } } return (ArrayList<Instance>) instancesCoveredList; }
private boolean containOtherClasses(ArrayList<Instance> instances, double classification){ List<Instance> instancesList = new ArrayList<>(instances); for (Instance instance : instancesList) { if(instance.classValue() != classification){ return false; } } return true; }
public static Map<Attribute, Double> instanceToMap(Instance ins){ HashMap<Attribute, Double> retval = new HashMap<Attribute, Double>(); Enumeration<Attribute> enu = ins.enumerateAttributes(); while(enu.hasMoreElements()){ Attribute temp = enu.nextElement(); retval.put(temp, ins.value(temp)); } return retval; }
private Map<Attribute, Map<Double, NormalDistribution>> initialiseGaussianDistributionForNumericAttribute(Instance instanceInfo, ArrayList<Instance> instancesList){ Map<Attribute, Map<Double, NormalDistribution>> numericAttributeClassGaussDistributions = new HashMap<>(); // go through each numeric attibute for (Attribute attribute : Collections.list(instanceInfo.enumerateAttributes())) { // check whether the attribute is numeric if(attribute.isNumeric()){ // for each class label HashMap<Double, NormalDistribution> classLabelDistribution = new HashMap<>(); for (int classLabelNo = 0; classLabelNo < instanceInfo.numClasses(); classLabelNo++) { // go through all instance in the dataset to create normal distribution SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Instance instance : instancesList) { summaryStatistics.addValue(instance.value(attribute)); } // create normal distribution for this attribute with corresponding // class label NormalDistribution normalDistribution = new NormalDistribution( summaryStatistics.getMean(), summaryStatistics.getStandardDeviation()); // map to hold classLabel and distribution classLabelDistribution.put((double) classLabelNo, normalDistribution); } // put it into the map numericAttributeClassGaussDistributions.put(attribute, classLabelDistribution); } } return numericAttributeClassGaussDistributions; }
public boolean coveredByRule(Instance instance){ // check whether all rule terms are satisfied for (RuleTerm ruleTerm : listOfRuleTerm) { if(!ruleTerm.coveredByRuleTerm(instance)){ return false; } } return true; }
@Override public double[][] distributionsForInstances(Instances batch) { double[][] dists = new double[batch.numInstances()][2]; for (int i = 0; i < batch.numInstances(); i++) { Instance ins = batch.instance(i); dists[i] = new double[2]; dists[i][1] = this.scoreInstance(ins); } return dists; }
public static Instances convertToArff(List<Document> dataSet, List<String> vocabulary, String fileName) { int dataSetSize = dataSet.size(); /* Create features */ ArrayList<Attribute> attributes = new ArrayList<>(); for (int i = 0; i < vocabulary.size(); i++) { attributes.add(new Attribute("word_" + i)); } Attribute classAttribute = new Attribute("Class"); attributes.add(classAttribute); /* Add examples */ System.out.println("Building instances..."); Instances trainingDataSet = new Instances(fileName, attributes, 0); for (int k = 0; k < dataSetSize; k++) { Document document = dataSet.get(k); Instance example = new DenseInstance(attributes.size()); for (int i = 0; i < vocabulary.size(); i++) { String word = vocabulary.get(i); example.setValue(i, Collections.frequency(document.getTerms(), word)); } example.setValue(classAttribute, document.getDocumentClass()); trainingDataSet.add(example); int progress = (int) ((k * 100.0) / dataSetSize); System.out.printf("\rPercent completed: %3d%%", progress); } trainingDataSet.setClass(classAttribute); System.out.println(); System.out.println("Writing to file ..."); try { ArffSaver saver = new ArffSaver(); saver.setInstances(trainingDataSet); saver.setFile(new File(fileName)); saver.writeBatch(); } catch (IOException e) { e.printStackTrace(); } return trainingDataSet; }
public static String getMD5(Instance ins){ StringBuffer name = new StringBuffer(""); for(int i = 0; i < ins.numAttributes() - 2; i++){ name.append(Math.round(ins.value(ins.attribute(i)))+","); } return getMD5(name.toString()); }
private double scoreInstance(Instance instance) { // bias double score = 1 * this.weights[0]; // ignore id and topic and class label for (int i = 2; i < instance.numAttributes() - 1; i++) { score += this.weights[i - 1] * instance.value(i); } return score; }
/** * check if all instances from a given dataset is covered/ satisfied by both BODY and HEAD * * @param dataset instances to be checked * @param leftHandSide these rule term(s) represent the BODY * @param rightHandside these rule term(s) represent the HEAD * @return true if all instances covered by the rule, false otherwise */ public boolean allInstancesCoveredByTermsBothSides(Instances dataset, List<Term> leftHandSide, List<Term> rightHandside){ if(rightHandside.isEmpty() || leftHandSide.isEmpty()){ return false; } Iterator<Instance> datasetIterator = dataset.iterator(); while(datasetIterator.hasNext()){ Instance anInstance = datasetIterator.next(); // 1st check if the instance is totally covered by the right hand-side for (int termI = 0; termI < rightHandside.size(); termI++) { if(rightHandside.get(termI).coveredInstance(anInstance) == false){ return false; } } // 2nd check if the instance is totally covered by the right hand-side for (int termI = 0; termI < leftHandSide.size(); termI++) { if(leftHandSide.get(termI).coveredInstance(anInstance) == false){ return false; } } } return true; }
/** * Calculate false positive rate, * the false positive rate is calculated by no. false positive / total negative * * @param dataset the dataset that will be used for the calculation * @param selectedRules the rule that is used for the calculation * @return false positive for the given rules and the input dataset */ public double calcuateFalsePositiveRate(Instances dataset, List<Rule> selectedRules){ // calculate total negative example int falsePostive = 0; int trueNegative = 0; Iterator<Instance> instancesIterator; // go through each selected rule for (Rule aRule : selectedRules) { // got HEAD from the rules List<Term> HeadTerms = aRule.getRightSdie(); List<Term> bodyTerm = aRule.getLeftSide(); // check whether this rule is positive (covered by the HEAD only) instancesIterator = dataset.iterator(); while(instancesIterator.hasNext()){ Instance anInstance = instancesIterator.next(); if(instanceCoveredByTermsList(anInstance, bodyTerm) == true && instanceCoveredByTermsList(anInstance, HeadTerms) == false){ falsePostive++; } if(instanceCoveredByTermsList(anInstance, bodyTerm) == false && instanceCoveredByTermsList(anInstance, HeadTerms) == false){ trueNegative++; } } } double falsePositiveRate = (double) falsePostive / (double) (falsePostive + trueNegative); return falsePositiveRate; }