public static List<Float> getWSErrorRates(LangDescriptor language, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(language.corpusDir, language); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(injectWSFeatures, alignmentFeatures, false, null); List<Formatter> formatters = results.a; List<Float> wsErrorRates = new ArrayList<>(); // don't include align errors for (Formatter formatter : formatters) { ClassificationAnalysis analysis = new ClassificationAnalysis(formatter.testDoc, formatter.getAnalysisPerToken()); wsErrorRates.add(analysis.getWSErrorRate()); } // System.out.println(results.c); // System.out.println("vs"); // System.out.println(wsErrorRates); return wsErrorRates; }
public static List<Float> getAlignmentErrorRates(LangDescriptor language, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(language.corpusDir, language); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(injectWSFeatures, alignmentFeatures, false, null); List<Formatter> formatters = results.a; List<Float> alignErrorRates = new ArrayList<>(); // don't include align errors for (Formatter formatter : formatters) { ClassificationAnalysis analysis = new ClassificationAnalysis(formatter.testDoc, formatter.getAnalysisPerToken()); alignErrorRates.add(analysis.getAlignmentErrorRate()); } // System.out.println(results.c); // System.out.println("vs"); // System.out.println(alignErrorRates); return alignErrorRates; }
protected Triple<Integer, String, String> exec(String[] cmd, String workingDir) throws IOException, InterruptedException { ProcessBuilder pb = new ProcessBuilder(); pb.command(Arrays.asList(cmd)).directory(new File(workingDir)); Process process = pb.start(); int resultCode = process.waitFor(); String stdout = dump(process.getInputStream()); String stderr = dump(process.getErrorStream()); return new Triple<>(resultCode, stdout, stderr); }
public void checkCExec(String filename) throws Exception { URL testFolderURL = TestCGen.class.getClassLoader().getResource(SAMPLES_DIR); String testFolder = testFolderURL.getPath(); String workingDir = getWorkingDir(); String J_pathToFile = testFolder+"/"+filename; String C_filename = basename(filename)+".c"; JTran jTran = new JTran(); String C_code = jTran.translate(J_pathToFile, C_filename, false, false); Utils.writeFile(workingDir+"/"+C_filename, C_code); // compile String[] cc = {"cc", "-o", basename(filename), C_filename}; Triple<Integer, String, String> cc_result = exec(cc, getWorkingDir()); int execCode = cc_result.a; String stdout = cc_result.b; String stderr = cc_result.c; assertEquals("", stdout); assertEquals("", stderr); assertEquals(0, execCode); // execute String[] exec_cmd = {"./"+basename(filename)}; Triple<Integer, String, String> result = exec(exec_cmd, getWorkingDir()); execCode = result.a; stdout = result.b; stderr = result.c; String expected_output_filename = basename(filename)+".txt"; String expected_output = readFile(testFolder+"/"+expected_output_filename); assertEquals(expected_output, stdout); assertEquals("", stderr); assertEquals(0, execCode); }
/** Return error rate for each document using leave-one-out validation */ public List<Float> scoreDocuments() throws Exception { List<String> allFiles = getFilenames(new File(rootDir), language.fileRegex); List<InputDocument> documents = Tool.load(allFiles, language); List<Float> errors = new ArrayList<>(); for (int i = 0; i<documents.size(); i++) { Triple<Formatter,Float,Float> results = validate(language, documents, documents.get(i).fileName, k, null, false, false); Float errorRate = results.c; errors.add(errorRate); } return errors; }
public static List<Float> checkStability(LangDescriptor language) throws Exception { List<Float> errorRates = new ArrayList<>(); // format the corpus into tmp dir LeaveOneOutValidator validator0 = new LeaveOneOutValidator(language.corpusDir, language); Triple<List<Formatter>, List<Float>, List<Float>> results0 = validator0.validateDocuments(false, "/tmp/stability/1"); errorRates.add( BuffUtils.median(results0.c) ); List<Formatter> formatters0 = results0.a; // now try formatting it over and over for (int i = 1; i<=STAGES; i++) { String inputDir = "/tmp/stability/"+i; String outputDir = "/tmp/stability/"+(i+1); LeaveOneOutValidator validator = new LeaveOneOutValidator(inputDir, language); Triple<List<Formatter>, List<Float>, List<Float>> results = validator.validateDocuments(false, outputDir); List<Formatter> formatters = results.a; List<Float> distances = new ArrayList<>(); for (int j = 0; j<formatters.size(); j++) { Formatter f0 = formatters0.get(j); Formatter f = formatters.get(j); float editDistance = normalizedLevenshteinDistance(f.getOutput(), f0.getOutput()); distances.add(editDistance); } errorRates.add( BuffUtils.median(distances) ); } return errorRates; }
public static Triple<Formatter,Float,Float> validate(LangDescriptor language, List<InputDocument> documents, InputDocument testDoc, boolean saveOutput, boolean computeEditDistance) throws Exception { // kNNClassifier.resetCache(); Corpus corpus = new Corpus(documents, language); corpus.train(); // System.out.printf("%d feature vectors\n", corpus.featureVectors.size()); Formatter formatter = new Formatter(corpus, language.indentSize); String output = formatter.format(testDoc, false); float editDistance = 0; if ( computeEditDistance ) { editDistance = normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.getAnalysisPerToken()); // System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate()); if ( saveOutput ) { File dir = new File(outputDir+"/"+language.name); if ( saveOutput ) { dir = new File(outputDir+"/"+language.name); dir.mkdir(); } Utils.writeFile(dir.getPath()+"/"+new File(testDoc.fileName).getName(), output); } return new Triple<>(formatter, editDistance, analysis.getErrorRate()); }
public Triple<Formatter,Float,Float> validateOneDocument(String fileToExclude, String outputDir, boolean collectAnalysis) throws Exception { List<String> allFiles = getFilenames(new File(rootDir), language.fileRegex); List<InputDocument> documents = Tool.load(allFiles, language); return validate(language, documents, fileToExclude, Formatter.DEFAULT_K, outputDir, false, collectAnalysis); }
public Triple<List<Formatter>,List<Float>,List<Float>> validateDocuments(boolean computeEditDistance, String outputDir) throws Exception { return validateDocuments(Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS, computeEditDistance, outputDir); }
public Triple<Formatter,Float,Float> validate(LangDescriptor language, List<InputDocument> documents, String fileToExclude, int k, String outputDir, boolean computeEditDistance, boolean collectAnalysis) throws Exception { return validate(language, documents, fileToExclude, k, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS, outputDir, computeEditDistance, collectAnalysis); }
/** * From {@code (A)?} build either: * * <pre> * o--A->o * | ^ * o---->| * </pre> * * or, if {@code A} is a block, just add an empty alt to the end of the * block */ @Override public Handle optional(GrammarAST optAST, Handle blk) { BlockStartState blkStart = (BlockStartState)blk.left; ATNState blkEnd = blk.right; preventEpsilonOptionalBlocks.add(new Triple<Rule, ATNState, ATNState>(currentRule, blkStart, blkEnd)); boolean greedy = ((QuantifierAST)optAST).isGreedy(); blkStart.nonGreedy = !greedy; epsilon(blkStart, blk.right, !greedy); optAST.atnState = blk.left; return blk; }
/** * From {@code (blk)+} build * * <pre> * |---------| * v | * [o-blk-o]->o->o * </pre> * * We add a decision for loop back node to the existing one at {@code blk} * start. */ @Override public Handle plus(GrammarAST plusAST, Handle blk) { PlusBlockStartState blkStart = (PlusBlockStartState)blk.left; BlockEndState blkEnd = (BlockEndState)blk.right; preventEpsilonClosureBlocks.add(new Triple<Rule, ATNState, ATNState>(currentRule, blkStart, blkEnd)); PlusLoopbackState loop = newState(PlusLoopbackState.class, plusAST); loop.nonGreedy = !((QuantifierAST)plusAST).isGreedy(); atn.defineDecisionState(loop); LoopEndState end = newState(LoopEndState.class, plusAST); blkStart.loopBackState = loop; end.loopBackState = loop; plusAST.atnState = loop; epsilon(blkEnd, loop); // blk can see loop back BlockAST blkAST = (BlockAST)plusAST.getChild(0); if ( ((QuantifierAST)plusAST).isGreedy() ) { if (expectNonGreedy(blkAST)) { g.tool.errMgr.grammarError(ErrorType.EXPECTED_NON_GREEDY_WILDCARD_BLOCK, g.fileName, plusAST.getToken(), plusAST.getToken().getText()); } epsilon(loop, blkStart); // loop back to start epsilon(loop, end); // or exit } else { // if not greedy, priority to exit branch; make it first epsilon(loop, end); // exit epsilon(loop, blkStart); // loop back to start } return new Handle(blkStart, end); }
/** * From {@code (blk)*} build {@code ( blk+ )?} with *two* decisions, one for * entry and one for choosing alts of {@code blk}. * * <pre> * |-------------| * v | * o--[o-blk-o]->o o * | ^ * -----------------| * </pre> * * Note that the optional bypass must jump outside the loop as * {@code (A|B)*} is not the same thing as {@code (A|B|)+}. */ @Override public Handle star(GrammarAST starAST, Handle elem) { StarBlockStartState blkStart = (StarBlockStartState)elem.left; BlockEndState blkEnd = (BlockEndState)elem.right; preventEpsilonClosureBlocks.add(new Triple<Rule, ATNState, ATNState>(currentRule, blkStart, blkEnd)); StarLoopEntryState entry = newState(StarLoopEntryState.class, starAST); entry.nonGreedy = !((QuantifierAST)starAST).isGreedy(); atn.defineDecisionState(entry); LoopEndState end = newState(LoopEndState.class, starAST); StarLoopbackState loop = newState(StarLoopbackState.class, starAST); entry.loopBackState = loop; end.loopBackState = loop; BlockAST blkAST = (BlockAST)starAST.getChild(0); if ( ((QuantifierAST)starAST).isGreedy() ) { if (expectNonGreedy(blkAST)) { g.tool.errMgr.grammarError(ErrorType.EXPECTED_NON_GREEDY_WILDCARD_BLOCK, g.fileName, starAST.getToken(), starAST.getToken().getText()); } epsilon(entry, blkStart); // loop enter edge (alt 1) epsilon(entry, end); // bypass loop edge (alt 2) } else { // if not greedy, priority to exit branch; make it first epsilon(entry, end); // bypass loop edge (alt 1) epsilon(entry, blkStart); // loop enter edge (alt 2) } epsilon(blkEnd, loop); // block end hits loop back epsilon(loop, entry); // loop back to entry/exit decision starAST.atnState = entry; // decision is to enter/exit; blk is its own decision return new Handle(entry, end); }
public void checkCGen(String filename) throws Exception { URL testFolderURL = TestCGen.class.getClassLoader().getResource(SAMPLES_DIR); String testFolder = testFolderURL.getPath(); String workingDir = getWorkingDir(); String J_pathToFile = testFolder+"/"+filename; String C_filename = basename(filename)+".c"; JTran jTran = new JTran(); String C_code = jTran.translate(J_pathToFile, C_filename, false, false); Utils.writeFile(workingDir+"/"+C_filename, C_code); String[] indent_result_cmd = { "indent", "-bap", "-bad", "-br", "-nce", "-ncs", "-nprs", "-npcs", "-sai", "-saw", "-di1", "-brs", "-blf", "--indent-level4", "-nut", "-sob", "-l200", C_filename, "-o", C_filename // write on top of itself }; // normalize generated code exec(indent_result_cmd, workingDir); // format the expected file as well String expected_C_CodeFilename = testFolder+"/"+C_filename; String[] indent_expected_cmd = { "indent", "-bap", "-bad", "-br", "-nce", "-ncs", "-nprs", "-npcs", "-sai", "-saw", "-di1", "-brs", "-blf", "--indent-level4", "-nut", "-sob", "-l200", expected_C_CodeFilename, "-o", "expected_"+C_filename }; exec(indent_expected_cmd, workingDir); // compare with expected c file String[] diff_cmd = { "diff", "expected_"+C_filename, C_filename }; Triple<Integer, String, String> result = exec(diff_cmd, workingDir); int execCode = result.a; String stdout = result.b; String stderr = result.c; assertEquals("", stdout); assertEquals("", stderr); assertEquals(0, execCode); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(JAVA8_DESCR.corpusDir, JAVA8_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(ANTLR4_DESCR.corpusDir, ANTLR4_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(JAVA_GUAVA_DESCR.corpusDir, JAVA_GUAVA_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(SQLITE_CLEAN_DESCR.corpusDir, SQLITE_CLEAN_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(JAVA_DESCR.corpusDir, JAVA_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(QUORUM_DESCR.corpusDir, QUORUM_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }
public static float[] getMedianErrorRates(LangDescriptor language, int maxNumFiles, int trials) throws Exception { SubsetValidator validator = new SubsetValidator(language.corpusDir, language); List<InputDocument> documents = Tool.load(validator.allFiles, language); float[] medians = new float[Math.min(documents.size(),maxNumFiles)+1]; int ncpu = Runtime.getRuntime().availableProcessors(); if ( FORCE_SINGLE_THREADED ) { ncpu = 2; } ExecutorService pool = Executors.newFixedThreadPool(ncpu-1); List<Callable<Void>> jobs = new ArrayList<>(); for (int i = 1; i<=Math.min(validator.allFiles.size(), maxNumFiles); i++) { // i is corpus subset size final int corpusSubsetSize = i; Callable<Void> job = () -> { try { List<Float> errorRates = new ArrayList<>(); for (int trial = 1; trial<=trials; trial++) { // multiple trials per subset size Pair<InputDocument, List<InputDocument>> sample = validator.selectSample(documents, corpusSubsetSize); Triple<Formatter, Float, Float> results = validate(language, sample.b, sample.a, true, false); // System.out.println(sample.a.fileName+" n="+corpusSubsetSize+": error="+results.c); // System.out.println("\tcorpus =\n\t\t"+Utils.join(sample.b.iterator(), "\n\t\t")); errorRates.add(results.c); } Collections.sort(errorRates); int n = errorRates.size(); float median = errorRates.get(n/2); System.out.println("median "+language.name+" error rate for n="+corpusSubsetSize+" is "+median); medians[corpusSubsetSize] = median; } catch (Throwable t) { t.printStackTrace(System.err); } return null; }; jobs.add(job); } pool.invokeAll(jobs); pool.shutdown(); boolean terminated = pool.awaitTermination(60, TimeUnit.MINUTES); return medians; }
public Triple<List<Formatter>,List<Float>,List<Float>> validateDocuments(FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, boolean computeEditDistance, String outputDir) throws Exception { List<Formatter> formatters = Collections.synchronizedList(new ArrayList<>()); List<Float> distances = Collections.synchronizedList(new ArrayList<>()); List<Float> errors = Collections.synchronizedList(new ArrayList<>()); long start = System.nanoTime(); try { List<String> allFiles = getFilenames(new File(rootDir), language.fileRegex); final List<InputDocument> documents = Tool.load(allFiles, language); final List<InputDocument> parsableDocuments = filter(documents, d -> d.tree!=null); long stop = System.nanoTime(); System.out.printf("Load/parse all docs from %s time %d ms\n", rootDir, (stop-start)/1_000_000); int ncpu = Runtime.getRuntime().availableProcessors(); if ( FORCE_SINGLE_THREADED ) { ncpu = 2; } ExecutorService pool = Executors.newFixedThreadPool(ncpu-1); List<Callable<Void>> jobs = new ArrayList<>(); for (int i = 0; i<parsableDocuments.size(); i++) { final String fileName = parsableDocuments.get(i).fileName; Callable<Void> job = () -> { try { Triple<Formatter, Float, Float> results = validate(language, parsableDocuments, fileName, Formatter.DEFAULT_K, injectWSFeatures, alignmentFeatures, outputDir, computeEditDistance, false); formatters.add(results.a); float editDistance = results.b; distances.add(editDistance); Float errorRate = results.c; errors.add(errorRate); } catch (Throwable t) { t.printStackTrace(System.err); } return null; }; jobs.add(job); } pool.invokeAll(jobs); pool.shutdown(); pool.awaitTermination(60, TimeUnit.MINUTES); } finally { long final_stop = System.nanoTime(); Double medianTrainingTime = median(trainingTimes); double medianFormattingPerMS = median(formattingTokensPerMS); System.out.printf("Total time %dms\n", (final_stop-start)/1_000_000); System.out.printf("Median training time %dms\n", medianTrainingTime.intValue()); System.out.printf("Median formatting time tokens per ms %5.4fms, min %5.4f max %5.4f\n", medianFormattingPerMS, BuffUtils.min(formattingTokensPerMS), BuffUtils.max(formattingTokensPerMS)); } return new Triple<>(formatters,distances,errors); }
public Triple<Formatter,Float,Float> validate(LangDescriptor language, List<InputDocument> documents, String fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, String outputDir, boolean computeEditDistance, boolean collectAnalysis) throws Exception { final String path = new File(fileToExclude).getAbsolutePath(); List<InputDocument> others = filter(documents, d -> !d.fileName.equals(path)); List<InputDocument> excluded = filter(documents, d -> d.fileName.equals(path)); assert others.size() == documents.size() - 1; // kNNClassifier.resetCache(); if ( excluded.size()==0 ) { System.err.println("Doc not in corpus: "+path); return null; } InputDocument testDoc = excluded.get(0); long start = System.nanoTime(); Corpus corpus = new Corpus(others, language); corpus.train(); long stop = System.nanoTime(); Formatter formatter = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures); InputDocument originalDoc = testDoc; long format_start = System.nanoTime(); String output = formatter.format(testDoc, collectAnalysis); long format_stop = System.nanoTime(); float editDistance = 0; if ( computeEditDistance ) { editDistance = normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.getAnalysisPerToken()); System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate()); if ( outputDir!=null ) { File dir = new File(outputDir+"/"+language.name+"/"+Tool.version); if ( !dir.exists() ) { dir.mkdirs(); } Utils.writeFile(dir.getPath()+"/"+new File(testDoc.fileName).getName(), output); } long tms = (stop - start) / 1_000_000; long fms = (format_stop - format_start) / 1_000_000; trainingTimes.add((double)tms); float tokensPerMS = testDoc.tokens.size() / (float) fms; formattingTokensPerMS.add((double)tokensPerMS); System.out.printf("Training time = %d ms, formatting %d ms, %5.3f tokens/ms (%d tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.size()); // System.out.printf("classify calls %d, hits %d rate %f\n", // kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, // kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls); // System.out.printf("kNN calls %d, hits %d rate %f\n", // kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, // kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls); return new Triple<>(formatter, editDistance, analysis.getErrorRate()); }
public static void main(String[] args) throws Exception { LeaveOneOutValidator validator = new LeaveOneOutValidator(TSQL_CLEAN_DESCR.corpusDir, TSQL_CLEAN_DESCR); Triple<List<Formatter>,List<Float>,List<Float>> results = validator.validateDocuments(false, "output"); System.out.println(results.b); System.out.println(results.c); }