private static StringDistance resolveDistance(String distanceVal) { distanceVal = distanceVal.toLowerCase(Locale.US); if ("internal".equals(distanceVal)) { return DirectSpellChecker.INTERNAL_LEVENSHTEIN; } else if ("damerau_levenshtein".equals(distanceVal) || "damerauLevenshtein".equals(distanceVal)) { return new LuceneLevenshteinDistance(); } else if ("levenstein".equals(distanceVal)) { return new LevensteinDistance(); // TODO Jaro and Winkler are 2 people - so apply same naming logic // as damerau_levenshtein } else if ("jarowinkler".equals(distanceVal)) { return new JaroWinklerDistance(); } else if ("ngram".equals(distanceVal)) { return new NGramDistance(); } else { throw new IllegalArgumentException("Illegal distance option " + distanceVal); } }
public static Matrix getEntityLabelSimMatrix(TokenizedDocument[] documents) { int entityCount = 0; for (int i = 0; i < documents.length; ++i) { entityCount += documents[i].entities.length; } String labels[] = new String[entityCount]; entityCount = 0; for (int d = 0; d < documents.length; ++d) { for (int e = 0; e < documents[d].entities.length; ++e) { labels[entityCount] = documents[d].entities[e].label; ++entityCount; } } Matrix stringSimMatrix = new Basic2DMatrix(entityCount, entityCount); NGramDistance nGramDistance = new NGramDistance(3); double similarity; for (int i = 0; i < labels.length; ++i) { stringSimMatrix.set(i, i, 1); for (int j = i + 1; j < labels.length; ++j) { similarity = nGramDistance.getDistance(labels[i], labels[j]); stringSimMatrix.set(i, j, similarity); stringSimMatrix.set(j, i, similarity); } } return stringSimMatrix; }
public CandidateUtil(final String file) throws IOException { final ClassLoader loader = Thread.currentThread().getContextClassLoader(); final InputStream is = loader.getResourceAsStream(file); final Properties prop = new Properties(); prop.load(is); nodeType = prop.getProperty("nodeType"); nGramDistance = new NGramDistance(Integer.valueOf(prop.getProperty("ngramDistance"))); index = new TripleIndex(file); context = Boolean.valueOf(prop.getProperty("context")); if (context == true) { // in case the index by context exist index2 = new TripleIndexContext(); } corporationAffixCleaner = new CorporationAffixCleaner(); domainWhiteLister = new DomainWhiteLister(index); popularity = Boolean.valueOf(prop.getProperty("popularity")); acronym = Boolean.valueOf(prop.getProperty("acronym")); commonEntities = Boolean.valueOf(prop.getProperty("commonEntities")); algorithm = prop.getProperty("algorithm"); }
public CandidateUtil() throws IOException { Properties prop = new Properties(); InputStream input = CandidateUtil.class.getResourceAsStream("/config/agdistis.properties"); prop.load(input); String envNodeType = System.getenv("AGDISTIS_NODE_TYPE"); this.nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType"); String envNgramDistance = System.getenv("AGDISTIS_NGRAM_DISTANCE"); this.nGramDistance = new NGramDistance( Integer.valueOf(envNgramDistance != null ? envNgramDistance : prop.getProperty("ngramDistance"))); this.index = new TripleIndex(); String envContext = System.getenv("AGDISTIS_CONTEXT"); this.context = Boolean.valueOf(envContext != null ? envContext : prop.getProperty("context")); if (context == true) { // in case the index by context exist this.index2 = new TripleIndexContext(); } this.corporationAffixCleaner = new CorporationAffixCleaner(); this.domainWhiteLister = new DomainWhiteLister(index); String envPopularity = System.getenv("AGDISTIS_POPULARITY"); this.popularity = Boolean.valueOf(envPopularity != null ? envPopularity : prop.getProperty("popularity")); String envAcronym = System.getenv("AGDISTIS_ACRONYM"); this.acronym = Boolean.valueOf(envAcronym != null ? envAcronym : prop.getProperty("acronym")); String envCommonEntities = System.getenv("AGDISTIS_COMMON_ENTITIES"); this.commonEntities = Boolean .valueOf(envCommonEntities != null ? envCommonEntities : prop.getProperty("commonEntities")); String envAlgorithm = System.getenv("AGDISTIS_ALGORITHM"); this.algorithm = envAlgorithm != null ? envAlgorithm : prop.getProperty("algorithm"); }
@Test public void testSurfaceFormsDistance() { String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; List<Triple> label = index.search(candidateURL, "http://www.w3.org/2000/01/rdf-schema#label", null); List<Triple> surfaceForms = index.search(candidateURL, "http://www.w3.org/2004/02/skos/core#altLabel", null); log.debug(" * " + surfaceForms.size()); NGramDistance n = new NGramDistance(3); for (Triple t : surfaceForms) { log.debug(label.get(0).getObject() + " " + t.getObject() + " : " + n.getDistance(label.get(0).getObject(), t.getObject())); assertTrue(n.getDistance(label.get(0).getObject(), t.getObject()) >= 0); } }
@Override public StringDistance toLucene() { return new NGramDistance(); }