public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4"); args.put("minGramSize","2"); args.put("maxGramSize","2"); TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory TokenizerFactory tf = new NGramTokenizerFactory(); Map<String, String> args = new HashMap<String, String>(); args.put("minGramSize","2"); args.put("maxGramSize","2"); tf.init( args ); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<String>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory Map<String, String> args = new HashMap<String, String>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4"); args.put("minGramSize","2"); args.put("maxGramSize","2"); TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<String>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }