@Override public Object create(Random random) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry Set<String> keys = new HashSet<>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { String key = TestUtil.randomSimpleString(random); if (!keys.contains(key) && key.length() > 0) { String value = TestUtil.randomSimpleString(random); builder.add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return builder.build(); }
@Override public Object create(Random random) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry Set<String> keys = new HashSet<String>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { String key = _TestUtil.randomSimpleString(random); if (!keys.contains(key) && key.length() > 0) { String value = _TestUtil.randomSimpleString(random); builder.add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return builder.build(); }
public NormalizeCharMap inform(ResourceLoader loader, String mapping) throws IOException { if (mapping != null) { List<String> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { List<String> files = splitFileNames(mapping); wlist = new ArrayList<String>(); for (String file : files) { List<String> lines = getLines(loader, file.trim()); wlist.addAll(lines); } } if (wlist.isEmpty()) { return null; } final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); return builder.build(); } return null; }
public MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name); List<String> rules = Analysis.getWordList(env, settings, "mappings"); if (rules == null) { throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured"); } NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder(); parseRules(rules, normMapBuilder); normMap = normMapBuilder.build(); }
/** * parses a list of MappingCharFilter style rules into a normalize char map */ private void parseRules(List<String> rules, NormalizeCharMap.Builder map) { for (String rule : rules) { Matcher m = rulePattern.matcher(rule); if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); String lhs = parseString(m.group(1).trim()); String rhs = parseString(m.group(2).trim()); if (lhs == null || rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping."); map.add(lhs, rhs); } }
public final static NormalizeCharMap getTibNormalizeCharMap() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // The non-breaking tsheg is replaced by the normal one builder.add("\u0f0C", "\u0F0B"); // Characters to delete: the markers found under selected syllables builder.add("\u0F35", ""); // ༵ builder.add("\u0F37", ""); // ༷ // Characters to decompose builder.add("\u0F00", "\u0F68\u0F7C\u0F7E"); // ༀ builder.add("\u0F02", "\u0F60\u0F70\u0F82"); // ༂ builder.add("\u0F03", "\u0F60\u0F70\u0F14"); // ༃ builder.add("\u0F43", "\u0F42\u0FB7"); // གྷ builder.add("\u0F48", "\u0F47\u0FB7"); // builder.add("\u0F4D", "\u0F4C\u0FB7"); // ཌྷ builder.add("\u0F52", "\u0F51\u0FB7"); // དྷ builder.add("\u0F57", "\u0F56\u0FB7"); // བྷ builder.add("\u0F5C", "\u0F5B\u0FB7"); // ཛྷ builder.add("\u0F69", "\u0F40\u0FB5"); // ཀྵ builder.add("\u0F73", "\u0F71\u0F72"); // ཱི builder.add("\u0F75", "\u0F71\u0F74"); // ཱུ builder.add("\u0F76", "\u0FB2\u0F80"); // ྲྀ builder.add("\u0F77", "\u0FB2\u0F71\u0F80"); // ཷ builder.add("\u0F78", "\u0FB3\u0F80"); // ླྀ builder.add("\u0F79", "\u0FB3\u0F71\u0F80"); // ཹ builder.add("\u0F81", "\u0F71\u0F80"); // ཱྀ builder.add("\u0F93", "\u0F92\u0FB7"); // ྒྷ builder.add("\u0F9D", "\u0F9C\u0FB7"); // ྜྷ builder.add("\u0FA2", "\u0FA1\u0FB7"); // ྡྷ builder.add("\u0FA7", "\u0FA6\u0FB7"); // ྦྷ builder.add("\u0FAC", "\u0FAB\u0FB7"); // ྫྷ builder.add("\u0FB9", "\u0F90\u0FB5"); // ྐྵ return builder.build(); }
@Inject public MappingCharFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name); List<String> rules = Analysis.getWordList(env, settings, "mappings"); if (rules == null) { throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured"); } NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder(); parseRules(rules, normMapBuilder); normMap = normMapBuilder.build(); }
public void testNormalizeWinDelimToLinuxDelim() throws Exception { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("\\", "/"); NormalizeCharMap normMap = builder.build(); String path = "c:\\a\\b\\c"; Reader cs = new MappingCharFilter(normMap, new StringReader(path)); PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), cs, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP); assertTokenStreamContents(t, new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"}, new int[]{0, 0, 0, 0}, new int[]{2, 4, 6, 8}, new int[]{1, 0, 0, 0}, path.length()); }
@Override protected Reader wrapReader(String fieldName, Reader reader) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); for (Map.Entry<String, String> e : customMappings.entrySet()) { builder.add(e.getKey(), e.getValue()); } return new MappingCharFilter(builder.build(), reader); }
@Override public void inform(ResourceLoader loader) throws IOException { mapping = args.get("mapping"); if (mapping != null) { List<String> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { List<String> files = splitFileNames(mapping); wlist = new ArrayList<String>(); for (String file : files) { List<String> lines = getLines(loader, file.trim()); wlist.addAll(lines); } } final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
protected void parseRules( List<String> rules, NormalizeCharMap.Builder builder ){ for( String rule : rules ){ Matcher m = p.matcher( rule ); if( !m.find() ) throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) ); } }
public void testNormalizeWinDelimToLinuxDelim() throws Exception { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("\\", "/"); NormalizeCharMap normMap = builder.build(); String path = "c:\\a\\b\\c"; Reader cs = new MappingCharFilter(normMap, new StringReader(path)); PathHierarchyTokenizer t = new PathHierarchyTokenizer( cs ); assertTokenStreamContents(t, new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"}, new int[]{0, 0, 0, 0}, new int[]{2, 4, 6, 8}, new int[]{1, 0, 0, 0}, path.length()); }
@Override public void inform(ResourceLoader loader) throws IOException { String mapping = getOriginalArgs().get("mapping"); if (mapping == null) { return; } for (String fileNames : mapping.split(PTN_STAGE_DELIMITER)) { fileNames = fileNames.replaceAll(PTN_REMOVE_ESCAPE_CHAR, ""); NormalizeCharMap map = inform(loader, fileNames); if (map != null) { normMapList.add(map); } } }
@Override public Reader create(Reader input) { for (NormalizeCharMap charMap : normMapList) { input = charMap == null ? input : new MappingCharFilter(charMap, input); } return input; }