/** * parses a list of MappingCharFilter style rules into a custom byte[] type table */ static byte[] parseTypes(Collection<String> rules) { SortedMap<Character, Byte> typeMap = new TreeMap<>(); for (String rule : rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) types[mapping.getKey()] = mapping.getValue(); return types; }
/** * parses a list of MappingCharFilter style rules into a custom byte[] type table */ private byte[] parseTypes(Collection<String> rules) { SortedMap<Character, Byte> typeMap = new TreeMap<>(); for (String rule : rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) types[mapping.getKey()] = mapping.getValue(); return types; }
private byte[] parseTypes(List<String> rules) { SortedMap<Character,Byte> typeMap = new TreeMap<Character,Byte>(); for( String rule : rules ){ Matcher m = typePattern.matcher(rule); if( !m.find() ) throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]"); String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); if (rhs == null) throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); for (Map.Entry<Character,Byte> mapping : typeMap.entrySet()) types[mapping.getKey()] = mapping.getValue(); return types; }
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
@Inject public WordDelimiterTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; }
private byte[] parseTypes(List<String> rules) { SortedMap<Character, Byte> typeMap = new TreeMap<>(); for (String rule : rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) { throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]"); } String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) { throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); } if (rhs == null) { throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); } typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) { types[i] = WordDelimiterIterator.getType(i); } for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) { types[mapping.getKey()] = mapping.getValue(); } return types; }
private byte[] parseTypes(List<String> rules) { SortedMap<Character, Byte> typeMap = newTreeMap(); for (String rule : rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) { throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]"); } String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) { throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); } if (rhs == null) { throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); } typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) { types[i] = WordDelimiterIterator.getType(i); } for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) { types[mapping.getKey()] = mapping.getValue(); } return types; }
@Override public WordDelimiterFilter create(TokenStream input) { return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords); }
public WordDelimiterFilter2Factory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(environment, settings, "type_table"); if (charTypeTableValues == null) { this.typeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.typeTable = parseTypes(charTypeTableValues); } // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If 1, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // If 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // If 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If 1, causes generated subwords to stick at the same position, they otherwise take a new position flags |= getFlag(ALL_PARTS_AT_SAME_POSITION, settings, "all_parts_at_same_position", false); // If not null is the set of tokens to protect from being delimited List<String> protoWords = Analysis.getWordList(environment, settings, "protected_words"); protectedWords = protoWords == null ? null : new HashSet<>(protoWords); }
@Inject public WordDelimiterFilter2Factory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.typeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.typeTable = parseTypes(charTypeTableValues); } // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If 1, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // If 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // If 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If 1, causes generated subwords to stick at the same position, they otherwise take a new position flags |= getFlag(ALL_PARTS_AT_SAME_POSITION, settings, "all_parts_at_same_position", false); // If not null is the set of tokens to protect from being delimited List<String> protoWords = Analysis.getWordList(env, settings, "protected_words"); protectedWords = protoWords == null ? null : newHashSet(protoWords); }