@Override public Object getMultiTermComponent() { if (preserveOriginal == false) { return this; } else { // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning return new TokenFilterFactory() { @Override public String name() { return ASCIIFoldingTokenFilterFactory.this.name(); } @Override public TokenStream create(TokenStream tokenStream) { return new ASCIIFoldingFilter(tokenStream, false); } }; } }
static String clean(String str) { if (str == null || str.length() == 0) { return str; } int len = str.length(); String STR = str.toUpperCase(Locale.FRENCH); char[] chars = new char[len]; int count = 0; for (int i = 0; i < len; i++) { if (Character.isLetter(STR.charAt(i))) { if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) { chars[count++] = '2'; } else { chars[count++] = STR.charAt(i); } } } char[] res = new char[count]; int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count); return new String(chars, 0, finalSize); }
/** * Supprime les accents d'une chaîne de caractères. * * @param text chaîne à nettoyer * @return chaîne sans accent * @see org.apache.lucene.analysis.ASCIIFoldingFilter */ public static String removeAccents(String text) { if (text == null) { return text; } int length = text.length(); char[] input = text.toCharArray(); char[] output = new char[256]; // Worst-case length required: final int maxSizeNeeded = 4 * length; if (output.length < maxSizeNeeded) { output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)]; } int outputPos = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, length); return new String(output, 0, outputPos); }
public void testInvalidOffset() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new WordTokenFilter(filters); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mosfellsbaer" }, new int[] { 0 }, new int[] { 11 }); }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); }
/** * Takes a Unicode string and tries to decompose non-7bit-ascii (Unicode Basic Latin) characters into 7bit ascii. * For example, the string 'âåäöốở' is turned into 'aaaooo'. * Note that it doesn't always succeed for some of the much more complicated characters (e.g. 'µ'). * Occasionally some complicated characters end up as two characters when the ASCIIFoldingFilter is used... * Perhaps we want to adopt this library: * http://www.ippatsuman.com/projects/junidecode/ */ public static String decomposeToAscii(final String s) { /* pure java version, doesn't work all the time: String normalized = Normalizer.normalize(s, Normalizer.Form.NFD); return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); */ // this works on more cases final char[] input = new char[s.length()]; s.getChars(0, s.length(), input, 0); final char[] output = new char[input.length * 4]; final int numChars = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length); // now remove anything not in the printable US-ASCII range, but keep newlines final StringBuilder sb = new StringBuilder(numChars); for (int i = 0; i < numChars; i++) { final char c = output[i]; // printable US-ASCII is from 32 to 126 if ((32 <= c && c <= 126) || '\n' == c) sb.append(c); } return sb.toString(); }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new EdgeNGramTokenFilter(Version.LUCENE_4_3, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); }
/** * Apply same filtering as "custom" analyzer. Lowercase is done by QueryParser for fuzzy search. * * @param word word * @return word escaped */ private String escapeForFuzzy(String word) { int length = word.length(); char[] tmp = new char[length * 4]; length = ASCIIFoldingFilter.foldToASCII(word.toCharArray(), 0, tmp, 0, length); return new String(tmp, 0, length); }
/** * Calculate length mapping of positions in transformed string. * * @param inputStr * input string * @param html * if true, ignore add HTML tags as gaps in position map * @return array of positions. The index represents the positions in the * output string and the value is the corresponding position in the * input string */ private static int[] getPositionMap(String inputStr, boolean html) { char[] input = inputStr.toCharArray(); int length = input.length; // Worst-case length required: final int maxSizeNeeded = 4 * length; char[] output = new char[maxSizeNeeded]; int[] posTransitions = new int[maxSizeNeeded]; int outputPos = 0; Map<Integer, Integer> htmlTagLength = new Hashtable<>(); if (html) { Matcher m = SIMPLE_TAG_PATTERN.matcher(inputStr); while (m.find()) { int mPos = m.start(); int mLen = m.end() - m.start(); htmlTagLength.put(mPos, mLen); } } for (int inputPos = 0; inputPos < length; inputPos++) { int nextInputPos = inputPos + 1; if (htmlTagLength.containsKey(inputPos)) { // Cursor is on HTML tag - skip over it int htmlLen = htmlTagLength.get(inputPos); inputPos += htmlLen - 1; continue; } int nextOutputPos = ASCIIFoldingFilter.foldToASCII(input, inputPos, output, outputPos, 1); for (int curOutputPos = outputPos + 1; curOutputPos <= nextOutputPos; curOutputPos++) { posTransitions[curOutputPos] = nextInputPos; } outputPos = nextOutputPos; } return posTransitions; }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new EdgeNGramTokenFilter(filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); }
private static String getFoldedString(String text) { char[] textChar = text.toCharArray(); char[] output = new char[textChar.length * 4]; int outputPos = ASCIIFoldingFilter.foldToASCII(textChar, 0, output, 0, textChar.length); text = new String(output, 0, outputPos); return text; }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); }
@Override public TokenStream create(TokenStream tokenStream) { return new ASCIIFoldingFilter(tokenStream, preserveOriginal); }
@Override public ASCIIFoldingFilter create(TokenStream input) { return new ASCIIFoldingFilter(input, preserveOriginal); }
@Override public ASCIIFoldingFilter create(TokenStream input) { return new ASCIIFoldingFilter(input); }
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader); return new TokenStreamComponents(source, new ASCIIFoldingFilter(source)); }