Java 类org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter 实例源码

项目:elasticsearch_my    文件:ASCIIFoldingTokenFilterFactory.java   
@Override
public Object getMultiTermComponent() {
    if (preserveOriginal == false) {
        return this;
    } else {
        // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
        return new TokenFilterFactory() {
            @Override
            public String name() {
                return ASCIIFoldingTokenFilterFactory.this.name();
            }
            @Override
            public TokenStream create(TokenStream tokenStream) {
                return new ASCIIFoldingFilter(tokenStream, false);
            }
        };
    }
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:french-phonetic-analyser    文件:FrenchPhonetic.java   
static String clean(String str) {
    if (str == null || str.length() == 0) {
        return str;
    }
    int len = str.length();
    String STR = str.toUpperCase(Locale.FRENCH);

    char[] chars = new char[len];
    int count = 0;
    for (int i = 0; i < len; i++) {
        if (Character.isLetter(STR.charAt(i))) {
            if (SOUND_2_ACCENTUATED_CHARS.contains(STR.charAt(i))) {
                chars[count++] = '2';
            } else {
                chars[count++] = STR.charAt(i);
            }
        }
    }
    char[] res = new char[count];
    int finalSize = ASCIIFoldingFilter.foldToASCII(chars, 0, res, 0, count);
    return new String(chars, 0, finalSize);
}
项目:owsi-core-parent    文件:StringUtils.java   
/**
 * Supprime les accents d'une chaîne de caractères.
 * 
 * @param text chaîne à nettoyer
 * @return chaîne sans accent
 * @see org.apache.lucene.analysis.ASCIIFoldingFilter
 */
public static String removeAccents(String text) {
    if (text == null) {
        return text;
    }

    int length = text.length();
    char[] input = text.toCharArray();
    char[] output = new char[256];

    // Worst-case length required:
    final int maxSizeNeeded = 4 * length;

    if (output.length < maxSizeNeeded) {
        output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)];
    }

    int outputPos = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, length);

    return new String(output, 0, outputPos);
}
项目:search    文件:TestSmartChineseAnalyzer.java   
public void testInvalidOffset() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new WordTokenFilter(filters);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };

  assertAnalyzesTo(analyzer, "mosfellsbær", 
      new String[] { "mosfellsbaer" },
      new int[]    { 0 },
      new int[]    { 11 });
}
项目:search    文件:NGramTokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
}
项目:NYBC    文件:TestSmartChineseAnalyzer.java   
public void testInvalidOffset() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new WordTokenFilter(filters);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };

  assertAnalyzesTo(analyzer, "mosfellsbær", 
      new String[] { "mosfellsbaer" },
      new int[]    { 0 },
      new int[]    { 11 });
}
项目:t3as-snomedct-service    文件:MetaMap.java   
/**
 * Takes a Unicode string and tries to decompose non-7bit-ascii (Unicode Basic Latin) characters into 7bit ascii.
 * For example, the string 'âåäöốở' is turned into 'aaaooo'.
 * Note that it doesn't always succeed for some of the much more complicated characters (e.g. 'µ').
 * Occasionally some complicated characters end up as two characters when the ASCIIFoldingFilter is used...
 * Perhaps we want to adopt this library:
 * http://www.ippatsuman.com/projects/junidecode/
 */
public static String decomposeToAscii(final String s) {
    /* pure java version, doesn't work all the time:
    String normalized = Normalizer.normalize(s, Normalizer.Form.NFD);
    return normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    */

    // this works on more cases
    final char[] input = new char[s.length()];
    s.getChars(0, s.length(), input, 0);
    final char[] output = new char[input.length * 4];
    final int numChars = ASCIIFoldingFilter.foldToASCII(input, 0, output, 0, input.length);

    // now remove anything not in the printable US-ASCII range, but keep newlines
    final StringBuilder sb = new StringBuilder(numChars);
    for (int i = 0; i < numChars; i++) {
        final char c = output[i];
        // printable US-ASCII is from 32 to 126
        if ((32 <= c && c <= 126) || '\n' == c) sb.append(c);
    }

    return sb.toString();
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestSmartChineseAnalyzer.java   
public void testInvalidOffset() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new WordTokenFilter(filters);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };

  assertAnalyzesTo(analyzer, "mosfellsbær", 
      new String[] { "mosfellsbaer" },
      new int[]    { 0 },
      new int[]    { 11 });
}
项目:Maskana-Gestor-de-Conocimiento    文件:NGramTokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
}
项目:search    文件:EdgeNGramTokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new EdgeNGramTokenFilter(Version.LUCENE_4_3, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" },
      new int[]    {    0,     0,      0,       0,        0,         0,          0,           0,            0,             0,              0 },
      new int[]    {   11,    11,     11,      11,       11,        11,         11,          11,           11,            11,             11 });
}
项目:javaee-lab    文件:DefaultLuceneQueryBuilder.java   
/**
 * Apply same filtering as "custom" analyzer. Lowercase is done by QueryParser for fuzzy search.
 *
 * @param word word
 * @return word escaped
 */
private String escapeForFuzzy(String word) {
    int length = word.length();
    char[] tmp = new char[length * 4];
    length = ASCIIFoldingFilter.foldToASCII(word.toCharArray(), 0, tmp, 0, length);
    return new String(tmp, 0, length);
}
项目:InstaTrie    文件:StringWordSplitter.java   
/**
 * Calculate length mapping of positions in transformed string.
 * 
 * @param inputStr
 *            input string
 * @param html
 *            if true, ignore add HTML tags as gaps in position map
 * @return array of positions. The index represents the positions in the
 *         output string and the value is the corresponding position in the
 *         input string
 */
private static int[] getPositionMap(String inputStr, boolean html) {
    char[] input = inputStr.toCharArray();
    int length = input.length;

    // Worst-case length required:
    final int maxSizeNeeded = 4 * length;
    char[] output = new char[maxSizeNeeded];
    int[] posTransitions = new int[maxSizeNeeded];

    int outputPos = 0;

    Map<Integer, Integer> htmlTagLength = new Hashtable<>();
    if (html) {
        Matcher m = SIMPLE_TAG_PATTERN.matcher(inputStr);
        while (m.find()) {
            int mPos = m.start();
            int mLen = m.end() - m.start();
            htmlTagLength.put(mPos, mLen);
        }
    }

    for (int inputPos = 0; inputPos < length; inputPos++) {
        int nextInputPos = inputPos + 1;
        if (htmlTagLength.containsKey(inputPos)) {
            // Cursor is on HTML tag - skip over it
            int htmlLen = htmlTagLength.get(inputPos);
            inputPos += htmlLen - 1;
            continue;
        }
        int nextOutputPos = ASCIIFoldingFilter.foldToASCII(input, inputPos, output, outputPos, 1);
        for (int curOutputPos = outputPos + 1; curOutputPos <= nextOutputPos; curOutputPos++) {
            posTransitions[curOutputPos] = nextInputPos;
        }
        outputPos = nextOutputPos;
    }

    return posTransitions;
}
项目:NYBC    文件:EdgeNGramTokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new EdgeNGramTokenFilter(filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" },
      new int[]    {    0,     0,      0,       0,        0,         0,          0,           0,            0,             0,              0 },
      new int[]    {   11,    11,     11,      11,       11,        11,         11,          11,           11,            11,             11 });
}
项目:NYBC    文件:NGramTokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
}
项目:lumongo    文件:LumongoSegment.java   
private static String getFoldedString(String text) {
    char[] textChar = text.toCharArray();
    char[] output = new char[textChar.length * 4];
    int outputPos = ASCIIFoldingFilter.foldToASCII(textChar, 0, output, 0, textChar.length);
    text = new String(output, 0, outputPos);
    return text;
}
项目:elasticsearch-analysis-edgengram2    文件:EdgeNGram2TokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new EdgeNGramTokenFilter(filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
            return new TokenStreamComponents(tokenizer, filters);
        }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær",
            new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" },
            new int[]    {    0,     0,      0,       0,        0,         0,          0,           0,            0,             0,              0 },
            new int[]    {   11,    11,     11,      11,       11,        11,         11,          11,           11,            11,             11 });
}
项目:Maskana-Gestor-de-Conocimiento    文件:EdgeNGramTokenFilterTest.java   
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" },
      new int[]    {    0,     0,      0,       0,        0,         0,          0,           0,            0,             0,              0 },
      new int[]    {   11,    11,     11,      11,       11,        11,         11,          11,           11,            11,             11 });
}
项目:elasticsearch_my    文件:ASCIIFoldingTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    return new ASCIIFoldingFilter(tokenStream, preserveOriginal);
}
项目:lams    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input, preserveOriginal);
}
项目:Elasticsearch    文件:ASCIIFoldingTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    return new ASCIIFoldingFilter(tokenStream, preserveOriginal);
}
项目:search    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input, preserveOriginal);
}
项目:NYBC    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input);
}
项目:read-open-source-code    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input, preserveOriginal);
}
项目:read-open-source-code    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input, preserveOriginal);
}
项目:read-open-source-code    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input, preserveOriginal);
}
项目:AGDISTIS    文件:LiteralAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
    return new TokenStreamComponents(source, new ASCIIFoldingFilter(source));

}
项目:Maskana-Gestor-de-Conocimiento    文件:ASCIIFoldingFilterFactory.java   
@Override
public ASCIIFoldingFilter create(TokenStream input) {
  return new ASCIIFoldingFilter(input);
}