Java 类java.lang.Character.UnicodeBlock 实例源码

项目:FlexibleRichTextView    文件:DefaultTeXFont.java   
private static void addAlphabet(Object base,
        Character.UnicodeBlock[] alphabet, String language)
        throws ResourceParseException, IOException {
    boolean b = false;
    for (int i = 0; !b && i < alphabet.length; i++) {
        b = loadedAlphabets.contains(alphabet[i]) || b;
    }
    if (!b) {
        TeXParser.isLoading = true;
        addTeXFontDescription(base,
                AjLatexMath.getAssetManager().open(language), language);
        for (int i = 0; i < alphabet.length; i++) {
            loadedAlphabets.add(alphabet[i]);
        }
        TeXParser.isLoading = false;
    }
}
项目:bibliome-java-utils    文件:Strings.java   
/**
 * Remove diacritics from the specified string.
 * @param s
 * @return a copy of the specified string with diacritics removed.
 */
public static final String removeDiacritics(String s) {
    String n = Normalizer.normalize(s, Form.NFD);
    StringBuilder sb = null;
    for (int i = 0; i < n.length(); ++i) {
        char c = n.charAt(i);
        UnicodeBlock b = UnicodeBlock.of(c);
        if (UnicodeBlock.COMBINING_DIACRITICAL_MARKS.equals(b) || UnicodeBlock.COMBINING_DIACRITICAL_MARKS_SUPPLEMENT.equals(b)) {
            if (sb == null) {
                sb = new StringBuilder(n.length());
                sb.append(n.substring(0, i));
            }
            continue;
        }
        if (sb != null)
            sb.append(c);
    }
    if (sb == null)
        return n;
    return sb.toString();
}
项目:jasperreports    文件:JRPdfExporter.java   
protected void initGlyphRenderer() 
{
    glyphRendererBlocks = new HashSet<Character.UnicodeBlock>();
    List<PropertySuffix> props = propertiesUtil.getAllProperties(getCurrentJasperPrint(), 
            PdfReportConfiguration.PROPERTY_PREFIX_GLYPH_RENDERER_BLOCKS);
    for (PropertySuffix prop : props)
    {
        String blocks = prop.getValue();
        for (String blockToken : blocks.split(","))
        {
            UnicodeBlock block = resolveUnicodeBlock(blockToken);
            if (block != null)
            {
                if (log.isDebugEnabled())
                {
                    log.debug("glyph renderer block " + block);
                }
                glyphRendererBlocks.add(block);
            }
        }
    }
}
项目:jasperreports    文件:JRPdfExporter.java   
protected UnicodeBlock resolveUnicodeBlock(String name)
{
    if (name.trim().isEmpty())
    {
        return null;
    }

    try 
    {
        return UnicodeBlock.forName(name.trim());
    } 
    catch (IllegalArgumentException e) 
    {
        log.warn("Could not resolve \"" + name + "\" to a Unicode block");
        return null;
    } 
}
项目:KOMORAN    文件:KoreanUnitParser.java   
public List<Pair<Character, UnitType>> parseWithType(String str) {
    List<Pair<Character, UnitType>> result = new ArrayList<>();

    int length = str.length();
    for (int i = 0; i < length; i++) {
        char ch = str.charAt(i);
        Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
        if (block == UnicodeBlock.HANGUL_SYLLABLES) {
            int cho, jung, jong, tmp;
            tmp = ch - 0xAC00;
            cho = tmp / (21 * 28);
            tmp = tmp % (21 * 28);
            jung = tmp / 28;
            jong = tmp % 28;
            result.add(new Pair<>(ChoSung[cho], UnitType.CHOSUNG));
            result.add(new Pair<>(JungSung[jung], UnitType.JUNGSUNG));
            if (jong != 0) {
                result.add(new Pair<>(JongSung[jong], UnitType.JONGSUNG));
            }
        } else {
            result.add(new Pair<>(ch, UnitType.OTHER));
        }
    }
    return result;
}
项目:textokit-core    文件:WordUtils.java   
public static boolean isRussianWord(String token) {
    if (token.isEmpty()) {
        return false;
    }
    Character lastLetter = null;
    // find last letter
    for (int i = token.length() - 1; i >= 0; i--) {
        char ch = token.charAt(i);
        if (Character.isLetter(ch)) {
            lastLetter = ch;
            break;
        }
    }
    if (lastLetter == null) {
        return false;
    }
    // check is it cyrillic
    return UnicodeBlock.of(lastLetter).equals(UnicodeBlock.CYRILLIC);
}
项目:Android-Studio-Translate-Tool    文件:WordsTransfer.java   
/**
 * utf-8 转unicode
 *
 * @param inStr
 * @return String
 */
public static String utf8ToUnicode(String inStr) {
    char[] myBuffer = inStr.toCharArray();

    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < inStr.length(); i++) {
        UnicodeBlock ub = UnicodeBlock.of(myBuffer[i]);
        if (ub == UnicodeBlock.BASIC_LATIN) {
            sb.append(myBuffer[i]);
        } else if (ub == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
            int j = (int) myBuffer[i] - 65248;
            sb.append((char) j);
        } else {
            short s = (short) myBuffer[i];
            String hexS = Integer.toHexString(s);
            String unicode = "\\u" + hexS;
            sb.append(unicode.toLowerCase());
        }
    }
    return sb.toString().replaceAll("ffff", "");
}
项目:donkirkby    文件:CharacterClassifier.java   
public boolean isChinese(char c) {
        Set<UnicodeBlock> chineseUnicodeBlocks = new HashSet<UnicodeBlock>();
        chineseUnicodeBlocks.add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
//          add(UnicodeBlock.CJK_COMPATIBILITY);
//          add(UnicodeBlock.CJK_COMPATIBILITY_FORMS);
//          add(UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
//          add(UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
//          add(UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
//          add(UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
//          add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
//          add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
//          add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
//          add(UnicodeBlock.KANGXI_RADICALS);
//          add(UnicodeBlock.IDEOGRAPHIC_DESCRIPTION_CHARACTERS);

        UnicodeBlock block = UnicodeBlock.of(c);
        return chineseUnicodeBlocks.contains(block);
    }
项目:mathosphere    文件:UnicodeUtils.java   
public static String normalizeCharacter(int codePoint) {
  if (!Character.isValidCodePoint(codePoint)) {
    return "";
  }

  // TODO: long search? maybe replace with own implementation
  UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
  if (block == Character.UnicodeBlock.MATHEMATICAL_ALPHANUMERIC_SYMBOLS) {
    return normalizeMath(codePoint);
  }

  if (block == Character.UnicodeBlock.LETTERLIKE_SYMBOLS) {
    return normalizeLetterLike(codePoint);
  }

  return codePointToString(codePoint);
}
项目:sqlapp    文件:UnicodeBlockTokenizer.java   
@Override
public List<String> tokenize(String text) {
    int beginIndex = -1;
    UnicodeBlock current = null;
    List<String> list = new LinkedList<>();
    for (int i = 0; i < text.length(); i++) {
        UnicodeBlock block = UnicodeBlock.of(text.charAt(i));
        if (current != block) {
            if (beginIndex >= 0) {
                list.add(text.substring(beginIndex, i));
            }
            beginIndex = i;
            current = block;
        }
    }
    if (beginIndex >= 0) {
        list.add(text.substring(beginIndex));
    }
    return list;
}
项目:sqlapp    文件:CharTypeValidator.java   
/**
 * 文字種判別「全角ひらがな」。
 * 
 * @param codePoint 対象文字 (コードポイントで指定すること)。
 * @return 対象文字が「全角ひらがな」であれば真(true)、さもなくば、偽(false)。
 */
public static boolean isFullHiragana(int codePoint) {
    // based on Unicode 3.2
    return of(codePoint) == UnicodeBlock.HIRAGANA || // \u3040 - \u309F
            // import from KATAKANA (\u30A0 - \u30FF)
            codePoint == '\u30A0' || // '゠' from KATAKANA (not in Win31J)
            codePoint == '\u30FB' || // '・' from KATAKANA
            codePoint == '\u30FC' || // 'ー' from KATAKANA
            // \u30FD 'ヽ' and \u30FE 'ヾ' if iteration mark for KATAKANA
            codePoint == '\u30FF' || // 'ヿ' from KATAKANA (not in Win31J)
            codePoint == '\u3001' || // '、'
            codePoint == '\u3002' || // '。'
            codePoint == '\u300C' || // '「'
            codePoint == '\u300D' || // '」'
            codePoint == '\u300E' || // '『'
            codePoint == '\u300F'; // '』'
}
项目:sqlapp    文件:CharTypeValidator.java   
/**
 * 文字種判別「全角カタカナ」。
 * 
 * @param codePoint 対象文字 (コードポイントで指定すること)。
 * @return 対象文字が「全角カタカナ」であれば真(true)、さもなくば、偽(false)。
 */
public static boolean isFullKatakana(int codePoint) {
    // based on Unicode 3.2
    return of(codePoint) == UnicodeBlock.KATAKANA || // \u30A0 - \u30FF
            of(codePoint) == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || // \u31F0-\u31FF
            // import from HIRAGANA (\u3040 - \u309F)
            // \u3040, \u3097, \u3098 is reserved
            codePoint == '\u3099' || // MARK from HIRAGANA (not in Win31J)
            codePoint == '\u309A' || // MARK from HIRAGANA (not in Win31J)
            codePoint == '\u309B' || // '゛' from HIRAGANA
            codePoint == '\u309C' || // '゜' from HIRAGANA
            // \u309D 'ゝ' and \u309E 'ゞ' is iteration mark for HIRAGANA
            codePoint == '\u309F' || // 'ゟ' from HIRAGANA (not in Win31J)
            codePoint == '\u3001' || // '、'
            codePoint == '\u3002' || // '。'
            codePoint == '\u300C' || // '「'
            codePoint == '\u300D' || // '」'
            codePoint == '\u300E' || // '『'
            codePoint == '\u300F'; // '』'
}
项目:common_utils    文件:UnicodeHelper.java   
/**
 * utf-8 转换成 unicode
 *
 * @param inStr
 * @return
 * @author fanhui
 * 2007-3-15
 */
public static String utf8ToUnicode(String inStr) {
    char[] myBuffer = inStr.toCharArray();

    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < inStr.length(); i++) {
        UnicodeBlock ub = UnicodeBlock.of(myBuffer[i]);
        if (ub == UnicodeBlock.BASIC_LATIN) {
            //英文及数字等
            sb.append(myBuffer[i]);
        } else if (ub == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
            //全角半角字符
            int j = (int) myBuffer[i] - 65248;
            sb.append((char) j);
        } else {
            //汉字
            short s = (short) myBuffer[i];
            String hexS = Integer.toHexString(s);
            String unicode = "\\u" + hexS;
            sb.append(unicode.toLowerCase());
        }
    }
    return sb.toString();
}
项目:jpexs-decompiler    文件:TTUnicodeRange.java   
static public TTUnicodeRange of(long a_unicode) {
    initList();

    TTUnicodeRange retval = null;
    UnicodeBlock block = UnicodeBlock.of((int) a_unicode);
    if (block == null) {
        return retval;
    }

    int i;
    for (i = 0; i < s_list.size(); i++) {
        TTUnicodeRange range = s_list.get(i);
        if (range.m_block.equals(block)) {
            return range;
        }
    }

    return retval;
}
项目:springapp    文件:UnicodeBlockTokenizer.java   
@Override
public List<String> tokenize(String text) {
    int beginIndex = -1;
    UnicodeBlock current = null;
    List<String> list = new LinkedList<>();
    for (int i = 0; i < text.length(); i++) {
        UnicodeBlock block = UnicodeBlock.of(text.charAt(i));
        if (current != block) {
            if (beginIndex >= 0) {
                list.add(text.substring(beginIndex, i));
            }
            beginIndex = i;
            current = block;
        }
    }
    if (beginIndex >= 0) {
        list.add(text.substring(beginIndex));
    }
    return list;
}
项目:springapp    文件:CharTypeValidator.java   
/**
 * 文字種判別「全角ひらがな」。
 * 
 * @param codePoint 対象文字 (コードポイントで指定すること)。
 * @return 対象文字が「全角ひらがな」であれば真(true)、さもなくば、偽(false)。
 */
public static boolean isFullHiragana(int codePoint) {
    // based on Unicode 3.2
    return of(codePoint) == UnicodeBlock.HIRAGANA || // \u3040 - \u309F
            // import from KATAKANA (\u30A0 - \u30FF)
            codePoint == '\u30A0' || // '゠' from KATAKANA (not in Win31J)
            codePoint == '\u30FB' || // '・' from KATAKANA
            codePoint == '\u30FC' || // 'ー' from KATAKANA
            // \u30FD 'ヽ' and \u30FE 'ヾ' if iteration mark for KATAKANA
            codePoint == '\u30FF' || // 'ヿ' from KATAKANA (not in Win31J)
            codePoint == '\u3001' || // '、'
            codePoint == '\u3002' || // '。'
            codePoint == '\u300C' || // '「'
            codePoint == '\u300D' || // '」'
            codePoint == '\u300E' || // '『'
            codePoint == '\u300F'; // '』'
}
项目:springapp    文件:CharTypeValidator.java   
/**
 * 文字種判別「全角カタカナ」。
 * 
 * @param codePoint 対象文字 (コードポイントで指定すること)。
 * @return 対象文字が「全角カタカナ」であれば真(true)、さもなくば、偽(false)。
 */
public static boolean isFullKatakana(int codePoint) {
    // based on Unicode 3.2
    return of(codePoint) == UnicodeBlock.KATAKANA || // \u30A0 - \u30FF
            of(codePoint) == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || // \u31F0-\u31FF
            // import from HIRAGANA (\u3040 - \u309F)
            // \u3040, \u3097, \u3098 is reserved
            codePoint == '\u3099' || // MARK from HIRAGANA (not in Win31J)
            codePoint == '\u309A' || // MARK from HIRAGANA (not in Win31J)
            codePoint == '\u309B' || // '゛' from HIRAGANA
            codePoint == '\u309C' || // '゜' from HIRAGANA
            // \u309D 'ゝ' and \u309E 'ゞ' is iteration mark for HIRAGANA
            codePoint == '\u309F' || // 'ゟ' from HIRAGANA (not in Win31J)
            codePoint == '\u3001' || // '、'
            codePoint == '\u3002' || // '。'
            codePoint == '\u300C' || // '「'
            codePoint == '\u300D' || // '」'
            codePoint == '\u300E' || // '『'
            codePoint == '\u300F'; // '』'
}
项目:breakout    文件:GlyphProducers.java   
/**
 * Creates a {@link GlyphProducer} based on a range of characters.
 *
 * @param font Style of text
 * @param rd Controller of rendering details
 * @param frc Details on how fonts are rendered
 * @param ub Range of characters to support
 * @return Correct glyph producer for unicode block, not null
 * @throws NullPointerException if font, render delegate, or render context is null
 */
/*@Nonnull*/
public static GlyphProducer get(/*@Nonnull*/ final Font font,
                                /*@Nonnull*/ final RenderDelegate rd,
                                /*@Nonnull*/ final FontRenderContext frc,
                                /*@CheckForNull*/ final UnicodeBlock ub) {

    Check.notNull(font, "Font cannot be null");
    Check.notNull(rd, "Render delegate cannot be null");
    Check.notNull(frc, "Font render context cannot be null");

    if (ub == UnicodeBlock.BASIC_LATIN) {
        return new AsciiGlyphProducer(font, rd, frc);
    } else {
        return new UnicodeGlyphProducer(font, rd, frc);
    }
}
项目:silent-contacts-android    文件:NameSplitter.java   
private int guessCJKNameStyle(String name, int offset) {
    int length = name.length();
    while (offset < length) {
        int codePoint = Character.codePointAt(name, offset);
        if (Character.isLetter(codePoint)) {
            UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
            if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
                return FullNameStyle.JAPANESE;
            }
            if (isKoreanUnicodeBlock(unicodeBlock)) {
                return FullNameStyle.KOREAN;
            }
        }
        offset += Character.charCount(codePoint);
    }

    return FullNameStyle.CJK;
}
项目:FlexibleRichTextView    文件:DefaultTeXFont.java   
public static void addAlphabet(Character.UnicodeBlock alphabet,
        InputStream inlanguage, String language, InputStream insymbols,
        String symbols, InputStream inmappings, String mappings)
        throws ResourceParseException, IOException {
    if (!loadedAlphabets.contains(alphabet)) {
        addTeXFontDescription(inlanguage, language);
        SymbolAtom.addSymbolAtom(insymbols, symbols);
        TeXFormula.addSymbolMappings(inmappings, mappings);
        loadedAlphabets.add(alphabet);
    }
}
项目:FlexibleRichTextView    文件:DefaultTeXFont.java   
public static void addAlphabet(Character.UnicodeBlock alphabet, String name)
        throws ResourceParseException, IOException {
    String lg = "fonts/" + name + "/language_" + name + ".xml";
    String sym = "fonts/" + name + "/symbols_" + name + ".xml";
    String map = "fonts/" + name + "/mappings_" + name + ".xml";

    try {
        DefaultTeXFont.addAlphabet(alphabet, AjLatexMath.getAssetManager()
                .open(lg), lg, TeXFormula.class.getResourceAsStream(sym),
                sym, TeXFormula.class.getResourceAsStream(map), map);
    } catch (FontAlreadyLoadedException e) {
    }
}
项目:FlexibleRichTextView    文件:TeXFormula.java   
public static FontInfos getExternalFont(Character.UnicodeBlock block) {
    FontInfos infos = externalFontMap.get(block);
    if (infos == null) {
        infos = new FontInfos("SansSerif", "Serif");
        externalFontMap.put(block, infos);
    }

    return infos;
}
项目:FlexibleRichTextView    文件:TeXFormula.java   
public static void registerExternalFont(Character.UnicodeBlock block,
        String sansserif, String serif) {
    if (sansserif == null && serif == null) {
        externalFontMap.remove(block);
        return;
    }
    externalFontMap.put(block, new FontInfos(sansserif, serif));
    if (block.equals(Character.UnicodeBlock.BASIC_LATIN)) {
        predefinedTeXFormulas.clear();
    }
}
项目:g2p    文件:LangUtil.java   
public static int recognizeLanguage(String text) {
    int kanCount = 0;
    int tamCount = 0;
    int digitCount = 0;
    for (int i = 0; i < text.length(); i++) {
        char c = text.charAt(i);
        UnicodeBlock ub = UnicodeBlock.of(c);
        if (ub == UnicodeBlock.KANNADA) {
            kanCount++;
        } else if (ub == UnicodeBlock.TAMIL) {
            tamCount++;
        } else if (Character.isDigit(c)) {
            digitCount++;
        }
    }
    if (kanCount == 0 && tamCount == 0) {
        if (digitCount > 0) {
            return PREVIOUS_LANGUAGE;
        }
        return LANGUAGE_UNKNOWN;
    }
    if (tamCount > kanCount) {
        PREVIOUS_LANGUAGE = LANGUAGE_TAMIL;
        return LANGUAGE_TAMIL;
    } else {
        PREVIOUS_LANGUAGE = LANGUAGE_KANNADA;
        return LANGUAGE_KANNADA;
    }
}
项目:fastcatsearch3    文件:TypeTokenizerTest.java   
@Test
public void testAll(){

    for (int i = 0; i < 1000; i++) {
        char ch =  (char) i;
        UnicodeBlock block = UnicodeBlock.of(ch);
        String type = TypeTokenizer.getType(ch);
        if(type == TypeTokenizer.UNCATEGORIZED){
            System.out.println(i+ " : " + ch +" : " + type + " : "+block);
        }
    }
}
项目:jasperreports    文件:JRPdfExporter.java   
protected boolean toUseGlyphRenderer(JRPrintText text)
{
    String value = styledTextUtil.getTruncatedText(text);
    if (value == null)
    {
        return false;
    }

    if (glyphRendererBlocks.isEmpty())
    {
        return false;
    }

    int charCount = value.length();
    char[] chars = new char[charCount];
    value.getChars(0, charCount, chars, 0);
    for (char c : chars)
    {
        UnicodeBlock block = UnicodeBlock.of(c);
        if (glyphRendererBlocks.contains(block))
        {
            if (log.isTraceEnabled())
            {
                log.trace("found character in block " + block + ", using the glyph renderer");
            }

            return true;
        }
    }

    return false;
}
项目:jasperreports    文件:SimpleTextLineWrapper.java   
protected boolean hasComplexLayout(char[] chars)
{
    UnicodeBlock prevBlock = null;
    for (int i = 0; i < chars.length; i++)
    {
        char ch = chars[i];
        if (ch >= COMPEX_LAYOUT_START_CHAR && ch <= COMPEX_LAYOUT_END_CHAR)
        {
            //FIXME use icu4j or CharPredicateCache
            UnicodeBlock chBlock = Character.UnicodeBlock.of(ch);
            if (chBlock == null)
            {
                // being conservative
                return true;
            }

            // if the same block as the previous block, avoid going to the hash set
            // this could offer some speed improvement
            if (prevBlock != chBlock)
            {
                prevBlock = chBlock;

                if (!simpleLayoutBlocks.contains(chBlock))
                {
                    return true;
                }
            }
        }
    }
    return false;
}
项目:libphonenumber-android    文件:PhoneNumberMatcher.java   
/**
 * Helper method to determine if a character is a Latin-script letter or not. For our purposes,
 * combining marks should also return true since we assume they have been added to a preceding
 * Latin character.
 */
// @VisibleForTesting
static boolean isLatinLetter(char letter) {
  // Combining marks are a subset of non-spacing-mark.
  if (!Character.isLetter(letter) && Character.getType(letter) != Character.NON_SPACING_MARK) {
    return false;
  }
  UnicodeBlock block = UnicodeBlock.of(letter);
  return block.equals(UnicodeBlock.BASIC_LATIN)
      || block.equals(UnicodeBlock.LATIN_1_SUPPLEMENT)
      || block.equals(UnicodeBlock.LATIN_EXTENDED_A)
      || block.equals(UnicodeBlock.LATIN_EXTENDED_ADDITIONAL)
      || block.equals(UnicodeBlock.LATIN_EXTENDED_B)
      || block.equals(UnicodeBlock.COMBINING_DIACRITICAL_MARKS);
}
项目:qpid-proton-j    文件:StringTypeTest.java   
/**
 * Loop over all the chars in given {@link UnicodeBlock}s and return a
 * {@link Set <String>} containing all the possible values as their
 * {@link String} values.
 *
 * @param blocks the {@link UnicodeBlock}s to loop over
 * @return a {@link Set <String>} containing all the possible values as
 * {@link String} values
 */
private static Set<String> getAllStringsFromUnicodeBlocks(final UnicodeBlock... blocks)
{
    final Set<UnicodeBlock> blockSet = new HashSet<UnicodeBlock>(Arrays.asList(blocks));
    final Set<String> strings = new HashSet<String>();
    for (int codePoint = 0; codePoint <= Character.MAX_CODE_POINT; codePoint++)
    {
        if (blockSet.contains(UnicodeBlock.of(codePoint)))
        {
            final int charCount = Character.charCount(codePoint);
            final StringBuilder sb = new StringBuilder(
                    charCount);
            if (charCount == 1)
            {
                sb.append(String.valueOf((char) codePoint));
            }
            else if (charCount == 2)
            {
                //TODO: use Character.highSurrogate(codePoint) and Character.lowSurrogate(codePoint) when Java 7 is baseline
                char highSurrogate = (char) ((codePoint >>> 10) + ('\uD800' - (0x010000 >>> 10)));
                char lowSurrogate =  (char) ((codePoint & 0x3ff) + '\uDC00');

                sb.append(highSurrogate);
                sb.append(lowSurrogate);
            }
            else
            {
                throw new IllegalArgumentException("Character.charCount of "
                                                   + charCount + " not supported.");
            }
            strings.add(sb.toString());
        }
    }
    return strings;
}
项目:qpid-proton-j    文件:StringTypeTest.java   
private static List<String> generateTestData()
{
    return new LinkedList<String>()
    {
        private static final long serialVersionUID = 7331717267070233454L;
        {
            // non-surrogate pair blocks
            addAll(getAllStringsFromUnicodeBlocks(UnicodeBlock.BASIC_LATIN,
                                                 UnicodeBlock.LATIN_1_SUPPLEMENT,
                                                 UnicodeBlock.GREEK,
                                                 UnicodeBlock.LETTERLIKE_SYMBOLS));
            // blocks with surrogate pairs
            //TODO: restore others when Java 7 is baseline
            addAll(getAllStringsFromUnicodeBlocks(UnicodeBlock.LINEAR_B_SYLLABARY,
                                                 /*UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,*/
                                                 UnicodeBlock.MUSICAL_SYMBOLS,
                                                 /*UnicodeBlock.EMOTICONS,*/
                                                 /*UnicodeBlock.PLAYING_CARDS,*/
                                                 UnicodeBlock.BOX_DRAWING,
                                                 UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS,
                                                 UnicodeBlock.PRIVATE_USE_AREA,
                                                 UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A,
                                                 UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B));
            // some additional combinations of characters that could cause problems to the encoder
            String[] boxDrawing = getAllStringsFromUnicodeBlocks(UnicodeBlock.BOX_DRAWING).toArray(new String[0]);
            String[] halfFullWidthForms = getAllStringsFromUnicodeBlocks(UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS).toArray(new String[0]);
            for (int i = 0; i < halfFullWidthForms.length; i++)
            {
                add(halfFullWidthForms[i] + boxDrawing[i % boxDrawing.length]);
            }
        }
    };
}
项目:KOMORAN    文件:KoreanUnitParser.java   
@Override
public String parse(String str) {

    StringBuffer result = new StringBuffer();

    int i = 0;
    int length = str.length();
    for (i = 0; i < length; i++) {
        char ch = str.charAt(i);
        Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
        if (block == UnicodeBlock.HANGUL_SYLLABLES) {
            int cho, jung, jong, tmp;
            tmp = ch - 0xAC00;
            cho = tmp / (21 * 28);
            tmp = tmp % (21 * 28);
            jung = tmp / 28;
            jong = tmp % 28;
            result.append(ChoSung[cho]);
            result.append(JungSung[jung]);
            if (jong != 0) {
                result.append(JongSung[jong]);
            }
        } else {
            result.append(ch);
        }
    }
    return result.toString();
}
项目:KOMORAN    文件:CorpusBuilder.java   
/**
 * 불규칙 사전에 추가
 * 
 * @param paPair
 */
private void appendIrregularDictionary(ProblemAnswerPair paPair) {
    if (this.isIrregular(paPair.getProblem(), paPair.getAnswerList())) {
        // 자소 단위로 변환하여 불규칙 패턴 추출            
        List<Pair<String, String>> irrRuleList = irrParser.parse(
                this.convertJaso(paPair.getProblem()),
                this.convertJaso(paPair.getAnswerList()));
        for (Pair<String, String> pair : irrRuleList) {
            //트레이닝 셋의 오류로 인한 skip(세종 코퍼스 기준)
            if (pair.getSecond().trim().length() == 0) {
                ;
            }else{
                //불규칙 대상에 자소 단위가 포함된 경우 skip
                if(this.irrExclusiveSet.contains(pair.getFirst()+"\t"+pair.getSecond().substring(0, pair.getSecond().lastIndexOf("/")))){
                    continue;
                }
                boolean hasJamoProblem = false;
                String tmpProblem = this.unitParser.combine(pair.getFirst());
                for(int i=0;i<tmpProblem.length();i++){
                    if(StringUtil.getUnicodeBlock(tmpProblem.charAt(i)) == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO){
                        hasJamoProblem = true;
                        break;
                    }
                }
                if(hasJamoProblem)continue;
                //놓으 -> 놓+으시와 같은 경우 skip
                //않으 -> 않+으시
                if(pair.getFirst().endsWith("ㅇㅡ") && pair.getSecond().endsWith("ㅇㅡㅅㅣ/EP")){
                    continue;
                }                   
                irrDic.append(this.unitParser.combine(pair.getFirst()), this.unitParser.combine(pair.getSecond()));
                //                  irrDic.append(pair.getFirst(), pair.getSecond());
            }
        }
    }
}
项目:KOMORAN    文件:CorpusBuilder.java   
/**
     * 단어 사전에 형태소, 품사 쌍 데이터 추가
     * 
     * @param answerList
     */
    private void appendWordDictionary(List<Pair<String, String>> answerList) {
        for (Pair<String, String> pair : answerList) {
            if(pair.getFirst().trim().length() == 1){
                if(StringUtil.getUnicodeBlock(pair.getFirst().trim().charAt(0)) == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 
                        && pair.getSecond().contains("NN")){
                    continue;
                }
            }

            if(pair.getSecond().equals("SH") || 
                    pair.getSecond().equals("SN") || 
                    pair.getSecond().equals("SL")){
                continue;
            }

            //analyzer와 의존성이 있는 관계로 rule parser에 해당 내용이 포함되어 있어야함
            //근데 이걸 하면 빨라질까?
//          if(pair.getSecond().equals("SF")    //마침표, 물음표, 느낌표 . ? !
//                  || pair.getSecond().equals("SP")    //쉼표, 가운뎃점, 콜론, 빗금 , / ; :
//                  || pair.getSecond().equals("SS")    //따옴표, 괄호표, 줄표 " ' ` - < > { } [ ] ( )
//                  || pair.getSecond().equals("SO")    //붙임표(물결, 숨김, 빠짐) ~
//                  ){  //줄임표 ...
//              continue;
//          }

            wordDic.append(pair.getFirst(), pair.getSecond());
        }
    }
项目:analyzers-ja    文件:ProlongedSoundMarkCharFilter.java   
@Override
protected CharSequence processInput(final CharSequence input) {
    final StringBuilder buf = new StringBuilder(input.length());
    char prev = 0;
    for (int pos = 0; pos < input.length(); pos++) {
        final char c = input.charAt(pos);
        switch (c) {
        case U002D:
        case UFF0D:
        case U2010:
        case U2011:
        case U2012:
        case U2013:
        case U2014:
        case U2015:
        case U207B:
        case U208B:
        case U30FC:
            if (prev != 0) {
                final UnicodeBlock block = UnicodeBlock.of(prev);
                if (block == UnicodeBlock.HIRAGANA
                        || block == UnicodeBlock.KATAKANA
                        || block == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
                    buf.append(replacement);
                } else {
                    buf.append(c);
                }
            } else {
                buf.append(c);
            }
            break;
        default:
            buf.append(c);
            break;
        }
        prev = c;
    }
    return buf;
}
项目:JotaTextEditor    文件:ArrowKeyMovementMethod.java   
private static int findWordStart(CharSequence text, int start) {
    if ( text.length() <= start ){
        return start;
    }

    UnicodeBlock c0 = UnicodeBlock.of(text.charAt(start));

    for (; start > 0; start--) {
        char c = text.charAt(start - 1);
        UnicodeBlock cb = UnicodeBlock.of(c);
        if ( c0 == UnicodeBlock.BASIC_LATIN ){
            int type = Character.getType(c);

            if (c != '\'' &&
                type != Character.UPPERCASE_LETTER &&
                type != Character.LOWERCASE_LETTER &&
                type != Character.TITLECASE_LETTER &&
                type != Character.MODIFIER_LETTER &&
                type != Character.DECIMAL_DIGIT_NUMBER) {
                break;
            }
        }else if ( c0 != cb ){
            break;
        }
    }

    return start;
}
项目:JotaTextEditor    文件:ArrowKeyMovementMethod.java   
private static int findWordEnd(CharSequence text, int end) {
    int len = text.length();

    if ( len <= end ){
        return end;
    }

    UnicodeBlock c0 = UnicodeBlock.of(text.charAt(end));

    for (; end < len; end++) {
        char c = text.charAt(end);
        UnicodeBlock cb = UnicodeBlock.of(c);
        if ( c0 == UnicodeBlock.BASIC_LATIN ){
            int type = Character.getType(c);

            if (c != '\'' &&
                type != Character.UPPERCASE_LETTER &&
                type != Character.LOWERCASE_LETTER &&
                type != Character.TITLECASE_LETTER &&
                type != Character.MODIFIER_LETTER &&
                type != Character.DECIMAL_DIGIT_NUMBER) {
                break;
            }
        }else if ( c0 != cb ){
            break;
        }
    }

    return end;
}
项目:FanFictionReader    文件:CategoryMenuLoaders.java   
@Override
protected boolean load(Document document, List<CategoryMenuItem> list) {

    if (mCache == null) {
        Elements fandoms = document.select("div#main > ol.fandom ul li");
        if (fandoms.isEmpty()) { return false; }
        mCache = new ArrayList<>(fandoms.size());

        for (Element fandom : fandoms) {
            Elements children = fandom.children();
            if (children.size() == 0) { return false; }

            // Remove non Latin characters from the beginning in order
            // to improve section indexing
            String title = children.get(0).ownText();
            if (!CONSERVE_CHARACTERS.contains(Character.UnicodeBlock.of(title.charAt(0)))) {
                title = title.contains("| ") ? title.substring(title.lastIndexOf("| ") + 2) : title;
                title = Character.toUpperCase(title.charAt(0)) + title.substring(1);
            }

            String views = fandom.ownText();
            views = views.replaceAll("[\\D]", "");
            final int viewsAsInt = Integer.parseInt(views);
            views = mFormatter.format(viewsAsInt);
            views = String.format(mFormatString, views);

            Uri url = Uri.parse(children.get(0).absUrl("href"));

            CategoryMenuItem item = new CategoryMenuItem(title, views, url);
            mCache.add(item);
        }

    }

    list.addAll(mCache);
    filter(list);

    return true;
}
项目:donkirkby    文件:CharacterClassifier.java   
public boolean isJapanese(char c) {
    Set<UnicodeBlock> japaneseUnicodeBlocks = new HashSet<UnicodeBlock>();
    japaneseUnicodeBlocks.add(UnicodeBlock.KATAKANA);
    japaneseUnicodeBlocks.add(UnicodeBlock.HIRAGANA);

    UnicodeBlock block = UnicodeBlock.of(c);
    return japaneseUnicodeBlocks.contains(block);
}
项目:inutils4j    文件:MyStringUtils.java   
public static boolean hasJapaneseCharacter(String str) {
  for (char c : str.toCharArray()) {
    if (JAPANESE_BLOCKS.contains(UnicodeBlock.of(c))) {
      return true;
    }
  }
  return false;
}