private static void addAlphabet(Object base, Character.UnicodeBlock[] alphabet, String language) throws ResourceParseException, IOException { boolean b = false; for (int i = 0; !b && i < alphabet.length; i++) { b = loadedAlphabets.contains(alphabet[i]) || b; } if (!b) { TeXParser.isLoading = true; addTeXFontDescription(base, AjLatexMath.getAssetManager().open(language), language); for (int i = 0; i < alphabet.length; i++) { loadedAlphabets.add(alphabet[i]); } TeXParser.isLoading = false; } }
/** * Remove diacritics from the specified string. * @param s * @return a copy of the specified string with diacritics removed. */ public static final String removeDiacritics(String s) { String n = Normalizer.normalize(s, Form.NFD); StringBuilder sb = null; for (int i = 0; i < n.length(); ++i) { char c = n.charAt(i); UnicodeBlock b = UnicodeBlock.of(c); if (UnicodeBlock.COMBINING_DIACRITICAL_MARKS.equals(b) || UnicodeBlock.COMBINING_DIACRITICAL_MARKS_SUPPLEMENT.equals(b)) { if (sb == null) { sb = new StringBuilder(n.length()); sb.append(n.substring(0, i)); } continue; } if (sb != null) sb.append(c); } if (sb == null) return n; return sb.toString(); }
protected void initGlyphRenderer() { glyphRendererBlocks = new HashSet<Character.UnicodeBlock>(); List<PropertySuffix> props = propertiesUtil.getAllProperties(getCurrentJasperPrint(), PdfReportConfiguration.PROPERTY_PREFIX_GLYPH_RENDERER_BLOCKS); for (PropertySuffix prop : props) { String blocks = prop.getValue(); for (String blockToken : blocks.split(",")) { UnicodeBlock block = resolveUnicodeBlock(blockToken); if (block != null) { if (log.isDebugEnabled()) { log.debug("glyph renderer block " + block); } glyphRendererBlocks.add(block); } } } }
protected UnicodeBlock resolveUnicodeBlock(String name) { if (name.trim().isEmpty()) { return null; } try { return UnicodeBlock.forName(name.trim()); } catch (IllegalArgumentException e) { log.warn("Could not resolve \"" + name + "\" to a Unicode block"); return null; } }
public List<Pair<Character, UnitType>> parseWithType(String str) { List<Pair<Character, UnitType>> result = new ArrayList<>(); int length = str.length(); for (int i = 0; i < length; i++) { char ch = str.charAt(i); Character.UnicodeBlock block = Character.UnicodeBlock.of(ch); if (block == UnicodeBlock.HANGUL_SYLLABLES) { int cho, jung, jong, tmp; tmp = ch - 0xAC00; cho = tmp / (21 * 28); tmp = tmp % (21 * 28); jung = tmp / 28; jong = tmp % 28; result.add(new Pair<>(ChoSung[cho], UnitType.CHOSUNG)); result.add(new Pair<>(JungSung[jung], UnitType.JUNGSUNG)); if (jong != 0) { result.add(new Pair<>(JongSung[jong], UnitType.JONGSUNG)); } } else { result.add(new Pair<>(ch, UnitType.OTHER)); } } return result; }
public static boolean isRussianWord(String token) { if (token.isEmpty()) { return false; } Character lastLetter = null; // find last letter for (int i = token.length() - 1; i >= 0; i--) { char ch = token.charAt(i); if (Character.isLetter(ch)) { lastLetter = ch; break; } } if (lastLetter == null) { return false; } // check is it cyrillic return UnicodeBlock.of(lastLetter).equals(UnicodeBlock.CYRILLIC); }
/** * utf-8 转unicode * * @param inStr * @return String */ public static String utf8ToUnicode(String inStr) { char[] myBuffer = inStr.toCharArray(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < inStr.length(); i++) { UnicodeBlock ub = UnicodeBlock.of(myBuffer[i]); if (ub == UnicodeBlock.BASIC_LATIN) { sb.append(myBuffer[i]); } else if (ub == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { int j = (int) myBuffer[i] - 65248; sb.append((char) j); } else { short s = (short) myBuffer[i]; String hexS = Integer.toHexString(s); String unicode = "\\u" + hexS; sb.append(unicode.toLowerCase()); } } return sb.toString().replaceAll("ffff", ""); }
public boolean isChinese(char c) { Set<UnicodeBlock> chineseUnicodeBlocks = new HashSet<UnicodeBlock>(); chineseUnicodeBlocks.add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS); // add(UnicodeBlock.CJK_COMPATIBILITY); // add(UnicodeBlock.CJK_COMPATIBILITY_FORMS); // add(UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS); // add(UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT); // add(UnicodeBlock.CJK_RADICALS_SUPPLEMENT); // add(UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION); // add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS); // add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A); // add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B); // add(UnicodeBlock.KANGXI_RADICALS); // add(UnicodeBlock.IDEOGRAPHIC_DESCRIPTION_CHARACTERS); UnicodeBlock block = UnicodeBlock.of(c); return chineseUnicodeBlocks.contains(block); }
public static String normalizeCharacter(int codePoint) { if (!Character.isValidCodePoint(codePoint)) { return ""; } // TODO: long search? maybe replace with own implementation UnicodeBlock block = Character.UnicodeBlock.of(codePoint); if (block == Character.UnicodeBlock.MATHEMATICAL_ALPHANUMERIC_SYMBOLS) { return normalizeMath(codePoint); } if (block == Character.UnicodeBlock.LETTERLIKE_SYMBOLS) { return normalizeLetterLike(codePoint); } return codePointToString(codePoint); }
@Override public List<String> tokenize(String text) { int beginIndex = -1; UnicodeBlock current = null; List<String> list = new LinkedList<>(); for (int i = 0; i < text.length(); i++) { UnicodeBlock block = UnicodeBlock.of(text.charAt(i)); if (current != block) { if (beginIndex >= 0) { list.add(text.substring(beginIndex, i)); } beginIndex = i; current = block; } } if (beginIndex >= 0) { list.add(text.substring(beginIndex)); } return list; }
/** * 文字種判別「全角ひらがな」。 * * @param codePoint 対象文字 (コードポイントで指定すること)。 * @return 対象文字が「全角ひらがな」であれば真(true)、さもなくば、偽(false)。 */ public static boolean isFullHiragana(int codePoint) { // based on Unicode 3.2 return of(codePoint) == UnicodeBlock.HIRAGANA || // \u3040 - \u309F // import from KATAKANA (\u30A0 - \u30FF) codePoint == '\u30A0' || // '゠' from KATAKANA (not in Win31J) codePoint == '\u30FB' || // '・' from KATAKANA codePoint == '\u30FC' || // 'ー' from KATAKANA // \u30FD 'ヽ' and \u30FE 'ヾ' if iteration mark for KATAKANA codePoint == '\u30FF' || // 'ヿ' from KATAKANA (not in Win31J) codePoint == '\u3001' || // '、' codePoint == '\u3002' || // '。' codePoint == '\u300C' || // '「' codePoint == '\u300D' || // '」' codePoint == '\u300E' || // '『' codePoint == '\u300F'; // '』' }
/** * 文字種判別「全角カタカナ」。 * * @param codePoint 対象文字 (コードポイントで指定すること)。 * @return 対象文字が「全角カタカナ」であれば真(true)、さもなくば、偽(false)。 */ public static boolean isFullKatakana(int codePoint) { // based on Unicode 3.2 return of(codePoint) == UnicodeBlock.KATAKANA || // \u30A0 - \u30FF of(codePoint) == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || // \u31F0-\u31FF // import from HIRAGANA (\u3040 - \u309F) // \u3040, \u3097, \u3098 is reserved codePoint == '\u3099' || // MARK from HIRAGANA (not in Win31J) codePoint == '\u309A' || // MARK from HIRAGANA (not in Win31J) codePoint == '\u309B' || // '゛' from HIRAGANA codePoint == '\u309C' || // '゜' from HIRAGANA // \u309D 'ゝ' and \u309E 'ゞ' is iteration mark for HIRAGANA codePoint == '\u309F' || // 'ゟ' from HIRAGANA (not in Win31J) codePoint == '\u3001' || // '、' codePoint == '\u3002' || // '。' codePoint == '\u300C' || // '「' codePoint == '\u300D' || // '」' codePoint == '\u300E' || // '『' codePoint == '\u300F'; // '』' }
/** * utf-8 转换成 unicode * * @param inStr * @return * @author fanhui * 2007-3-15 */ public static String utf8ToUnicode(String inStr) { char[] myBuffer = inStr.toCharArray(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < inStr.length(); i++) { UnicodeBlock ub = UnicodeBlock.of(myBuffer[i]); if (ub == UnicodeBlock.BASIC_LATIN) { //英文及数字等 sb.append(myBuffer[i]); } else if (ub == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { //全角半角字符 int j = (int) myBuffer[i] - 65248; sb.append((char) j); } else { //汉字 short s = (short) myBuffer[i]; String hexS = Integer.toHexString(s); String unicode = "\\u" + hexS; sb.append(unicode.toLowerCase()); } } return sb.toString(); }
static public TTUnicodeRange of(long a_unicode) { initList(); TTUnicodeRange retval = null; UnicodeBlock block = UnicodeBlock.of((int) a_unicode); if (block == null) { return retval; } int i; for (i = 0; i < s_list.size(); i++) { TTUnicodeRange range = s_list.get(i); if (range.m_block.equals(block)) { return range; } } return retval; }
/** * Creates a {@link GlyphProducer} based on a range of characters. * * @param font Style of text * @param rd Controller of rendering details * @param frc Details on how fonts are rendered * @param ub Range of characters to support * @return Correct glyph producer for unicode block, not null * @throws NullPointerException if font, render delegate, or render context is null */ /*@Nonnull*/ public static GlyphProducer get(/*@Nonnull*/ final Font font, /*@Nonnull*/ final RenderDelegate rd, /*@Nonnull*/ final FontRenderContext frc, /*@CheckForNull*/ final UnicodeBlock ub) { Check.notNull(font, "Font cannot be null"); Check.notNull(rd, "Render delegate cannot be null"); Check.notNull(frc, "Font render context cannot be null"); if (ub == UnicodeBlock.BASIC_LATIN) { return new AsciiGlyphProducer(font, rd, frc); } else { return new UnicodeGlyphProducer(font, rd, frc); } }
private int guessCJKNameStyle(String name, int offset) { int length = name.length(); while (offset < length) { int codePoint = Character.codePointAt(name, offset); if (Character.isLetter(codePoint)) { UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { return FullNameStyle.JAPANESE; } if (isKoreanUnicodeBlock(unicodeBlock)) { return FullNameStyle.KOREAN; } } offset += Character.charCount(codePoint); } return FullNameStyle.CJK; }
public static void addAlphabet(Character.UnicodeBlock alphabet, InputStream inlanguage, String language, InputStream insymbols, String symbols, InputStream inmappings, String mappings) throws ResourceParseException, IOException { if (!loadedAlphabets.contains(alphabet)) { addTeXFontDescription(inlanguage, language); SymbolAtom.addSymbolAtom(insymbols, symbols); TeXFormula.addSymbolMappings(inmappings, mappings); loadedAlphabets.add(alphabet); } }
public static void addAlphabet(Character.UnicodeBlock alphabet, String name) throws ResourceParseException, IOException { String lg = "fonts/" + name + "/language_" + name + ".xml"; String sym = "fonts/" + name + "/symbols_" + name + ".xml"; String map = "fonts/" + name + "/mappings_" + name + ".xml"; try { DefaultTeXFont.addAlphabet(alphabet, AjLatexMath.getAssetManager() .open(lg), lg, TeXFormula.class.getResourceAsStream(sym), sym, TeXFormula.class.getResourceAsStream(map), map); } catch (FontAlreadyLoadedException e) { } }
public static FontInfos getExternalFont(Character.UnicodeBlock block) { FontInfos infos = externalFontMap.get(block); if (infos == null) { infos = new FontInfos("SansSerif", "Serif"); externalFontMap.put(block, infos); } return infos; }
public static void registerExternalFont(Character.UnicodeBlock block, String sansserif, String serif) { if (sansserif == null && serif == null) { externalFontMap.remove(block); return; } externalFontMap.put(block, new FontInfos(sansserif, serif)); if (block.equals(Character.UnicodeBlock.BASIC_LATIN)) { predefinedTeXFormulas.clear(); } }
public static int recognizeLanguage(String text) { int kanCount = 0; int tamCount = 0; int digitCount = 0; for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); UnicodeBlock ub = UnicodeBlock.of(c); if (ub == UnicodeBlock.KANNADA) { kanCount++; } else if (ub == UnicodeBlock.TAMIL) { tamCount++; } else if (Character.isDigit(c)) { digitCount++; } } if (kanCount == 0 && tamCount == 0) { if (digitCount > 0) { return PREVIOUS_LANGUAGE; } return LANGUAGE_UNKNOWN; } if (tamCount > kanCount) { PREVIOUS_LANGUAGE = LANGUAGE_TAMIL; return LANGUAGE_TAMIL; } else { PREVIOUS_LANGUAGE = LANGUAGE_KANNADA; return LANGUAGE_KANNADA; } }
@Test public void testAll(){ for (int i = 0; i < 1000; i++) { char ch = (char) i; UnicodeBlock block = UnicodeBlock.of(ch); String type = TypeTokenizer.getType(ch); if(type == TypeTokenizer.UNCATEGORIZED){ System.out.println(i+ " : " + ch +" : " + type + " : "+block); } } }
protected boolean toUseGlyphRenderer(JRPrintText text) { String value = styledTextUtil.getTruncatedText(text); if (value == null) { return false; } if (glyphRendererBlocks.isEmpty()) { return false; } int charCount = value.length(); char[] chars = new char[charCount]; value.getChars(0, charCount, chars, 0); for (char c : chars) { UnicodeBlock block = UnicodeBlock.of(c); if (glyphRendererBlocks.contains(block)) { if (log.isTraceEnabled()) { log.trace("found character in block " + block + ", using the glyph renderer"); } return true; } } return false; }
protected boolean hasComplexLayout(char[] chars) { UnicodeBlock prevBlock = null; for (int i = 0; i < chars.length; i++) { char ch = chars[i]; if (ch >= COMPEX_LAYOUT_START_CHAR && ch <= COMPEX_LAYOUT_END_CHAR) { //FIXME use icu4j or CharPredicateCache UnicodeBlock chBlock = Character.UnicodeBlock.of(ch); if (chBlock == null) { // being conservative return true; } // if the same block as the previous block, avoid going to the hash set // this could offer some speed improvement if (prevBlock != chBlock) { prevBlock = chBlock; if (!simpleLayoutBlocks.contains(chBlock)) { return true; } } } } return false; }
/** * Helper method to determine if a character is a Latin-script letter or not. For our purposes, * combining marks should also return true since we assume they have been added to a preceding * Latin character. */ // @VisibleForTesting static boolean isLatinLetter(char letter) { // Combining marks are a subset of non-spacing-mark. if (!Character.isLetter(letter) && Character.getType(letter) != Character.NON_SPACING_MARK) { return false; } UnicodeBlock block = UnicodeBlock.of(letter); return block.equals(UnicodeBlock.BASIC_LATIN) || block.equals(UnicodeBlock.LATIN_1_SUPPLEMENT) || block.equals(UnicodeBlock.LATIN_EXTENDED_A) || block.equals(UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) || block.equals(UnicodeBlock.LATIN_EXTENDED_B) || block.equals(UnicodeBlock.COMBINING_DIACRITICAL_MARKS); }
/** * Loop over all the chars in given {@link UnicodeBlock}s and return a * {@link Set <String>} containing all the possible values as their * {@link String} values. * * @param blocks the {@link UnicodeBlock}s to loop over * @return a {@link Set <String>} containing all the possible values as * {@link String} values */ private static Set<String> getAllStringsFromUnicodeBlocks(final UnicodeBlock... blocks) { final Set<UnicodeBlock> blockSet = new HashSet<UnicodeBlock>(Arrays.asList(blocks)); final Set<String> strings = new HashSet<String>(); for (int codePoint = 0; codePoint <= Character.MAX_CODE_POINT; codePoint++) { if (blockSet.contains(UnicodeBlock.of(codePoint))) { final int charCount = Character.charCount(codePoint); final StringBuilder sb = new StringBuilder( charCount); if (charCount == 1) { sb.append(String.valueOf((char) codePoint)); } else if (charCount == 2) { //TODO: use Character.highSurrogate(codePoint) and Character.lowSurrogate(codePoint) when Java 7 is baseline char highSurrogate = (char) ((codePoint >>> 10) + ('\uD800' - (0x010000 >>> 10))); char lowSurrogate = (char) ((codePoint & 0x3ff) + '\uDC00'); sb.append(highSurrogate); sb.append(lowSurrogate); } else { throw new IllegalArgumentException("Character.charCount of " + charCount + " not supported."); } strings.add(sb.toString()); } } return strings; }
private static List<String> generateTestData() { return new LinkedList<String>() { private static final long serialVersionUID = 7331717267070233454L; { // non-surrogate pair blocks addAll(getAllStringsFromUnicodeBlocks(UnicodeBlock.BASIC_LATIN, UnicodeBlock.LATIN_1_SUPPLEMENT, UnicodeBlock.GREEK, UnicodeBlock.LETTERLIKE_SYMBOLS)); // blocks with surrogate pairs //TODO: restore others when Java 7 is baseline addAll(getAllStringsFromUnicodeBlocks(UnicodeBlock.LINEAR_B_SYLLABARY, /*UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,*/ UnicodeBlock.MUSICAL_SYMBOLS, /*UnicodeBlock.EMOTICONS,*/ /*UnicodeBlock.PLAYING_CARDS,*/ UnicodeBlock.BOX_DRAWING, UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS, UnicodeBlock.PRIVATE_USE_AREA, UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A, UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B)); // some additional combinations of characters that could cause problems to the encoder String[] boxDrawing = getAllStringsFromUnicodeBlocks(UnicodeBlock.BOX_DRAWING).toArray(new String[0]); String[] halfFullWidthForms = getAllStringsFromUnicodeBlocks(UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS).toArray(new String[0]); for (int i = 0; i < halfFullWidthForms.length; i++) { add(halfFullWidthForms[i] + boxDrawing[i % boxDrawing.length]); } } }; }
@Override public String parse(String str) { StringBuffer result = new StringBuffer(); int i = 0; int length = str.length(); for (i = 0; i < length; i++) { char ch = str.charAt(i); Character.UnicodeBlock block = Character.UnicodeBlock.of(ch); if (block == UnicodeBlock.HANGUL_SYLLABLES) { int cho, jung, jong, tmp; tmp = ch - 0xAC00; cho = tmp / (21 * 28); tmp = tmp % (21 * 28); jung = tmp / 28; jong = tmp % 28; result.append(ChoSung[cho]); result.append(JungSung[jung]); if (jong != 0) { result.append(JongSung[jong]); } } else { result.append(ch); } } return result.toString(); }
/** * 불규칙 사전에 추가 * * @param paPair */ private void appendIrregularDictionary(ProblemAnswerPair paPair) { if (this.isIrregular(paPair.getProblem(), paPair.getAnswerList())) { // 자소 단위로 변환하여 불규칙 패턴 추출 List<Pair<String, String>> irrRuleList = irrParser.parse( this.convertJaso(paPair.getProblem()), this.convertJaso(paPair.getAnswerList())); for (Pair<String, String> pair : irrRuleList) { //트레이닝 셋의 오류로 인한 skip(세종 코퍼스 기준) if (pair.getSecond().trim().length() == 0) { ; }else{ //불규칙 대상에 자소 단위가 포함된 경우 skip if(this.irrExclusiveSet.contains(pair.getFirst()+"\t"+pair.getSecond().substring(0, pair.getSecond().lastIndexOf("/")))){ continue; } boolean hasJamoProblem = false; String tmpProblem = this.unitParser.combine(pair.getFirst()); for(int i=0;i<tmpProblem.length();i++){ if(StringUtil.getUnicodeBlock(tmpProblem.charAt(i)) == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO){ hasJamoProblem = true; break; } } if(hasJamoProblem)continue; //놓으 -> 놓+으시와 같은 경우 skip //않으 -> 않+으시 if(pair.getFirst().endsWith("ㅇㅡ") && pair.getSecond().endsWith("ㅇㅡㅅㅣ/EP")){ continue; } irrDic.append(this.unitParser.combine(pair.getFirst()), this.unitParser.combine(pair.getSecond())); // irrDic.append(pair.getFirst(), pair.getSecond()); } } } }
/** * 단어 사전에 형태소, 품사 쌍 데이터 추가 * * @param answerList */ private void appendWordDictionary(List<Pair<String, String>> answerList) { for (Pair<String, String> pair : answerList) { if(pair.getFirst().trim().length() == 1){ if(StringUtil.getUnicodeBlock(pair.getFirst().trim().charAt(0)) == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO && pair.getSecond().contains("NN")){ continue; } } if(pair.getSecond().equals("SH") || pair.getSecond().equals("SN") || pair.getSecond().equals("SL")){ continue; } //analyzer와 의존성이 있는 관계로 rule parser에 해당 내용이 포함되어 있어야함 //근데 이걸 하면 빨라질까? // if(pair.getSecond().equals("SF") //마침표, 물음표, 느낌표 . ? ! // || pair.getSecond().equals("SP") //쉼표, 가운뎃점, 콜론, 빗금 , / ; : // || pair.getSecond().equals("SS") //따옴표, 괄호표, 줄표 " ' ` - < > { } [ ] ( ) // || pair.getSecond().equals("SO") //붙임표(물결, 숨김, 빠짐) ~ // ){ //줄임표 ... // continue; // } wordDic.append(pair.getFirst(), pair.getSecond()); } }
@Override protected CharSequence processInput(final CharSequence input) { final StringBuilder buf = new StringBuilder(input.length()); char prev = 0; for (int pos = 0; pos < input.length(); pos++) { final char c = input.charAt(pos); switch (c) { case U002D: case UFF0D: case U2010: case U2011: case U2012: case U2013: case U2014: case U2015: case U207B: case U208B: case U30FC: if (prev != 0) { final UnicodeBlock block = UnicodeBlock.of(prev); if (block == UnicodeBlock.HIRAGANA || block == UnicodeBlock.KATAKANA || block == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) { buf.append(replacement); } else { buf.append(c); } } else { buf.append(c); } break; default: buf.append(c); break; } prev = c; } return buf; }
private static int findWordStart(CharSequence text, int start) { if ( text.length() <= start ){ return start; } UnicodeBlock c0 = UnicodeBlock.of(text.charAt(start)); for (; start > 0; start--) { char c = text.charAt(start - 1); UnicodeBlock cb = UnicodeBlock.of(c); if ( c0 == UnicodeBlock.BASIC_LATIN ){ int type = Character.getType(c); if (c != '\'' && type != Character.UPPERCASE_LETTER && type != Character.LOWERCASE_LETTER && type != Character.TITLECASE_LETTER && type != Character.MODIFIER_LETTER && type != Character.DECIMAL_DIGIT_NUMBER) { break; } }else if ( c0 != cb ){ break; } } return start; }
private static int findWordEnd(CharSequence text, int end) { int len = text.length(); if ( len <= end ){ return end; } UnicodeBlock c0 = UnicodeBlock.of(text.charAt(end)); for (; end < len; end++) { char c = text.charAt(end); UnicodeBlock cb = UnicodeBlock.of(c); if ( c0 == UnicodeBlock.BASIC_LATIN ){ int type = Character.getType(c); if (c != '\'' && type != Character.UPPERCASE_LETTER && type != Character.LOWERCASE_LETTER && type != Character.TITLECASE_LETTER && type != Character.MODIFIER_LETTER && type != Character.DECIMAL_DIGIT_NUMBER) { break; } }else if ( c0 != cb ){ break; } } return end; }
@Override protected boolean load(Document document, List<CategoryMenuItem> list) { if (mCache == null) { Elements fandoms = document.select("div#main > ol.fandom ul li"); if (fandoms.isEmpty()) { return false; } mCache = new ArrayList<>(fandoms.size()); for (Element fandom : fandoms) { Elements children = fandom.children(); if (children.size() == 0) { return false; } // Remove non Latin characters from the beginning in order // to improve section indexing String title = children.get(0).ownText(); if (!CONSERVE_CHARACTERS.contains(Character.UnicodeBlock.of(title.charAt(0)))) { title = title.contains("| ") ? title.substring(title.lastIndexOf("| ") + 2) : title; title = Character.toUpperCase(title.charAt(0)) + title.substring(1); } String views = fandom.ownText(); views = views.replaceAll("[\\D]", ""); final int viewsAsInt = Integer.parseInt(views); views = mFormatter.format(viewsAsInt); views = String.format(mFormatString, views); Uri url = Uri.parse(children.get(0).absUrl("href")); CategoryMenuItem item = new CategoryMenuItem(title, views, url); mCache.add(item); } } list.addAll(mCache); filter(list); return true; }
public boolean isJapanese(char c) { Set<UnicodeBlock> japaneseUnicodeBlocks = new HashSet<UnicodeBlock>(); japaneseUnicodeBlocks.add(UnicodeBlock.KATAKANA); japaneseUnicodeBlocks.add(UnicodeBlock.HIRAGANA); UnicodeBlock block = UnicodeBlock.of(c); return japaneseUnicodeBlocks.contains(block); }
public static boolean hasJapaneseCharacter(String str) { for (char c : str.toCharArray()) { if (JAPANESE_BLOCKS.contains(UnicodeBlock.of(c))) { return true; } } return false; }