/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s * The string the encode (assuming ASCII characters only) * @param e * A character that does not require encoding if found in the * string. */ private static String encode_UTF8(String s, char e) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isUnreserved(b) || b == e) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s * The string the encode (assuming ASCII characters only) */ private static String minimalEncode_UTF8(String s) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isLegal(b)) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
/** * Remove diacritics from the specified string. * @param s * @return a copy of the specified string with diacritics removed. */ public static final String removeDiacritics(String s) { String n = Normalizer.normalize(s, Form.NFD); StringBuilder sb = null; for (int i = 0; i < n.length(); ++i) { char c = n.charAt(i); UnicodeBlock b = UnicodeBlock.of(c); if (UnicodeBlock.COMBINING_DIACRITICAL_MARKS.equals(b) || UnicodeBlock.COMBINING_DIACRITICAL_MARKS_SUPPLEMENT.equals(b)) { if (sb == null) { sb = new StringBuilder(n.length()); sb.append(n.substring(0, i)); } continue; } if (sb != null) sb.append(c); } if (sb == null) return n; return sb.toString(); }
private static String getHash(int iterations, byte[] salt, String text, String algorithm) throws NoSuchAlgorithmException { MessageDigest digest; if (--iterations < 0) { iterations = 0; } byte[] data; try { digest = MessageDigest.getInstance(algorithm); text = Normalizer.normalize(text, Form.NFC); if (salt != null) { digest.update(salt); } data = digest.digest(text.getBytes("UTF-8")); for (int i = 0; i < iterations; i++) { data = digest.digest(data); } } catch (UnsupportedEncodingException e) { throw new MCRException("Could not get " + algorithm + " checksum", e); } return toHexString(data); }
String toString(List<List<TextPosition>> words) { StringBuilder stringBuilder = new StringBuilder(); boolean first = true; for (List<TextPosition> word : words) { if (first) first = false; else stringBuilder.append(' '); for (TextPosition textPosition : word) { stringBuilder.append(textPosition.getUnicode()); } } // cf. http://stackoverflow.com/a/7171932/1729265 return Normalizer.normalize(stringBuilder, Form.NFKC); }
private static String getHash(int iterations, byte[] salt, String str, String algorithm) throws NoSuchAlgorithmException, UnsupportedEncodingException { MessageDigest digest; int it = iterations; if (--it < 0) { it = 0; } byte[] data; digest = MessageDigest.getInstance(algorithm); String text = Normalizer.normalize(str, Form.NFC); if (salt != null) { digest.update(salt); } data = digest.digest(text.getBytes("UTF-8")); for (int i = 0; i < it; i++) { data = digest.digest(data); } return Hash.toHexString(data); }
public static String hashIt ( final String salt, String data ) { data = Normalizer.normalize ( data, Form.NFC ); final byte[] strData = data.getBytes ( StandardCharsets.UTF_8 ); final byte[] saltData = salt.getBytes ( StandardCharsets.UTF_8 ); final byte[] first = new byte[saltData.length + strData.length]; System.arraycopy ( saltData, 0, first, 0, saltData.length ); System.arraycopy ( strData, 0, first, saltData.length, strData.length ); final MessageDigest md = createDigest (); byte[] digest = md.digest ( first ); final byte[] current = new byte[saltData.length + digest.length]; for ( int i = 0; i < 1000; i++ ) { System.arraycopy ( saltData, 0, current, 0, saltData.length ); System.arraycopy ( digest, 0, current, saltData.length, digest.length ); digest = md.digest ( current ); } return Base64.getEncoder ().encodeToString ( digest ); }
@Override public Fragment expanded() { char[] cs = super.innerText().toCharArray(); String accent = charMap.get(innerText().substring(0, 1)); if (accent == null) { accent = "\uFFFD"; Message m = Message.builder("char.accent.unknown") .fromNode(this) .addNote("Character " + text + " cannot be expanded.") .build(); Log.getInstance().add(m); } String str = "" + cs[1] + accent; str = Normalizer.normalize(str, Form.NFC); return wrap("ACCENT", str); }
/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s The string the encode (assuming ASCII characters only) * @param e A character that does not require encoding if found in the string. */ private static String encodeUTF8(String s, char e) { String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = StandardCharsets.UTF_8.encode(n); // URI encode StringBuilder sb = new StringBuilder(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isUnreserved(b) || b == e) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s The string the encode (assuming ASCII characters only) */ private static String minimalEncodeUTF8(String s) { String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = StandardCharsets.UTF_8.encode(n); // URI encode StringBuilder sb = new StringBuilder(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isLegal(b)) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s The string the encode (assuming ASCII characters only) * @param e A character that does not require encoding if found in the string. */ private static String encode_UTF8(String s, char e) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isUnreserved(b) || b == e) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s The string the encode (assuming ASCII characters only) */ private static String minimalEncode_UTF8(String s) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isLegal(b)) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * 获取字符串的Slug. * @param str - 待获取Slug的字符串 * @return 字符串对应的Slug */ public static String getSlug(String str) { if ( str == null ) { return ""; } // Rid of White Spaces String noWhiteSpace = WHITESPACE.matcher(str.trim()).replaceAll("-"); // Processing Non-ASCII Characters try { noWhiteSpace = URLEncoder.encode(noWhiteSpace, "UTF-8"); } catch (UnsupportedEncodingException e) { // Never reach here } // Slugify String String normalized = Normalizer.normalize(noWhiteSpace, Form.NFD); return normalized.toLowerCase(); }
static String NormalizeAccents(String regularString) { if (!g_bNormalize) return regularString; // leave the accents String normalizedString = regularString.replace("é", "e"); normalizedString = Normalizer.normalize(normalizedString, Form.NFD); StringBuilder sb = new StringBuilder(normalizedString); for (int i = 0; i < sb.length(); i++) { if (Character.getType(sb.charAt(i)) == Character.NON_SPACING_MARK) { sb.delete(i, 1); } } regularString = sb.toString(); return regularString; }
public void apply(TokenStream stream) throws TokenizerException { if (stream == null) return; stream.reset(); while (stream.hasNext()) { String token = stream.next(); // String tmp = Normalizer.normalize(token, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+",""); String tmp = Normalizer.normalize(token, Form.NFD); tmp = tmp.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); // .replaceAll("\\p{InCombiningDiacriticalMarks}+",""); if(!token.equals(tmp)) { stream.previous(); stream.set(tmp); stream.next(); } } }
public static String convertToAlphaNumerics(String value) { logger.debug("Before : " + value); value = Normalizer.normalize(value, Form.NFD); value = value.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); value = value.replaceAll("[^-_a-zA-Z0-9\\s]", "").replace(" ", ""); logger.debug("After : " + value); return value; }
public static String deAccent(String value) { logger.debug("Before : " + value); String nfdNormalizedString = Normalizer.normalize(value, Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); value = pattern.matcher(nfdNormalizedString).replaceAll(""); logger.debug("After : " + value); return value; }
public static String convertToAlphaNumerics(String value, Integer countApp) throws UnsupportedEncodingException { value = new String(value.getBytes("ISO-8859-1"), "UTF-8"); value = Normalizer.normalize(value, Form.NFD); value = value.replaceAll("[^\\p{ASCII}]", "") .replaceAll("[^a-zA-Z0-9\\s]", "").replace(" ", ""); if (value.equalsIgnoreCase("")) { value = "default" + countApp; } return value; }
public static String makeSlug(String input, boolean transliterate) { String origInput = input; // Validate the input if (input == null) { ProjectLogger.log("Provided input value is null"); return input; } // Remove extra spaces input = input.trim(); // Remove URL encoding input = urlDecode(input); // If transliterate is required if (transliterate) { // Tranlisterate & cleanup String transliterated = transliterate(input); // transliterated = removeDuplicateChars(transliterated); input = transliterated; } // Replace all whitespace with dashes input = WHITESPACE.matcher(input).replaceAll("-"); // Remove all accent chars input = Normalizer.normalize(input, Form.NFD); // Remove all non-latin special characters input = NONLATIN.matcher(input).replaceAll(""); // Remove any consecutive dashes input = normalizeDashes(input); // Validate before returning validateResult(input, origInput); // Slug is always lowercase return input.toLowerCase(Locale.ENGLISH); }
/** * Attempts to compose input by combining the first character * with the first combining mark following it. Returns a String * that is the composition of the leading character with its first * combining mark followed by the remaining combining marks. Returns * null if the first two characters cannot be further composed. */ private static String composeOneStep(String input) { int len = countChars(input, 0, 2); String firstTwoCharacters = input.substring(0, len); String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); if (result.equals(firstTwoCharacters)) return null; else { String remainder = input.substring(len); return result + remainder; } }
boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { int ch0 = Character.codePointAt(seq, i); int n = Character.charCount(ch0); int j = i + n; while (j < matcher.to) { int ch1 = Character.codePointAt(seq, j); if (Grapheme.isBoundary(ch0, ch1)) break; ch0 = ch1; j += Character.charCount(ch1); } if (i + n == j) { // single, assume nfc cp if (predicate.is(ch0)) return next.match(matcher, j, seq); } else { while (i + n < j) { String nfc = Normalizer.normalize( seq.toString().substring(i, j), Normalizer.Form.NFC); if (nfc.codePointCount(0, nfc.length()) == 1) { if (predicate.is(nfc.codePointAt(0)) && next.match(matcher, j, seq)) { return true; } } ch0 = Character.codePointBefore(seq, j); j -= Character.charCount(ch0); } } if (j < matcher.to) return false; } matcher.hitEnd = true; return false; }
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = Charset.defaultCharset().name(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[]) transferable.getTransferData(javaTextEncodingFlavor), StandardCharsets.UTF_8); } catch (UnsupportedFlavorException cannotHappen) { } } String xml = new String(bytes, charset); // macosx pasteboard returns a property list that consists of one URL // let's extract it. return new URL(extractURL(xml)); } if(isUriListFlavor(flavor) && format == CF_FILE) { // dragQueryFile works fine with files and url, // it parses and extracts values from property list. // maxosx always returns property list for // CF_URL and CF_FILE String[] strings = dragQueryFile(bytes); if(strings == null) { return null; } bytes = String.join(System.getProperty("line.separator"), strings).getBytes(); // now we extracted uri from xml, now we should treat it as // regular string that allows to translate data to target represantation // class by base method format = CF_STRING; } else if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
/** * provides the slug name for the parsed input * @param input * @return */ //from https://stackoverflow.com/questions/1657193/java-code-library-for-generating-slugs-for-use-in-pretty-urls public static String toSlug(String input) { String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); return slug.toLowerCase(Locale.ROOT); }
public static String normalizeText(String text) { text = text.toLowerCase(Locale.getDefault()); text = new MCRHyphenNormalizer().normalize(text).replace("-", " "); text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{M}", ""); //canonical decomposition, remove accents text = text.replace("ue", "u").replace("oe", "o").replace("ae", "a").replace("ß", "s").replace("ss", "s"); text = text.replaceAll("[^a-z0-9]\\s]", ""); //remove all non-alphabetic characters // text = text.replaceAll("\\b.{1,3}\\b", " ").trim(); // remove all words with fewer than four characters text = text.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation text = text.replaceAll("\\s+", " "); // normalize whitespace return text; }
private String normalize(String nameFragment) { String text = nameFragment.toLowerCase(Locale.getDefault()); text = new MCRHyphenNormalizer().normalize(text).replace("-", " "); text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{M}", ""); // canonical decomposition, then remove accents text = text.replace("ue", "u").replace("oe", "o").replace("ae", "a").replace("ß", "s").replace("ss", "s"); text = text.replaceAll("[^a-z0-9]\\s]", ""); //remove all non-alphabetic characters text = text.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation text = text.replaceAll("\\s+", " "); // normalize whitespace return text.trim(); }
public static boolean isNumber(String str) { if (null == str) { return false; } str = Normalizer.normalize(str, Form.NFKC); return str.matches("\\d+"); }
/** * Transform any string on slug. Just alphanumeric, dash or underscore characters. * @param input string to convert on slug * @return slug string */ public static String toSlug(String input) { String nowhitespace = Constant.WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = Constant.NONLATIN.matcher(normalized).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
@Override public <T extends Object> PrismPropertyValue<T> apply(PrismPropertyValue<T> propertyValue) { Validate.notNull(propertyValue, "Node must not be null."); String text = getStringValue(propertyValue); if (StringUtils.isEmpty(text)) { return propertyValue; } String newValue = Normalizer.normalize(text, Form.NFD).replaceAll( "\\p{InCombiningDiacriticalMarks}+", ""); propertyValue.setValue((T) newValue); return propertyValue; }
final String generateSlug(final String suggestedSlug, final String newTitle) { String rv = suggestedSlug; if (rv == null || rv.trim().isEmpty()) { rv = Normalizer.normalize(newTitle.toLowerCase(), Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}|[^\\w\\s]", "").replaceAll("[\\s-]+", " ").trim().replaceAll("\\s", "-"); } return rv; }
public static String generateSlug(String input, Date createdAt) { Pattern NONLATIN = Pattern.compile("[^\\w-]"); Pattern WHITESPACE = Pattern.compile("[\\s]"); SecureRandom random = new SecureRandom(createdAt.toString().getBytes()); String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); String lowerCase = slug.toLowerCase(Locale.ENGLISH); String unique = lowerCase + "-" + new BigInteger(130, random).toString(32).substring(0, 6); return unique; }