/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s * The string the encode (assuming ASCII characters only) * @param e * A character that does not require encoding if found in the * string. */ private static String encode_UTF8(String s, char e) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isUnreserved(b) || b == e) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * Encodes a string containing non ASCII characters using an UTF-8 encoder. * * @param s * The string the encode (assuming ASCII characters only) */ private static String minimalEncode_UTF8(String s) { // TODO: Normalizer requires Java 6! String n = (Normalizer.isNormalized(s, Form.NFKC)) ? s : Normalizer.normalize(s, Form.NFKC); // convert String to UTF-8 ByteBuffer bb = UTF8.encode(n); // URI encode StringBuffer sb = new StringBuffer(); while (bb.hasRemaining()) { int b = bb.get() & 0xff; if (isLegal(b)) { sb.append((char) b); } else { appendEscape(sb, (byte) b); } } return sb.toString(); }
/** * Test if a string is in a given normalization form. * This is semantically equivalent to source.equals(normalize(source, mode)). * * Unlike quickCheck(), this function returns a definitive result, * never a "maybe". * For NFD, NFKD, and FCD, both functions work exactly the same. * For NFC and NFKC where quickCheck may return "maybe", this function will * perform further tests to arrive at a true/false result. * @param str the input string to be checked to see if it is normalized * @param form the normalization form * @param options the optional features to be enabled. */ public static boolean isNormalized(String str, Normalizer.Form form, int options) { switch (form) { case NFC: return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); case NFD: return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); case NFKC: return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); case NFKD: return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); } throw new IllegalArgumentException("Unexpected normalization form: " + form); }
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
public void setWaarde(final String waarde) { this.waarde = waarde; if (waarde == null) { this.slimZoekenWaarde = null; } else if (waarde.startsWith("\\")) { this.exact = true; this.slimZoekenWaarde = waarde.substring(1); } else if (waarde.endsWith("*")) { this.wildcard = true; this.slimZoekenWaarde = waarde.substring(0, waarde.length() - 1); } else { this.slimZoekenWaarde = waarde; } if (waarde != null && !this.exact) { if (!waarde.matches(".*[A-Z].*") && attribuut.isString()) { this.caseInsensitive = true; } String normalizedWaarde = Normalizer.normalize(waarde, Normalizer.Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); if (pattern.matcher(normalizedWaarde).find()) { this.diakriet = true; } } }
/** * Converteer een naam naar een java enumeratie naam. * @param javaNameBase naam * @return enumeratie naam */ public static String convertToJavaEnumName(final String javaNameBase) { if (javaNameBase.startsWith(LITERAL)) { return StringEscapeUtils.unescapeJava(javaNameBase.replaceAll(String.format("^%s", LITERAL), "")); } else { String result = javaNameBase; // Unaccent result = Normalizer.normalize(result, Normalizer.Form.NFD); // Replace whitespace with underscore result = result.replaceAll("(\\s|-)", "_"); // Uppercase result = result.toUpperCase(); // Remove unsupported characters result = result.replaceAll("[^A-Z0-9_]", ""); // Remove duplicate seperators result = result.replaceAll("_{2,}", "_"); return result; } }
/** * Remove diacritics from the specified string. * @param s * @return a copy of the specified string with diacritics removed. */ public static final String removeDiacritics(String s) { String n = Normalizer.normalize(s, Form.NFD); StringBuilder sb = null; for (int i = 0; i < n.length(); ++i) { char c = n.charAt(i); UnicodeBlock b = UnicodeBlock.of(c); if (UnicodeBlock.COMBINING_DIACRITICAL_MARKS.equals(b) || UnicodeBlock.COMBINING_DIACRITICAL_MARKS_SUPPLEMENT.equals(b)) { if (sb == null) { sb = new StringBuilder(n.length()); sb.append(n.substring(0, i)); } continue; } if (sb != null) sb.append(c); } if (sb == null) return n; return sb.toString(); }
/** * Converts specified string to it's slug representation, which can be used to generate readable and SEO-friendly * URLs. * * @param input string, which will be converted. * @return slug representation of string, which can be used to generate readable and SEO-friendly * URLs. */ public static String toSlug(String input) { String transliterated = transliterator.transform(input); String noWhitespace = WHITESPACE.matcher(transliterated).replaceAll("-"); String normalized = Normalizer.normalize(noWhitespace, Normalizer.Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); slug = EDGESDHASHES.matcher(slug).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
/** * Converts specified string to it's slug representation, which can be used to generate readable and SEO-friendly * URLs. * * @param input string, which will be converted. * @return slug representation of string, which can be used to generate readable and SEO-friendly * URLs. */ public String toSlug(String input) { String transliterated = transliterator.transform(input); String noWhitespace = WHITESPACE.matcher(transliterated).replaceAll("-"); String normalized = Normalizer.normalize(noWhitespace, Normalizer.Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); slug = EDGESDHASHES.matcher(slug).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
private String processData(String input) { // to extract all alphabets from string String withoutAccent = Normalizer.normalize(input, Normalizer.Form.NFD); String output = withoutAccent.replaceAll("[^a-zA-Z ]", ""); return output; //return s.replaceAll("[^A-Za-z]+", ""); }
public static String convertToAlphaNumerics(String value) { logger.debug("Before : " + value); value = Normalizer.normalize(value, Form.NFD); value = value.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); value = value.replaceAll("[^-_a-zA-Z0-9\\s]", "").replace(" ", ""); logger.debug("After : " + value); return value; }
public static String deAccent(String value) { logger.debug("Before : " + value); String nfdNormalizedString = Normalizer.normalize(value, Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); value = pattern.matcher(nfdNormalizedString).replaceAll(""); logger.debug("After : " + value); return value; }
public void setTag(String tag) { if (tag != null) { tag = tag.toLowerCase(); tag = Normalizer.normalize(tag, Normalizer.Form.NFD); tag = tag.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); tag = tag.replaceAll("[^a-z0-9]", ""); } this.tag = tag; }
public void setFullTag(String fullTag) { if (fullTag != null) { fullTag = fullTag.toLowerCase(); fullTag = Normalizer.normalize(fullTag, Normalizer.Form.NFD); fullTag = fullTag.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); fullTag = fullTag.replaceAll("[^a-z0-9-]", ""); } this.fullTag = fullTag; }
public static String convertToAlphaNumerics(String value, Integer countApp) throws UnsupportedEncodingException { value = new String(value.getBytes("ISO-8859-1"), "UTF-8"); value = Normalizer.normalize(value, Form.NFD); value = value.replaceAll("[^\\p{ASCII}]", "") .replaceAll("[^a-zA-Z0-9\\s]", "").replace(" ", ""); if (value.equalsIgnoreCase("")) { value = "default" + countApp; } return value; }
/** * Normalize a String * * @param value the value to normalize * @return The normalized value */ public static String normalize( String value ) { if ( !Normalizer.isNormalized( value, Normalizer.Form.NFKC ) ) { return Normalizer.normalize( value, Normalizer.Form.NFKC ); } else { return value; } }
private String cleanseName(String name) { // Replace whitespace with _ String result= name.trim().toLowerCase().replaceAll("\\s+", "_"); // Remove accents from characters and strips out non-alphanumeric chars. return Normalizer.normalize(result, Normalizer.Form.NFD).replaceAll("[^a-zA-z0-9_]+", ""); }
/** * Attempts to convert text in a given character set to a Unicode string, * and normalize it. Returns null on failure. * @param text ByteBuffer containing the character array to convert. * @param charsetName Character set it's in encoded in. * @return: Unicode string on success, null on failure. */ @CalledByNative private static String convertToUnicodeAndNormalize( ByteBuffer text, String charsetName) { String unicodeString = convertToUnicode(text, charsetName); if (unicodeString == null) return null; return Normalizer.normalize(unicodeString, Normalizer.Form.NFC); }
private String normalize(final String input) { String text = Normalizer.normalize(input, Normalizer.Form.NFKD); text = PATTERN_NORMALIZE_NON_ASCII.matcher(text).replaceAll(EMPTY); text = PATTERN_NORMALIZE_SEPARATOR.matcher(text).replaceAll(underscoreSeparator ? "_" : "-"); text = PATTERN_NORMALIZE_TRIM_DASH.matcher(text).replaceAll(EMPTY); return text; }
@Override public String derive(String providedMessage, String usageName) { Objects.requireNonNull(providedMessage); Objects.requireNonNull(usageName); return Bytes.wrap(HKDF.fromHmacSha512().extractAndExpand(salt, Bytes.from(providedMessage, Normalizer.Form.NFKD).array(), Bytes.from(usageName, Normalizer.Form.NFKD).array(), outLength)).encodeHex(); }
private byte[] keyDerivationFunction(String contentKey, byte[] fingerprint, byte[] contentSalt, byte[] preferenceSalt, @Nullable char[] password) { Bytes ikm = Bytes.wrap(fingerprint).append(contentSalt).append(Bytes.from(contentKey, Normalizer.Form.NFKD)); if (password != null) { ikm.append(keyStretchingFunction.stretch(contentSalt, password, 32)); } return HKDF.fromHmacSha512().extractAndExpand(preferenceSalt, ikm.array(), "DefaultEncryptionProtocol".getBytes(), keyLengthBit / 8); }
public static String stripAccents(String s) { if (s != null) { s = Normalizer.normalize(s, Normalizer.Form.NFD); s = s.replaceAll("[^\\p{ASCII}]", ""); return s; } else { return null; } }
public int getTweetLength(String text) { text = Normalizer.normalize(text, Normalizer.Form.NFC); int length = text.codePointCount(0, text.length()); for (Extractor.Entity urlEntity : extractor.extractURLsWithIndices(text)) { length += urlEntity.start - urlEntity.end; length += urlEntity.value.toLowerCase().startsWith("https://") ? shortUrlLengthHttps : shortUrlLength; } return length; }
public ECKey decrypt(String passphrase) throws BadPassphraseException { String normalizedPassphrase = Normalizer.normalize(passphrase, Normalizer.Form.NFC); ECKey key = ecMultiply ? decryptEC(normalizedPassphrase) : decryptNoEC(normalizedPassphrase); Sha256Hash hash = Sha256Hash.twiceOf(key.toAddress(params).toString().getBytes(Charsets.US_ASCII)); byte[] actualAddressHash = Arrays.copyOfRange(hash.getBytes(), 0, 4); if (!Arrays.equals(actualAddressHash, addressHash)) throw new BadPassphraseException(); return key; }
public CharSequence normalize(final CharSequence name) { if(!Normalizer.isNormalized(name, Normalizer.Form.NFC)) { // Canonical decomposition followed by canonical composition (default) final String normalized = Normalizer.normalize(name, Normalizer.Form.NFC); if(log.isDebugEnabled()) { log.debug(String.format("Normalized string %s to %s", name, normalized)); } return normalized; } return name; }
private void checkString(String string, Charset charset) { Bytes b = Bytes.from(string, charset); assertArrayEquals(string.getBytes(charset), b.array()); assertEquals(new String(string.getBytes(charset), charset), b.encodeCharset(charset)); if (charset != StandardCharsets.UTF_8) { Bytes bUtf8 = Bytes.from(string); assertArrayEquals(string.getBytes(StandardCharsets.UTF_8), bUtf8.array()); assertEquals(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8), bUtf8.encodeUtf8()); } else { Bytes bNormalized = Bytes.from(string, Normalizer.Form.NFKD); assertArrayEquals(Normalizer.normalize(string, Normalizer.Form.NFKD).getBytes(charset), bNormalized.array()); } }
public static void main(String[] args) { String s = "São Paulo"; System.out.println(Normalizer.isNormalized(s, Normalizer.Form.NFKD)); System.out.println(s); s = Normalizer.normalize(s, Normalizer.Form.NFKD); System.out.println(Normalizer.isNormalized(s, Normalizer.Form.NFKD)); System.out.println(s); // TODO: how can I print the difference? }
@Override public boolean isNameCompatible(String cn, JavaFileObject.Kind kind) { cn.getClass(); // null check if (kind == Kind.OTHER && getKind() != kind) { return false; } String n = cn + kind.extension; if (name.equals(n)) { return true; } if (isMacOS && Normalizer.isNormalized(name, Normalizer.Form.NFD) && Normalizer.isNormalized(n, Normalizer.Form.NFC)) { // On Mac OS X it is quite possible to file name and class // name normalized in a different way - in that case we have to normalize file name // to the Normal Form Compised (NFC) String normName = Normalizer.normalize(name, Normalizer.Form.NFC); if (normName.equals(n)) { this.name = normName; return true; } } if (name.equalsIgnoreCase(n)) { try { // allow for Windows return file.getCanonicalFile().getName().equals(n); } catch (IOException e) { } } return false; }
/** * The pattern is converted to normalizedD form and then a pure group * is constructed to match canonical equivalences of the characters. */ private void normalize() { boolean inCharClass = false; int lastCodePoint = -1; // Convert pattern into normalizedD form normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD); patternLength = normalizedPattern.length(); // Modify pattern to match canonical equivalences StringBuilder newPattern = new StringBuilder(patternLength); for(int i=0; i<patternLength; ) { int c = normalizedPattern.codePointAt(i); StringBuilder sequenceBuffer; if ((Character.getType(c) == Character.NON_SPACING_MARK) && (lastCodePoint != -1)) { sequenceBuffer = new StringBuilder(); sequenceBuffer.appendCodePoint(lastCodePoint); sequenceBuffer.appendCodePoint(c); while(Character.getType(c) == Character.NON_SPACING_MARK) { i += Character.charCount(c); if (i >= patternLength) break; c = normalizedPattern.codePointAt(i); sequenceBuffer.appendCodePoint(c); } String ea = produceEquivalentAlternation( sequenceBuffer.toString()); newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); newPattern.append("(?:").append(ea).append(")"); } else if (c == '[' && lastCodePoint != '\\') { i = normalizeCharClass(newPattern, i); } else { newPattern.appendCodePoint(c); } lastCodePoint = c; i += Character.charCount(c); } normalizedPattern = newPattern.toString(); }
/** * Attempts to compose input by combining the first character * with the first combining mark following it. Returns a String * that is the composition of the leading character with its first * combining mark followed by the remaining combining marks. Returns * null if the first two characters cannot be further composed. */ private String composeOneStep(String input) { int len = countChars(input, 0, 2); String firstTwoCharacters = input.substring(0, len); String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); if (result.equals(firstTwoCharacters)) return null; else { String remainder = input.substring(len); return result + remainder; } }
/** * Normalizes a <code>String</code> using the given normalization form. * * @param str the input string to be normalized. * @param form the normalization form * @param options the optional features to be enabled. */ public static String normalize(String str, Normalizer.Form form, int options) { int len = str.length(); boolean asciiOnly = true; if (len < 80) { for (int i = 0; i < len; i++) { if (str.charAt(i) > 127) { asciiOnly = false; break; } } } else { char[] a = str.toCharArray(); for (int i = 0; i < len; i++) { if (a[i] > 127) { asciiOnly = false; break; } } } switch (form) { case NFC : return asciiOnly ? str : NFC.normalize(str, options); case NFD : return asciiOnly ? str : NFD.normalize(str, options); case NFKC : return asciiOnly ? str : NFKC.normalize(str, options); case NFKD : return asciiOnly ? str : NFKD.normalize(str, options); } throw new IllegalArgumentException("Unexpected normalization form: " + form); }
private Stream<String> getWords(final String sentence) { return Arrays.stream(sentence.split(REGEX_SPACE)) .map(String::toLowerCase) .map(s -> Normalizer.normalize(s, Normalizer.Form.NFD)) .map(s -> s.replaceAll(REGEX_ALPHANUM, "")) .map(this::stemmed) .filter(s -> !s.isEmpty()) .filter(w -> !StopWords.match(w)); }
public static String makeSlug(String input, boolean transliterate) { String origInput = input; // Validate the input if (input == null) { ProjectLogger.log("Provided input value is null"); return input; } // Remove extra spaces input = input.trim(); // Remove URL encoding input = urlDecode(input); // If transliterate is required if (transliterate) { // Tranlisterate & cleanup String transliterated = transliterate(input); // transliterated = removeDuplicateChars(transliterated); input = transliterated; } // Replace all whitespace with dashes input = WHITESPACE.matcher(input).replaceAll("-"); // Remove all accent chars input = Normalizer.normalize(input, Form.NFD); // Remove all non-latin special characters input = NONLATIN.matcher(input).replaceAll(""); // Remove any consecutive dashes input = normalizeDashes(input); // Validate before returning validateResult(input, origInput); // Slug is always lowercase return input.toLowerCase(Locale.ENGLISH); }
public static String removeAcentos(String str) { str = str.replaceAll("\r", ""); str = str.replaceAll("\t", ""); str = str.replaceAll("\n", ""); str = str.replaceAll("&", "E"); str = str.replaceAll(">\\s+<", "><"); CharSequence cs = new StringBuilder(str == null ? "" : str); return Normalizer.normalize(cs, Normalizer.Form.NFKD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); }
private static WordList normalizeNFKD(WordList wordList) { return new WordList() { @Override public String getWord(int index) { return Normalizer.normalize(wordList.getWord(index), Normalizer.Form.NFKD); } @Override public char getSpace() { return wordList.getSpace(); } }; }