@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String reading = readingAttr.getReading(); if (useRomaji) { if (reading == null) { // if its an OOV term, just try the term text buffer.setLength(0); ToStringUtil.getRomanization(buffer, termAttr); termAttr.setEmpty().append(buffer); } else { ToStringUtil.getRomanization(termAttr.setEmpty(), reading); } } else { // just replace the term text with the reading, if it exists if (reading != null) { termAttr.setEmpty().append(reading); } } return true; } else { return false; } }
@Override public void reflectWith(AttributeReflector reflector) { String reading = getReading(); String readingEN = reading == null ? null : ToStringUtil.getRomanization(reading); String pronunciation = getPronunciation(); String pronunciationEN = pronunciation == null ? null : ToStringUtil.getRomanization(pronunciation); reflector.reflect(ReadingAttribute.class, "reading", reading); reflector.reflect(ReadingAttribute.class, "reading (en)", readingEN); reflector.reflect(ReadingAttribute.class, "pronunciation", pronunciation); reflector.reflect(ReadingAttribute.class, "pronunciation (en)", pronunciationEN); }
@Override public void reflectWith(AttributeReflector reflector) { String partOfSpeech = getPartOfSpeech(); String partOfSpeechEN = partOfSpeech == null ? null : ToStringUtil.getPOSTranslation(partOfSpeech); reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech", partOfSpeech); reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech (en)", partOfSpeechEN); }
@Override public void reflectWith(AttributeReflector reflector) { String type = getInflectionType(); String typeEN = type == null ? null : ToStringUtil.getInflectionTypeTranslation(type); reflector.reflect(InflectionAttribute.class, "inflectionType", type); reflector.reflect(InflectionAttribute.class, "inflectionType (en)", typeEN); String form = getInflectionForm(); String formEN = form == null ? null : ToStringUtil.getInflectedFormTranslation(form); reflector.reflect(InflectionAttribute.class, "inflectionForm", form); reflector.reflect(InflectionAttribute.class, "inflectionForm (en)", formEN); }
/** enumerates the entire FST/lookup data and just does basic sanity checks */ public void testEnumerateAll() throws Exception { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.getInstance(); ConnectionCosts matrix = ConnectionCosts.getInstance(); FST<Long> fst = tid.getFST().getInternalFST(); IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst); InputOutput<Long> mapping; IntsRef scratch = new IntsRef(); while ((mapping = fstEnum.next()) != null) { numTerms++; IntsRef input = mapping.input; char chars[] = new char[input.length]; for (int i = 0; i < chars.length; i++) { chars[i] = (char)input.ints[input.offset+i]; } assertTrue(UnicodeUtil.validUTF16String(new String(chars))); Long output = mapping.output; int sourceId = output.intValue(); // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.lookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.length; i++) { numWords++; int wordId = scratch.ints[scratch.offset+i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length); assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm)); String inflectionForm = tid.getInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm)); if (inflectionForm != null) { // check that its actually an ipadic inflection form assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.getInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType)); if (inflectionType != null) { // check that its actually an ipadic inflection type assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType)); } int leftId = tid.getLeftId(wordId); int rightId = tid.getRightId(wordId); matrix.get(rightId, leftId); tid.getWordCost(wordId); String pos = tid.getPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.validUTF16String(pos)); // check that its actually an ipadic pos tag assertNotNull(ToStringUtil.getPOSTranslation(pos)); String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.validUTF16String(pronunciation)); String reading = tid.getReading(wordId, chars, 0, chars.length); assertNotNull(reading); assertTrue(UnicodeUtil.validUTF16String(reading)); } } if (VERBOSE) { System.out.println("checked " + numTerms + " terms, " + numWords + " words."); } }
/** * Romanize katakana with modified hepburn */ public static String katakanaToRomaji(String text) { return ToStringUtil.getRomanization(text); }
public static String translatePartOfSpeech(String partOfSpeech) { String translation = ToStringUtil.getPOSTranslation(partOfSpeech); return translation != null ? translation : partOfSpeech; }
public static String translateInflectedForm(String inflectedForm) { String translation = ToStringUtil .getInflectedFormTranslation(inflectedForm); return translation != null ? translation : inflectedForm; }
public static String translateInflectionType(String inflectionType) { String translation = ToStringUtil .getInflectionTypeTranslation(inflectionType); return translation != null ? translation : inflectionType; }
/** enumerates the entire FST/lookup data and just does basic sanity checks */ public void testEnumerateAll() throws Exception { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.getInstance(); ConnectionCosts matrix = ConnectionCosts.getInstance(); FST<Long> fst = tid.getFST().getInternalFST(); IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst); InputOutput<Long> mapping; IntsRef scratch = new IntsRef(); while ((mapping = fstEnum.next()) != null) { numTerms++; IntsRef input = mapping.input; char chars[] = new char[input.length]; for (int i = 0; i < chars.length; i++) { chars[i] = (char)input.ints[input.offset+i]; } assertTrue(UnicodeUtil.validUTF16String(new String(chars))); Long output = mapping.output; int sourceId = output.intValue(); // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.lookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.length; i++) { numWords++; int wordId = scratch.ints[scratch.offset+i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length); assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm)); String inflectionForm = tid.getInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm)); if (inflectionForm != null) { // check that its actually an ipadic inflection form assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.getInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType)); if (inflectionType != null) { // check that its actually an ipadic inflection type assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType)); } int leftId = tid.getLeftId(wordId); int rightId = tid.getRightId(wordId); matrix.get(rightId, leftId); tid.getWordCost(wordId); String pos = tid.getPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.validUTF16String(pos)); // check that its actually an ipadic pos tag assertNotNull(ToStringUtil.getPOSTranslation(pos)); String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.validUTF16String(pronunciation)); String reading = tid.getReading(wordId, chars, 0, chars.length); assertNotNull(reading); assertTrue(UnicodeUtil.validUTF16String(reading)); } } if (VERBOSE) { System.out.println("checked " + numTerms + " terms, " + numWords + " words."); } }
/** * Romanize katakana with modified hepburn * * @throws IOException */ public static void katakanaToRomaji(Appendable builder, CharSequence s) throws IOException { ToStringUtil.getRomanization(builder, s); }