/** enumerates the entire FST/lookup data and just does basic sanity checks */ public void testEnumerateAll() throws Exception { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.getInstance(); ConnectionCosts matrix = ConnectionCosts.getInstance(); FST<Long> fst = tid.getFST().getInternalFST(); IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst); InputOutput<Long> mapping; IntsRef scratch = new IntsRef(); while ((mapping = fstEnum.next()) != null) { numTerms++; IntsRef input = mapping.input; char chars[] = new char[input.length]; for (int i = 0; i < chars.length; i++) { chars[i] = (char)input.ints[input.offset+i]; } assertTrue(UnicodeUtil.validUTF16String(new String(chars))); Long output = mapping.output; int sourceId = output.intValue(); // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.lookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.length; i++) { numWords++; int wordId = scratch.ints[scratch.offset+i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length); assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm)); String inflectionForm = tid.getInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm)); if (inflectionForm != null) { // check that its actually an ipadic inflection form assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.getInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType)); if (inflectionType != null) { // check that its actually an ipadic inflection type assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType)); } int leftId = tid.getLeftId(wordId); int rightId = tid.getRightId(wordId); matrix.get(rightId, leftId); tid.getWordCost(wordId); String pos = tid.getPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.validUTF16String(pos)); // check that its actually an ipadic pos tag assertNotNull(ToStringUtil.getPOSTranslation(pos)); String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.validUTF16String(pronunciation)); String reading = tid.getReading(wordId, chars, 0, chars.length); assertNotNull(reading); assertTrue(UnicodeUtil.validUTF16String(reading)); } } if (VERBOSE) { System.out.println("checked " + numTerms + " terms, " + numWords + " words."); } }
/** enumerates the entire FST/lookup data and just does basic sanity checks */ public void testEnumerateAll() throws Exception { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.getInstance(); ConnectionCosts matrix = ConnectionCosts.getInstance(); FST<Long> fst = tid.getFST().getInternalFST(); IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst); InputOutput<Long> mapping; IntsRef scratch = new IntsRef(); while ((mapping = fstEnum.next()) != null) { numTerms++; IntsRef input = mapping.input; char chars[] = new char[input.length]; for (int i = 0; i < chars.length; i++) { chars[i] = (char)input.ints[input.offset+i]; } assertTrue(UnicodeUtil.validUTF16String(new String(chars))); Long output = mapping.output; int sourceId = output.intValue(); // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.lookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.length; i++) { numWords++; int wordId = scratch.ints[scratch.offset+i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length); assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm)); String inflectionForm = tid.getInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm)); if (inflectionForm != null) { // check that its actually an ipadic inflection form assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.getInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType)); if (inflectionType != null) { // check that its actually an ipadic inflection type assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType)); } int leftId = tid.getLeftId(wordId); int rightId = tid.getRightId(wordId); matrix.get(rightId, leftId); tid.getWordCost(wordId); String pos = tid.getPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.validUTF16String(pos)); // check that its actually an ipadic pos tag assertNotNull(ToStringUtil.getPOSTranslation(pos)); String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.validUTF16String(pronunciation)); String reading = tid.getReading(wordId, chars, 0, chars.length); assertNotNull(reading); assertTrue(UnicodeUtil.validUTF16String(reading)); } } if (VERBOSE) { System.out.println("checked " + numTerms + " terms, " + numWords + " words."); } }