public void write(String baseDir) throws IOException { String filename = baseDir + File.separator + CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX; new File(filename).getParentFile().mkdirs(); OutputStream os = new FileOutputStream(filename); try { os = new BufferedOutputStream(os); final DataOutput out = new OutputStreamDataOutput(os); CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION); out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length); for (int i = 0; i < CharacterDefinition.CLASS_COUNT; i++) { final byte b = (byte) ( (invokeMap[i] ? 0x01 : 0x00) | (groupMap[i] ? 0x02 : 0x00) ); out.writeByte(b); } } finally { os.close(); } }
/** * Put mapping from unicode code point to character class. * * @param codePoint * code point * @param characterClassName character class name */ public void putCharacterCategory(int codePoint, String characterClassName) { characterClassName = characterClassName.split(" ")[0]; // use first // category // class // Override Nakaguro if (codePoint == 0x30FB) { characterClassName = "SYMBOL"; } characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName); }
public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding) throws IOException { UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); FileInputStream inputStream = new FileInputStream(filename); Charset cs = Charset.forName(encoding); CharsetDecoder decoder = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); InputStreamReader streamReader = new InputStreamReader(inputStream, decoder); LineNumberReader lineReader = new LineNumberReader(streamReader); dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY)); List<String[]> lines = new ArrayList<>(); String line = null; while ((line = lineReader.readLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry lines.add(parsed); } Collections.sort(lines, new Comparator<String[]>() { public int compare(String[] left, String[] right) { int leftId = CharacterDefinition.lookupCharacterClass(left[0]); int rightId = CharacterDefinition.lookupCharacterClass(right[0]); return leftId - rightId; } }); for (String[] entry : lines) { dictionary.put(entry); } return dictionary; }
@Override public int put(String[] entry) { // Get wordId of current entry int wordId = buffer.position(); // Put entry int result = super.put(entry); // Put entry in targetMap int characterId = CharacterDefinition.lookupCharacterClass(entry[0]); addMapping(characterId, wordId); return result; }
public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding) throws IOException { UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); FileInputStream inputStream = new FileInputStream(filename); Charset cs = Charset.forName(encoding); CharsetDecoder decoder = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); InputStreamReader streamReader = new InputStreamReader(inputStream, decoder); LineNumberReader lineReader = new LineNumberReader(streamReader); dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY)); List<String[]> lines = new ArrayList<String[]>(); String line = null; while ((line = lineReader.readLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry lines.add(parsed); } Collections.sort(lines, new Comparator<String[]>() { public int compare(String[] left, String[] right) { int leftId = CharacterDefinition.lookupCharacterClass(left[0]); int rightId = CharacterDefinition.lookupCharacterClass(right[0]); return leftId - rightId; } }); for (String[] entry : lines) { dictionary.put(entry); } return dictionary; }
/** * Constructor for building. TODO: remove write access */ public CharacterDefinitionWriter() { Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT); }
public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName); invokeMap[characterClass] = invoke == 1; groupMap[characterClass] = group == 1; // TODO: length def ignored }