From a245d15da5d295af21ead9a01583c64796a31ad7 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Thu, 12 Dec 2013 15:08:10 +0900 Subject: Have dicttool use the native library to generate v4 dicts. Yay ! Change-Id: Iea8ced9e81031b9ab7eff05ad9ef7215be248de9 --- .../latin/AbstractDictionaryWriter.java | 4 +- .../inputmethod/latin/BinaryDictionary.java | 3 +- .../latin/makedict/DynamicBinaryDictIOUtils.java | 14 +- .../inputmethod/latin/makedict/FormatSpec.java | 4 +- .../latin/makedict/Ver2DictDecoder.java | 264 +++++++++++++ .../latin/makedict/Ver2DictEncoder.java | 255 ++++++++++++ .../latin/makedict/Ver2DictUpdater.java | 82 ++++ .../latin/makedict/Ver3DictDecoder.java | 271 ------------- .../latin/makedict/Ver3DictEncoder.java | 255 ------------ .../latin/makedict/Ver3DictUpdater.java | 82 ---- .../latin/makedict/Ver4DictEncoder.java | 427 +++------------------ 11 files changed, 664 insertions(+), 997 deletions(-) create mode 100644 java/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java create mode 100644 java/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java create mode 100644 java/src/com/android/inputmethod/latin/makedict/Ver2DictUpdater.java delete mode 100644 java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java delete mode 100644 java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java delete mode 100644 java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java (limited to 'java/src') diff --git a/java/src/com/android/inputmethod/latin/AbstractDictionaryWriter.java b/java/src/com/android/inputmethod/latin/AbstractDictionaryWriter.java index e6fb9807e..1aee22baf 100644 --- a/java/src/com/android/inputmethod/latin/AbstractDictionaryWriter.java +++ b/java/src/com/android/inputmethod/latin/AbstractDictionaryWriter.java @@ -21,7 +21,7 @@ import android.util.Log; import com.android.inputmethod.latin.makedict.DictEncoder; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; -import com.android.inputmethod.latin.makedict.Ver3DictEncoder; +import com.android.inputmethod.latin.makedict.Ver2DictEncoder; import java.io.File; import java.io.IOException; @@ -64,7 +64,7 @@ abstract public class AbstractDictionaryWriter { final String tempFilePath = file.getAbsolutePath() + ".temp"; final File tempFile = new File(tempFilePath); try { - final DictEncoder dictEncoder = new Ver3DictEncoder(tempFile); + final DictEncoder dictEncoder = new Ver2DictEncoder(tempFile); writeDictionary(dictEncoder, attributeMap); tempFile.renameTo(file); } catch (IOException e) { diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index db4234c63..95ac3e203 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -367,6 +367,7 @@ public final class BinaryDictionary extends Dictionary { public static class LanguageModelParam { public final int[] mWord0; public final int[] mWord1; + // TODO: this needs to be a list of shortcuts public final int[] mShortcutTarget; public final int mUnigramProbability; public final int mBigramProbability; @@ -375,7 +376,7 @@ public final class BinaryDictionary extends Dictionary { public final boolean mIsBlacklisted; public final int mTimestamp; - // Constructor for unigram. + // Constructor for unigram. TODO: support shortcuts public LanguageModelParam(final String word, final int unigramProbability, final int timestamp) { mWord0 = null; diff --git a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java index ff03190a3..97ad667a6 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java @@ -55,7 +55,7 @@ public final class DynamicBinaryDictIOUtils { * @param newParentAddress the absolute address of the parent. * @param formatOptions file format options. */ - private static void updateParentAddress(final Ver3DictUpdater dictUpdater, + private static void updateParentAddress(final Ver2DictUpdater dictUpdater, final int ptNodeOriginAddress, final int newParentAddress, final FormatOptions formatOptions) { final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); @@ -88,7 +88,7 @@ public final class DynamicBinaryDictIOUtils { * @param newParentAddress the address to be written. * @param formatOptions file format options. */ - private static void updateParentAddresses(final Ver3DictUpdater dictUpdater, + private static void updateParentAddresses(final Ver2DictUpdater dictUpdater, final int ptNodeOriginAddress, final int newParentAddress, final FormatOptions formatOptions) { final int originalPosition = dictUpdater.getPosition(); @@ -114,7 +114,7 @@ public final class DynamicBinaryDictIOUtils { * @param newChildrenAddress the absolute address of the child. * @param formatOptions file format options. */ - private static void updateChildrenAddress(final Ver3DictUpdater dictUpdater, + private static void updateChildrenAddress(final Ver2DictUpdater dictUpdater, final int ptNodeOriginAddress, final int newChildrenAddress, final FormatOptions formatOptions) { final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); @@ -134,7 +134,7 @@ public final class DynamicBinaryDictIOUtils { * Helper method to move a PtNode to the tail of the file. */ private static int movePtNode(final OutputStream destination, - final Ver3DictUpdater dictUpdater, final PtNodeInfo info, + final Ver2DictUpdater dictUpdater, final PtNodeInfo info, final int nodeArrayOriginAddress, final int oldNodeAddress, final FormatOptions formatOptions) throws IOException { final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); @@ -151,7 +151,7 @@ public final class DynamicBinaryDictIOUtils { } @SuppressWarnings("unused") - private static void updateForwardLink(final Ver3DictUpdater dictUpdater, + private static void updateForwardLink(final Ver2DictUpdater dictUpdater, final int nodeArrayOriginAddress, final int newNodeArrayAddress, final FormatOptions formatOptions) { final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); @@ -199,7 +199,7 @@ public final class DynamicBinaryDictIOUtils { final int length, final int flags, final int frequency, final int parentAddress, final ArrayList shortcutTargets, final ArrayList bigrams, final OutputStream destination, - final Ver3DictUpdater dictUpdater, final int oldPtNodeArrayOrigin, + final Ver2DictUpdater dictUpdater, final int oldPtNodeArrayOrigin, final int oldPtNodeOrigin, final FormatOptions formatOptions) throws IOException { int size = 0; final int newPtNodeOrigin = fileEndAddress + 1; @@ -252,7 +252,7 @@ public final class DynamicBinaryDictIOUtils { // TODO: Support batch insertion. // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. @UsedForTesting - public static void insertWord(final Ver3DictUpdater dictUpdater, + public static void insertWord(final Ver2DictUpdater dictUpdater, final OutputStream destination, final String word, final int frequency, final ArrayList bigramStrings, final ArrayList shortcuts, final boolean isNotAWord, diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index 20ddba836..f23fe4656 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -401,7 +401,7 @@ public final class FormatSpec { if (dictFile.isDirectory()) { return new Ver4DictDecoder(dictFile, bufferType); } else if (dictFile.isFile()) { - return new Ver3DictDecoder(dictFile, bufferType); + return new Ver2DictDecoder(dictFile, bufferType); } return null; } @@ -411,7 +411,7 @@ public final class FormatSpec { if (dictFile.isDirectory()) { return new Ver4DictDecoder(dictFile, factory); } else if (dictFile.isFile()) { - return new Ver3DictDecoder(dictFile, factory); + return new Ver2DictDecoder(dictFile, factory); } return null; } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java new file mode 100644 index 000000000..e9667ab0b --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.utils.JniUtils; + +import android.util.Log; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * An implementation of DictDecoder for version 2 binary dictionary. + */ +@UsedForTesting +public class Ver2DictDecoder extends AbstractDictDecoder { + private static final String TAG = Ver2DictDecoder.class.getSimpleName(); + + protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { + private static int readFrequency(final DictBuffer dictBuffer) { + return dictBuffer.readUnsignedByte(); + } + } + + protected final File mDictionaryBinaryFile; + private final DictionaryBufferFactory mBufferFactory; + protected DictBuffer mDictBuffer; + + /* package */ Ver2DictDecoder(final File file, final int factoryFlag) { + mDictionaryBinaryFile = file; + mDictBuffer = null; + + if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { + mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); + } else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) { + mBufferFactory = new DictionaryBufferFromByteArrayFactory(); + } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) { + mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory(); + } else { + mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); + } + } + + /* package */ Ver2DictDecoder(final File file, final DictionaryBufferFactory factory) { + mDictionaryBinaryFile = file; + mBufferFactory = factory; + } + + @Override + public void openDictBuffer() throws FileNotFoundException, IOException { + mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile); + } + + @Override + public boolean isDictBufferOpen() { + return mDictBuffer != null; + } + + /* package */ DictBuffer getDictBuffer() { + return mDictBuffer; + } + + @UsedForTesting + /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException { + openDictBuffer(); + return getDictBuffer(); + } + + @Override + public FileHeader readHeader() throws IOException, UnsupportedFormatException { + if (mDictBuffer == null) { + openDictBuffer(); + } + final FileHeader header = super.readHeader(mDictBuffer); + final int version = header.mFormatOptions.mVersion; + if (!(version >= 2 && version <= 3)) { + throw new UnsupportedFormatException("File header has a wrong version : " + version); + } + return header; + } + + // TODO: Make this buffer multi thread safe. + private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; + @Override + public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) { + int addressPointer = ptNodePos; + final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); + addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; + + final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); + if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { + addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; + } + + final int characters[]; + if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { + int index = 0; + int character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + while (FormatSpec.INVALID_CHARACTER != character) { + // FusionDictionary is making sure that the length of the word is smaller than + // MAX_WORD_LENGTH. + // So we'll never write past the end of mCharacterBuffer. + mCharacterBuffer[index++] = character; + character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + } + characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); + } else { + final int character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + characters = new int[] { character }; + } + final int frequency; + if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { + frequency = PtNodeReader.readFrequency(mDictBuffer); + addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE; + } else { + frequency = PtNode.NOT_A_TERMINAL; + } + int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); + if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenAddress += addressPointer; + } + addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); + final ArrayList shortcutTargets; + if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { + // readShortcut will add shortcuts to shortcutTargets. + shortcutTargets = new ArrayList(); + addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); + } else { + shortcutTargets = null; + } + + final ArrayList bigrams; + if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { + bigrams = new ArrayList(); + addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, + addressPointer); + if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() + + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); + } + } else { + bigrams = null; + } + return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency, + parentAddress, childrenAddress, shortcutTargets, bigrams); + } + + @Override + public FusionDictionary readDictionaryBinary(final FusionDictionary dict, + final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException { + if (mDictBuffer == null) { + openDictBuffer(); + } + try { + return BinaryDictDecoderUtils.readDictionaryBinary(this, dict); + } catch (IOException e) { + Log.e(TAG, "The dictionary " + mDictionaryBinaryFile.getName() + " is broken.", e); + if (deleteDictIfBroken && !mDictionaryBinaryFile.delete()) { + Log.e(TAG, "Failed to delete the broken dictionary."); + } + throw e; + } catch (UnsupportedFormatException e) { + Log.e(TAG, "The dictionary " + mDictionaryBinaryFile.getName() + " is broken.", e); + if (deleteDictIfBroken && !mDictionaryBinaryFile.delete()) { + Log.e(TAG, "Failed to delete the broken dictionary."); + } + throw e; + } + } + + @Override + public void setPosition(int newPos) { + mDictBuffer.position(newPos); + } + + @Override + public int getPosition() { + return mDictBuffer.position(); + } + + @Override + public int readPtNodeCount() { + return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer); + } + + @Override + public boolean readAndFollowForwardLink() { + final int nextAddress = mDictBuffer.readUnsignedInt24(); + if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { + mDictBuffer.position(nextAddress); + return true; + } + return false; + } + + @Override + public boolean hasNextPtNodeArray() { + return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS; + } + + @Override + public void skipPtNode(final FormatOptions formatOptions) { + final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); + PtNodeReader.readParentAddress(mDictBuffer, formatOptions); + BinaryDictIOUtils.skipString(mDictBuffer, + (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readFrequency(mDictBuffer); + if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { + final int shortcutsSize = mDictBuffer.readUnsignedShort(); + mDictBuffer.position(mDictBuffer.position() + shortcutsSize + - FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + final int bigramFlags = mDictBuffer.readUnsignedByte(); + switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: + mDictBuffer.readUnsignedByte(); + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: + mDictBuffer.readUnsignedShort(); + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: + mDictBuffer.readUnsignedInt24(); + break; + } + if ((bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) == 0) break; + } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode."); + } + } + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java new file mode 100644 index 000000000..665544228 --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -0,0 +1,255 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * An implementation of DictEncoder for version 2 binary dictionary. + */ +public class Ver2DictEncoder implements DictEncoder { + + private final File mDictFile; + private OutputStream mOutStream; + private byte[] mBuffer; + private int mPosition; + + public Ver2DictEncoder(final File dictFile) { + mDictFile = dictFile; + mOutStream = null; + mBuffer = null; + } + + // This constructor is used only by BinaryDictOffdeviceUtilsTests. + // If you want to use this in the production code, you should consider keeping consistency of + // the interface of Ver3DictDecoder by using factory. + public Ver2DictEncoder(final OutputStream outStream) { + mDictFile = null; + mOutStream = outStream; + } + + private void openStream() throws FileNotFoundException { + mOutStream = new FileOutputStream(mDictFile); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion > FormatSpec.VERSION3) { + throw new UnsupportedFormatException( + "The given format options has wrong version number : " + + formatOptions.mVersion); + } + + if (mOutStream == null) { + openStream(); + } + BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions); + + // Addresses are limited to 3 bytes, but since addresses can be relative to each node + // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding + // the order of the PtNode arrays becomes a quite complicated problem, because though the + // dictionary itself does not have a size limit, each node array must still be within 16MB + // of all its children and parents. As long as this is ensured, the dictionary file may + // grow to any size. + + // Leave the choice of the optimal node order to the flattenTree function. + MakedictLog.i("Flattening the tree..."); + ArrayList flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); + MakedictLog.i("Checking PtNode array..."); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + // Create a buffer that matches the final dictionary size. + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mBuffer = new byte[bufferSize]; + + MakedictLog.i("Writing file..."); + + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); + } + if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); + mOutStream.write(mBuffer, 0, mPosition); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(final int position) { + if (mBuffer == null || position < 0 || position >= mBuffer.length) return; + mPosition = position; + } + + @Override + public int getPosition() { + return mPosition; + } + + @Override + public void writePtNodeCount(final int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeParentPosition(final int parentPosition, final PtNode ptNode, + final FormatOptions formatOptions) { + if (parentPosition == FormatSpec.NO_PARENT_ADDRESS) { + mPosition = BinaryDictEncoderUtils.writeParentAddress(mBuffer, mPosition, + parentPosition, formatOptions); + } else { + mPosition = BinaryDictEncoderUtils.writeParentAddress(mBuffer, mPosition, + parentPosition - ptNode.mCachedAddressAfterUpdate, formatOptions); + } + } + + private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) { + mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition); + if (hasSeveralChars) { + mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeFrequency(final int frequency) { + if (frequency >= 0) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, + FormatSpec.PTNODE_FREQUENCY_SIZE); + } + } + + private void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + if (formatOptions.supportsDynamicUpdate()) { + mPosition += BinaryDictEncoderUtils.writeSignedChildrenPosition(mBuffer, mPosition, + childrenPos); + } else { + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + childrenPos); + } + } + + /** + * Write a shortcut attributes list to mBuffer. + * + * @param shortcuts the shortcut attributes list. + */ + private void writeShortcuts(final ArrayList shortcuts) { + if (null == shortcuts || shortcuts.isEmpty()) return; + + final int indexOfShortcutByteSize = mPosition; + mPosition += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; + final Iterator shortcutIterator = shortcuts.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( + shortcutIterator.hasNext(), + target.mFrequency); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord); + mPosition += shortcutShift; + } + final int shortcutByteSize = mPosition - indexOfShortcutByteSize; + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { + throw new RuntimeException("Shortcut list too large"); + } + BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize, + FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + + /** + * Write a bigram attributes list to mBuffer. + * + * @param bigrams the bigram attributes list. + * @param dict the dictionary the node array is a part of (for relative offsets). + */ + private void writeBigrams(final ArrayList bigrams, + final FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.mFrequency; + final int offset = addressOfBigram + - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + Math.abs(offset)); + } + } + + @Override + public void writeForwardLinkAddress(final int forwardLinkAddress) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, forwardLinkAddress, + FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + } + + @Override + public void writePtNode(final PtNode ptNode, final int parentPosition, + final FormatOptions formatOptions, final FusionDictionary dict) { + writePtNodeFlags(ptNode, formatOptions); + writeParentPosition(parentPosition, ptNode, formatOptions); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); + writeFrequency(ptNode.mFrequency); + writeChildrenPosition(ptNode, formatOptions); + writeShortcuts(ptNode.mShortcutTargets); + writeBigrams(ptNode.mBigrams, dict); + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver2DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/Ver2DictUpdater.java new file mode 100644 index 000000000..6419340ff --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver2DictUpdater.java @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; + +/** + * An implementation of DictUpdater for version 2 binary dictionary. + */ +@UsedForTesting +public class Ver2DictUpdater extends Ver2DictDecoder implements DictUpdater { + private OutputStream mOutStream; + + @UsedForTesting + public Ver2DictUpdater(final File dictFile, final int factoryType) { + // DictUpdater must have an updatable DictBuffer. + super(dictFile, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY) + ? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER); + mOutStream = null; + } + + private void openStreamAndBuffer() throws FileNotFoundException, IOException { + super.openDictBuffer(); + mOutStream = new FileOutputStream(mDictionaryBinaryFile, true /* append */); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + @Override @UsedForTesting + public void deleteWord(final String word) throws IOException, UnsupportedFormatException { + if (mOutStream == null) openStreamAndBuffer(); + mDictBuffer.position(0); + readHeader(); + final int wordPos = getTerminalPosition(word); + if (wordPos != FormatSpec.NOT_VALID_WORD) { + mDictBuffer.position(wordPos); + final int flags = mDictBuffer.readUnsignedByte(); + mDictBuffer.position(wordPos); + mDictBuffer.put((byte) DynamicBinaryDictIOUtils.markAsDeleted(flags)); + } + close(); + } + + @Override @UsedForTesting + public void insertWord(final String word, final int frequency, + final ArrayList bigramStrings, + final ArrayList shortcuts, + final boolean isNotAWord, final boolean isBlackListEntry) + throws IOException, UnsupportedFormatException { + if (mOutStream == null) openStreamAndBuffer(); + DynamicBinaryDictIOUtils.insertWord(this, mOutStream, word, frequency, bigramStrings, + shortcuts, isNotAWord, isBlackListEntry); + close(); + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java deleted file mode 100644 index acab4f8a5..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; -import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; -import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; -import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.utils.JniUtils; - -import android.util.Log; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; - -/** - * An implementation of DictDecoder for version 3 binary dictionary. - */ -@UsedForTesting -public class Ver3DictDecoder extends AbstractDictDecoder { - private static final String TAG = Ver3DictDecoder.class.getSimpleName(); - - static { - JniUtils.loadNativeLibrary(); - } - - // TODO: implement something sensical instead of just a phony method - private static native int doNothing(); - - protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { - private static int readFrequency(final DictBuffer dictBuffer) { - return dictBuffer.readUnsignedByte(); - } - } - - protected final File mDictionaryBinaryFile; - private final DictionaryBufferFactory mBufferFactory; - protected DictBuffer mDictBuffer; - - /* package */ Ver3DictDecoder(final File file, final int factoryFlag) { - mDictionaryBinaryFile = file; - mDictBuffer = null; - - if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { - mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); - } else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) { - mBufferFactory = new DictionaryBufferFromByteArrayFactory(); - } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) { - mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory(); - } else { - mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); - } - } - - /* package */ Ver3DictDecoder(final File file, final DictionaryBufferFactory factory) { - mDictionaryBinaryFile = file; - mBufferFactory = factory; - } - - @Override - public void openDictBuffer() throws FileNotFoundException, IOException { - mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile); - } - - @Override - public boolean isDictBufferOpen() { - return mDictBuffer != null; - } - - /* package */ DictBuffer getDictBuffer() { - return mDictBuffer; - } - - @UsedForTesting - /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException { - openDictBuffer(); - return getDictBuffer(); - } - - @Override - public FileHeader readHeader() throws IOException, UnsupportedFormatException { - if (mDictBuffer == null) { - openDictBuffer(); - } - final FileHeader header = super.readHeader(mDictBuffer); - final int version = header.mFormatOptions.mVersion; - if (!(version >= 2 && version <= 3)) { - throw new UnsupportedFormatException("File header has a wrong version : " + version); - } - return header; - } - - // TODO: Make this buffer multi thread safe. - private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; - @Override - public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) { - int addressPointer = ptNodePos; - final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; - - final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); - if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { - addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; - } - - final int characters[]; - if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { - int index = 0; - int character = CharEncoding.readChar(mDictBuffer); - addressPointer += CharEncoding.getCharSize(character); - while (FormatSpec.INVALID_CHARACTER != character) { - // FusionDictionary is making sure that the length of the word is smaller than - // MAX_WORD_LENGTH. - // So we'll never write past the end of mCharacterBuffer. - mCharacterBuffer[index++] = character; - character = CharEncoding.readChar(mDictBuffer); - addressPointer += CharEncoding.getCharSize(character); - } - characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); - } else { - final int character = CharEncoding.readChar(mDictBuffer); - addressPointer += CharEncoding.getCharSize(character); - characters = new int[] { character }; - } - final int frequency; - if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { - frequency = PtNodeReader.readFrequency(mDictBuffer); - addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE; - } else { - frequency = PtNode.NOT_A_TERMINAL; - } - int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); - if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { - childrenAddress += addressPointer; - } - addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); - final ArrayList shortcutTargets; - if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { - // readShortcut will add shortcuts to shortcutTargets. - shortcutTargets = new ArrayList(); - addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); - } else { - shortcutTargets = null; - } - - final ArrayList bigrams; - if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { - bigrams = new ArrayList(); - addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, - addressPointer); - if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() - + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); - } - } else { - bigrams = null; - } - return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency, - parentAddress, childrenAddress, shortcutTargets, bigrams); - } - - @Override - public FusionDictionary readDictionaryBinary(final FusionDictionary dict, - final boolean deleteDictIfBroken) - throws FileNotFoundException, IOException, UnsupportedFormatException { - if (mDictBuffer == null) { - openDictBuffer(); - } - try { - return BinaryDictDecoderUtils.readDictionaryBinary(this, dict); - } catch (IOException e) { - Log.e(TAG, "The dictionary " + mDictionaryBinaryFile.getName() + " is broken.", e); - if (deleteDictIfBroken && !mDictionaryBinaryFile.delete()) { - Log.e(TAG, "Failed to delete the broken dictionary."); - } - throw e; - } catch (UnsupportedFormatException e) { - Log.e(TAG, "The dictionary " + mDictionaryBinaryFile.getName() + " is broken.", e); - if (deleteDictIfBroken && !mDictionaryBinaryFile.delete()) { - Log.e(TAG, "Failed to delete the broken dictionary."); - } - throw e; - } - } - - @Override - public void setPosition(int newPos) { - mDictBuffer.position(newPos); - } - - @Override - public int getPosition() { - return mDictBuffer.position(); - } - - @Override - public int readPtNodeCount() { - return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer); - } - - @Override - public boolean readAndFollowForwardLink() { - final int nextAddress = mDictBuffer.readUnsignedInt24(); - if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { - mDictBuffer.position(nextAddress); - return true; - } - return false; - } - - @Override - public boolean hasNextPtNodeArray() { - return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS; - } - - @Override - public void skipPtNode(final FormatOptions formatOptions) { - final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - PtNodeReader.readParentAddress(mDictBuffer, formatOptions); - BinaryDictIOUtils.skipString(mDictBuffer, - (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); - PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions); - if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readFrequency(mDictBuffer); - if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { - final int shortcutsSize = mDictBuffer.readUnsignedShort(); - mDictBuffer.position(mDictBuffer.position() + shortcutsSize - - FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); - } - if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { - int bigramCount = 0; - while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - final int bigramFlags = mDictBuffer.readUnsignedByte(); - switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: - mDictBuffer.readUnsignedByte(); - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: - mDictBuffer.readUnsignedShort(); - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: - mDictBuffer.readUnsignedInt24(); - break; - } - if ((bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) == 0) break; - } - if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - throw new RuntimeException("Too many bigrams in a PtNode."); - } - } - } -} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java deleted file mode 100644 index 92eb861d6..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; -import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; -import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Iterator; - -/** - * An implementation of DictEncoder for version 3 binary dictionary. - */ -public class Ver3DictEncoder implements DictEncoder { - - private final File mDictFile; - private OutputStream mOutStream; - private byte[] mBuffer; - private int mPosition; - - public Ver3DictEncoder(final File dictFile) { - mDictFile = dictFile; - mOutStream = null; - mBuffer = null; - } - - // This constructor is used only by BinaryDictOffdeviceUtilsTests. - // If you want to use this in the production code, you should consider keeping consistency of - // the interface of Ver3DictDecoder by using factory. - public Ver3DictEncoder(final OutputStream outStream) { - mDictFile = null; - mOutStream = outStream; - } - - private void openStream() throws FileNotFoundException { - mOutStream = new FileOutputStream(mDictFile); - } - - private void close() throws IOException { - if (mOutStream != null) { - mOutStream.close(); - mOutStream = null; - } - } - - @Override - public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) - throws IOException, UnsupportedFormatException { - if (formatOptions.mVersion > FormatSpec.VERSION3) { - throw new UnsupportedFormatException( - "The given format options has wrong version number : " - + formatOptions.mVersion); - } - - if (mOutStream == null) { - openStream(); - } - BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions); - - // Addresses are limited to 3 bytes, but since addresses can be relative to each node - // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding - // the order of the PtNode arrays becomes a quite complicated problem, because though the - // dictionary itself does not have a size limit, each node array must still be within 16MB - // of all its children and parents. As long as this is ensured, the dictionary file may - // grow to any size. - - // Leave the choice of the optimal node order to the flattenTree function. - MakedictLog.i("Flattening the tree..."); - ArrayList flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); - - MakedictLog.i("Computing addresses..."); - BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); - MakedictLog.i("Checking PtNode array..."); - if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); - - // Create a buffer that matches the final dictionary size. - final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); - final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; - mBuffer = new byte[bufferSize]; - - MakedictLog.i("Writing file..."); - - for (PtNodeArray nodeArray : flatNodes) { - BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); - } - if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); - mOutStream.write(mBuffer, 0, mPosition); - - MakedictLog.i("Done"); - close(); - } - - @Override - public void setPosition(final int position) { - if (mBuffer == null || position < 0 || position >= mBuffer.length) return; - mPosition = position; - } - - @Override - public int getPosition() { - return mPosition; - } - - @Override - public void writePtNodeCount(final int ptNodeCount) { - final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); - if (countSize != 1 && countSize != 2) { - throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); - } - final int encodedPtNodeCount = (countSize == 2) ? - (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; - mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, - countSize); - } - - private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) { - final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, - BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions), - FormatSpec.PTNODE_FLAGS_SIZE); - } - - private void writeParentPosition(final int parentPosition, final PtNode ptNode, - final FormatOptions formatOptions) { - if (parentPosition == FormatSpec.NO_PARENT_ADDRESS) { - mPosition = BinaryDictEncoderUtils.writeParentAddress(mBuffer, mPosition, - parentPosition, formatOptions); - } else { - mPosition = BinaryDictEncoderUtils.writeParentAddress(mBuffer, mPosition, - parentPosition - ptNode.mCachedAddressAfterUpdate, formatOptions); - } - } - - private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) { - mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition); - if (hasSeveralChars) { - mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; - } - } - - private void writeFrequency(final int frequency) { - if (frequency >= 0) { - mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, - FormatSpec.PTNODE_FREQUENCY_SIZE); - } - } - - private void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions) { - final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - if (formatOptions.supportsDynamicUpdate()) { - mPosition += BinaryDictEncoderUtils.writeSignedChildrenPosition(mBuffer, mPosition, - childrenPos); - } else { - mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, - childrenPos); - } - } - - /** - * Write a shortcut attributes list to mBuffer. - * - * @param shortcuts the shortcut attributes list. - */ - private void writeShortcuts(final ArrayList shortcuts) { - if (null == shortcuts || shortcuts.isEmpty()) return; - - final int indexOfShortcutByteSize = mPosition; - mPosition += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; - final Iterator shortcutIterator = shortcuts.iterator(); - while (shortcutIterator.hasNext()) { - final WeightedString target = shortcutIterator.next(); - final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( - shortcutIterator.hasNext(), - target.mFrequency); - mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord); - mPosition += shortcutShift; - } - final int shortcutByteSize = mPosition - indexOfShortcutByteSize; - if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { - throw new RuntimeException("Shortcut list too large"); - } - BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize, - FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); - } - - /** - * Write a bigram attributes list to mBuffer. - * - * @param bigrams the bigram attributes list. - * @param dict the dictionary the node array is a part of (for relative offsets). - */ - private void writeBigrams(final ArrayList bigrams, - final FusionDictionary dict) { - if (bigrams == null) return; - - final Iterator bigramIterator = bigrams.iterator(); - while (bigramIterator.hasNext()) { - final WeightedString bigram = bigramIterator.next(); - final PtNode target = - FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); - final int addressOfBigram = target.mCachedAddressAfterUpdate; - final int unigramFrequencyForThisWord = target.mFrequency; - final int offset = addressOfBigram - - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), - offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); - mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, - Math.abs(offset)); - } - } - - @Override - public void writeForwardLinkAddress(final int forwardLinkAddress) { - mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, forwardLinkAddress, - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); - } - - @Override - public void writePtNode(final PtNode ptNode, final int parentPosition, - final FormatOptions formatOptions, final FusionDictionary dict) { - writePtNodeFlags(ptNode, formatOptions); - writeParentPosition(parentPosition, ptNode, formatOptions); - writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); - writeFrequency(ptNode.mFrequency); - writeChildrenPosition(ptNode, formatOptions); - writeShortcuts(ptNode.mShortcutTargets); - writeBigrams(ptNode.mBigrams, dict); - } -} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java deleted file mode 100644 index 07adda625..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayList; - -/** - * An implementation of DictUpdater for version 3 binary dictionary. - */ -@UsedForTesting -public class Ver3DictUpdater extends Ver3DictDecoder implements DictUpdater { - private OutputStream mOutStream; - - @UsedForTesting - public Ver3DictUpdater(final File dictFile, final int factoryType) { - // DictUpdater must have an updatable DictBuffer. - super(dictFile, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY) - ? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER); - mOutStream = null; - } - - private void openStreamAndBuffer() throws FileNotFoundException, IOException { - super.openDictBuffer(); - mOutStream = new FileOutputStream(mDictionaryBinaryFile, true /* append */); - } - - private void close() throws IOException { - if (mOutStream != null) { - mOutStream.close(); - mOutStream = null; - } - } - - @Override @UsedForTesting - public void deleteWord(final String word) throws IOException, UnsupportedFormatException { - if (mOutStream == null) openStreamAndBuffer(); - mDictBuffer.position(0); - readHeader(); - final int wordPos = getTerminalPosition(word); - if (wordPos != FormatSpec.NOT_VALID_WORD) { - mDictBuffer.position(wordPos); - final int flags = mDictBuffer.readUnsignedByte(); - mDictBuffer.position(wordPos); - mDictBuffer.put((byte) DynamicBinaryDictIOUtils.markAsDeleted(flags)); - } - close(); - } - - @Override @UsedForTesting - public void insertWord(final String word, final int frequency, - final ArrayList bigramStrings, - final ArrayList shortcuts, - final boolean isNotAWord, final boolean isBlackListEntry) - throws IOException, UnsupportedFormatException { - if (mOutStream == null) openStreamAndBuffer(); - DynamicBinaryDictIOUtils.insertWord(this, mOutStream, word, frequency, bigramStrings, - shortcuts, isNotAWord, isBlackListEntry); - close(); - } -} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java index 8b80ebe63..a746f9945 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -1,4 +1,3 @@ -/* /* * Copyright (C) 2013 The Android Open Source Project * @@ -18,25 +17,15 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; -import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.utils.CollectionUtils; -import com.android.inputmethod.latin.utils.FileUtils; +import com.android.inputmethod.latin.utils.LocaleUtils; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; /** * An implementation of DictEncoder for version 4 binary dictionary. @@ -44,197 +33,19 @@ import java.util.Iterator; @UsedForTesting public class Ver4DictEncoder implements DictEncoder { private final File mDictPlacedDir; - private byte[] mTrieBuf; - private int mTriePos; - private OutputStream mTrieOutStream; - private OutputStream mHeaderOutStream; - private OutputStream mFreqOutStream; - private OutputStream mUnigramTimestampOutStream; - private OutputStream mTerminalAddressTableOutStream; - private File mDictDir; - private String mBaseFilename; - private BigramContentWriter mBigramWriter; - private ShortcutContentWriter mShortcutWriter; @UsedForTesting public Ver4DictEncoder(final File dictPlacedDir) { mDictPlacedDir = dictPlacedDir; } - private static class BigramContentWriter extends SparseTableContentWriter { - private final boolean mWriteTimestamp; - - public BigramContentWriter(final String name, final int initialCapacity, - final File baseDir, final boolean writeTimestamp) { - super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity, - FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp)); - mWriteTimestamp = writeTimestamp; - } - - private static String[] getContentFilenames(final String name, - final boolean writeTimestamp) { - final String[] contentFilenames; - if (writeTimestamp) { - contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION, - name + FormatSpec.BIGRAM_FILE_EXTENSION }; - } else { - contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }; - } - return contentFilenames; - } - - private static String[] getContentIds(final boolean writeTimestamp) { - final String[] contentIds; - if (writeTimestamp) { - contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID, - FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID }; - } else { - contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }; - } - return contentIds; - } - - public void writeBigramsForOneWord(final int terminalId, final int bigramCount, - final Iterator bigramIterator, final FusionDictionary dict) - throws IOException { - write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, - new SparseTableContentWriterInterface() { - @Override - public void write(final OutputStream outStream) throws IOException { - writeBigramsForOneWordInternal(outStream, bigramIterator, dict); - }}); - if (mWriteTimestamp) { - write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId, - new SparseTableContentWriterInterface() { - @Override - public void write(final OutputStream outStream) throws IOException { - initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream, - bigramCount); - }}); - } - } - - private void writeBigramsForOneWordInternal(final OutputStream outStream, - final Iterator bigramIterator, final FusionDictionary dict) - throws IOException { - while (bigramIterator.hasNext()) { - final WeightedString bigram = bigramIterator.next(); - final PtNode target = - FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); - final int unigramFrequencyForThisWord = target.mFrequency; - final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags( - bigramIterator.hasNext(), 0, bigram.mFrequency, - unigramFrequencyForThisWord, bigram.mWord); - BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId, - FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); - } - } - - private void initBigramTimestampsCountersAndLevelsForOneWordInternal( - final OutputStream outStream, final int bigramCount) throws IOException { - for (int i = 0; i < bigramCount; ++i) { - // TODO: Figure out what initial values should be. - BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */, - FormatSpec.BIGRAM_TIMESTAMP_SIZE); - BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */, - FormatSpec.BIGRAM_COUNTER_SIZE); - BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */, - FormatSpec.BIGRAM_LEVEL_SIZE); - } - } - } - - private static class ShortcutContentWriter extends SparseTableContentWriter { - public ShortcutContentWriter(final String name, final int initialCapacity, - final File baseDir) { - super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity, - FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, - new String[] { FormatSpec.SHORTCUT_CONTENT_ID }); - } - - public void writeShortcutForOneWord(final int terminalId, - final Iterator shortcutIterator) throws IOException { - write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, - new SparseTableContentWriterInterface() { - @Override - public void write(final OutputStream outStream) throws IOException { - writeShortcutForOneWordInternal(outStream, shortcutIterator); - } - }); - } - - private void writeShortcutForOneWordInternal(final OutputStream outStream, - final Iterator shortcutIterator) throws IOException { - while (shortcutIterator.hasNext()) { - final WeightedString target = shortcutIterator.next(); - final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( - shortcutIterator.hasNext(), target.mFrequency); - BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - CharEncoding.writeString(outStream, target.mWord); - } - } - } - - private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) - throws FileNotFoundException, IOException { - final FileHeader header = new FileHeader(0, dictOptions, formatOptions); - mBaseFilename = header.getId() + "." + header.getVersion(); - mDictDir = new File(mDictPlacedDir, mBaseFilename); - final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION); - final File headerFile = new File(mDictDir, - mBaseFilename + FormatSpec.HEADER_FILE_EXTENSION); - final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); - final File timestampFile = new File(mDictDir, - mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION); - final File terminalAddressTableFile = new File(mDictDir, - mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); - if (!mDictDir.isDirectory()) { - if (mDictDir.exists()) { - FileUtils.deleteRecursively(mDictDir); - } - mDictDir.mkdirs(); - } - mTrieOutStream = new FileOutputStream(trieFile); - mHeaderOutStream = new FileOutputStream(headerFile); - mFreqOutStream = new FileOutputStream(freqFile); - mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); - if (formatOptions.mHasTimestamp) { - mUnigramTimestampOutStream = new FileOutputStream(timestampFile); - } - } - - private void close() throws IOException { - try { - if (mTrieOutStream != null) { - mTrieOutStream.close(); - } - if (mHeaderOutStream != null) { - mHeaderOutStream.close(); - } - if (mFreqOutStream != null) { - mFreqOutStream.close(); - } - if (mTerminalAddressTableOutStream != null) { - mTerminalAddressTableOutStream.close(); - } - if (mUnigramTimestampOutStream != null) { - mUnigramTimestampOutStream.close(); - } - } finally { - mTrieOutStream = null; - mHeaderOutStream = null; - mFreqOutStream = null; - mTerminalAddressTableOutStream = null; - } - } - + // TODO: This builds a FusionDictionary first and iterates it to add words to the binary + // dictionary. However, it is possible to just add words directly to the binary dictionary + // instead. + // In the long run, when we stop supporting version 2, FusionDictionary will become deprecated + // and we can remove it. Then we'll be able to just call BinaryDictionary directly. @Override - public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions) throws IOException, UnsupportedFormatException { if (formatOptions.mVersion != FormatSpec.VERSION4) { throw new UnsupportedFormatException("File header has a wrong version number : " @@ -243,208 +54,70 @@ public class Ver4DictEncoder implements DictEncoder { if (!mDictPlacedDir.isDirectory()) { throw new UnsupportedFormatException("Given path is not a directory."); } - - if (mTrieOutStream == null) { - openStreams(formatOptions, dict.mOptions); - } - - BinaryDictEncoderUtils.writeDictionaryHeader(mHeaderOutStream, dict, formatOptions); - - MakedictLog.i("Flattening the tree..."); - ArrayList flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); - int terminalCount = 0; - final ArrayList nodes = CollectionUtils.newArrayList(); - for (final PtNodeArray array : flatNodes) { - for (final PtNode node : array.mData) { - if (node.isTerminal()) { - nodes.add(node); - node.mTerminalId = terminalCount++; + if (!BinaryDictionary.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(), + FormatSpec.VERSION4, dict.mOptions.mAttributes)) { + throw new IOException("Cannot create dictionary file"); + } + final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(), + 0l, mDictPlacedDir.length(), true /* useFullEditDistance */, + LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get( + FormatSpec.FileHeader.DICTIONARY_LOCALE_ATTRIBUTE)), + Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */, + true /* isUpdatable */); + if (!binaryDict.isValidDictionary()) { + // Somehow createEmptyDictFile returned true, but the file was not created correctly + throw new IOException("Cannot create dictionary file"); + } + for (final Word word : dict) { + // TODO: switch to addMultipleDictionaryEntries when they support shortcuts + if (null == word.mShortcutTargets || word.mShortcutTargets.isEmpty()) { + binaryDict.addUnigramWord(word.mWord, word.mFrequency, + null /* shortcutTarget */, 0 /* shortcutProbability */, + word.mIsNotAWord, word.mIsBlacklistEntry, 0 /* timestamp */); + } else { + for (final WeightedString shortcutTarget : word.mShortcutTargets) { + binaryDict.addUnigramWord(word.mWord, word.mFrequency, + shortcutTarget.mWord, shortcutTarget.mFrequency, + word.mIsNotAWord, word.mIsBlacklistEntry, 0 /* timestamp */); } } - } - Collections.sort(nodes, new Comparator() { - @Override - public int compare(final PtNode lhs, final PtNode rhs) { - if (lhs.mFrequency != rhs.mFrequency) { - return lhs.mFrequency < rhs.mFrequency ? -1 : 1; - } - if (lhs.mTerminalId < rhs.mTerminalId) return -1; - if (lhs.mTerminalId > rhs.mTerminalId) return 1; - return 0; + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDict.flushWithGC(); } - }); - int count = 0; - for (final PtNode node : nodes) { - node.mTerminalId = count++; - } - - MakedictLog.i("Computing addresses..."); - BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); - if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); - - writeTerminalData(flatNodes, terminalCount); - if (formatOptions.mHasTimestamp) { - initUnigramTimestamps(terminalCount); - } - mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir, - formatOptions.mHasTimestamp); - writeBigrams(flatNodes, dict); - mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir); - writeShortcuts(flatNodes); - - final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); - final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; - mTrieBuf = new byte[bufferSize]; - - MakedictLog.i("Writing file..."); - for (PtNodeArray nodeArray : flatNodes) { - BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); } - if (MakedictLog.DBG) { - BinaryDictEncoderUtils.showStatistics(flatNodes); - MakedictLog.i("has " + terminalCount + " terminals."); + for (final Word word0 : dict) { + if (null == word0.mBigrams) continue; + for (final WeightedString word1 : word0.mBigrams) { + binaryDict.addBigramWords(word0.mWord, word1.mWord, word1.mFrequency, + 0 /* timestamp */); + } + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDict.flushWithGC(); + } } - mTrieOutStream.write(mTrieBuf); - - MakedictLog.i("Done"); - close(); + binaryDict.flushWithGC(); + binaryDict.close(); } @Override public void setPosition(int position) { - if (mTrieBuf == null || position < 0 || position > mTrieBuf.length) return; - mTriePos = position; } @Override public int getPosition() { - return mTriePos; + return 0; } @Override public void writePtNodeCount(int ptNodeCount) { - final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); - // ptNodeCount must fit on one byte or two bytes. - // Please see comments in FormatSpec - if (countSize != 1 && countSize != 2) { - throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); - } - final int encodedPtNodeCount = (countSize == 2) ? - (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; - mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, encodedPtNodeCount, - countSize); - } - - private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) { - final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, - BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions), - FormatSpec.PTNODE_FLAGS_SIZE); - } - - private void writeParentPosition(int parentPos, final PtNode ptNode, - final FormatOptions formatOptions) { - if (parentPos != FormatSpec.NO_PARENT_ADDRESS) { - parentPos -= ptNode.mCachedAddressAfterUpdate; - } - mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos, - formatOptions); - } - - private void writeCharacters(final int[] characters, final boolean hasSeveralChars) { - mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos); - if (hasSeveralChars) { - mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; - } - } - - private void writeTerminalId(final int terminalId) { - mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId, - FormatSpec.PTNODE_TERMINAL_ID_SIZE); - } - - private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) { - final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - if (formatOptions.supportsDynamicUpdate()) { - mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf, - mTriePos, childrenPos); - } else { - mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, - mTriePos, childrenPos); - } - } - - private void writeBigrams(final ArrayList flatNodes, final FusionDictionary dict) - throws IOException { - mBigramWriter.openStreams(); - for (final PtNodeArray nodeArray : flatNodes) { - for (final PtNode ptNode : nodeArray.mData) { - if (ptNode.mBigrams != null) { - mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(), - ptNode.mBigrams.iterator(), dict); - } - } - } - mBigramWriter.closeStreams(); - } - - private void writeShortcuts(final ArrayList flatNodes) throws IOException { - mShortcutWriter.openStreams(); - for (final PtNodeArray nodeArray : flatNodes) { - for (final PtNode ptNode : nodeArray.mData) { - if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) { - mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId, - ptNode.mShortcutTargets.iterator()); - } - } - } - mShortcutWriter.closeStreams(); } @Override public void writeForwardLinkAddress(int forwardLinkAddress) { - mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, - forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE); } @Override - public void writePtNode(final PtNode ptNode, final int parentPosition, - final FormatOptions formatOptions, final FusionDictionary dict) { - writePtNodeFlags(ptNode, formatOptions); - writeParentPosition(parentPosition, ptNode, formatOptions); - writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); - if (ptNode.isTerminal()) { - writeTerminalId(ptNode.mTerminalId); - } - writeChildrenPosition(ptNode, formatOptions); - } - - private void writeTerminalData(final ArrayList flatNodes, - final int terminalCount) throws IOException { - final byte[] freqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE]; - final byte[] terminalAddressTableBuf = - new byte[terminalCount * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE]; - for (final PtNodeArray nodeArray : flatNodes) { - for (final PtNode ptNode : nodeArray.mData) { - if (ptNode.isTerminal()) { - BinaryDictEncoderUtils.writeUIntToBuffer(freqBuf, - ptNode.mTerminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE, - ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE); - BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf, - ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, - ptNode.mCachedAddressAfterUpdate, - FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - } - } - } - mFreqOutStream.write(freqBuf); - mTerminalAddressTableOutStream.write(terminalAddressTableBuf); - } - - private void initUnigramTimestamps(final int terminalCount) throws IOException { - // Initial value of time stamps for each word is 0. - final byte[] unigramTimestampBuf = - new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE]; - mUnigramTimestampOutStream.write(unigramTimestampBuf); + public void writePtNode( + PtNode ptNode, int parentPosition, FormatOptions formatOptions, FusionDictionary dict) { } } -- cgit v1.2.3-83-g751a