aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java726
1 files changed, 721 insertions, 5 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
index 3d8f186ba..91d9cf345 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
@@ -17,29 +17,133 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
+import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
+import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
+import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
+import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
+import com.android.inputmethod.latin.utils.CollectionUtils;
+
+import android.util.Log;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStream;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
/**
* An implementation of DictUpdater for version 4 binary dictionary.
*/
@UsedForTesting
public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater {
+ private static final String TAG = Ver4DictUpdater.class.getSimpleName();
+
+ private OutputStream mDictStream;
+ private final File mFrequencyFile;
@UsedForTesting
- public Ver4DictUpdater(final File dictDirectory, final int factoryType) {
+ public Ver4DictUpdater(final File dictDirectory, final int factoryType)
+ throws UnsupportedFormatException {
// DictUpdater must have an updatable DictBuffer.
super(dictDirectory, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY)
? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER);
+ mFrequencyFile = getFile(FILETYPE_FREQUENCY);
+ }
+
+ private static class BigramContentUpdater extends SparseTableContentUpdater {
+ private final boolean mHasTimestamp;
+
+ public BigramContentUpdater(final String name, final File baseDir,
+ final boolean hasTimestamp) {
+ super(name + FormatSpec.BIGRAM_FILE_EXTENSION,
+ FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
+ BigramContentReader.getContentFilenames(name, hasTimestamp),
+ BigramContentReader.getContentIds(hasTimestamp),
+ new DictionaryBufferFromWritableByteBufferFactory());
+ mHasTimestamp = hasTimestamp;
+ }
+
+ public void insertBigramEntries(final int terminalId, final int frequency,
+ final ArrayList<PendingAttribute> entries) throws IOException {
+ if (terminalId < 0) {
+ throw new RuntimeException("Invalid terminal id : " + terminalId);
+ }
+ openStreamsAndBuffers();
+
+ if (entries == null || entries.isEmpty()) {
+ setContentValue(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
+ SparseTable.NOT_EXIST);
+ return;
+ }
+ final int positionOfEntries =
+ (int) mContentFiles[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX].length();
+ setContentValue(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, positionOfEntries);
+
+ final Iterator<PendingAttribute> bigramIterator = entries.iterator();
+ while (bigramIterator.hasNext()) {
+ final PendingAttribute entry = bigramIterator.next();
+ final int flags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
+ 0 /* offset */, entry.mFrequency, frequency, "" /* word */);
+ BinaryDictEncoderUtils.writeUIntToStream(
+ mContentOutStreams[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX], flags,
+ FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
+ BinaryDictEncoderUtils.writeUIntToStream(
+ mContentOutStreams[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX], entry.mAddress,
+ FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
+ }
+ close();
+ }
+ }
+
+ private static class ShortcutContentUpdater extends SparseTableContentUpdater {
+ public ShortcutContentUpdater(final String name, final File baseDir) {
+ super(name + FormatSpec.SHORTCUT_FILE_EXTENSION,
+ FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
+ new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
+ new String[] { FormatSpec.SHORTCUT_CONTENT_ID },
+ new DictionaryBufferFromWritableByteBufferFactory());
+ }
+
+ public void insertShortcuts(final int terminalId,
+ final ArrayList<WeightedString> shortcuts) throws IOException {
+ if (terminalId < 0) {
+ throw new RuntimeException("Invalid terminal id : " + terminalId);
+ }
+ openStreamsAndBuffers();
+ if (shortcuts == null || shortcuts.isEmpty()) {
+ setContentValue(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
+ SparseTable.NOT_EXIST);
+ return;
+ }
+
+ final int positionOfShortcuts =
+ (int) mContentFiles[FormatSpec.SHORTCUT_CONTENT_INDEX].length();
+ setContentValue(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, positionOfShortcuts);
+
+ final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
+ while (shortcutIterator.hasNext()) {
+ final WeightedString target = shortcutIterator.next();
+ final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
+ shortcutIterator.hasNext(), target.mFrequency);
+ BinaryDictEncoderUtils.writeUIntToStream(
+ mContentOutStreams[FormatSpec.SHORTCUT_CONTENT_INDEX], shortcutFlags,
+ FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
+ CharEncoding.writeString(mContentOutStreams[FormatSpec.SHORTCUT_CONTENT_INDEX],
+ target.mWord);
+ }
+ close();
+ }
}
@Override
public void deleteWord(final String word) throws IOException, UnsupportedFormatException {
- if (mDictBuffer == null) openDictBuffer();
- readHeader();
+ if (mDictBuffer == null) {
+ openDictBuffer();
+ readHeader();
+ }
final int wordPos = getTerminalPosition(word);
if (wordPos != FormatSpec.NOT_VALID_WORD) {
mDictBuffer.position(wordPos);
@@ -49,11 +153,623 @@ public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater {
}
}
- @Override
+ private int getNewTerminalId() {
+ // The size of frequency file is FormatSpec.FREQUENCY_AND_FLAGS_SIZE * number of terminals
+ // because each terminal always has a frequency.
+ // So we can get a fresh terminal id by this logic.
+ // CAVEAT: we are reading the file size from the disk each time: beware of race conditions,
+ // even on one thread.
+ return (int) (mFrequencyFile.length() / FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
+ }
+
+ private void updateParentPosIfNotMoved(final int nodePos, final int newParentPos,
+ final FormatOptions formatOptions) {
+ final int originalPos = getPosition();
+ setPosition(nodePos);
+ final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
+ if (!BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) {
+ final int parentOffset = newParentPos - nodePos;
+ BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, parentOffset);
+ }
+ setPosition(originalPos);
+ }
+
+ private void updateParentPositions(final int nodeArrayPos, final int newParentPos,
+ final FormatOptions formatOptions) {
+ final int originalPos = mDictBuffer.position();
+ mDictBuffer.position(nodeArrayPos);
+ int jumpCount = 0;
+ do {
+ final int count = readPtNodeCount();
+ for (int i = 0; i < count; ++i) {
+ updateParentPosIfNotMoved(getPosition(), newParentPos, formatOptions);
+ skipPtNode(formatOptions);
+ }
+ if (!readAndFollowForwardLink()) break;
+ } while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS);
+ setPosition(originalPos);
+ }
+
+ private void updateChildrenPos(final int nodePos, final int newChildrenPos,
+ final FormatOptions options) {
+ final int originalPos = getPosition();
+ setPosition(nodePos);
+ final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
+ PtNodeReader.readParentAddress(mDictBuffer, options);
+ BinaryDictIOUtils.skipString(mDictBuffer,
+ (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
+ if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer);
+ final int basePos = getPosition();
+ BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newChildrenPos - basePos);
+ setPosition(originalPos);
+ }
+
+ private void updateTerminalPosition(final int terminalId, final int position) {
+ if (terminalId == PtNode.NOT_A_TERMINAL
+ || terminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE
+ >= mTerminalAddressTableBuffer.limit()) return;
+ mTerminalAddressTableBuffer.position(terminalId
+ * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
+ BinaryDictEncoderUtils.writeUIntToDictBuffer(mTerminalAddressTableBuffer, position,
+ FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
+ }
+
+ private void updateForwardLink(final int nodeArrayPos, final int newForwardLink,
+ final FormatOptions formatOptions) {
+ final int originalPos = getPosition();
+ setPosition(nodeArrayPos);
+ int jumpCount = 0;
+ while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS) {
+ final int ptNodeCount = readPtNodeCount();
+ for (int i = 0; i < ptNodeCount; ++i) {
+ skipPtNode(formatOptions);
+ }
+ final int forwardLinkPos = getPosition();
+ if (!readAndFollowForwardLink()) {
+ setPosition(forwardLinkPos);
+ BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newForwardLink - forwardLinkPos);
+ break;
+ }
+ }
+ setPosition(originalPos);
+ }
+
+ private void markPtNodeAsMoved(final int nodePos, final int newNodePos,
+ final FormatOptions options) {
+ final int originalPos = getPosition();
+ updateParentPosIfNotMoved(nodePos, newNodePos, options);
+ setPosition(nodePos);
+ final int currentFlags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
+ setPosition(nodePos);
+ mDictBuffer.put((byte) (FormatSpec.FLAG_IS_MOVED
+ | (currentFlags & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
+ final int offset = newNodePos - nodePos;
+ BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, offset);
+ setPosition(originalPos);
+ }
+
+ /**
+ * Writes a PtNode to an output stream from a Ver4PtNodeInfo.
+ *
+ * @param nodePos the position of the head of the PtNode.
+ * @param info the PtNode info to be written.
+ * @return the size written, in bytes.
+ */
+ private int writePtNode(final int nodePos, final Ver4PtNodeInfo info) throws IOException {
+ int written = 0;
+
+ // Write flags.
+ mDictStream.write((byte) (info.mFlags & 0xFF));
+ written += FormatSpec.PTNODE_FLAGS_SIZE;
+
+ // Write the parent position.
+ final int parentOffset = info.mParentPos == FormatSpec.NO_PARENT_ADDRESS ?
+ FormatSpec.NO_PARENT_ADDRESS : info.mParentPos - nodePos;
+ BinaryDictIOUtils.writeSInt24ToStream(mDictStream, parentOffset);
+ written += FormatSpec.PARENT_ADDRESS_SIZE;
+
+ // Write a string.
+ if (((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0)
+ != (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters > 1)) {
+ throw new RuntimeException("Inconsistent flags : hasMultipleChars = "
+ + ((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0) + ", length = "
+ + (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters));
+ }
+ written += CharEncoding.writeCodePoints(mDictStream, info.mCharacters,
+ info.mStartIndexOfCharacters, info.mEndIndexOfCharacters);
+
+ // Write the terminal id.
+ if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
+ BinaryDictEncoderUtils.writeUIntToStream(mDictStream, info.mTerminalId,
+ FormatSpec.PTNODE_TERMINAL_ID_SIZE);
+ written += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
+ }
+
+ // Write the children position.
+ final int childrenOffset = info.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS
+ ? 0 : info.mChildrenPos - (nodePos + written);
+ BinaryDictIOUtils.writeSInt24ToStream(mDictStream, childrenOffset);
+ written += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+
+ return written;
+ }
+
+ /**
+ * Helper method to split and move PtNode.
+ *
+ * @param ptNodeArrayPos the position of PtNodeArray which contains the split and moved PtNode.
+ * @param splittedPtNodeToMovePos the position of the split and moved PtNode.
+ * @param newParent the parent PtNode after splitting.
+ * @param newChildren the children PtNodes after splitting.
+ * @param newParentStartPos where to write the new parent.
+ * @param formatOptions the format options.
+ */
+ private void writeSplittedPtNodes(final int ptNodeArrayPos, final int splittedPtNodeToMovePos,
+ final Ver4PtNodeInfo newParent, final Ver4PtNodeInfo[] newChildren,
+ final int newParentStartPos,
+ final FormatOptions formatOptions) throws IOException {
+ updateTerminalPosition(newParent.mTerminalId,
+ newParentStartPos + 1 /* size of PtNodeCount */);
+ int written = writePtNodeArray(newParentStartPos, new Ver4PtNodeInfo[] { newParent },
+ FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ final int childrenStartPos = newParentStartPos + written;
+ writePtNodeArray(childrenStartPos, newChildren, FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ int childrenNodePos = childrenStartPos + 1 /* size of PtNodeCount */;
+ for (final Ver4PtNodeInfo info : newChildren) {
+ updateTerminalPosition(info.mTerminalId, childrenNodePos);
+ childrenNodePos += computePtNodeSize(info.mCharacters, info.mStartIndexOfCharacters,
+ info.mEndIndexOfCharacters,
+ (info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0);
+ }
+
+ // Mark as moved.
+ markPtNodeAsMoved(splittedPtNodeToMovePos, newParentStartPos + 1 /* size of PtNodeCount */,
+ formatOptions);
+ updateForwardLink(ptNodeArrayPos, newParentStartPos, formatOptions);
+ }
+
+ /**
+ * Writes a node array to the stream.
+ *
+ * @param nodeArrayPos the position of the head of the node array.
+ * @param infos an array of Ver4PtNodeInfo to be written.
+ * @return the written length in bytes.
+ */
+ private int writePtNodeArray(final int nodeArrayPos, final Ver4PtNodeInfo[] infos,
+ final int forwardLink) throws IOException {
+ int written = BinaryDictIOUtils.writePtNodeCount(mDictStream, infos.length);
+ for (int i = 0; i < infos.length; ++i) {
+ written += writePtNode(nodeArrayPos + written, infos[i]);
+ }
+ BinaryDictIOUtils.writeSInt24ToStream(mDictStream, forwardLink);
+ written += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
+ return written;
+ }
+
+ private int computePtNodeSize(final int[] codePoints, final int startIndex, final int endIndex,
+ final boolean isTerminal) {
+ return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ + CharEncoding.getCharArraySize(codePoints, startIndex, endIndex)
+ + (endIndex - startIndex > 1 ? FormatSpec.PTNODE_TERMINATOR_SIZE : 0)
+ + (isTerminal ? FormatSpec.PTNODE_TERMINAL_ID_SIZE : 0)
+ + FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+ }
+
+ private void writeNewSinglePtNodeWithAttributes(final int[] codePoints,
+ final boolean hasShortcuts, final int terminalId, final boolean hasBigrams,
+ final boolean isNotAWord, final boolean isBlackListEntry, final int parentPos,
+ final FormatOptions formatOptions) throws IOException {
+ final int newNodeArrayPos = mDictBuffer.limit();
+ final int newNodeFlags = BinaryDictEncoderUtils.makePtNodeFlags(codePoints.length > 1,
+ terminalId != PtNode.NOT_A_TERMINAL, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts,
+ hasBigrams, isNotAWord, isBlackListEntry, formatOptions);
+ final Ver4PtNodeInfo info = new Ver4PtNodeInfo(newNodeFlags, codePoints, terminalId,
+ FormatSpec.NO_CHILDREN_ADDRESS, parentPos, 0 /* nodeSize */);
+ writePtNodeArray(newNodeArrayPos, new Ver4PtNodeInfo[] { info },
+ FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ }
+
+ private int setMultipleCharsInFlags(final int currentFlags, final boolean hasMultipleChars) {
+ final int flags;
+ if (hasMultipleChars) {
+ flags = currentFlags | FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
+ } else {
+ flags = currentFlags & (~FormatSpec.FLAG_HAS_MULTIPLE_CHARS);
+ }
+ return flags;
+ }
+
+ private int setIsNotAWordInFlags(final int currentFlags, final boolean isNotAWord) {
+ final int flags;
+ if (isNotAWord) {
+ flags = currentFlags | FormatSpec.FLAG_IS_NOT_A_WORD;
+ } else {
+ flags = currentFlags & (~FormatSpec.FLAG_IS_NOT_A_WORD);
+ }
+ return flags;
+ }
+
+ private int setIsBlackListEntryInFlags(final int currentFlags, final boolean isBlackListEntry) {
+ final int flags;
+ if (isBlackListEntry) {
+ flags = currentFlags | FormatSpec.FLAG_IS_BLACKLISTED;
+ } else {
+ flags = currentFlags & (~FormatSpec.FLAG_IS_BLACKLISTED);
+ }
+ return flags;
+ }
+
+ /**
+ * Splits a PtNode.
+ *
+ * abcd - ef
+ *
+ * -> inserting "abc"
+ *
+ * abc - d - ef
+ *
+ * @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split.
+ * @param nodeToSplitPos the position of the PtNode to split.
+ * @param nodeToSplitInfo the information of the PtNode to split.
+ * @param indexToSplit the index where to split in the code points array.
+ * @param parentOfNodeToSplitPos the absolute position of a parent of the node to split.
+ * @param newTerminalId the terminal id of the inserted node (corresponds to "d").
+ * @param hasShortcuts whether the inserted word should have shortcuts.
+ * @param hasBigrams whether the inserted word should have bigrams.
+ * @param isNotAWord whether the inserted word should be not a word.
+ * @param isBlackListEntry whether the inserted word should be a black list entry.
+ * @param formatOptions the format options.
+ */
+ private void splitOnly(final int nodeArrayToSplitPos, final int nodeToSplitPos,
+ final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit,
+ final int parentOfNodeToSplitPos, final int newTerminalId, final boolean hasShortcuts,
+ final boolean hasBigrams, final boolean isNotAWord, final boolean isBlackListEntry,
+ final FormatOptions formatOptions) throws IOException {
+ final int parentNodeArrayStartPos = mDictBuffer.limit();
+ final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
+ final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(indexToSplit > 1,
+ true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, formatOptions);
+ final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags,
+ nodeToSplitInfo.mCharacters, newTerminalId, parentNodeStartPos
+ + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, true)
+ + FormatSpec.FORWARD_LINK_ADDRESS_SIZE,
+ parentOfNodeToSplitPos, 0 /* nodeSize */);
+ parentInfo.mStartIndexOfCharacters = 0;
+ parentInfo.mEndIndexOfCharacters = indexToSplit;
+
+ // Write the child.
+ final int childrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags,
+ nodeToSplitInfo.mCharacters.length - indexToSplit > 1);
+ final Ver4PtNodeInfo childrenInfo = new Ver4PtNodeInfo(childrenFlags,
+ nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId,
+ nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */);
+ childrenInfo.mStartIndexOfCharacters = indexToSplit;
+ childrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length;
+ if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
+ updateParentPositions(nodeToSplitInfo.mChildrenPos,
+ parentInfo.mChildrenPos + 1 /* size of PtNodeCount */, formatOptions);
+ }
+
+ writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo,
+ new Ver4PtNodeInfo[] { childrenInfo }, parentNodeArrayStartPos, formatOptions);
+ }
+
+ /**
+ * Split and branch a PtNode.
+ *
+ * ab - cd
+ *
+ * -> inserting "ac"
+ *
+ * a - b - cd
+ * |
+ * - c
+ *
+ * @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split.
+ * @param nodeToSplitPos the position of the PtNode to split.
+ * @param nodeToSplitInfo the information of the PtNode to split.
+ * @param indexToSplit the index where to split in the code points array.
+ * @param parentOfNodeToSplitPos the absolute position of parent of the node to split.
+ * @param newWordSuffixCodePoints the suffix of the newly inserted word (corresponds to "c").
+ * @param startIndexOfNewWordSuffixCodePoints the start index in newWordSuffixCodePoints where
+ * the suffix starts.
+ * @param newTerminalId the terminal id of the inserted node (correspond to "c").
+ * @param hasShortcuts whether the inserted word should have shortcuts.
+ * @param hasBigrams whether the inserted word should have bigrams.
+ * @param isNotAWord whether the inserted word should be not a word.
+ * @param isBlackListEntry whether the inserted word should be a black list entry.
+ * @param formatOptions the format options.
+ */
+ private void splitAndBranch(final int nodeArrayToSplitPos, final int nodeToSplitPos,
+ final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit,
+ final int parentOfNodeToSplitPos, final int[] newWordSuffixCodePoints,
+ final int startIndexOfNewWordSuffixCodePoints,
+ final int newTerminalId,
+ final boolean hasShortcuts, final boolean hasBigrams, final boolean isNotAWord,
+ final boolean isBlackListEntry, final FormatOptions formatOptions) throws IOException {
+ final int parentNodeArrayStartPos = mDictBuffer.limit();
+ final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
+ final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(
+ indexToSplit > 1,
+ false /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED,
+ false /* hasShortcut */, false /* hasBigrams */,
+ false /* isNotAWord */, false /* isBlackListEntry */, formatOptions);
+ final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags,
+ nodeToSplitInfo.mCharacters, PtNode.NOT_A_TERMINAL,
+ parentNodeStartPos
+ + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false)
+ + FormatSpec.FORWARD_LINK_ADDRESS_SIZE,
+ parentOfNodeToSplitPos, 0 /* nodeSize */);
+ parentInfo.mStartIndexOfCharacters = 0;
+ parentInfo.mEndIndexOfCharacters = indexToSplit;
+
+ final int childrenNodeArrayStartPos = parentNodeStartPos
+ + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false)
+ + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
+ final int firstChildrenFlags = BinaryDictEncoderUtils.makePtNodeFlags(
+ newWordSuffixCodePoints.length - startIndexOfNewWordSuffixCodePoints > 1,
+ true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, formatOptions);
+ final Ver4PtNodeInfo firstChildrenInfo = new Ver4PtNodeInfo(firstChildrenFlags,
+ newWordSuffixCodePoints, newTerminalId,
+ FormatSpec.NO_CHILDREN_ADDRESS, parentNodeStartPos,
+ 0 /* nodeSize */);
+ firstChildrenInfo.mStartIndexOfCharacters = startIndexOfNewWordSuffixCodePoints;
+ firstChildrenInfo.mEndIndexOfCharacters = newWordSuffixCodePoints.length;
+
+ final int secondChildrenStartPos = childrenNodeArrayStartPos + 1 /* size of ptNodeCount */
+ + computePtNodeSize(newWordSuffixCodePoints, startIndexOfNewWordSuffixCodePoints,
+ newWordSuffixCodePoints.length, true /* isTerminal */);
+ final int secondChildrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags,
+ nodeToSplitInfo.mCharacters.length - indexToSplit > 1);
+ final Ver4PtNodeInfo secondChildrenInfo = new Ver4PtNodeInfo(secondChildrenFlags,
+ nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId,
+ nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */);
+ secondChildrenInfo.mStartIndexOfCharacters = indexToSplit;
+ secondChildrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length;
+ if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
+ updateParentPositions(nodeToSplitInfo.mChildrenPos, secondChildrenStartPos,
+ formatOptions);
+ }
+
+ writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo,
+ new Ver4PtNodeInfo[] { firstChildrenInfo, secondChildrenInfo },
+ parentNodeArrayStartPos, formatOptions);
+ }
+
+ /**
+ * Inserts a word into the trie file and returns the position of inserted terminal node.
+ * If the insertion is failed, returns FormatSpec.NOT_VALID_WORD.
+ */
+ @UsedForTesting
+ private int insertWordToTrie(final String word, final int newTerminalId,
+ final boolean isNotAWord, final boolean isBlackListEntry, final boolean hasBigrams,
+ final boolean hasShortcuts) throws IOException, UnsupportedFormatException {
+ setPosition(0);
+ final FileHeader header = readHeader();
+
+ final int[] codePoints = FusionDictionary.getCodePoints(word);
+ final int wordLen = codePoints.length;
+
+ int wordPos = 0;
+ for (int depth = 0; depth < FormatSpec.MAX_WORD_LENGTH; /* nop */) {
+ final int nodeArrayPos = getPosition();
+ final int ptNodeCount = readPtNodeCount();
+ boolean goToChildren = false;
+ int parentPos = FormatSpec.NO_PARENT_ADDRESS;
+ for (int i = 0; i < ptNodeCount; ++i) {
+ final int nodePos = getPosition();
+ final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(nodePos, header.mFormatOptions);
+ if (BinaryDictIOUtils.isMovedPtNode(nodeInfo.mFlags, header.mFormatOptions)) {
+ continue;
+ }
+ if (nodeInfo.mParentPos != FormatSpec.NO_PARENT_ADDRESS) {
+ parentPos = nodePos + nodeInfo.mParentPos;
+ }
+
+ final boolean firstCharacterMatched =
+ codePoints[wordPos] == nodeInfo.mCharacters[0];
+ boolean allCharactersMatched = true;
+ int firstDifferentCharacterIndex = -1;
+ for (int p = 0; p < nodeInfo.mCharacters.length; ++p) {
+ if (wordPos + p >= codePoints.length) break;
+ if (codePoints[wordPos + p] != nodeInfo.mCharacters[p]) {
+ if (firstDifferentCharacterIndex == -1) {
+ firstDifferentCharacterIndex = p;
+ }
+ allCharactersMatched = false;
+ }
+ }
+
+ if (!firstCharacterMatched) {
+ // Go to the next sibling node.
+ continue;
+ }
+
+ if (!allCharactersMatched) {
+ final int parentNodeArrayStartPos = mDictBuffer.limit();
+ splitAndBranch(nodeArrayPos, nodePos, nodeInfo, firstDifferentCharacterIndex,
+ parentPos, codePoints, wordPos + firstDifferentCharacterIndex,
+ newTerminalId, hasShortcuts, hasBigrams, isNotAWord,
+ isBlackListEntry, header.mFormatOptions);
+
+ return parentNodeArrayStartPos + computePtNodeSize(codePoints, wordPos,
+ wordPos + firstDifferentCharacterIndex, false)
+ + FormatSpec.FORWARD_LINK_ADDRESS_SIZE + 1 /* size of PtNodeCount */;
+ }
+
+ if (wordLen - wordPos < nodeInfo.mCharacters.length) {
+ final int parentNodeArrayStartPos = mDictBuffer.limit();
+ splitOnly(nodeArrayPos, nodePos, nodeInfo, wordLen - wordPos, parentPos,
+ newTerminalId, hasShortcuts, hasBigrams, isNotAWord, isBlackListEntry,
+ header.mFormatOptions);
+
+ // Return the position of the inserted word.
+ return parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
+ }
+
+ wordPos += nodeInfo.mCharacters.length;
+ if (wordPos == wordLen) {
+ // This dictionary already contains the word.
+ Log.e(TAG, "Something went wrong. If the word is already contained, "
+ + " there is no need to insert new PtNode.");
+ return FormatSpec.NOT_VALID_WORD;
+ }
+ if (nodeInfo.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS) {
+ // There are no children.
+ // We need to add a new node as a child of this node.
+ final int newNodeArrayPos = mDictBuffer.limit();
+ final int[] newNodeCodePoints = Arrays.copyOfRange(codePoints, wordPos,
+ codePoints.length);
+ writeNewSinglePtNodeWithAttributes(newNodeCodePoints, hasShortcuts,
+ newTerminalId, hasBigrams, isNotAWord, isBlackListEntry, nodePos,
+ header.mFormatOptions);
+ updateChildrenPos(nodePos, newNodeArrayPos, header.mFormatOptions);
+ return newNodeArrayPos + 1 /* size of PtNodeCount */;
+ } else {
+ // Found the matched node.
+ // Go to the children of this node.
+ setPosition(nodeInfo.mChildrenPos);
+ goToChildren = true;
+ depth++;
+ break;
+ }
+ }
+
+ if (goToChildren) continue;
+ if (!readAndFollowForwardLink()) {
+ // Add a new node that contains [wordPos, word.length()-1].
+ // and update the forward link.
+ final int newNodeArrayPos = mDictBuffer.limit();
+ final int[] newCodePoints = Arrays.copyOfRange(codePoints, wordPos,
+ codePoints.length);
+ writeNewSinglePtNodeWithAttributes(newCodePoints, hasShortcuts, newTerminalId,
+ hasBigrams, isNotAWord, isBlackListEntry, parentPos, header.mFormatOptions);
+ updateForwardLink(nodeArrayPos, newNodeArrayPos, header.mFormatOptions);
+ return newNodeArrayPos + 1 /* size of PtNodeCount */;
+ }
+ }
+ return FormatSpec.NOT_VALID_WORD;
+ }
+
+ private void updateFrequency(final int terminalId, final int frequency) {
+ mFrequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
+ BinaryDictEncoderUtils.writeUIntToDictBuffer(mFrequencyBuffer, frequency,
+ FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
+ }
+
+ private void insertFrequency(final int frequency) throws IOException {
+ final OutputStream frequencyStream = new FileOutputStream(mFrequencyFile,
+ true /* append */);
+ BinaryDictEncoderUtils.writeUIntToStream(frequencyStream, frequency,
+ FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
+ frequencyStream.close();
+ }
+
+ private void insertTerminalPosition(final int posOfTerminal) throws IOException,
+ UnsupportedFormatException {
+ final OutputStream terminalPosStream = new FileOutputStream(
+ getFile(FILETYPE_TERMINAL_ADDRESS_TABLE), true /* append */);
+ BinaryDictEncoderUtils.writeUIntToStream(terminalPosStream, posOfTerminal,
+ FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
+ terminalPosStream.close();
+ }
+
+ private void insertBigrams(final int terminalId, final int frequency,
+ final ArrayList<PendingAttribute> bigramAddresses)
+ throws IOException, UnsupportedFormatException {
+ openDictBuffer();
+ final BigramContentUpdater updater = new BigramContentUpdater(mDictDirectory.getName(),
+ mDictDirectory, false);
+
+ // Convert addresses to terminal ids.
+ final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
+ mDictBuffer.position(0);
+ final FileHeader header = readHeader();
+ for (PendingAttribute attr : bigramAddresses) {
+ mDictBuffer.position(attr.mAddress);
+ final Ver4PtNodeInfo info = readVer4PtNodeInfo(attr.mAddress, header.mFormatOptions);
+ if (info.mTerminalId == PtNode.NOT_A_TERMINAL) {
+ throw new RuntimeException("We can't have a bigram target that's not a terminal.");
+ }
+ bigrams.add(new PendingAttribute(frequency, info.mTerminalId));
+ }
+ updater.insertBigramEntries(terminalId, frequency, bigrams);
+ close();
+ }
+
+ private void insertShortcuts(final int terminalId, final ArrayList<WeightedString> shortcuts)
+ throws IOException {
+ final ShortcutContentUpdater updater = new ShortcutContentUpdater(mDictDirectory.getName(),
+ mDictDirectory);
+ updater.insertShortcuts(terminalId, shortcuts);
+ }
+
+ private void openBuffersAndStream() throws IOException, UnsupportedFormatException {
+ openDictBuffer();
+ mDictStream = new FileOutputStream(getFile(FILETYPE_TRIE), true /* append */);
+ }
+
+ private void close() throws IOException {
+ if (mDictStream != null) {
+ mDictStream.close();
+ mDictStream = null;
+ }
+ mDictBuffer = null;
+ mFrequencyBuffer = null;
+ mTerminalAddressTableBuffer = null;
+ }
+
+ private void updateAttributes(final int posOfWord, final int frequency,
+ final ArrayList<WeightedString> bigramStrings,
+ final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
+ final boolean isBlackListEntry) throws IOException, UnsupportedFormatException {
+ mDictBuffer.position(0);
+ final FileHeader header = readHeader();
+ mDictBuffer.position(posOfWord);
+ final Ver4PtNodeInfo info = readVer4PtNodeInfo(posOfWord, header.mFormatOptions);
+ final int terminalId = info.mTerminalId;
+
+ // Update the flags.
+ final int newFlags = setIsNotAWordInFlags(
+ setIsBlackListEntryInFlags(info.mFlags, isBlackListEntry), isNotAWord);
+ mDictBuffer.position(posOfWord);
+ mDictBuffer.put((byte) newFlags);
+
+ updateFrequency(terminalId, frequency);
+ insertBigrams(terminalId, frequency,
+ DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings));
+ insertShortcuts(terminalId, shortcuts);
+ }
+
+ @Override @UsedForTesting
public void insertWord(final String word, final int frequency,
final ArrayList<WeightedString> bigramStrings, final ArrayList<WeightedString> shortcuts,
final boolean isNotAWord, final boolean isBlackListEntry)
throws IOException, UnsupportedFormatException {
- // TODO: Implement this method.
+ final int newTerminalId = getNewTerminalId();
+
+ openBuffersAndStream();
+ final int posOfWord = getTerminalPosition(word);
+ if (posOfWord != FormatSpec.NOT_VALID_WORD) {
+ // The word is already contained in the dictionary.
+ updateAttributes(posOfWord, frequency, bigramStrings, shortcuts, isNotAWord,
+ isBlackListEntry);
+ close();
+ return;
+ }
+
+ // Insert new PtNode into trie.
+ final int posOfTerminal = insertWordToTrie(word, newTerminalId, isNotAWord,
+ isBlackListEntry, bigramStrings != null && !bigramStrings.isEmpty(),
+ shortcuts != null && !shortcuts.isEmpty());
+ insertFrequency(frequency);
+ insertTerminalPosition(posOfTerminal);
+ close();
+
+ insertBigrams(newTerminalId, frequency,
+ DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings));
+ insertShortcuts(newTerminalId, shortcuts);
}
}