From e9a0e66716dab4dd3184d009d8920de1961efdfa Mon Sep 17 00:00:00 2001 From: Amin Bandali Date: Mon, 16 Dec 2024 21:45:41 -0500 Subject: Rename to Kelar Keyboard (org.kelar.inputmethod.latin) --- .../inputmethod/latin/BinaryDictionaryTests.java | 913 +++++++++++++++++++++ 1 file changed, 913 insertions(+) create mode 100644 tests/src/org/kelar/inputmethod/latin/BinaryDictionaryTests.java (limited to 'tests/src/org/kelar/inputmethod/latin/BinaryDictionaryTests.java') diff --git a/tests/src/org/kelar/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/org/kelar/inputmethod/latin/BinaryDictionaryTests.java new file mode 100644 index 000000000..a44c0bce7 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/BinaryDictionaryTests.java @@ -0,0 +1,913 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import android.text.TextUtils; +import android.util.Pair; + +import androidx.test.InstrumentationRegistry; +import androidx.test.filters.LargeTest; +import androidx.test.runner.AndroidJUnit4; + +import org.kelar.inputmethod.latin.NgramContext.WordInfo; +import org.kelar.inputmethod.latin.common.CodePointUtils; +import org.kelar.inputmethod.latin.common.FileUtils; +import org.kelar.inputmethod.latin.makedict.DictionaryHeader; +import org.kelar.inputmethod.latin.makedict.FormatSpec; +import org.kelar.inputmethod.latin.makedict.WeightedString; +import org.kelar.inputmethod.latin.makedict.WordProperty; +import org.kelar.inputmethod.latin.utils.BinaryDictionaryUtils; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Locale; +import java.util.Random; + +@LargeTest +@RunWith(AndroidJUnit4.class) +public class BinaryDictionaryTests { + private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; + private static final String TEST_LOCALE = "test"; + private static final String DICTIONARY_ID = "TestBinaryDictionary"; + + private HashSet mDictFilesToBeDeleted = new HashSet<>(); + + @Before + public void setUp() throws Exception { + mDictFilesToBeDeleted.clear(); + } + + @After + public void tearDown() throws Exception { + for (final File dictFile : mDictFilesToBeDeleted) { + dictFile.delete(); + } + mDictFilesToBeDeleted.clear(); + } + + private File createEmptyDictionaryAndGetFile(final int formatVersion) { + return createEmptyDictionaryWithAttributesAndGetFile(formatVersion, + new HashMap()); + } + + private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, + final HashMap attributeMap) { + try { + final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, + attributeMap); + mDictFilesToBeDeleted.add(dictFile); + return dictFile; + } catch (final IOException e) { + fail(e.toString()); + } + return null; + } + + private File createEmptyVer4DictionaryAndGetFile(final int formatVersion, + final HashMap attributeMap) throws IOException { + final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, + InstrumentationRegistry.getTargetContext().getCacheDir()); + file.delete(); + file.mkdir(); + if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, + Locale.ENGLISH, attributeMap)) { + return file; + } + throw new IOException("Empty dictionary " + file.getAbsolutePath() + + " cannot be created. Format version: " + formatVersion); + } + + private static BinaryDictionary getBinaryDictionary(final File dictFile) { + return new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + } + + private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) { + final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); + return new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + } + + @Test + public void testIsValidDictionary() { + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + assertTrue("binaryDictionary must be valid for existing valid dictionary file.", + binaryDictionary.isValidDictionary()); + binaryDictionary.close(); + assertFalse("binaryDictionary must be invalid after closing.", + binaryDictionary.isValidDictionary()); + FileUtils.deleteRecursively(dictFile); + binaryDictionary = getBinaryDictionary(dictFile); + assertFalse("binaryDictionary must be invalid for not existing dictionary file.", + binaryDictionary.isValidDictionary()); + binaryDictionary.close(); + } + + @Test + public void testConstructingDictionaryOnMemory() { + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + FileUtils.deleteRecursively(dictFile); + assertFalse(dictFile.exists()); + final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, + FormatSpec.VERSION403, new HashMap()); + assertTrue(binaryDictionary.isValidDictionary()); + assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); + final int probability = 100; + addUnigramWord(binaryDictionary, "word", probability); + assertEquals(probability, binaryDictionary.getFrequency("word")); + assertFalse(dictFile.exists()); + binaryDictionary.flush(); + assertTrue(dictFile.exists()); + assertTrue(binaryDictionary.isValidDictionary()); + assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); + assertEquals(probability, binaryDictionary.getFrequency("word")); + binaryDictionary.close(); + } + + @Test + public void testAddTooLongWord() { + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + final StringBuffer stringBuilder = new StringBuffer(); + for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) { + stringBuilder.append('a'); + } + final String validLongWord = stringBuilder.toString(); + stringBuilder.append('a'); + final String invalidLongWord = stringBuilder.toString(); + final int probability = 100; + addUnigramWord(binaryDictionary, "aaa", probability); + addUnigramWord(binaryDictionary, validLongWord, probability); + addUnigramWord(binaryDictionary, invalidLongWord, probability); + // Too long short cut. + binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */, + false /* isNotAWord */, false /* isPossiblyOffensive */, + BinaryDictionary.NOT_A_VALID_TIMESTAMP); + addUnigramWord(binaryDictionary, "abc", probability); + final int updatedProbability = 200; + // Update. + addUnigramWord(binaryDictionary, validLongWord, updatedProbability); + addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); + addUnigramWord(binaryDictionary, "abc", updatedProbability); + + assertEquals(probability, binaryDictionary.getFrequency("aaa")); + assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); + assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord)); + assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); + } + + private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, + final int probability) { + binaryDictionary.addUnigramEntry(word, probability, + false /* isBeginningOfSentence */, false /* isNotAWord */, + false /* isPossiblyOffensive */, + BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + } + + private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, + final String word1, final int probability) { + binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability, + BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + } + + private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, + final String word1, final String word2, final int probability) { + binaryDictionary.addNgramEntry( + new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2, + probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + } + + private static boolean isValidBigram(final BinaryDictionary binaryDictionary, + final String word0, final String word1) { + return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1); + } + + private static int getBigramProbability(final BinaryDictionary binaryDictionary, + final String word0, final String word1) { + return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1); + } + + private static int getTrigramProbability(final BinaryDictionary binaryDictionary, + final String word0, final String word1, final String word2) { + return binaryDictionary.getNgramProbability( + new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2); + } + + @Test + public void testAddUnigramWord() { + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + final int probability = 100; + addUnigramWord(binaryDictionary, "aaa", probability); + // Reallocate and create. + addUnigramWord(binaryDictionary, "aab", probability); + // Insert into children. + addUnigramWord(binaryDictionary, "aac", probability); + // Make terminal. + addUnigramWord(binaryDictionary, "aa", probability); + // Create children. + addUnigramWord(binaryDictionary, "aaaa", probability); + // Reallocate and make termianl. + addUnigramWord(binaryDictionary, "a", probability); + + final int updatedProbability = 200; + // Update. + addUnigramWord(binaryDictionary, "aaa", updatedProbability); + + assertEquals(probability, binaryDictionary.getFrequency("aab")); + assertEquals(probability, binaryDictionary.getFrequency("aac")); + assertEquals(probability, binaryDictionary.getFrequency("aa")); + assertEquals(probability, binaryDictionary.getFrequency("aaaa")); + assertEquals(probability, binaryDictionary.getFrequency("a")); + assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); + } + + @Test + public void testRandomlyAddUnigramWord() { + final int wordCount = 1000; + final int codePointSetSize = 50; + final long seed = System.currentTimeMillis(); + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + + final HashMap probabilityMap = new HashMap<>(); + // Test a word that isn't contained within the dictionary. + final Random random = new Random(seed); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + for (int i = 0; i < wordCount; ++i) { + final String word = CodePointUtils.generateWord(random, codePointSet); + probabilityMap.put(word, random.nextInt(0xFF)); + } + for (String word : probabilityMap.keySet()) { + addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); + } + for (String word : probabilityMap.keySet()) { + assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); + } + } + + @Test + public void testAddBigramWords() { + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + + final int unigramProbability = 100; + final int bigramProbability = 150; + final int updatedBigramProbability = 200; + addUnigramWord(binaryDictionary, "aaa", unigramProbability); + addUnigramWord(binaryDictionary, "abb", unigramProbability); + addUnigramWord(binaryDictionary, "bcc", unigramProbability); + addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); + addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); + addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); + addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); + + assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); + assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); + assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); + assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); + + addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); + assertEquals(updatedBigramProbability, + getBigramProbability(binaryDictionary, "aaa", "abb")); + + assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); + assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); + assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); + assertEquals(Dictionary.NOT_A_PROBABILITY, + getBigramProbability(binaryDictionary, "bcc", "aaa")); + assertEquals(Dictionary.NOT_A_PROBABILITY, + getBigramProbability(binaryDictionary, "bcc", "bbc")); + assertEquals(Dictionary.NOT_A_PROBABILITY, + getBigramProbability(binaryDictionary, "aaa", "aaa")); + + // Testing bigram link. + addUnigramWord(binaryDictionary, "abcde", unigramProbability); + addUnigramWord(binaryDictionary, "fghij", unigramProbability); + addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); + addUnigramWord(binaryDictionary, "fgh", unigramProbability); + addUnigramWord(binaryDictionary, "abc", unigramProbability); + addUnigramWord(binaryDictionary, "f", unigramProbability); + + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij")); + assertEquals(Dictionary.NOT_A_PROBABILITY, + getBigramProbability(binaryDictionary, "abcde", "fgh")); + addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); + assertEquals(updatedBigramProbability, + getBigramProbability(binaryDictionary, "abcde", "fghij")); + } + + @Test + public void testRandomlyAddBigramWords() { + final int wordCount = 100; + final int bigramCount = 1000; + final int codePointSetSize = 50; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + + final ArrayList words = new ArrayList<>(); + final ArrayList> bigramWords = new ArrayList<>(); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final HashMap unigramProbabilities = new HashMap<>(); + final HashMap, Integer> bigramProbabilities = new HashMap<>(); + + for (int i = 0; i < wordCount; ++i) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + unigramProbabilities.put(word, unigramProbability); + addUnigramWord(binaryDictionary, word, unigramProbability); + } + + for (int i = 0; i < bigramCount; i++) { + final String word0 = words.get(random.nextInt(wordCount)); + final String word1 = words.get(random.nextInt(wordCount)); + if (TextUtils.equals(word0, word1)) { + continue; + } + final Pair bigram = new Pair<>(word0, word1); + bigramWords.add(bigram); + final int unigramProbability = unigramProbabilities.get(word1); + final int bigramProbability = + unigramProbability + random.nextInt(0xFF - unigramProbability); + bigramProbabilities.put(bigram, bigramProbability); + addBigramWords(binaryDictionary, word0, word1, bigramProbability); + } + + for (final Pair bigram : bigramWords) { + final int bigramProbability = bigramProbabilities.get(bigram); + assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, + isValidBigram(binaryDictionary, bigram.first, bigram.second)); + assertEquals(bigramProbability, + getBigramProbability(binaryDictionary, bigram.first, bigram.second)); + } + } + + @Test + public void testAddTrigramWords() { + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + final int unigramProbability = 100; + final int trigramProbability = 150; + final int updatedTrigramProbability = 200; + addUnigramWord(binaryDictionary, "aaa", unigramProbability); + addUnigramWord(binaryDictionary, "abb", unigramProbability); + addUnigramWord(binaryDictionary, "bcc", unigramProbability); + + addBigramWords(binaryDictionary, "abb", "bcc", 10); + addBigramWords(binaryDictionary, "abb", "aaa", 10); + + addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability); + addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability); + + assertEquals(trigramProbability, + getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc")); + assertEquals(trigramProbability, + getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); + assertFalse(isValidBigram(binaryDictionary, "aaa", "abb")); + + addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability); + assertEquals(updatedTrigramProbability, + getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); + } + + @Test + public void testFlushDictionary() { + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + + final int probability = 100; + addUnigramWord(binaryDictionary, "aaa", probability); + addUnigramWord(binaryDictionary, "abcd", probability); + // Close without flushing. + binaryDictionary.close(); + + binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); + assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); + + addUnigramWord(binaryDictionary, "aaa", probability); + addUnigramWord(binaryDictionary, "abcd", probability); + binaryDictionary.flush(); + binaryDictionary.close(); + + binaryDictionary = getBinaryDictionary(dictFile); + assertEquals(probability, binaryDictionary.getFrequency("aaa")); + assertEquals(probability, binaryDictionary.getFrequency("abcd")); + addUnigramWord(binaryDictionary, "bcde", probability); + binaryDictionary.flush(); + binaryDictionary.close(); + + binaryDictionary = getBinaryDictionary(dictFile); + assertEquals(probability, binaryDictionary.getFrequency("bcde")); + binaryDictionary.close(); + } + + @Test + public void testFlushWithGCDictionary() { + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + final int unigramProbability = 100; + final int bigramProbability = 150; + addUnigramWord(binaryDictionary, "aaa", unigramProbability); + addUnigramWord(binaryDictionary, "abb", unigramProbability); + addUnigramWord(binaryDictionary, "bcc", unigramProbability); + addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); + addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); + addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); + addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); + binaryDictionary.flushWithGC(); + binaryDictionary.close(); + + binaryDictionary = getBinaryDictionary(dictFile); + assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); + assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); + assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); + assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); + assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); + assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); + assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); + binaryDictionary.flushWithGC(); + binaryDictionary.close(); + } + + @Test + public void testAddBigramWordsAndFlashWithGC() { + final int wordCount = 100; + final int bigramCount = 1000; + final int codePointSetSize = 30; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + + final ArrayList words = new ArrayList<>(); + final ArrayList> bigramWords = new ArrayList<>(); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final HashMap unigramProbabilities = new HashMap<>(); + final HashMap, Integer> bigramProbabilities = new HashMap<>(); + + for (int i = 0; i < wordCount; ++i) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + unigramProbabilities.put(word, unigramProbability); + addUnigramWord(binaryDictionary, word, unigramProbability); + } + + for (int i = 0; i < bigramCount; i++) { + final String word0 = words.get(random.nextInt(wordCount)); + final String word1 = words.get(random.nextInt(wordCount)); + if (TextUtils.equals(word0, word1)) { + continue; + } + final Pair bigram = new Pair<>(word0, word1); + bigramWords.add(bigram); + final int unigramProbability = unigramProbabilities.get(word1); + final int bigramProbability = + unigramProbability + random.nextInt(0xFF - unigramProbability); + bigramProbabilities.put(bigram, bigramProbability); + addBigramWords(binaryDictionary, word0, word1, bigramProbability); + } + + binaryDictionary.flushWithGC(); + binaryDictionary.close(); + binaryDictionary = getBinaryDictionary(dictFile); + + for (final Pair bigram : bigramWords) { + final int bigramProbability = bigramProbabilities.get(bigram); + assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, + isValidBigram(binaryDictionary, bigram.first, bigram.second)); + assertEquals(bigramProbability, + getBigramProbability(binaryDictionary, bigram.first, bigram.second)); + } + } + + @Test + public void testRandomOperationsAndFlashWithGC() { + final int maxUnigramCount = 5000; + final int maxBigramCount = 10000; + final HashMap attributeMap = new HashMap<>(); + attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); + attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); + + final int flashWithGCIterationCount = 50; + final int operationCountInEachIteration = 200; + final int initialUnigramCount = 100; + final float addUnigramProb = 0.5f; + final float addBigramProb = 0.8f; + final int codePointSetSize = 30; + + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, + attributeMap); + BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + + final ArrayList words = new ArrayList<>(); + final ArrayList> bigramWords = new ArrayList<>(); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final HashMap unigramProbabilities = new HashMap<>(); + final HashMap, Integer> bigramProbabilities = new HashMap<>(); + for (int i = 0; i < initialUnigramCount; ++i) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + unigramProbabilities.put(word, unigramProbability); + addUnigramWord(binaryDictionary, word, unigramProbability); + } + binaryDictionary.flushWithGC(); + binaryDictionary.close(); + + for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { + binaryDictionary = getBinaryDictionary(dictFile); + for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { + // Add unigram. + if (random.nextFloat() < addUnigramProb) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + unigramProbabilities.put(word, unigramProbability); + addUnigramWord(binaryDictionary, word, unigramProbability); + } + // Add bigram. + if (random.nextFloat() < addBigramProb && words.size() > 2) { + final int word0Index = random.nextInt(words.size()); + int word1Index = random.nextInt(words.size() - 1); + if (word0Index <= word1Index) { + word1Index++; + } + final String word0 = words.get(word0Index); + final String word1 = words.get(word1Index); + if (TextUtils.equals(word0, word1)) { + continue; + } + final int unigramProbability = unigramProbabilities.get(word1); + final int bigramProbability = + unigramProbability + random.nextInt(0xFF - unigramProbability); + final Pair bigram = new Pair<>(word0, word1); + bigramWords.add(bigram); + bigramProbabilities.put(bigram, bigramProbability); + addBigramWords(binaryDictionary, word0, word1, bigramProbability); + } + } + + // Test whether the all unigram operations are collectlly handled. + for (int i = 0; i < words.size(); i++) { + final String word = words.get(i); + final int unigramProbability = unigramProbabilities.get(word); + assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); + } + // Test whether the all bigram operations are collectlly handled. + for (int i = 0; i < bigramWords.size(); i++) { + final Pair bigram = bigramWords.get(i); + final int probability; + if (bigramProbabilities.containsKey(bigram)) { + probability = bigramProbabilities.get(bigram); + } else { + probability = Dictionary.NOT_A_PROBABILITY; + } + + assertEquals(probability, + getBigramProbability(binaryDictionary, bigram.first, bigram.second)); + assertEquals(probability != Dictionary.NOT_A_PROBABILITY, + isValidBigram(binaryDictionary, bigram.first, bigram.second)); + } + binaryDictionary.flushWithGC(); + binaryDictionary.close(); + } + } + + @Test + public void testAddManyUnigramsAndFlushWithGC() { + final int flashWithGCIterationCount = 3; + final int codePointSetSize = 50; + + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + + final ArrayList words = new ArrayList<>(); + final HashMap unigramProbabilities = new HashMap<>(); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + + BinaryDictionary binaryDictionary; + for (int i = 0; i < flashWithGCIterationCount; i++) { + binaryDictionary = getBinaryDictionary(dictFile); + while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + unigramProbabilities.put(word, unigramProbability); + addUnigramWord(binaryDictionary, word, unigramProbability); + } + + for (int j = 0; j < words.size(); j++) { + final String word = words.get(j); + final int unigramProbability = unigramProbabilities.get(word); + assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); + } + + binaryDictionary.flushWithGC(); + binaryDictionary.close(); + } + } + + @Test + public void testUnigramAndBigramCount() { + final int maxUnigramCount = 5000; + final int maxBigramCount = 10000; + final HashMap attributeMap = new HashMap<>(); + attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); + attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); + + final int flashWithGCIterationCount = 10; + final int codePointSetSize = 50; + final int unigramCountPerIteration = 1000; + final int bigramCountPerIteration = 2000; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, + attributeMap); + + final ArrayList words = new ArrayList<>(); + final HashSet> bigrams = new HashSet<>(); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + + BinaryDictionary binaryDictionary; + for (int i = 0; i < flashWithGCIterationCount; i++) { + binaryDictionary = getBinaryDictionary(dictFile); + for (int j = 0; j < unigramCountPerIteration; j++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + addUnigramWord(binaryDictionary, word, unigramProbability); + } + for (int j = 0; j < bigramCountPerIteration; j++) { + final String word0 = words.get(random.nextInt(words.size())); + final String word1 = words.get(random.nextInt(words.size())); + if (TextUtils.equals(word0, word1)) { + continue; + } + bigrams.add(new Pair<>(word0, word1)); + final int bigramProbability = random.nextInt(0xF); + addBigramWords(binaryDictionary, word0, word1, bigramProbability); + } + assertEquals(new HashSet<>(words).size(), Integer.parseInt( + binaryDictionary.getPropertyForGettingStats( + BinaryDictionary.UNIGRAM_COUNT_QUERY))); + assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( + binaryDictionary.getPropertyForGettingStats( + BinaryDictionary.BIGRAM_COUNT_QUERY))); + binaryDictionary.flushWithGC(); + assertEquals(new HashSet<>(words).size(), Integer.parseInt( + binaryDictionary.getPropertyForGettingStats( + BinaryDictionary.UNIGRAM_COUNT_QUERY))); + assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( + binaryDictionary.getPropertyForGettingStats( + BinaryDictionary.BIGRAM_COUNT_QUERY))); + binaryDictionary.close(); + } + } + + @Test + public void testGetWordProperties() { + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final int UNIGRAM_COUNT = 1000; + final int BIGRAM_COUNT = 1000; + final int codePointSetSize = 20; + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); + final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + + final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", + false /* isBeginningOfSentence */); + assertFalse(invalidWordProperty.isValid()); + + final ArrayList words = new ArrayList<>(); + final HashMap wordProbabilities = new HashMap<>(); + final HashMap> bigrams = new HashMap<>(); + final HashMap, Integer> bigramProbabilities = new HashMap<>(); + + for (int i = 0; i < UNIGRAM_COUNT; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + final int unigramProbability = random.nextInt(0xFF); + final boolean isNotAWord = random.nextBoolean(); + final boolean isPossiblyOffensive = random.nextBoolean(); + // TODO: Add tests for historical info. + binaryDictionary.addUnigramEntry(word, unigramProbability, + false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, + BinaryDictionary.NOT_A_VALID_TIMESTAMP); + if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + words.add(word); + wordProbabilities.put(word, unigramProbability); + final WordProperty wordProperty = binaryDictionary.getWordProperty(word, + false /* isBeginningOfSentence */); + assertEquals(word, wordProperty.mWord); + assertTrue(wordProperty.isValid()); + assertEquals(isNotAWord, wordProperty.mIsNotAWord); + assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive); + assertEquals(false, wordProperty.mHasNgrams); + assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); + } + + for (int i = 0; i < BIGRAM_COUNT; i++) { + final int word0Index = random.nextInt(wordProbabilities.size()); + final int word1Index = random.nextInt(wordProbabilities.size()); + if (word0Index == word1Index) { + continue; + } + final String word0 = words.get(word0Index); + final String word1 = words.get(word1Index); + final int unigramProbability = wordProbabilities.get(word1); + final int bigramProbability = + unigramProbability + random.nextInt(0xFF - unigramProbability); + addBigramWords(binaryDictionary, word0, word1, bigramProbability); + if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + if (!bigrams.containsKey(word0)) { + final HashSet bigramWord1s = new HashSet<>(); + bigrams.put(word0, bigramWord1s); + } + bigrams.get(word0).add(word1); + bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability); + } + + for (int i = 0; i < words.size(); i++) { + final String word0 = words.get(i); + if (!bigrams.containsKey(word0)) { + continue; + } + final HashSet bigramWord1s = bigrams.get(word0); + final WordProperty wordProperty = binaryDictionary.getWordProperty(word0, + false /* isBeginningOfSentence */); + assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size()); + // TODO: Support ngram. + for (final WeightedString bigramTarget : wordProperty.getBigrams()) { + final String word1 = bigramTarget.mWord; + assertTrue(bigramWord1s.contains(word1)); + final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1)); + assertEquals(bigramProbability, bigramTarget.getProbability()); + } + } + } + + @Test + public void testIterateAllWords() { + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final int UNIGRAM_COUNT = 1000; + final int BIGRAM_COUNT = 1000; + final int codePointSetSize = 20; + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + + final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", + false /* isBeginningOfSentence */); + assertFalse(invalidWordProperty.isValid()); + + final ArrayList words = new ArrayList<>(); + final HashMap wordProbabilitiesToCheckLater = new HashMap<>(); + final HashMap> bigrams = new HashMap<>(); + final HashMap, Integer> bigramProbabilitiesToCheckLater = + new HashMap<>(); + + for (int i = 0; i < UNIGRAM_COUNT; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + final int unigramProbability = random.nextInt(0xFF); + addUnigramWord(binaryDictionary, word, unigramProbability); + if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + words.add(word); + wordProbabilitiesToCheckLater.put(word, unigramProbability); + } + + for (int i = 0; i < BIGRAM_COUNT; i++) { + final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); + final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); + if (word0Index == word1Index) { + continue; + } + final String word0 = words.get(word0Index); + final String word1 = words.get(word1Index); + final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); + final int bigramProbability = + unigramProbability + random.nextInt(0xFF - unigramProbability); + addBigramWords(binaryDictionary, word0, word1, bigramProbability); + if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + if (!bigrams.containsKey(word0)) { + final HashSet bigramWord1s = new HashSet<>(); + bigrams.put(word0, bigramWord1s); + } + bigrams.get(word0).add(word1); + bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability); + } + + final HashSet wordSet = new HashSet<>(words); + final HashSet> bigramSet = + new HashSet<>(bigramProbabilitiesToCheckLater.keySet()); + int token = 0; + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + final String word0 = wordProperty.mWord; + assertEquals((int)wordProbabilitiesToCheckLater.get(word0), + wordProperty.mProbabilityInfo.mProbability); + wordSet.remove(word0); + final HashSet bigramWord1s = bigrams.get(word0); + // TODO: Support ngram. + if (wordProperty.mHasNgrams) { + for (final WeightedString bigramTarget : wordProperty.getBigrams()) { + final String word1 = bigramTarget.mWord; + assertTrue(bigramWord1s.contains(word1)); + final Pair bigram = new Pair<>(word0, word1); + final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram); + assertEquals(bigramProbability, bigramTarget.getProbability()); + bigramSet.remove(bigram); + } + } + token = result.mNextToken; + } while (token != 0); + assertTrue(wordSet.isEmpty()); + assertTrue(bigramSet.isEmpty()); + } + + @Test + public void testPossiblyOffensiveAttributeMaintained() { + final BinaryDictionary binaryDictionary = + getEmptyBinaryDictionary(FormatSpec.VERSION403); + binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0); + WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false); + assertEquals(true, wordProperty.mIsPossiblyOffensive); + } + + @Test + public void testBeginningOfSentence() { + final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); + final int dummyProbability = 0; + final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE; + final int bigramProbability = 200; + addUnigramWord(binaryDictionary, "aaa", dummyProbability); + binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, + BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + assertEquals(bigramProbability, + binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); + binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, + BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + addUnigramWord(binaryDictionary, "bbb", dummyProbability); + binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability, + BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + binaryDictionary.flushWithGC(); + assertEquals(bigramProbability, + binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); + assertEquals(bigramProbability, + binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb")); + } +} -- cgit v1.2.3-83-g751a