diff --git a/src/libime/core/historybigram.cpp b/src/libime/core/historybigram.cpp index 2df8801..93deff0 100644 --- a/src/libime/core/historybigram.cpp +++ b/src/libime/core/historybigram.cpp @@ -571,4 +571,14 @@ void HistoryBigram::fillPredict(std::unordered_set &words, pool.fillPredict(words, lookup, maxSize); }); } + +bool HistoryBigram::containsBigram(std::string_view prev, + std::string_view cur) const { + FCITX_D(); + return std::ranges::any_of(d->pools_, + [&prev, &cur](const HistoryBigramPool &pool) { + return pool.bigramFreq(prev, cur) > 0; + }); +} + } // namespace libime diff --git a/src/libime/core/historybigram.h b/src/libime/core/historybigram.h index e7b03a7..4c29bdf 100644 --- a/src/libime/core/historybigram.h +++ b/src/libime/core/historybigram.h @@ -57,6 +57,8 @@ class LIBIMECORE_EXPORT HistoryBigram { const std::vector &sentence, size_t maxSize) const; + bool containsBigram(std::string_view prev, std::string_view cur) const; + private: std::unique_ptr d_ptr; FCITX_DECLARE_PRIVATE(HistoryBigram); diff --git a/src/libime/core/languagemodel.cpp b/src/libime/core/languagemodel.cpp index 72cc204..feb3865 100644 --- a/src/libime/core/languagemodel.cpp +++ b/src/libime/core/languagemodel.cpp @@ -5,6 +5,7 @@ */ #include "languagemodel.h" +#include #include #include #include @@ -27,9 +28,11 @@ #include "lm/config.hh" #include "lm/lm_exception.hh" #include "lm/model.hh" +#include "lm/return.hh" #include "lm/state.hh" #include "lm/word_index.hh" #include "util/string_piece.hh" +#include "utils.h" namespace libime { @@ -72,6 +75,8 @@ const DATrie &StaticLanguageModelFile::predictionTrie() const { static_assert(sizeof(void *) + sizeof(lm::ngram::State) <= StateSize, "Size"); +LanguageModelBase::~LanguageModelBase() {} + bool LanguageModelBase::isNodeUnknown(const LatticeNode &node) const { return isUnknown(node.idx(), node.word()); } @@ -217,6 +222,32 @@ bool LanguageModel::isUnknown(WordIndex idx, std::string_view /*word*/) const { return idx == unknown(); } +unsigned int +LanguageModel::maxNgramLength(const std::vector &words) const { + FCITX_D(); + if (!d->model()) { + return 0; + } + State state = nullState(); + State outState; + + unsigned int maxNgramLength = 0; + std::vector nodes; + for (const auto &word : words) { + const auto idx = index(word); + lm::FullScoreReturn full = + d->model()->FullScore(lmState(state), idx, lmState(outState)); + unsigned int ngramLength = full.ngram_length; + if (ngramLength == 1 && idx == unknown()) { + ngramLength = 0; + } + + maxNgramLength = std::max(maxNgramLength, ngramLength); + state = outState; + } + return maxNgramLength; +} + void LanguageModel::setUnknownPenalty(float unknown) { FCITX_D(); d->unknown_ = unknown; diff --git a/src/libime/core/languagemodel.h b/src/libime/core/languagemodel.h index d16960e..a0e01aa 100644 --- a/src/libime/core/languagemodel.h +++ b/src/libime/core/languagemodel.h @@ -32,7 +32,7 @@ class LanguageModelResolverPrivate; class LIBIMECORE_EXPORT LanguageModelBase { public: - virtual ~LanguageModelBase() {} + virtual ~LanguageModelBase(); virtual WordIndex beginSentence() const = 0; virtual WordIndex endSentence() const = 0; @@ -89,6 +89,8 @@ class LIBIMECORE_EXPORT LanguageModel : public LanguageModelBase { void setUnknownPenalty(float unknown); float unknownPenalty() const; + unsigned int maxNgramLength(const std::vector &words) const; + private: std::unique_ptr d_ptr; FCITX_DECLARE_PRIVATE(LanguageModel); diff --git a/src/libime/core/userlanguagemodel.cpp b/src/libime/core/userlanguagemodel.cpp index 76e7f5b..a25814f 100644 --- a/src/libime/core/userlanguagemodel.cpp +++ b/src/libime/core/userlanguagemodel.cpp @@ -9,10 +9,13 @@ #include #include #include +#include #include #include +#include #include #include +#include #include #include "constants.h" #include "historybigram.h" @@ -150,4 +153,21 @@ bool UserLanguageModel::useOnlyUnigram() const { FCITX_D(); return d->useOnlyUnigram_; } + +bool UserLanguageModel::containsNonUnigram( + const std::vector &words) const { + FCITX_D(); + if (words.size() <= 1 || d->useOnlyUnigram_) { + return false; + } + + for (auto iter = words.begin(); iter != std::prev(words.end()); ++iter) { + if (d->history_.containsBigram(*iter, *(std::next(iter)))) { + return true; + } + } + + return LanguageModel::maxNgramLength(words) > 1; +} + } // namespace libime diff --git a/src/libime/core/userlanguagemodel.h b/src/libime/core/userlanguagemodel.h index 5b13b61..1c767b3 100644 --- a/src/libime/core/userlanguagemodel.h +++ b/src/libime/core/userlanguagemodel.h @@ -9,7 +9,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -44,6 +46,8 @@ class LIBIMECORE_EXPORT UserLanguageModel : public LanguageModel { State &out) const override; bool isUnknown(WordIndex idx, std::string_view view) const override; + bool containsNonUnigram(const std::vector &words) const; + private: std::unique_ptr d_ptr; FCITX_DECLARE_PRIVATE(UserLanguageModel); diff --git a/src/libime/pinyin/pinyincontext.cpp b/src/libime/pinyin/pinyincontext.cpp index 605f8d7..0fe8859 100644 --- a/src/libime/pinyin/pinyincontext.cpp +++ b/src/libime/pinyin/pinyincontext.cpp @@ -251,9 +251,17 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { totalPinyinLength += item.encodedPinyin_.size() / 2; } } - if (!isAllSingleWord && !hasCustom && totalPinyinLength > 4) { - return LearnWordResult::Ignored; + + FCITX_Q(); + if (!hasCustom) { + if ((!isAllSingleWord && totalPinyinLength > 4)) { + return LearnWordResult::Ignored; + } + if (ime_->model()->containsNonUnigram(q->selectedWords())) { + return LearnWordResult::Ignored; + } } + for (auto &s : selected_) { for (auto &item : s) { if (item.type_ == SelectedPinyinType::Separator) { diff --git a/test/testpinyincontext.cpp b/test/testpinyincontext.cpp index e1205a3..7d36e9a 100644 --- a/test/testpinyincontext.cpp +++ b/test/testpinyincontext.cpp @@ -285,5 +285,27 @@ int main() { } } + { + c.clear(); + c.clearContextWords(); + FCITX_ASSERT(!ime.model()->history().containsBigram("他", "爱")); + c.type("taai"); + size_t i = 0; + for (const auto &candidate : c.candidatesToCursor()) { + if (candidate.toString() == "他爱") { + break; + } + i++; + } + FCITX_ASSERT(i < c.candidatesToCursor().size()); + c.selectCandidatesToCursor(i); + + FCITX_ASSERT(c.selected()); + FCITX_ASSERT(c.selectedSentence() == "他爱"); + c.learn(); + c.clear(); + FCITX_ASSERT(ime.model()->history().containsBigram("他", "爱")); + } + return 0; } diff --git a/test/testpinyinime_unit.cpp b/test/testpinyinime_unit.cpp index f3b9d3b..473d5a4 100644 --- a/test/testpinyinime_unit.cpp +++ b/test/testpinyinime_unit.cpp @@ -8,6 +8,7 @@ #include #include #include +#include "libime/core/historybigram.h" #include "libime/core/userlanguagemodel.h" #include "libime/pinyin/pinyincontext.h" #include "libime/pinyin/pinyincorrectionprofile.h" @@ -55,8 +56,8 @@ int main() { "ni'hao'zhong'guo", "你好中国")); c.select(std::distance(c.candidates().begin(), iter)); c.learn(); - FCITX_ASSERT(ime.dict()->lookupWord(PinyinDictionary::UserDict, - "ni'hao'zhong'guo", "你好中国")); + FCITX_ASSERT(ime.model()->history().containsBigram("你", "好")); + FCITX_ASSERT(ime.model()->history().containsBigram("好", "中国")); c.setUseShuangpin(true);