emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/pyim 19a18b414b: 优化候选词排序机制。


From: ELPA Syncer
Subject: [elpa] externals/pyim 19a18b414b: 优化候选词排序机制。
Date: Sun, 2 Jan 2022 20:57:41 -0500 (EST)

branch: externals/pyim
commit 19a18b414bc60e5f58f4d800b0d6994164df4baf
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>

    优化候选词排序机制。
    
            * pyim-dregcache.el (pyim-dregcache-sort-words): Add iword2count 
and count-weight-table argument.
    
            * pyim-dhashcache.el (pyim-dhashcache-sort-words): Add 
count-weight-table.
    
            * pyim-candidates.el (pyim-candidates-create-weight-table): New 
function.
            (pyim-candidates-sort): Add weight-table arguments.
            (pyim-candidates-create:xingma, pyim-candidates-create-quanpin): 
Update.
    
        * tests/pyim-tests.el (pyim-tests-pyim-dhashcache-sort-words): New test.
---
 pyim-candidates.el  | 33 ++++++++++++++++++++++++++-------
 pyim-dhashcache.el  | 17 ++++++++++++-----
 pyim-dregcache.el   | 24 ++++++++++++++++--------
 tests/pyim-tests.el | 24 ++++++++++++++++++++++++
 4 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/pyim-candidates.el b/pyim-candidates.el
index 32e425a94e..063c733a4d 100644
--- a/pyim-candidates.el
+++ b/pyim-candidates.el
@@ -59,9 +59,22 @@
  '(pyim-candidates pyim-candidate-position))
 
 ;; ** 获取备选词列表
-(defun pyim-candidates-sort (candidates)
+(defun pyim-candidates-create-weight-table (words)
+  "基于 WORDS 的先后顺序,创建一个用于候选词排序的 count 权重表。
+count 和 count 权重结合起来确定词条的先后顺序。"
+  (let ((table (make-hash-table :test #'equal))
+        ;; FIXME: 这个权重列表是想当然的数字,因为目前我也不知道这个合理的权重是
+        ;; 什么,希望以后通过实际使用,可以总结出更合理的数字。
+        (weights (list 1.3 1.2 1.1)))
+    (dolist (weight weights)
+      (let ((word (pop words)))
+        (when word
+          (puthash word weight table))))
+    table))
+
+(defun pyim-candidates-sort (candidates &optional weight-table)
   "对 CANDIDATES 进行排序。"
-  (pyim-dcache-call-api 'sort-words candidates))
+  (pyim-dcache-call-api 'sort-words candidates nil weight-table))
 
 (defun pyim-candidates-create (imobjs scheme-name &optional async)
   "按照 SCHEME-NAME 对应的输入法方案, 从输入法内部对象列表:
@@ -144,12 +157,14 @@ IMOBJS 获得候选词条。"
                 ;; NOTE: 下面这种策略是否合理?
                 ;; 1. 第一个词选择公共词库中的第一个词。
                 ;; 2. 剩下的分成常用字和词,常用字优先排,字和词各按 count 大小排序。
-                (let* ((personal-words
+                (let* ((personal-words (pyim-dcache-get last-code 
'(icode2word)))
+                       (weight-table (pyim-candidates-create-weight-table 
personal-words))
+                       (personal-words
                         (pyim-candidates-sort
-                         (pyim-dcache-get last-code '(icode2word))))
+                         personal-words weight-table))
                        (common-words (pyim-dcache-get last-code '(code2word)))
                        (chief-word (pyim-candidates-get-chief scheme-name 
personal-words common-words))
-                       (common-words (pyim-candidates-sort common-words))
+                       (common-words (pyim-candidates-sort common-words 
weight-table))
                        (other-words (pyim-dcache-get last-code 
'(shortcode2word))))
                   (mapcar (lambda (word)
                             (concat prefix word))
@@ -215,7 +230,10 @@ IMOBJS 获得候选词条。"
 
 (defun pyim-candidates-create-quanpin (imobjs scheme-name &optional 
fast-search)
   "`pyim-candidates-create:quanpin' 内部使用的函数。"
-  (let (jianpin-words znabc-words personal-words common-words pinyin-chars-1 
pinyin-chars-2 chief-word)
+  (let ( jianpin-words znabc-words
+         personal-words common-words
+         pinyin-chars-1 pinyin-chars-2
+         chief-word weight-table)
     ;; 智能ABC模式,得到尽可能的拼音组合,查询这些组合,得到的词条做为联想词。
     (let ((codes (mapcar (lambda (x)
                            (pyim-subconcat x "-"))
@@ -296,7 +314,8 @@ IMOBJS 获得候选词条。"
     ;; 个人词条排序:使用词频信息对个人词库得到的候选词排序,第一个词条的位置
     ;; 比较特殊,不参与排序,具体原因请参考 `pyim-page-select-word' 中的
     ;; comment.
-    (setq personal-words (pyim-candidates-sort personal-words))
+    (setq weight-table (pyim-candidates-create-weight-table personal-words))
+    (setq personal-words (pyim-candidates-sort personal-words weight-table))
     (setq chief-word (pyim-candidates-get-chief scheme-name personal-words))
 
     ;; 调试输出
diff --git a/pyim-dhashcache.el b/pyim-dhashcache.el
index 40766bd956..14d42c02de 100644
--- a/pyim-dhashcache.el
+++ b/pyim-dhashcache.el
@@ -52,16 +52,23 @@
 (defvar pyim-dhashcache-update-icode2word-p nil)
 (defvar pyim-dhashcache-update-code2word-running-p nil)
 
-(defun pyim-dhashcache-sort-words (words-list &optional iword2count)
+(defun pyim-dhashcache-sort-words (words-list &optional iword2count 
count-weight-table)
   "对 WORDS-LIST 排序,词频大的排在前面.
 
 如果 IWORD2COUNT 为 nil, 排序将使用 `pyim-dhashcache-iword2count'
-中记录的词频信息"
-  (let ((iword2count (or iword2count pyim-dhashcache-iword2count)))
+中记录的词频信息
+
+COUNT-WEIGHT-TABLE 是一个哈希表,保存词条的 count 权重,在排序过
+程中, ‘count * 权重’ 的取值决定了排序先后顺序, 权重是一个不小于1
+的数字。"
+  (let ((iword2count (or iword2count pyim-dhashcache-iword2count))
+        (count-weight-table (or count-weight-table (make-hash-table :test 
#'equal))))
     (sort words-list
           (lambda (a b)
-            (> (or (gethash a iword2count) 0)
-               (or (gethash b iword2count) 0))))))
+            (> (* (or (gethash a iword2count) 0)
+                  (or (gethash a count-weight-table) 1))
+               (* (or (or (gethash b iword2count) 0)
+                      (or (gethash b count-weight-table) 1))))))))
 
 (defun pyim-dhashcache-get-shortcodes (code)
   "获取 CODE 所有的 shortcodes.
diff --git a/pyim-dregcache.el b/pyim-dregcache.el
index 09a561bea7..4a7825163c 100644
--- a/pyim-dregcache.el
+++ b/pyim-dregcache.el
@@ -67,16 +67,24 @@
         (insert-file-contents file)
         (buffer-string)))))
 
-(defun pyim-dregcache-sort-words (words-list)
+(defun pyim-dregcache-sort-words (words-list &optional iword2count 
count-weight-table)
   "对 WORDS-LIST 排序,词频大的排在前面.
 
-排序使用 `pyim-dregcache-iword2count' 中记录的词频信息"
-  (sort words-list
-        (lambda (a b)
-          (let ((a (car (split-string a ":")))
-                (b (car (split-string b ":"))))
-            (> (or (gethash a pyim-dregcache-iword2count) 0)
-               (or (gethash b pyim-dregcache-iword2count) 0))))))
+如果 IWORD2COUNT 为 nil, 排序将使用 `pyim-dregcache-iword2count'
+中记录的词频信息
+
+COUNT-WEIGHT-TABLE 是一个哈希表,保存词条的 count 权重,在排序过
+程中, ‘count * 权重’ 的取值决定了排序先后顺序。"
+  (let ((iword2count (or iword2count pyim-dregcache-iword2count))
+        (count-weight-table (or count-weight-table (make-hash-table :test 
#'equal))))
+    (sort words-list
+          (lambda (a b)
+            (let ((a (car (split-string a ":")))
+                  (b (car (split-string b ":"))))
+              (> (* (or (gethash a iword2count) 0)
+                    (or (gethash a count-weight-table) 1))
+                 (* (or (or (gethash b iword2count) 0)
+                        (or (gethash b count-weight-table) 1)))))))))
 
 (defun pyim-dregcache-sort-icode2word ()
   "对个人词库排序."
diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el
index 51f713646f..2bec8f6707 100644
--- a/tests/pyim-tests.el
+++ b/tests/pyim-tests.el
@@ -858,6 +858,30 @@ yin-xing 因行
     (should (equal (gethash "n-h" pyim-dhashcache-ishortcode2word)
                    '("你慌" "你好" "你坏")))))
 
+(ert-deftest pyim-tests-pyim-dhashcache-sort-words ()
+  (let ((pyim-dhashcache-iword2count (make-hash-table :test #'equal))
+        (weight-table (make-hash-table :test #'equal))
+        words)
+    (puthash "你好" 3 pyim-dhashcache-iword2count)
+    (puthash "呢耗" 2 pyim-dhashcache-iword2count)
+    (puthash "你豪" 1 pyim-dhashcache-iword2count)
+
+    (puthash "你好" 0.1 weight-table)
+    (puthash "呢耗" 0.3 weight-table)
+    (puthash "你豪" 5   weight-table)
+
+    (setq words (list "呢耗" "你豪" "你好"))
+    (should (equal (pyim-dhashcache-sort-words words)
+                   '("你好" "呢耗" "你豪")))
+
+    (setq words (list "呢耗" "你豪" "你好"))
+    (should (equal (pyim-dhashcache-sort-words words 
pyim-dhashcache-iword2count)
+                   '("你好" "呢耗" "你豪")))
+
+    (setq words (list "呢耗" "你豪" "你好"))
+    (should  (equal (pyim-dhashcache-sort-words words nil weight-table)
+                    '("你豪" "呢耗" "你好")))))
+
 ;; ** pyim-dregcache 相关单元测试
 (ert-deftest pyim-tests-pyim-general ()
   (let ((pyim-dcache-backend 'pyim-dregcache))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]