[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/pyim 12435095d3 08/12: Sort dcache, dhashcache, dregcac
From: |
ELPA Syncer |
Subject: |
[elpa] externals/pyim 12435095d3 08/12: Sort dcache, dhashcache, dregcache |
Date: |
Thu, 9 Jun 2022 10:57:53 -0400 (EDT) |
branch: externals/pyim
commit 12435095d326c868aff514a1c8f312d943d25598
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>
Sort dcache, dhashcache, dregcache
---
pyim-dcache.el | 156 ++++-----
pyim-dhashcache.el | 904 +++++++++++++++++++++++++++--------------------------
pyim-dregcache.el | 581 +++++++++++++++++-----------------
3 files changed, 833 insertions(+), 808 deletions(-)
diff --git a/pyim-dcache.el b/pyim-dcache.el
index e812d3972c..8f0848f8f0 100644
--- a/pyim-dcache.el
+++ b/pyim-dcache.el
@@ -65,15 +65,7 @@ pyim 对资源的消耗。
2. 自动更新功能无法正常工作,用户通过手工从其他机器上拷贝
dcache 文件的方法让 pyim 正常工作。")
-;; ** Dcache 变量处理相关功能
-(cl-defgeneric pyim-dcache-init-variables ()
- "初始化 dcache 缓存相关变量."
- nil)
-
-(cl-defmethod pyim-dcache-init-variables :before ()
- (unless (featurep pyim-dcache-backend)
- (require pyim-dcache-backend)))
-
+;; ** Dcache 变量初始化相关函数
(defmacro pyim-dcache-init-variable (variable &optional fallback-value)
"初始化 VARIABLE.
@@ -85,18 +77,22 @@ dcache 文件的方法让 pyim 正常工作。")
,fallback-value
(make-hash-table :test #'equal)))))
-(defmacro pyim-dcache-reload-variable (variable)
- "从 `pyim-dcache-directory' 重新读取并设置 VARIABLE 的值."
- `(when (symbolp ',variable)
- (setq ,variable (or (pyim-dcache-get-value ',variable)
- (make-hash-table :test #'equal)))))
-
(defun pyim-dcache-get-value (variable)
"从 `pyim-dcache-directory' 中读取与 VARIABLE 对应的文件中保存的值."
(let ((file (expand-file-name (url-hexify-string (symbol-name variable))
pyim-dcache-directory)))
(pyim-dcache-get-value-from-file file)))
+(defun pyim-dcache-get-value-from-file (file)
+ "读取保存到 FILE 里面的 value."
+ (when (and (> (length file) 0)
+ (file-exists-p file))
+ (with-temp-buffer
+ (insert-file-contents file)
+ (ignore-errors
+ (read (current-buffer))))))
+
+;; ** Dcache 保存变量相关函数
(defun pyim-dcache-save-variable (variable value &optional
auto-backup-threshold)
"将 VARIABLE 变量的取值保存到 `pyim-dcache-directory' 中对应文件中.
@@ -106,14 +102,6 @@ dcache 文件的方法让 pyim 正常工作。")
pyim-dcache-directory)))
(pyim-dcache-save-value-to-file value file auto-backup-threshold)))
-(defun pyim-dcache-value-length (value)
- "获取 VALUE 的某个可以作为长度的值."
- (or (ignore-errors
- (if (hash-table-p value)
- (hash-table-count value)
- (length value)))
- 0))
-
(defun pyim-dcache-save-value-to-file (value file &optional
auto-backup-threshold)
"将 VALUE 保存到 FILE 文件中.
@@ -144,16 +132,14 @@ AUTO-BACKUP-THRESHOLD 倍, 那么原值将自动备份到 FILE 对应的备份
(insert ";; End:")
(pyim-dcache-write-file file)))))
-(defun pyim-dcache-get-value-from-file (file)
- "读取保存到 FILE 里面的 value."
- (when (and (> (length file) 0)
- (file-exists-p file))
- (with-temp-buffer
- (insert-file-contents file)
- (ignore-errors
- (read (current-buffer))))))
+(defun pyim-dcache-value-length (value)
+ "获取 VALUE 的某个可以作为长度的值."
+ (or (ignore-errors
+ (if (hash-table-p value)
+ (hash-table-count value)
+ (length value)))
+ 0))
-;; ** Dcache 文件处理功能
(defun pyim-dcache-write-file (filename &optional confirm)
"A helper function to write dcache files."
(let ((coding-system-for-write 'utf-8-unix)
@@ -170,30 +156,52 @@ AUTO-BACKUP-THRESHOLD 倍, 那么原值将自动备份到 FILE 对应的备份
(write-region (point-min) (point-max) filename nil :silent)
(message "Saving file %s..." filename)))
-(cl-defgeneric pyim-dcache-save-caches ()
- "保存 dcache.
+;; ** Dcache 重新加载变量相关函数
+(defmacro pyim-dcache-reload-variable (variable)
+ "从 `pyim-dcache-directory' 重新读取并设置 VARIABLE 的值."
+ `(when (symbolp ',variable)
+ (setq ,variable (or (pyim-dcache-get-value ',variable)
+ (make-hash-table :test #'equal)))))
-将用户选择过的词生成的缓存和词频缓存的取值
-保存到它们对应的文件中.")
+;; ** Dcache 初始化功能接口
+(cl-defgeneric pyim-dcache-init-variables ()
+ "初始化 dcache 缓存相关变量."
+ nil)
-;; ** Dcache 导出功能
-(cl-defgeneric pyim-dcache-export-words-and-counts (file &optional confirm
ignore-counts)
- "将个人词条以及词条对应的词频信息导出到文件 FILE.
+(cl-defmethod pyim-dcache-init-variables :before ()
+ (unless (featurep pyim-dcache-backend)
+ (require pyim-dcache-backend)))
-如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为
-non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式")
+;; ** Dcache 检索词条功能接口
+(cl-defgeneric pyim-dcache-get (_code &optional _from)
+ "从 FROM 对应的 dcache 中搜索 CODE, 得到对应的词条.
-(cl-defgeneric pyim-dcache-export-personal-words (file &optional confirm)
- "将用户的个人词条导出为 pyim 词库文件.
+当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
+code 对应的中文词条了."
+ nil)
-如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil,
-文件存在时将会提示用户是否覆盖,默认为覆盖模式。")
+(cl-defmethod pyim-dcache-get :before (_code &optional _from)
+ (unless (featurep pyim-dcache-backend)
+ (require pyim-dcache-backend)))
-;; ** Dcache 更新功能
-(cl-defgeneric pyim-dcache-update (&optional force)
- "读取并加载所有相关词库 dcache.
+;; ** Dcache 代码反查功能接口
+(cl-defgeneric pyim-dcache-search-word-code (word)
+ "从 dcache 中搜索 WROD 对应的 code.")
+
+;; ** Dcache 加词功能接口
+(cl-defgeneric pyim-dcache-insert-word (word code prepend)
+ "将词条 WORD 插入到 dcache 中。
+
+如果 PREPEND 为 non-nil, 词条将放到已有词条的最前面。
+内部函数会根据 CODE 来确定插入对应的 hash key.")
+
+;; ** Dcache 删词功能
+(cl-defgeneric pyim-dcache-delete-word (word)
+ "将中文词条 WORD 从个人词库中删除")
-如果 FORCE 为真,强制加载。")
+;; ** Dcache 更新功能接口
+(cl-defgeneric pyim-dcache-update (&optional force)
+ "读取并加载所有相关词库 dcache, 如果 FORCE 为真,强制加载。")
(defun pyim-dcache-create-files-md5 (files)
"为 FILES 生成 md5 字符串。"
@@ -204,6 +212,7 @@ non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式"
(list version file (nth 5 (file-attributes file 'string))))
files)))))
+;; ** Dcache 更新词条统计量功能接口
(cl-defgeneric pyim-dcache-update-wordcount (word &optional wordcount-handler)
"保存 WORD 词频.
@@ -212,42 +221,35 @@ non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式"
2. 如果 WORDCOUNT-HANDLER 是一个数值:那么这个数值直接作为词频保存。
3. 如果 WORDCOUNT-HANDLER 为其他值:词频不变.")
-;; ** Dcache 加词功能
-(cl-defgeneric pyim-dcache-insert-word (word code prepend)
- "将词条 WORD 插入到 dcache 中。
-
-如果 PREPEND 为 non-nil, 词条将放到已有词条的最前面。
-内部函数会根据 CODE 来确定插入对应的 hash key.")
-
-;; ** Dcache 升级功能
+;; ** Dcache 升级功能接口
(cl-defgeneric pyim-dcache-upgrade ()
"升级词库缓存.")
-;; ** Dcache 删词功能
-(cl-defgeneric pyim-dcache-delete-word (word)
- "将中文词条 WORD 从个人词库中删除")
+;; ** Dcache 排序功能接口
+(cl-defgeneric pyim-dcache-sort-words (words)
+ "对 WORDS 进行排序。"
+ words)
-;; ** Dcache 检索功能
-(cl-defgeneric pyim-dcache-get (code &optional from)
- "从 FROM 对应的 dcache 中搜索 CODE, 得到对应的词条.
+;; ** Dcache 保存功能接口
+(cl-defgeneric pyim-dcache-save-caches ()
+ "保存 dcache.
-当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
-code 对应的中文词条了."
- ;; Fix compile warn
- (ignore code from)
- nil)
+将用户选择过的词生成的缓存和词频缓存的取值
+保存到它们对应的文件中.")
-(cl-defmethod pyim-dcache-get :before (_code &optional _from)
- (unless (featurep pyim-dcache-backend)
- (require pyim-dcache-backend)))
+;; ** Dcache 导出功能接口
+(cl-defgeneric pyim-dcache-export-words-and-counts (file &optional confirm
ignore-counts)
+ "将个人词条以及词条对应的词频信息导出到文件 FILE.
-(cl-defgeneric pyim-dcache-search-word-code (word)
- "从 dcache 中搜索 WROD 对应的 code.")
+如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为
+non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式")
+
+(cl-defgeneric pyim-dcache-export-personal-words (file &optional confirm)
+ "将用户的个人词条导出为 pyim 词库文件.
+
+如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil,
+文件存在时将会提示用户是否覆盖,默认为覆盖模式。")
-;; ** Dcache 排序功能
-(cl-defgeneric pyim-dcache-sort-words (words)
- "对 WORDS 进行排序。"
- words)
;; * Footer
(provide 'pyim-dcache)
diff --git a/pyim-dhashcache.el b/pyim-dhashcache.el
index fc8b43cf06..f07cdf7811 100644
--- a/pyim-dhashcache.el
+++ b/pyim-dhashcache.el
@@ -79,6 +79,161 @@
(defvar pyim-dhashcache-update-iword2priority-p nil)
(defvar pyim-dhashcache-update-code2word-running-p nil)
+;; ** 初始化 dhashcache 相关函数
+(cl-defmethod pyim-dcache-init-variables
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "初始化 dcache 缓存相关变量."
+ (when (and (not pyim-dhashcache-icode2word)
+ pyim-dcache-directory
+ (file-directory-p pyim-dcache-directory)
+ (directory-files pyim-dcache-directory nil "-backup-"))
+ (message "PYIM: 在 %S 目录中发现备份文件的存在,可能是词库缓存文件损坏导致,请抓紧检查处理!!!"
+ pyim-dcache-directory))
+ (pyim-dhashcache-init-count-and-priority-variables)
+ (pyim-dcache-init-variable pyim-dhashcache-code2word)
+ (pyim-dcache-init-variable pyim-dhashcache-word2code)
+ (pyim-dcache-init-variable pyim-dhashcache-shortcode2word)
+ (pyim-dcache-init-variable pyim-dhashcache-icode2word)
+ (pyim-dcache-init-variable pyim-dhashcache-ishortcode2word))
+
+(defun pyim-dhashcache-init-count-and-priority-variables ()
+ "初始化 count 相关的变量。"
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count-log)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-10-words)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-50-words)
+ (pyim-dcache-init-variable pyim-dhashcache-iword2priority))
+
+;; ** 从 dhashcache 搜索词条相关函数
+(cl-defmethod pyim-dcache-get
+ (code &context (pyim-dcache-backend (eql pyim-dhashcache))
+ &optional from)
+ "从 FROM 对应的 dcaches 中搜索 CODE, 得到对应的词条.
+
+当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
+code 对应的中文词条了。
+
+如果 FROM 为 nil, 则默认搜索 `pyim-dhashcache-icode2word' 和
+`pyim-dhashcache-code2word' 两个 dcache."
+ (when code
+ (let* ((caches (mapcar (lambda (x)
+ (intern (concat "pyim-dhashcache-" (symbol-name
x))))
+ (or (and from
+ (if (listp from)
+ from
+ (list from)))
+ '(icode2word code2word))))
+ result)
+ (dolist (cache caches)
+ (let* ((cache (ignore-errors (symbol-value cache)))
+ (value (and cache (gethash code cache))))
+ ;; 处理 iword2count.
+ (unless (listp value)
+ (setq value (list value)))
+ (when value
+ (setq result (append result value)))))
+ result)))
+
+;; ** 从 dhashcache 搜索代码相关函数
+(cl-defmethod pyim-dcache-search-word-code
+ (string &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ (gethash string pyim-dhashcache-word2code))
+
+;; ** 给 dhashcache 添加词条相关函数
+(cl-defmethod pyim-dcache-insert-word
+ (word code prepend
+ &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "将词条 WORD 插入到下面两个词库缓存中。
+
+1. `pyim-dhashcache-icode2word'
+2. `pyim-dhashcache-insert-word-into-ishortcode2word'."
+ (pyim-dhashcache-insert-word-into-icode2word word code prepend)
+ ;; NOTE: 保存词条到 icode2word 词库缓存的同时,也在 ishortcode2word 词库缓存中
+ ;; 临时写入一份,供当前 Emacs session 使用,但退出时 pyim 不会保存
+ ;; ishortcode2word 词库缓存到文件,因为下次启动 Emacs 的时候,ishortcode2word
+ ;; 词库缓存会从 icode2word 再次重建。
+ (pyim-dhashcache-insert-word-into-ishortcode2word word code prepend))
+
+(defmacro pyim-dhashcache-put (cache code &rest body)
+ "将 BODY 的返回值保存到 CACHE 对应的 CODE 中。
+
+注意事项:这个宏是一个指代宏,其中 orig-value 在这个宏中有特殊含
+义,代表原来 code 对应的取值。"
+ (declare (indent 0))
+ (let ((key (make-symbol "key"))
+ (table (make-symbol "table"))
+ (new-value (make-symbol "new-value")))
+ `(let* ((,key ,code)
+ (,table ,cache)
+ (orig-value (gethash ,key ,table))
+ ,new-value)
+ (setq ,new-value (progn ,@body))
+ (puthash ,key ,new-value ,table))))
+
+(defun pyim-dhashcache-insert-word-into-icode2word (word code prepend)
+ "将词条 WORD 插入到 icode2word 词库缓存 CODE 键对应的位置.
+
+默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
+到已有词条的最前面。"
+ (pyim-dhashcache-put
+ pyim-dhashcache-icode2word code
+ (if prepend
+ `(,word ,@(remove word orig-value))
+ `(,@(remove word orig-value) ,word))))
+
+(defun pyim-dhashcache-insert-word-into-ishortcode2word (word code prepend)
+ "将词条 WORD 插入到 ishortcode2word 词库缓存 CODE 首字母字符串对应的位置.
+
+默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
+到已有词条的最前面。"
+ (dolist (newcode (pyim-dhashcache-get-ishortcodes code))
+ (pyim-dhashcache-put
+ pyim-dhashcache-ishortcode2word
+ newcode
+ (if prepend
+ `(,word ,@(remove word orig-value))
+ `(,@(remove word orig-value) ,word)))))
+
+(defun pyim-dhashcache-get-ishortcodes (code)
+ "获取CODE 所有的简写 ishortcodes.
+
+比如: ni-hao -> (n-h)
+
+注意事项:这个函数用于全拼输入法。"
+ (when (and (> (length code) 0)
+ (not (pyim-string-match-p "/" code))
+ (not (pyim-string-match-p "[^a-z-]" code)))
+ (list (mapconcat
+ (lambda (x)
+ (substring x 0 1))
+ (split-string code "-") "-"))))
+
+;; ** 从 dhashcache 删除词条相关函数
+(cl-defmethod pyim-dcache-delete-word
+ (word &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "将中文词条 WORD 从个人词库中删除"
+ (maphash
+ (lambda (key value)
+ (when (member word value)
+ (let ((new-value (remove word value)))
+ (if new-value
+ (puthash key new-value pyim-dhashcache-icode2word)
+ (remhash key pyim-dhashcache-icode2word)))))
+ pyim-dhashcache-icode2word)
+ (maphash
+ (lambda (key value)
+ (when (member word value)
+ (print value)
+ (let ((new-value (remove word value)))
+ (if new-value
+ (puthash key new-value pyim-dhashcache-ishortcode2word)
+ (remhash key pyim-dhashcache-ishortcode2word)))))
+ pyim-dhashcache-ishortcode2word)
+ (remhash word pyim-dhashcache-iword2count)
+ (remhash word pyim-dhashcache-iword2count-log)
+ (remhash word pyim-dhashcache-iword2priority))
+
+;; ** 更新 dhashcache 相关函数
(cl-defmethod pyim-dcache-update
(&context (pyim-dcache-backend (eql pyim-dhashcache)) &optional force)
"读取并加载所有相关词库 dcache.
@@ -92,23 +247,57 @@
(dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
(pyim-dhashcache-update-code2word dict-files dicts-md5 force))))
-(cl-defmethod pyim-dcache-sort-words
- (words-list &context (pyim-dcache-backend (eql pyim-dhashcache)))
- "对 WORDS-LIST 排序"
- (let ((iword2count pyim-dhashcache-iword2count)
- (iword2priority pyim-dhashcache-iword2priority))
- (sort words-list
- (lambda (a b)
- (let ((p1 (gethash a iword2priority))
- (p2 (gethash b iword2priority)))
- (cond
- ((and (listp p1)
- (listp p2)
- (not (equal p1 p2)))
- (pyim-numbers> p1 p2))
- (t (let ((n1 (or (gethash a iword2count) 0))
- (n2 (or (gethash b iword2count) 0)))
- (> n1 n2)))))))))
+(defun pyim-dhashcache-update-iword2priority (&optional force)
+ "更新词条优先级表,如果 FORCE 为真,强制更新。"
+ (interactive)
+ (when (or force (not pyim-dhashcache-update-iword2priority-p))
+ ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
+ ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
+ (setq pyim-dhashcache-update-iword2priority-p t)
+ (async-start
+ `(lambda ()
+ ,@(pyim-dhashcache-async-inject-variables)
+ (require 'pyim-dhashcache)
+ (pyim-dhashcache-init-count-and-priority-variables)
+ (maphash
+ (lambda (key value)
+ (puthash key
+ (pyim-dhashcache-calculate-priority
+ (pyim-dhashcache-get-counts-from-log
+ value))
+ pyim-dhashcache-iword2priority))
+ pyim-dhashcache-iword2count-log)
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2priority
+ pyim-dhashcache-iword2priority)
+ nil)
+ (lambda (_)
+ (pyim-dcache-reload-variable pyim-dhashcache-iword2priority)))))
+
+(defun pyim-dhashcache-async-inject-variables ()
+ "pyim's async-inject-variables."
+ (list (async-inject-variables "^load-path$")
+ (async-inject-variables "^exec-path$")
+ (async-inject-variables "^pyim-.+?directory$")))
+
+(defun pyim-dhashcache-calculate-priority (counts-info)
+ "根据 COUNTS-INFO 计算优先级(优先级是多个数字组成的一个列表),
+用于对词条进行排序。COUNTS-INFO 是一个 alist, 其结构类似:
+
+ ((day n1 n2 n3 ...))
+
+其中 (n1 n2 n3 ...) 代表从当前日期逐日倒推,每日 count 所组成的列表。"
+ (mapcar (lambda (x)
+ (let* ((label (car x))
+ (plist (cdr x))
+ (weights (plist-get plist :weights))
+ (factor (plist-get plist :factor)))
+ (round (* (apply #'+ (cl-mapcar (lambda (a b)
+ (* (or a 0) b))
+ (cdr (assoc label counts-info))
+ weights))
+ factor))))
+ pyim-dhashcache-count-types))
(defun pyim-dhashcache-get-counts-from-log (log-info &optional time)
"从 LOG-INFO 中获取所有的 count 值。
@@ -133,64 +322,37 @@
`(,label ,@(reverse output))))
pyim-dhashcache-count-types))
-(defun pyim-dhashcache-calculate-priority (counts-info)
- "根据 COUNTS-INFO 计算优先级(优先级是多个数字组成的一个列表),
-用于对词条进行排序。COUNTS-INFO 是一个 alist, 其结构类似:
-
- ((day n1 n2 n3 ...))
-
-其中 (n1 n2 n3 ...) 代表从当前日期逐日倒推,每日 count 所组成的列表。"
- (mapcar (lambda (x)
- (let* ((label (car x))
- (plist (cdr x))
- (weights (plist-get plist :weights))
- (factor (plist-get plist :factor)))
- (round (* (apply #'+ (cl-mapcar (lambda (a b)
- (* (or a 0) b))
- (cdr (assoc label counts-info))
- weights))
- factor))))
- pyim-dhashcache-count-types))
-
-(defun pyim-dhashcache-get-shortcodes (code)
- "获取 CODE 所有的 shortcodes.
-
-比如:wubi/aaaa -> (wubi/aaa wubi/aa)
-
-注意事项:这个函数目前只用于五笔等型码输入法,不用于拼音输入法,
-因为拼音输入法词库太大,这样处理之后,会生成一个特别大的哈希表,
-占用太多内存资源,拼音输入法使用 ishortcode 机制。"
- (when (and (pyim-string-match-p "/" code)
- (not (pyim-string-match-p "-" code)))
- (let* ((x (split-string code "/"))
- (prefix (concat (nth 0 x) "/"))
- (code1 (nth 1 x))
- (n (length code1))
- results)
- (dotimes (i n)
- (when (> i 1)
- (push (concat prefix (substring code1 0 i)) results)))
- results)))
-
-(defun pyim-dhashcache-get-ishortcodes (code)
- "获取CODE 所有的简写 ishortcodes.
-
-比如: ni-hao -> (n-h)
+(defun pyim-dhashcache-update-personal-words (&optional force)
+ (pyim-dhashcache-update-icode2word force))
-注意事项:这个函数用于全拼输入法。"
- (when (and (> (length code) 0)
- (not (pyim-string-match-p "/" code))
- (not (pyim-string-match-p "[^a-z-]" code)))
- (list (mapconcat
- (lambda (x)
- (substring x 0 1))
- (split-string code "-") "-"))))
+(defun pyim-dhashcache-update-icode2word (&optional force)
+ "对 personal 缓存中的词条进行排序,加载排序后的结果.
-(defun pyim-dhashcache-async-inject-variables ()
- "pyim's async-inject-variables."
- (list (async-inject-variables "^load-path$")
- (async-inject-variables "^exec-path$")
- (async-inject-variables "^pyim-.+?directory$")))
+在这个过程中使用了 `pyim-dhashcache-iword2count' 中记录的词频信息。
+如果 FORCE 为真,强制排序。"
+ (interactive)
+ (when (or force (not pyim-dhashcache-update-icode2word-p))
+ ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
+ ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
+ (setq pyim-dhashcache-update-icode2word-p t)
+ (async-start
+ `(lambda ()
+ ,@(pyim-dhashcache-async-inject-variables)
+ (require 'pyim-dhashcache)
+ (pyim-dcache-init-variable pyim-dhashcache-icode2word)
+ (pyim-dhashcache-init-count-and-priority-variables)
+ (maphash
+ (lambda (key value)
+ (puthash key (pyim-dcache-sort-words value)
+ pyim-dhashcache-icode2word))
+ pyim-dhashcache-icode2word)
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-icode2word
+ pyim-dhashcache-icode2word)
+ nil)
+ (lambda (_)
+ (pyim-dcache-reload-variable pyim-dhashcache-icode2word)
+ (pyim-dhashcache-update-ishortcode2word force)))))
(defun pyim-dhashcache-update-ishortcode2word (&optional force)
"读取 `pyim-dhashcache-icode2word' 中的词库,创建 *简拼* 缓存,然后加载这个缓存.
@@ -233,54 +395,62 @@
ishortcode2word)
ishortcode2word))
-(defun pyim-dhashcache-update-shortcode2word (&optional force)
- "使用 `pyim-dhashcache-code2word' 中的词条,创建简写 code 词库缓存并加载.
+(defun pyim-dhashcache-update-code2word (dict-files dicts-md5 &optional force)
+ "读取并加载词库.
-如果 FORCE 为真,强制运行。"
+读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
+
+如果 FORCE 为真,强制加载。"
(interactive)
- (when (or force (not pyim-dhashcache-update-shortcode2word-p))
- ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
- ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-shortcode2word-p t)
- (async-start
- `(lambda ()
- ,@(pyim-dhashcache-async-inject-variables)
- (require 'pyim-dhashcache)
- (pyim-dcache-init-variable pyim-dhashcache-code2word)
- (pyim-dhashcache-init-count-and-priority-variables)
- (pyim-dcache-save-variable
- 'pyim-dhashcache-shortcode2word
- (pyim-dhashcache-update-shortcode2word-1
- pyim-dhashcache-code2word)))
- (lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-shortcode2word)))))
+ (let* ((code2word-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word))
+ (word2code-file (pyim-dhashcache-get-path 'pyim-dhashcache-word2code))
+ (code2word-md5-file (pyim-dhashcache-get-path
'pyim-dhashcache-code2word-md5)))
+ (when (or force (and (not (equal dicts-md5
(pyim-dcache-get-value-from-file code2word-md5-file)))
+ (not pyim-dhashcache-update-code2word-running-p)))
+ (setq pyim-dhashcache-update-code2word-running-p t)
+ ;; use hashtable
+ (async-start
+ `(lambda ()
+ ,@(pyim-dhashcache-async-inject-variables)
+ (require 'pyim-dhashcache)
+ (let ((dcache (pyim-dhashcache-generate-dcache-file ',dict-files
,code2word-file)))
+ (pyim-dhashcache-generate-word2code-dcache-file dcache
,word2code-file))
+ (pyim-dcache-save-value-to-file ',dicts-md5 ,code2word-md5-file))
+ (lambda (_)
+ (pyim-dcache-reload-variable pyim-dhashcache-code2word)
+ (pyim-dcache-reload-variable pyim-dhashcache-word2code)
+ (pyim-dhashcache-update-shortcode2word force)
+ (setq pyim-dhashcache-update-code2word-running-p nil))))))
-(defun pyim-dhashcache-update-shortcode2word-1 (code2word)
- "`pyim-dhashcache-update-shortcode2word' 的内部函数"
- (let ((shortcode2word (make-hash-table :test #'equal)))
- (maphash
- (lambda (key value)
- (dolist (x (pyim-dhashcache-get-shortcodes key))
- (puthash x
- (mapcar
- (lambda (word)
- ;; 这个地方的代码用于实现五笔 code 自动提示功能,
- ;; 比如输入 'aa' 后得到选词框:
- ;; ----------------------
- ;; | 1. 莁aa 2.匶wv ... |
- ;; ----------------------
- (if (get-text-property 0 :comment word)
- word
- (propertize word :comment (substring key (length x)))))
- (delete-dups `(,@(gethash x shortcode2word) ,@value)))
- shortcode2word)))
- code2word)
- (maphash
- (lambda (key value)
- (puthash key (pyim-dcache-sort-words value)
- shortcode2word))
- shortcode2word)
- shortcode2word))
+(defun pyim-dhashcache-generate-word2code-dcache-file (dcache file)
+ "从 DCACHE 生成一个 word -> code 的反向查询表.
+DCACHE 是一个 code -> words 的 hashtable.
+并将生成的表保存到 FILE 中."
+ (when (hash-table-p dcache)
+ (let ((hashtable (make-hash-table :size 1000000 :test #'equal)))
+ (maphash
+ (lambda (code words)
+ ;; 这里主要考虑五笔仓颉等形码输入法,也就是 code-prefix 中包含 "/" 的输
+ ;; 入法,全拼输入法反查功能主要使用 pymap 实现,不使用这个表。
+ (when (pyim-string-match-p "/" code)
+ (dolist (word words)
+ (let ((value (gethash word hashtable))
+ ;; NOTE: 这里使用 `cl-copy-seq', 可以让保存的文件内容类似:
+ ;;
+ ;; "呵" ("he" "a")
+ ;;
+ ;; 而不是:
+ ;;
+ ;; "呵" (#9="he" #2#)
+ ;;
+ (code (cl-copy-seq code)))
+ (puthash word
+ (if value
+ `(,code ,@value)
+ (list code))
+ hashtable)))))
+ dcache)
+ (pyim-dcache-save-value-to-file hashtable file))))
(defun pyim-dhashcache-get-path (variable)
"获取保存 VARIABLE 取值的文件的路径."
@@ -318,263 +488,76 @@ pyim 使用的词库文件是简单的文本文件,编码 *强制* 为 \\='utf
(pyim-dcache-save-value-to-file hashtable dcache-file)
hashtable))
-(defun pyim-dhashcache-generate-word2code-dcache-file (dcache file)
- "从 DCACHE 生成一个 word -> code 的反向查询表.
-DCACHE 是一个 code -> words 的 hashtable.
-并将生成的表保存到 FILE 中."
- (when (hash-table-p dcache)
- (let ((hashtable (make-hash-table :size 1000000 :test #'equal)))
- (maphash
- (lambda (code words)
- ;; 这里主要考虑五笔仓颉等形码输入法,也就是 code-prefix 中包含 "/" 的输
- ;; 入法,全拼输入法反查功能主要使用 pymap 实现,不使用这个表。
- (when (pyim-string-match-p "/" code)
- (dolist (word words)
- (let ((value (gethash word hashtable))
- ;; NOTE: 这里使用 `cl-copy-seq', 可以让保存的文件内容类似:
- ;;
- ;; "呵" ("he" "a")
- ;;
- ;; 而不是:
- ;;
- ;; "呵" (#9="he" #2#)
- ;;
- (code (cl-copy-seq code)))
- (puthash word
- (if value
- `(,code ,@value)
- (list code))
- hashtable)))))
- dcache)
- (pyim-dcache-save-value-to-file hashtable file))))
-
-(defun pyim-dhashcache-update-code2word (dict-files dicts-md5 &optional force)
- "读取并加载词库.
-
-读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
-
-如果 FORCE 为真,强制加载。"
- (interactive)
- (let* ((code2word-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word))
- (word2code-file (pyim-dhashcache-get-path 'pyim-dhashcache-word2code))
- (code2word-md5-file (pyim-dhashcache-get-path
'pyim-dhashcache-code2word-md5)))
- (when (or force (and (not (equal dicts-md5
(pyim-dcache-get-value-from-file code2word-md5-file)))
- (not pyim-dhashcache-update-code2word-running-p)))
- (setq pyim-dhashcache-update-code2word-running-p t)
- ;; use hashtable
- (async-start
- `(lambda ()
- ,@(pyim-dhashcache-async-inject-variables)
- (require 'pyim-dhashcache)
- (let ((dcache (pyim-dhashcache-generate-dcache-file ',dict-files
,code2word-file)))
- (pyim-dhashcache-generate-word2code-dcache-file dcache
,word2code-file))
- (pyim-dcache-save-value-to-file ',dicts-md5 ,code2word-md5-file))
- (lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-code2word)
- (pyim-dcache-reload-variable pyim-dhashcache-word2code)
- (pyim-dhashcache-update-shortcode2word force)
- (setq pyim-dhashcache-update-code2word-running-p nil))))))
-
-(defun pyim-dhashcache-export (dcache file &optional confirm)
- "将一个 pyim DCACHE 导出为文件 FILE.
-
-如果 CONFIRM 为 non-nil,文件存在时将会提示用户是否覆盖,
-默认为覆盖模式"
- (with-temp-buffer
- (insert ";;; -*- coding: utf-8-unix -*-\n")
- (maphash
- (lambda (key value)
- (let ((value (cl-remove-if
- (lambda (x)
- ;; 如果某个词条的 text 属性 :noexport 设置为 t, 在导出的
- ;; 时候自动忽略这个词条。
- (and (stringp x)
- (get-text-property 0 :noexport x)))
- (if (listp value)
- value
- (list value)))))
- (when value
- (insert (format "%s %s\n" key (mapconcat #'identity value " "))))))
- dcache)
- (pyim-dcache-write-file file confirm)))
-
-(cl-defmethod pyim-dcache-get
- (code &context (pyim-dcache-backend (eql pyim-dhashcache))
- &optional from)
- "从 FROM 对应的 dcaches 中搜索 CODE, 得到对应的词条.
-
-当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个
-code 对应的中文词条了。
-
-如果 FROM 为 nil, 则默认搜索 `pyim-dhashcache-icode2word' 和
-`pyim-dhashcache-code2word' 两个 dcache."
- (when code
- (let* ((caches (mapcar (lambda (x)
- (intern (concat "pyim-dhashcache-" (symbol-name
x))))
- (or (and from
- (if (listp from)
- from
- (list from)))
- '(icode2word code2word))))
- result)
- (dolist (cache caches)
- (let* ((cache (ignore-errors (symbol-value cache)))
- (value (and cache (gethash code cache))))
- ;; 处理 iword2count.
- (unless (listp value)
- (setq value (list value)))
- (when value
- (setq result (append result value)))))
- result)))
-
-(defun pyim-dhashcache-update-icode2word (&optional force)
- "对 personal 缓存中的词条进行排序,加载排序后的结果.
+(defun pyim-dhashcache-update-shortcode2word (&optional force)
+ "使用 `pyim-dhashcache-code2word' 中的词条,创建简写 code 词库缓存并加载.
-在这个过程中使用了 `pyim-dhashcache-iword2count' 中记录的词频信息。
-如果 FORCE 为真,强制排序。"
+如果 FORCE 为真,强制运行。"
(interactive)
- (when (or force (not pyim-dhashcache-update-icode2word-p))
+ (when (or force (not pyim-dhashcache-update-shortcode2word-p))
;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-icode2word-p t)
+ (setq pyim-dhashcache-update-shortcode2word-p t)
(async-start
`(lambda ()
,@(pyim-dhashcache-async-inject-variables)
(require 'pyim-dhashcache)
- (pyim-dcache-init-variable pyim-dhashcache-icode2word)
+ (pyim-dcache-init-variable pyim-dhashcache-code2word)
(pyim-dhashcache-init-count-and-priority-variables)
- (maphash
- (lambda (key value)
- (puthash key (pyim-dcache-sort-words value)
- pyim-dhashcache-icode2word))
- pyim-dhashcache-icode2word)
(pyim-dcache-save-variable
- 'pyim-dhashcache-icode2word
- pyim-dhashcache-icode2word)
- nil)
+ 'pyim-dhashcache-shortcode2word
+ (pyim-dhashcache-update-shortcode2word-1
+ pyim-dhashcache-code2word)))
(lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-icode2word)
- (pyim-dhashcache-update-ishortcode2word force)))))
-
-(cl-defmethod pyim-dcache-upgrade (&context (pyim-dcache-backend (eql
pyim-dhashcache)))
- "升级词库缓存.
-
-当前已有的功能:
-1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。"
- (pyim-dhashcache-upgrade-icode2word))
-
-(defun pyim-dhashcache-upgrade-icode2word ()
- "升级 icode2word 缓存。"
- (let ((delete-old-key-p (yes-or-no-p "Delete old key after upgrade? "))
- (ruler-list (delete-dups
- (remove nil
- (mapcar
- (lambda (scheme)
- (let ((code-prefix (plist-get (cdr scheme)
:code-prefix))
- (code-prefix-history (plist-get (cdr
scheme) :code-prefix-history)))
- (when code-prefix-history
- (cons code-prefix-history code-prefix))))
- pyim-schemes)))))
- (dolist (ruler ruler-list)
- (let ((old-prefix-list (car ruler))
- (new-prefix (cdr ruler)))
- (dolist (old-prefix old-prefix-list)
- (maphash
- (lambda (key _value)
- (when (string-prefix-p old-prefix key)
- (let* ((key-words (gethash key pyim-dhashcache-icode2word))
- (new-key (concat new-prefix (string-remove-prefix
old-prefix key)))
- (new-key-words (gethash new-key
pyim-dhashcache-icode2word))
- (merged-value (delete-dups `(,@new-key-words
,@key-words))))
- (puthash new-key merged-value pyim-dhashcache-icode2word)
- (message "PYIM icode2word upgrade: %S %S -> %S %S" key
key-words new-key merged-value)
- (when delete-old-key-p
- (remhash key pyim-dhashcache-icode2word)
- (message "PYIM icode2word upgrade: %S has been deleted."
key)))))
- pyim-dhashcache-icode2word))))))
-
-(defun pyim-dhashcache-update-personal-words (&optional force)
- (pyim-dhashcache-update-icode2word force))
-
-(cl-defmethod pyim-dcache-init-variables
- (&context (pyim-dcache-backend (eql pyim-dhashcache)))
- "初始化 dcache 缓存相关变量."
- (when (and (not pyim-dhashcache-icode2word)
- pyim-dcache-directory
- (file-directory-p pyim-dcache-directory)
- (directory-files pyim-dcache-directory nil "-backup-"))
- (message "PYIM: 在 %S 目录中发现备份文件的存在,可能是词库缓存文件损坏导致,请抓紧检查处理!!!"
- pyim-dcache-directory))
- (pyim-dhashcache-init-count-and-priority-variables)
- (pyim-dcache-init-variable pyim-dhashcache-code2word)
- (pyim-dcache-init-variable pyim-dhashcache-word2code)
- (pyim-dcache-init-variable pyim-dhashcache-shortcode2word)
- (pyim-dcache-init-variable pyim-dhashcache-icode2word)
- (pyim-dcache-init-variable pyim-dhashcache-ishortcode2word))
-
-(defun pyim-dhashcache-init-count-and-priority-variables ()
- "初始化 count 相关的变量。"
- (pyim-dcache-init-variable pyim-dhashcache-iword2count)
- (pyim-dcache-init-variable pyim-dhashcache-iword2count-log)
- (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-10-words)
- (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-50-words)
- (pyim-dcache-init-variable pyim-dhashcache-iword2priority))
-
-(cl-defmethod pyim-dcache-save-caches
- (&context (pyim-dcache-backend (eql pyim-dhashcache)))
- (pyim-dhashcache-save-personal-dcache-to-file))
+ (pyim-dcache-reload-variable pyim-dhashcache-shortcode2word)))))
-(defun pyim-dhashcache-save-personal-dcache-to-file ()
- ;; 用户选择过的词
- (pyim-dcache-save-variable
- 'pyim-dhashcache-icode2word
- pyim-dhashcache-icode2word 0.8)
- ;; 词条总 count
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2count
- pyim-dhashcache-iword2count 0.8)
- ;; 词条 count 日志
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2count-log
- pyim-dhashcache-iword2count-log 0.8)
- ;; 词条优先级
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2priority
- pyim-dhashcache-iword2priority 0.8))
+(defun pyim-dhashcache-update-shortcode2word-1 (code2word)
+ "`pyim-dhashcache-update-shortcode2word' 的内部函数"
+ (let ((shortcode2word (make-hash-table :test #'equal)))
+ (maphash
+ (lambda (key value)
+ (dolist (x (pyim-dhashcache-get-shortcodes key))
+ (puthash x
+ (mapcar
+ (lambda (word)
+ ;; 这个地方的代码用于实现五笔 code 自动提示功能,
+ ;; 比如输入 'aa' 后得到选词框:
+ ;; ----------------------
+ ;; | 1. 莁aa 2.匶wv ... |
+ ;; ----------------------
+ (if (get-text-property 0 :comment word)
+ word
+ (propertize word :comment (substring key (length x)))))
+ (delete-dups `(,@(gethash x shortcode2word) ,@value)))
+ shortcode2word)))
+ code2word)
+ (maphash
+ (lambda (key value)
+ (puthash key (pyim-dcache-sort-words value)
+ shortcode2word))
+ shortcode2word)
+ shortcode2word))
-(defmacro pyim-dhashcache-put (cache code &rest body)
- "将 BODY 的返回值保存到 CACHE 对应的 CODE 中。
+(defun pyim-dhashcache-get-shortcodes (code)
+ "获取 CODE 所有的 shortcodes.
-注意事项:这个宏是一个指代宏,其中 orig-value 在这个宏中有特殊含
-义,代表原来 code 对应的取值。"
- (declare (indent 0))
- (let ((key (make-symbol "key"))
- (table (make-symbol "table"))
- (new-value (make-symbol "new-value")))
- `(let* ((,key ,code)
- (,table ,cache)
- (orig-value (gethash ,key ,table))
- ,new-value)
- (setq ,new-value (progn ,@body))
- (puthash ,key ,new-value ,table))))
+比如:wubi/aaaa -> (wubi/aaa wubi/aa)
-(defun pyim-dhashcache-update-iword2count-recent (word n hash-table)
- (let (words-need-remove)
- (pyim-dhashcache-put
- hash-table :all-words
- (setq orig-value (remove word orig-value))
- (push word orig-value)
- (if (<= (length orig-value) n)
- orig-value
- (setq words-need-remove (nthcdr n orig-value))
- (cl-subseq orig-value 0 n)))
- (dolist (w words-need-remove)
- (remhash w hash-table))
- (pyim-dhashcache-put
- hash-table word
- (+ (or orig-value 0) 1))
- hash-table))
+注意事项:这个函数目前只用于五笔等型码输入法,不用于拼音输入法,
+因为拼音输入法词库太大,这样处理之后,会生成一个特别大的哈希表,
+占用太多内存资源,拼音输入法使用 ishortcode 机制。"
+ (when (and (pyim-string-match-p "/" code)
+ (not (pyim-string-match-p "-" code)))
+ (let* ((x (split-string code "/"))
+ (prefix (concat (nth 0 x) "/"))
+ (code1 (nth 1 x))
+ (n (length code1))
+ results)
+ (dotimes (i n)
+ (when (> i 1)
+ (push (concat prefix (substring code1 0 i)) results)))
+ results)))
+;; ** 更新 dhashcache 词条计数
(cl-defmethod pyim-dcache-update-wordcount
(word &context (pyim-dcache-backend (eql pyim-dhashcache))
&optional wordcount-handler)
@@ -625,99 +608,105 @@ code 对应的中文词条了。
(pyim-dhashcache-get-counts-from-log
(gethash word pyim-dhashcache-iword2count-log)))))
-(defun pyim-dhashcache-update-iword2priority (&optional force)
- "更新词条优先级表,如果 FORCE 为真,强制更新。"
- (interactive)
- (when (or force (not pyim-dhashcache-update-iword2priority-p))
- ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死,
- ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。
- (setq pyim-dhashcache-update-iword2priority-p t)
- (async-start
- `(lambda ()
- ,@(pyim-dhashcache-async-inject-variables)
- (require 'pyim-dhashcache)
- (pyim-dhashcache-init-count-and-priority-variables)
- (maphash
- (lambda (key value)
- (puthash key
- (pyim-dhashcache-calculate-priority
- (pyim-dhashcache-get-counts-from-log
- value))
- pyim-dhashcache-iword2priority))
- pyim-dhashcache-iword2count-log)
- (pyim-dcache-save-variable
- 'pyim-dhashcache-iword2priority
- pyim-dhashcache-iword2priority)
- nil)
- (lambda (_)
- (pyim-dcache-reload-variable pyim-dhashcache-iword2priority)))))
-
-(cl-defmethod pyim-dcache-delete-word
- (word &context (pyim-dcache-backend (eql pyim-dhashcache)))
- "将中文词条 WORD 从个人词库中删除"
- (maphash
- (lambda (key value)
- (when (member word value)
- (let ((new-value (remove word value)))
- (if new-value
- (puthash key new-value pyim-dhashcache-icode2word)
- (remhash key pyim-dhashcache-icode2word)))))
- pyim-dhashcache-icode2word)
- (maphash
- (lambda (key value)
- (when (member word value)
- (print value)
- (let ((new-value (remove word value)))
- (if new-value
- (puthash key new-value pyim-dhashcache-ishortcode2word)
- (remhash key pyim-dhashcache-ishortcode2word)))))
- pyim-dhashcache-ishortcode2word)
- (remhash word pyim-dhashcache-iword2count)
- (remhash word pyim-dhashcache-iword2count-log)
- (remhash word pyim-dhashcache-iword2priority))
-
-(cl-defmethod pyim-dcache-insert-word
- (word code prepend
- &context (pyim-dcache-backend (eql pyim-dhashcache)))
- "将词条 WORD 插入到下面两个词库缓存中。
+(defun pyim-dhashcache-update-iword2count-recent (word n hash-table)
+ (let (words-need-remove)
+ (pyim-dhashcache-put
+ hash-table :all-words
+ (setq orig-value (remove word orig-value))
+ (push word orig-value)
+ (if (<= (length orig-value) n)
+ orig-value
+ (setq words-need-remove (nthcdr n orig-value))
+ (cl-subseq orig-value 0 n)))
+ (dolist (w words-need-remove)
+ (remhash w hash-table))
+ (pyim-dhashcache-put
+ hash-table word
+ (+ (or orig-value 0) 1))
+ hash-table))
-1. `pyim-dhashcache-icode2word'
-2. `pyim-dhashcache-insert-word-into-ishortcode2word'."
- (pyim-dhashcache-insert-word-into-icode2word word code prepend)
- ;; NOTE: 保存词条到 icode2word 词库缓存的同时,也在 ishortcode2word 词库缓存中
- ;; 临时写入一份,供当前 Emacs session 使用,但退出时 pyim 不会保存
- ;; ishortcode2word 词库缓存到文件,因为下次启动 Emacs 的时候,ishortcode2word
- ;; 词库缓存会从 icode2word 再次重建。
- (pyim-dhashcache-insert-word-into-ishortcode2word word code prepend))
+;; ** 根据 dhashcache 信息对词条进行排序
+(cl-defmethod pyim-dcache-sort-words
+ (words-list &context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "对 WORDS-LIST 排序"
+ (let ((iword2count pyim-dhashcache-iword2count)
+ (iword2priority pyim-dhashcache-iword2priority))
+ (sort words-list
+ (lambda (a b)
+ (let ((p1 (gethash a iword2priority))
+ (p2 (gethash b iword2priority)))
+ (cond
+ ((and (listp p1)
+ (listp p2)
+ (not (equal p1 p2)))
+ (pyim-numbers> p1 p2))
+ (t (let ((n1 (or (gethash a iword2count) 0))
+ (n2 (or (gethash b iword2count) 0)))
+ (> n1 n2)))))))))
-(defun pyim-dhashcache-insert-word-into-icode2word (word code prepend)
- "将词条 WORD 插入到 icode2word 词库缓存 CODE 键对应的位置.
+;; ** 升级 dhashcache 相关函数
+(cl-defmethod pyim-dcache-upgrade
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)))
+ "升级词库缓存.
-默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
-到已有词条的最前面。"
- (pyim-dhashcache-put
- pyim-dhashcache-icode2word code
- (if prepend
- `(,word ,@(remove word orig-value))
- `(,@(remove word orig-value) ,word))))
+当前已有的功能:
+1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。"
+ (pyim-dhashcache-upgrade-icode2word))
-(defun pyim-dhashcache-insert-word-into-ishortcode2word (word code prepend)
- "将词条 WORD 插入到 ishortcode2word 词库缓存 CODE 首字母字符串对应的位置.
+(defun pyim-dhashcache-upgrade-icode2word ()
+ "升级 icode2word 缓存。"
+ (let ((delete-old-key-p (yes-or-no-p "Delete old key after upgrade? "))
+ (ruler-list (delete-dups
+ (remove nil
+ (mapcar
+ (lambda (scheme)
+ (let ((code-prefix (plist-get (cdr scheme)
:code-prefix))
+ (code-prefix-history (plist-get (cdr
scheme) :code-prefix-history)))
+ (when code-prefix-history
+ (cons code-prefix-history code-prefix))))
+ pyim-schemes)))))
+ (dolist (ruler ruler-list)
+ (let ((old-prefix-list (car ruler))
+ (new-prefix (cdr ruler)))
+ (dolist (old-prefix old-prefix-list)
+ (maphash
+ (lambda (key _value)
+ (when (string-prefix-p old-prefix key)
+ (let* ((key-words (gethash key pyim-dhashcache-icode2word))
+ (new-key (concat new-prefix (string-remove-prefix
old-prefix key)))
+ (new-key-words (gethash new-key
pyim-dhashcache-icode2word))
+ (merged-value (delete-dups `(,@new-key-words
,@key-words))))
+ (puthash new-key merged-value pyim-dhashcache-icode2word)
+ (message "PYIM icode2word upgrade: %S %S -> %S %S" key
key-words new-key merged-value)
+ (when delete-old-key-p
+ (remhash key pyim-dhashcache-icode2word)
+ (message "PYIM icode2word upgrade: %S has been deleted."
key)))))
+ pyim-dhashcache-icode2word))))))
-默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放
-到已有词条的最前面。"
- (dolist (newcode (pyim-dhashcache-get-ishortcodes code))
- (pyim-dhashcache-put
- pyim-dhashcache-ishortcode2word
- newcode
- (if prepend
- `(,word ,@(remove word orig-value))
- `(,@(remove word orig-value) ,word)))))
+;; ** 保存 dhashcache 相关函数
+(cl-defmethod pyim-dcache-save-caches
+ (&context (pyim-dcache-backend (eql pyim-dhashcache)))
+ (pyim-dhashcache-save-personal-dcache-to-file))
-(cl-defmethod pyim-dcache-search-word-code
- (string &context (pyim-dcache-backend (eql pyim-dhashcache)))
- (gethash string pyim-dhashcache-word2code))
+(defun pyim-dhashcache-save-personal-dcache-to-file ()
+ ;; 用户选择过的词
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-icode2word
+ pyim-dhashcache-icode2word 0.8)
+ ;; 词条总 count
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2count
+ pyim-dhashcache-iword2count 0.8)
+ ;; 词条 count 日志
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2count-log
+ pyim-dhashcache-iword2count-log 0.8)
+ ;; 词条优先级
+ (pyim-dcache-save-variable
+ 'pyim-dhashcache-iword2priority
+ pyim-dhashcache-iword2priority 0.8))
+;; ** 导出相关函数
(cl-defmethod pyim-dcache-export-personal-words
(file &context (pyim-dcache-backend (eql pyim-dhashcache))
&optional confirm)
@@ -725,6 +714,29 @@ code 对应的中文词条了。
(pyim-dcache-init-variables)
(pyim-dhashcache-export pyim-dhashcache-icode2word file confirm))
+(defun pyim-dhashcache-export (dcache file &optional confirm)
+ "将一个 pyim DCACHE 导出为文件 FILE.
+
+如果 CONFIRM 为 non-nil,文件存在时将会提示用户是否覆盖,
+默认为覆盖模式"
+ (with-temp-buffer
+ (insert ";;; -*- coding: utf-8-unix -*-\n")
+ (maphash
+ (lambda (key value)
+ (let ((value (cl-remove-if
+ (lambda (x)
+ ;; 如果某个词条的 text 属性 :noexport 设置为 t, 在导出的
+ ;; 时候自动忽略这个词条。
+ (and (stringp x)
+ (get-text-property 0 :noexport x)))
+ (if (listp value)
+ value
+ (list value)))))
+ (when value
+ (insert (format "%s %s\n" key (mapconcat #'identity value " "))))))
+ dcache)
+ (pyim-dcache-write-file file confirm)))
+
(cl-defmethod pyim-dcache-export-words-and-counts
(file &context (pyim-dcache-backend (eql pyim-dhashcache))
&optional confirm ignore-counts)
@@ -754,6 +766,6 @@ code 对应的中文词条了。
(pyim-dcache-write-file file confirm)))
;; * Footer
-
(provide 'pyim-dhashcache)
+
;;; pyim-dhashcache.el ends here
diff --git a/pyim-dregcache.el b/pyim-dregcache.el
index ef06687b8a..8cacf5a9e7 100644
--- a/pyim-dregcache.el
+++ b/pyim-dregcache.el
@@ -44,133 +44,77 @@
(defvar pyim-dregcache-iword2count nil)
(defvar pyim-dregcache-dicts-md5 nil)
-(cl-defmethod pyim-dcache-update
- (&context (pyim-dcache-backend (eql pyim-dregcache)) &optional force)
- "读取并加载所有相关词库 dcache.
-
-如果 FORCE 为真,强制加载。"
- (pyim-dcache-init-variables)
- (when pyim-dcache-auto-update
- (pyim-dregcache-update-personal-words force)
- (let* ((dict-files (pyim-dict-get-enabled-dict-files))
- (dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
- (when pyim-debug
- (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s
dict-files=%s"
- pyim-dicts
- pyim-extra-dicts
- dict-files))
- (pyim-dregcache-update-code2word dict-files dicts-md5 force))))
-
-(defun pyim-dregcache-variable-file (variable)
- "Get VARIABLE dcache file path."
- (concat (file-name-as-directory pyim-dcache-directory)
- (symbol-name variable)))
-
-(defun pyim-dregcache-save-variable (variable value)
- "Save VARIABLE with its VALUE."
- (let* ((file (pyim-dregcache-variable-file variable))
- (save-silently t))
- (make-directory (file-name-directory file) t)
- (with-temp-buffer
- (insert value)
- (pyim-dcache-write-file file))))
-
-(defun pyim-dregcache-load-variable (variable)
- "载入 VARIABLE 对应的文件内容."
- (let* ((file (pyim-dregcache-variable-file variable)))
- (when (and file (file-exists-p file))
- (with-temp-buffer
- (insert-file-contents file)
- (buffer-string)))))
-
-(defun pyim-dregcache-sort-words (words-list)
- "对 WORDS-LIST 排序,词频大的排在前面."
- (let ((iword2count pyim-dregcache-iword2count))
- (sort words-list
- (lambda (a b)
- (let ((a (car (split-string a ":")))
- (b (car (split-string b ":"))))
- (> (or (gethash a iword2count) 0)
- (or (gethash b iword2count) 0)))))))
-
-(defun pyim-dregcache-sort-icode2word ()
- "对个人词库排序."
- ;; https://github.com/redguardtoo/zhfreq
- (with-temp-buffer
- (dolist (l (split-string pyim-dregcache-icode2word "\n"))
- (cond
- ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l)
- ;; 3字以上词很少,如果只处理单字,2字词,3字词
- ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+
\\)\\(.*\\)" l)
- (let* ((pinyin (match-string 1 l))
- (words (pyim-dregcache-sort-words (split-string (match-string 2
l) " "))))
- (insert (format "%s\n" (concat pinyin (string-join words " "))))))
- ;; 其他词
- ((string= l "")
- ;; skip empty line
- )
- (t
- (insert (format "%s\n" l)))))
- (setq pyim-dregcache-icode2word (buffer-string))))
-
-(defun pyim-dregcache-create-cache-content (raw-content)
- "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区."
- (let ((chars "bcdefghjklmnopqrstwxyz")
- (i 0)
- content-segments
- (start (string-match "^a" raw-content))
- chunk
- end)
- ;; 将字典缓存划分成多个"子搜索区域"
- (while (< i (length chars))
- (when (setq end (string-match (string ?^ (elt chars i))
- raw-content
- start))
- (setq chunk (substring-no-properties raw-content start end))
- (push chunk content-segments)
- (setq start end))
- (setq i (1+ i)))
-
- ;; last chunk
- (setq chunk (substring-no-properties raw-content end (length raw-content)))
- (push chunk content-segments)
- (list :content (nreverse content-segments))))
-
-(defun pyim-dregcache-load-dictionary-file (dict-file)
- "READ from DICT-FILE."
- (let* ((raw-content (with-temp-buffer
- (insert-file-contents dict-file)
- (buffer-string))))
- (setq pyim-dregcache-cache
- ;; use string type as key, so have to use `lax-plist-put'
- ;; @see
https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access
- (lax-plist-put pyim-dregcache-cache
- (file-truename dict-file)
- (pyim-dregcache-create-cache-content raw-content)))))
-
-(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force)
- "读取并加载词库.
+;; ** 初始化 dregcache 相关函数
+(cl-defmethod pyim-dcache-init-variables
+ (&context (pyim-dcache-backend (eql pyim-dregcache)))
+ "初始化 cache 缓存相关变量."
+ (pyim-dcache-init-variable
+ pyim-dregcache-iword2count
+ ;; dregcache 引擎也需要词频信息,第一次使用 dregcache 引擎的时候,
+ ;; 自动导入 dhashcache 引擎的词频信息,以后两个引擎的词频信息就
+ ;; 完全分开了。
+ (pyim-dcache-get-value 'pyim-dhashcache-iword2count))
+ (unless pyim-dregcache-icode2word
+ (pyim-dregcache-update-personal-words t)))
-读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
+;; ** 从 dregcache 搜索词条相关函数
+(cl-defmethod pyim-dcache-get
+ (code &context (pyim-dcache-backend (eql pyim-dregcache))
+ &optional from)
+ "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条."
+ (when code
+ (cond ((or (memq 'icode2word from)
+ (memq 'ishortcode2word from))
+ (pyim-dregcache-get-icode2word-ishortcode2word code))
+ ;; FIXME: pyim-dregcache 暂时不支持 iword2count-recent-10-words 和
+ ;; iword2count-recent-50-words.
+ ((or (memq 'iword2count-recent-10-words from)
+ (memq 'iword2count-recent-50-words from))
+ nil)
+ (t (let ((dict-files (pyim-dregcache-all-dict-files))
+ result)
-DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
+ (when pyim-debug (message "pyim-dregcache-get is called.
code=%s" code))
+ (when dict-files
+ (dolist (file dict-files)
+ (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
+ (content (pyim-dregcache-get-content code
file-info)))
+ (setq result (append (pyim-dregcache-get-1 content code)
result)))))
+ ;; `push' plus `nreverse' is more efficient than `add-to-list'
+ ;; Many examples exist in Emacs' own code
+ (nreverse result))))))
-如果 FORCE 为真,强制加载。"
- (interactive)
- (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5)))
- ;; no hashtable i file mapping algorithm
- (dolist (file dict-files)
- (pyim-dregcache-load-dictionary-file file))
- (setq pyim-dregcache-dicts-md5 dicts-md5)))
+(defun pyim-dregcache-get-icode2word-ishortcode2word (code)
+ "以 CODE 搜索个人词和个人联想词. 正则表达式搜索词库,不需要为联想词开单独缓存."
+ (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word
called => %s" code))
+ (when pyim-dregcache-icode2word
+ (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code))))
-(defmacro pyim-dregcache-shenmu2regexp (char)
- "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字."
- `(concat ,char "[a-z]*"))
+(defun pyim-dregcache-get-1 (content code)
+ (let ((case-fold-search t)
+ (start 0)
+ (pattern (pyim-dregcache-match-line code))
+ (content-length (length content))
+ word
+ output)
+ (while (and (< start content-length)
+ (setq start (string-match pattern content start)))
+ ;; 提取词
+ (setq word (match-string-no-properties 1 content))
+ (when word
+ (cond
+ ((string-match "^[^ ]+$" word)
+ ;; 单个词
+ (push word output))
+ (t
+ ;; 多个字
+ (setq output (append (nreverse (split-string word " +")) output)))))
+ ;; 继续搜索
+ (setq start (+ start 2 (length code) (length word))))
+ output))
-(defmacro pyim-dregcache-is-shenmu (code)
- "判断CODE 是否是一个声母."
- `(and (eq (length ,code) 1)
- (not (string-match ,code "aeo"))))
+(defmacro pyim-dregcache-match-line (code)
+ `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)"))
(defun pyim-dregcache-code2regexp (code)
"将 CODE 转换成正则表达式用来搜索辞典缓存中的匹配项目.
@@ -210,8 +154,14 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
;; tian-an-men => tian-an-men[a-z-]*
(concat s "[a-z-]*"))))))))
-(defmacro pyim-dregcache-match-line (code)
- `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)"))
+(defmacro pyim-dregcache-is-shenmu (code)
+ "判断CODE 是否是一个声母."
+ `(and (eq (length ,code) 1)
+ (not (string-match ,code "aeo"))))
+
+(defmacro pyim-dregcache-shenmu2regexp (char)
+ "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字."
+ `(concat ,char "[a-z]*"))
(defun pyim-dregcache-all-dict-files ()
"所有词典文件."
@@ -239,60 +189,109 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
;; fetch segment using the first character of pinyin code
(nth idx rlt)))
-(defun pyim-dregcache-get-1 (content code)
- (let ((case-fold-search t)
- (start 0)
- (pattern (pyim-dregcache-match-line code))
- (content-length (length content))
- word
- output)
- (while (and (< start content-length)
- (setq start (string-match pattern content start)))
- ;; 提取词
- (setq word (match-string-no-properties 1 content))
- (when word
- (cond
- ((string-match "^[^ ]+$" word)
- ;; 单个词
- (push word output))
- (t
- ;; 多个字
- (setq output (append (nreverse (split-string word " +")) output)))))
- ;; 继续搜索
- (setq start (+ start 2 (length code) (length word))))
- output))
+;; ** 从 dregcache 搜索代码相关函数
+(cl-defmethod pyim-dcache-search-word-code
+ (word &context (pyim-dcache-backend (eql pyim-dregcache)))
+ "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code."
+ (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word))
+ (when pyim-dregcache-cache
+ (catch 'result
+ (let ((dict-files (pyim-dregcache-all-dict-files))
+ code)
+ (when pyim-dregcache-icode2word
+ (setq code (pyim-dregcache-search-word-code-1 word
pyim-dregcache-icode2word))
+ (when code (throw 'result (list code))))
+ (dolist (file dict-files)
+ (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
+ (contents (lax-plist-get file-info :content)))
+ (dolist (content contents)
+ (setq code (pyim-dregcache-search-word-code-1 word content))
+ (when code (throw 'result (list code))))))))))
-(cl-defmethod pyim-dcache-get
- (code &context (pyim-dcache-backend (eql pyim-dregcache))
- &optional from)
- "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条."
- (when code
- (cond ((or (memq 'icode2word from)
- (memq 'ishortcode2word from))
- (pyim-dregcache-get-icode2word-ishortcode2word code))
- ;; FIXME: pyim-dregcache 暂时不支持 iword2count-recent-10-words 和
- ;; iword2count-recent-50-words.
- ((or (memq 'iword2count-recent-10-words from)
- (memq 'iword2count-recent-50-words from))
- nil)
- (t (let ((dict-files (pyim-dregcache-all-dict-files))
- result)
+(defun pyim-dregcache-search-word-code-1 (word content)
+ (let* ((case-fold-search t)
+ (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word
"$\\)")))
+ (when (string-match regexp content)
+ (match-string-no-properties 1 content))))
+
+;; ** 给 dregcache 添加词条相关函数
+(cl-defmethod pyim-dcache-insert-word
+ (word code prepend
+ &context (pyim-dcache-backend (eql pyim-dregcache)))
+ "将词条 WORD 插入到 `pyim-dregcache-icode2word'."
+ (pyim-dregcache-insert-word-into-icode2word word code prepend))
+
+(defun pyim-dregcache-insert-word-into-icode2word (word code prepend)
+ "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法."
+ (when pyim-debug
+ (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s"
+ word
+ code
+ prepend))
+ (with-temp-buffer
+ (when pyim-dregcache-icode2word
+ (insert pyim-dregcache-icode2word))
+ (goto-char (point-min))
+ (let* ((case-fold-search t)
+ substring replace-string beg end old-word-list)
+ (if (re-search-forward (concat "^" code " \\(.*\\)") nil t)
+ (progn
+ (setq beg (match-beginning 0))
+ (setq end (match-end 0))
+ (setq substring (match-string-no-properties 1))
+ (delete-region beg end)
+ ;; 这里不进行排序,在pyim-dregcache-update-personal-words排序
+ (setq old-word-list (pyim-dregcache-sort-words (split-string
substring " ")))
+ (setq replace-string (concat code " " (string-join (delete-dups
`(,@old-word-list ,word)) " "))))
+ (setq replace-string (concat code " " (or replace-string word) "\n")))
+ (goto-char (or beg (point-max)))
+ (insert replace-string))
+ (setq pyim-dregcache-icode2word
+ (buffer-string))))
+
+;; ** 从 dregcache 删除词条相关函数
+(cl-defmethod pyim-dcache-delete-word
+ (word &context (pyim-dcache-backend (eql pyim-dregcache)))
+ "将中文词条 WORD 从个人词库中删除."
+ (with-temp-buffer
+ (insert pyim-dregcache-icode2word)
+ (goto-char (point-min))
+ (let* ((case-fold-search t)
+ substring beg end)
+ (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word
"\\(.*\\)$") nil t)
+ (setq beg (match-beginning 0))
+ (setq end (match-end 0))
+ (setq substring (concat (match-string-no-properties 1)
+ (match-string-no-properties 2)
+ (match-string-no-properties 3)))
+
+ ;; delete string and the newline char
+ (delete-region beg (+ 1 end))
+ (when (> (length (split-string substring " ")) 1)
+ (goto-char beg)
+ (insert substring)))
+ (setq pyim-dregcache-icode2word
+ (buffer-string))))
+ ;; 删除对应词条的词频
+ (remhash word pyim-dregcache-iword2count))
- (when pyim-debug (message "pyim-dregcache-get is called.
code=%s" code))
- (when dict-files
- (dolist (file dict-files)
- (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
- (content (pyim-dregcache-get-content code
file-info)))
- (setq result (append (pyim-dregcache-get-1 content code)
result)))))
- ;; `push' plus `nreverse' is more efficient than `add-to-list'
- ;; Many examples exist in Emacs' own code
- (nreverse result))))))
+;; ** 更新 dhashcache 相关函数
+(cl-defmethod pyim-dcache-update
+ (&context (pyim-dcache-backend (eql pyim-dregcache)) &optional force)
+ "读取并加载所有相关词库 dcache.
-(defun pyim-dregcache-get-icode2word-ishortcode2word (code)
- "以 CODE 搜索个人词和个人联想词. 正则表达式搜索词库,不需要为联想词开单独缓存."
- (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word
called => %s" code))
- (when pyim-dregcache-icode2word
- (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code))))
+如果 FORCE 为真,强制加载。"
+ (pyim-dcache-init-variables)
+ (when pyim-dcache-auto-update
+ (pyim-dregcache-update-personal-words force)
+ (let* ((dict-files (pyim-dict-get-enabled-dict-files))
+ (dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
+ (when pyim-debug
+ (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s
dict-files=%s"
+ pyim-dicts
+ pyim-extra-dicts
+ dict-files))
+ (pyim-dregcache-update-code2word dict-files dicts-md5 force))))
(defun pyim-dregcache-update-personal-words (&optional force)
"合并 `pyim-dregcache-icode2word' 磁盘文件. 加载排序后的结果.
@@ -331,39 +330,70 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
(when (and force pyim-dregcache-icode2word)
(pyim-dregcache-sort-icode2word)))
-(cl-defmethod pyim-dcache-init-variables
- (&context (pyim-dcache-backend (eql pyim-dregcache)))
- "初始化 cache 缓存相关变量."
- (pyim-dcache-init-variable
- pyim-dregcache-iword2count
- ;; dregcache 引擎也需要词频信息,第一次使用 dregcache 引擎的时候,
- ;; 自动导入 dhashcache 引擎的词频信息,以后两个引擎的词频信息就
- ;; 完全分开了。
- (pyim-dcache-get-value 'pyim-dhashcache-iword2count))
- (unless pyim-dregcache-icode2word
- (pyim-dregcache-update-personal-words t)))
+(defun pyim-dregcache-load-variable (variable)
+ "载入 VARIABLE 对应的文件内容."
+ (let* ((file (pyim-dregcache-variable-file variable)))
+ (when (and file (file-exists-p file))
+ (with-temp-buffer
+ (insert-file-contents file)
+ (buffer-string)))))
-(cl-defmethod pyim-dcache-save-caches
- (&context (pyim-dcache-backend (eql pyim-dregcache)))
- (pyim-dregcache-save-personal-dcache-to-file))
+(defun pyim-dregcache-variable-file (variable)
+ "Get VARIABLE dcache file path."
+ (concat (file-name-as-directory pyim-dcache-directory)
+ (symbol-name variable)))
-(defun pyim-dregcache-save-personal-dcache-to-file ()
- "保存缓存内容到默认目录."
- (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file
called"))
- ;; 用户选择过的词存为标准辞典格式保存
- (when pyim-dregcache-icode2word
- (pyim-dregcache-save-variable
- 'pyim-dregcache-icode2word
- pyim-dregcache-icode2word))
- ;; 词频
- (pyim-dcache-save-variable
- 'pyim-dregcache-iword2count
- pyim-dregcache-iword2count))
+(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force)
+ "读取并加载词库.
-(defun pyim-dregcache-export-words-and-counts ()
- "TODO"
- )
+读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。
+
+DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
+
+如果 FORCE 为真,强制加载。"
+ (interactive)
+ (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5)))
+ ;; no hashtable i file mapping algorithm
+ (dolist (file dict-files)
+ (pyim-dregcache-load-dictionary-file file))
+ (setq pyim-dregcache-dicts-md5 dicts-md5)))
+
+(defun pyim-dregcache-load-dictionary-file (dict-file)
+ "READ from DICT-FILE."
+ (let* ((raw-content (with-temp-buffer
+ (insert-file-contents dict-file)
+ (buffer-string))))
+ (setq pyim-dregcache-cache
+ ;; use string type as key, so have to use `lax-plist-put'
+ ;; @see
https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access
+ (lax-plist-put pyim-dregcache-cache
+ (file-truename dict-file)
+ (pyim-dregcache-create-cache-content raw-content)))))
+
+(defun pyim-dregcache-create-cache-content (raw-content)
+ "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区."
+ (let ((chars "bcdefghjklmnopqrstwxyz")
+ (i 0)
+ content-segments
+ (start (string-match "^a" raw-content))
+ chunk
+ end)
+ ;; 将字典缓存划分成多个"子搜索区域"
+ (while (< i (length chars))
+ (when (setq end (string-match (string ?^ (elt chars i))
+ raw-content
+ start))
+ (setq chunk (substring-no-properties raw-content start end))
+ (push chunk content-segments)
+ (setq start end))
+ (setq i (1+ i)))
+ ;; last chunk
+ (setq chunk (substring-no-properties raw-content end (length raw-content)))
+ (push chunk content-segments)
+ (list :content (nreverse content-segments))))
+
+;; ** 更新 dregcache 词条计数。
(cl-defmethod pyim-dcache-update-wordcount
(word &context (pyim-dcache-backend (eql pyim-dregcache))
&optional wordcount-handler)
@@ -383,65 +413,7 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
(unless (equal orig-value new-value)
(puthash word new-value pyim-dregcache-iword2count))))
-(cl-defmethod pyim-dcache-delete-word
- (word &context (pyim-dcache-backend (eql pyim-dregcache)))
- "将中文词条 WORD 从个人词库中删除."
- (with-temp-buffer
- (insert pyim-dregcache-icode2word)
- (goto-char (point-min))
- (let* ((case-fold-search t)
- substring beg end)
- (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word
"\\(.*\\)$") nil t)
- (setq beg (match-beginning 0))
- (setq end (match-end 0))
- (setq substring (concat (match-string-no-properties 1)
- (match-string-no-properties 2)
- (match-string-no-properties 3)))
-
- ;; delete string and the newline char
- (delete-region beg (+ 1 end))
- (when (> (length (split-string substring " ")) 1)
- (goto-char beg)
- (insert substring)))
- (setq pyim-dregcache-icode2word
- (buffer-string))))
- ;; 删除对应词条的词频
- (remhash word pyim-dregcache-iword2count))
-
-(cl-defmethod pyim-dcache-insert-word
- (word code prepend
- &context (pyim-dcache-backend (eql pyim-dregcache)))
- "将词条 WORD 插入到 `pyim-dregcache-icode2word'."
- (pyim-dregcache-insert-word-into-icode2word word code prepend))
-
-(defun pyim-dregcache-insert-word-into-icode2word (word code prepend)
- "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法."
- (when pyim-debug
- (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s"
- word
- code
- prepend))
- (with-temp-buffer
- (when pyim-dregcache-icode2word
- (insert pyim-dregcache-icode2word))
- (goto-char (point-min))
- (let* ((case-fold-search t)
- substring replace-string beg end old-word-list)
- (if (re-search-forward (concat "^" code " \\(.*\\)") nil t)
- (progn
- (setq beg (match-beginning 0))
- (setq end (match-end 0))
- (setq substring (match-string-no-properties 1))
- (delete-region beg end)
- ;; 这里不进行排序,在pyim-dregcache-update-personal-words排序
- (setq old-word-list (pyim-dregcache-sort-words (split-string
substring " ")))
- (setq replace-string (concat code " " (string-join (delete-dups
`(,@old-word-list ,word)) " "))))
- (setq replace-string (concat code " " (or replace-string word) "\n")))
- (goto-char (or beg (point-max)))
- (insert replace-string))
- (setq pyim-dregcache-icode2word
- (buffer-string))))
-
+;; ** 升级 dhashcache 相关函数
(cl-defmethod pyim-dcache-upgrade (&context (pyim-dcache-backend (eql
pyim-dregcache)))
"升级词库缓存.
@@ -456,30 +428,45 @@ dregcache 只支持全拼和双拼,不能用于五笔之类的型码输入法
update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所
以不需要具体实现细节。")
-(defun pyim-dregcache-search-word-code-1 (word content)
- (let* ((case-fold-search t)
- (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word
"$\\)")))
- (when (string-match regexp content)
- (match-string-no-properties 1 content))))
+;; ** 根据 dregcache 信息对词条进行排序
+(defun pyim-dregcache-sort-words (words-list)
+ "对 WORDS-LIST 排序,词频大的排在前面."
+ (let ((iword2count pyim-dregcache-iword2count))
+ (sort words-list
+ (lambda (a b)
+ (let ((a (car (split-string a ":")))
+ (b (car (split-string b ":"))))
+ (> (or (gethash a iword2count) 0)
+ (or (gethash b iword2count) 0)))))))
-(cl-defmethod pyim-dcache-search-word-code
- (word &context (pyim-dcache-backend (eql pyim-dregcache)))
- "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code."
- (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word))
- (when pyim-dregcache-cache
- (catch 'result
- (let ((dict-files (pyim-dregcache-all-dict-files))
- code)
- (when pyim-dregcache-icode2word
- (setq code (pyim-dregcache-search-word-code-1 word
pyim-dregcache-icode2word))
- (when code (throw 'result (list code))))
- (dolist (file dict-files)
- (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
- (contents (lax-plist-get file-info :content)))
- (dolist (content contents)
- (setq code (pyim-dregcache-search-word-code-1 word content))
- (when code (throw 'result (list code))))))))))
+;; ** 保存 dregcache 相关函数
+(cl-defmethod pyim-dcache-save-caches
+ (&context (pyim-dcache-backend (eql pyim-dregcache)))
+ (pyim-dregcache-save-personal-dcache-to-file))
+
+(defun pyim-dregcache-save-personal-dcache-to-file ()
+ "保存缓存内容到默认目录."
+ (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file
called"))
+ ;; 用户选择过的词存为标准辞典格式保存
+ (when pyim-dregcache-icode2word
+ (pyim-dregcache-save-variable
+ 'pyim-dregcache-icode2word
+ pyim-dregcache-icode2word))
+ ;; 词频
+ (pyim-dcache-save-variable
+ 'pyim-dregcache-iword2count
+ pyim-dregcache-iword2count))
+
+(defun pyim-dregcache-save-variable (variable value)
+ "Save VARIABLE with its VALUE."
+ (let* ((file (pyim-dregcache-variable-file variable))
+ (save-silently t))
+ (make-directory (file-name-directory file) t)
+ (with-temp-buffer
+ (insert value)
+ (pyim-dcache-write-file file))))
+;; ** 导出 dregcache 相关函数
(cl-defmethod pyim-dcache-export-personal-words
(file &context (pyim-dcache-backend (eql pyim-dregcache))
&optional confirm)
@@ -498,6 +485,30 @@ update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所
(sort-lines nil (point-min) (point-max))
(pyim-dcache-write-file file confirm))))
+(defun pyim-dregcache-sort-icode2word ()
+ "对个人词库排序."
+ ;; https://github.com/redguardtoo/zhfreq
+ (with-temp-buffer
+ (dolist (l (split-string pyim-dregcache-icode2word "\n"))
+ (cond
+ ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l)
+ ;; 3字以上词很少,如果只处理单字,2字词,3字词
+ ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+
\\)\\(.*\\)" l)
+ (let* ((pinyin (match-string 1 l))
+ (words (pyim-dregcache-sort-words (split-string (match-string 2
l) " "))))
+ (insert (format "%s\n" (concat pinyin (string-join words " "))))))
+ ;; 其他词
+ ((string= l "")
+ ;; skip empty line
+ )
+ (t
+ (insert (format "%s\n" l)))))
+ (setq pyim-dregcache-icode2word (buffer-string))))
+
+(defun pyim-dregcache-export-words-and-counts ()
+ "TODO"
+ )
+
;; * Footer
(provide 'pyim-dregcache)
- [elpa] externals/pyim updated (1e0834c456 -> 6c05a5fc03), ELPA Syncer, 2022/06/09
- [elpa] externals/pyim b33d2f2a75 04/12: cl-defgeneric pyim-dcache-update, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 156d282073 06/12: cl-defgeneric pyim-dcache-save-caches, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 7288242a6d 10/12: Sort dcache.el, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 371d2ee095 05/12: cl-defgeneric pyim-dcache-export-*, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim fb74c9fc93 01/12: cl-defgeneric pyim-dcache-upgrade, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 12435095d3 08/12: Sort dcache, dhashcache, dregcache,
ELPA Syncer <=
- [elpa] externals/pyim 504f20fd49 02/12: cl-defgeneric pyim-dcache-insert-word, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim e0f80e5d36 11/12: Merge branch 'dcache', ELPA Syncer, 2022/06/09
- [elpa] externals/pyim f6a45f6506 03/12: cl-defgeneric pyim-dcache-update-wordcount, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 41564f3d74 09/12: Sort dregcache, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 6c05a5fc03 12/12: Fix pyim-tests.el, ELPA Syncer, 2022/06/09
- [elpa] externals/pyim 05d95422b9 07/12: cl-defgeneric rest dcache interfaces., ELPA Syncer, 2022/06/09