From 22dfbf0ff6fa9449789c43fed35c5d74e0cc195f Mon Sep 17 00:00:00 2001 From: ShikiSuen Date: Fri, 20 May 2022 00:15:18 +0800 Subject: [PATCH] Add DumpDOT support and fix unit tests. (#25) * Refactoring & Add documentation. * DumpDOT support. * Fix data conversion loss in UnitTests and add DumpDOT tests. --- README.md | 118 +++++++++++++- Sources/Megrez/1_BlockReadingBuilder.swift | 173 +++++++++++++++++---- Sources/Megrez/1_Walker.swift | 123 --------------- Sources/Megrez/2_Grid.swift | 88 +++++++++-- Sources/Megrez/3_NodeAnchor.swift | 37 ++++- Sources/Megrez/3_Span.swift | 25 ++- Sources/Megrez/4_Node.swift | 123 ++++++++------- Sources/Megrez/5_LanguageModel.swift | 8 +- Sources/Megrez/6_Bigram.swift | 38 +++-- Sources/Megrez/6_Unigram.swift | 37 +++-- Sources/Megrez/7_KeyValuePair.swift | 14 +- Tests/MegrezTests/MegrezTests.swift | 86 +++++----- 12 files changed, 571 insertions(+), 299 deletions(-) delete mode 100644 Sources/Megrez/1_Walker.swift diff --git a/README.md b/README.md index bde4520..61a3b9f 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,117 @@ Megrez Engine is a module made for processing lingual data of an input method. This repository is part of Operation Longinus of The vChewing Project. -欲知使用方法,請洽該倉庫內的 MegrezTests.swift 檔案當中的示例、也可研讀上文提到的威注音輸入法的倉庫內的源碼。 +## 使用說明 -- Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). - - Swift programmer: Shiki Suen - - C++ migration review: Hiraku Wong -- Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). +### §1. 初期化 + +在你的 ctlInputMethod (InputMethodController) 或者 KeyHandler 內初期化一份 Megrez.BlockReadingBuilder 分節讀音槽副本(這裡將該副本命名為「`_builder`」)。由於 Megrez.BlockReadingBuilder 的型別是 Class 型別,所以其副本可以用 let 來宣告。 + +以 KeyHandler 為例: +```swift +class KeyHandler: NSObject { + // 先設定好變數 + let _builder: Megrez.BlockReadingBuilder = .init() + ... +} +``` + +以 ctlInputMethod 為例: +```swift +@objc(ctlInputMethod) // 根據 info.plist 內的情況來確定型別的命名 +class ctlInputMethod: IMKInputController { + // 先設定好變數 + let _builder: Megrez.BlockReadingBuilder = .init() + ... +} +``` + +由於 Swift 會在某個大副本(KeyHandler 或者 ctlInputMethod 副本)被銷毀的時候自動銷毀其中的全部副本,所以 Megrez.BlockReadingBuilder 的副本初期化沒必要寫在 init() 當中。但你很可能會想在 init() 時指定 Tekkon.Composer 所對接的語言模組型別、以及其可以允許的最大詞長。 + +這裡就需要在 init() 時使用參數: +```swift + /// 分節讀音槽。 + /// - Parameters: + /// - lm: 語言模型。可以是任何基於 Megrez.LanguageModel 的衍生型別。 + /// - length: 指定該分節讀音曹內可以允許的最大詞長,預設為 10 字。 + /// - separator: 多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 + let _builder: Megrez.BlockReadingBuilder = .init(lm: lmTest, length: 13, separator: "-") +``` + +### §2. 使用範例 + +請結合 MegrezTests.swift 檔案來學習。這裡只是給個概述。 + +#### // 1. 準備用作語言模型的專用型別 + +首先,Megrez 內建的 LanguageModel 型別是遠遠不夠用的,只能說是個類似於 protocol 一樣的存在。你需要自己單獨寫一個新的衍生型別: + +```swift +class ExampleLM: Megrez.LanguageModel { +... + override func unigramsFor(key: String) -> [Megrez.Unigram] { + ... + } +... +} +``` + +這個型別需要下述兩個函數能夠針對給定的鍵回饋對應的資料值、或其存無狀態: +- unigramsFor(key: String) -> [Megrez.Unigram] +- hasUnigramsFor(key: String) -> Bool + +MegrezTests.swift 檔案內的 SimpleLM 可以作為範例。 + +如果需要更實戰的範例的話,可以洽威注音專案的倉庫內的 LMInstantiator.swift。 + +#### // 2. 怎樣與 builder 互動: + +這裡只講幾個常用函數: + +- 游標位置 `builder.cursorIndex` 是可以賦值與取值的動態變數,且會在賦值內容為超出位置範圍的數值時自動修正。初期值為 0。 +- `builder.insertReadingAtCursor(reading: "gao1")` 可以在當前的游標位置插入讀音「gao1」。 +- `builder.deleteReadingToTheFrontOfCursor()` 的作用是:朝著往文字輸入方向、砍掉一個與游標相鄰的讀音。反之,`deleteReadingAtTheRearOfCursor` 則朝著與文字輸入方向相反的方向、砍掉一個與游標相鄰的讀音。 + - 在威注音的術語體系當中,「文字輸入方向」為向前(Front)、與此相反的方向為向後(Rear)。 +- `builder.grid.fixNodeSelectedCandidate(location: ?, value: "??")` 用來根據輸入法選中的候選字詞、據此更新當前游標位置選中的候選字詞節點當中的候選字詞。 + +輸入完內容之後,可以聲明一個用來接收結果的變數: + +```swift + /// 對已給定的軌格按照給定的位置與條件進行正向爬軌。 + /// + /// 其實就是將反向爬軌的結果顛倒順序再給出來而已,省得使用者自己再顛倒一遍。 + /// - Parameters: + /// - at: 開始爬軌的位置。 + /// - score: 給定累計權重,非必填參數。預設值為 0。 + /// - nodesLimit: 限定最多只爬多少個節點。 + /// - balanced: 啟用平衡權重,在節點權重的基礎上根據節點幅位長度來加權。 + var walked = _builder.walk(at: builder.grid.width, score: 0.0, nodesLimit: 3, balanced: true) +``` + +MegrezTests.swift 是輸入了很多內容之後再 walk 的。實際上一款輸入法會在你每次插入讀音或刪除讀音的時候都重新 walk。那些處於候選字詞鎖定狀態的節點不會再受到之後的 walk 的行為的影響,但除此之外的節點會因為每次 walk 而可能各自的候選字詞會出現自動變化。如果給了 nodesLimit 一個非零的數值的話,則 walk 的範圍外的節點不會受到影響。 + +walk 之後的取值的方法及利用方法可以有很多種。這裡有其中的一個: + +```swift + var composed: [String] = [] + for phrase in walked { + if let node = phrase.node { + composed.append(node.currentKeyValue.value) + } + } + print(composed) +``` + +上述 print 結果就是 _builder 目前的組句,是這種陣列格式(以吳宗憲的詩句為例): +```swift + ["八月", "中秋", "山林", "涼", "風吹", "大地", "草枝", "擺"] +``` + +自己看 MegrezTests.swift 慢慢研究吧。 + +## 著作權 (Credits) + +- Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT-NTL License). + - Swift programmer: Shiki Suen + - C++ migration review: Hiraku Wong +- Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). diff --git a/Sources/Megrez/1_BlockReadingBuilder.swift b/Sources/Megrez/1_BlockReadingBuilder.swift index 78b659f..5d4628c 100644 --- a/Sources/Megrez/1_BlockReadingBuilder.swift +++ b/Sources/Megrez/1_BlockReadingBuilder.swift @@ -24,33 +24,55 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 分節讀音槽。 public class BlockReadingBuilder { - var mutMaximumBuildSpanLength = 10 - var mutCursorIndex: Int = 0 - var mutReadings: [String] = [] - var mutGrid: Grid = .init() - var mutLM: LanguageModel - var mutJoinSeparator: String = "" - - public init(lm: LanguageModel, length: Int = 10) { + /// 該分節讀音曹內可以允許的最大詞長。 + private var mutMaximumBuildSpanLength = 10 + /// 該分節讀音槽的游標位置。 + private var mutCursorIndex: Int = 0 + /// 該分節讀音槽的讀音陣列。 + private var mutReadings: [String] = [] + /// 該分節讀音槽的軌格。 + private var mutGrid: Grid = .init() + /// 該分節讀音槽所使用的語言模型。 + private var mutLM: LanguageModel + + /// 公開:多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 + public var joinSeparator: String = "" + /// 公開:該分節讀音槽的游標位置。 + public var cursorIndex: Int { + get { mutCursorIndex } + set { mutCursorIndex = min(newValue, mutReadings.count) } + } + + /// 公開:該分節讀音槽的軌格(唯讀)。 + public var grid: Grid { mutGrid } + /// 公開:該分節讀音槽的長度,也就是內建漢字讀音的數量(唯讀)。 + public var length: Int { mutReadings.count } + /// 公開:該分節讀音槽的讀音陣列(唯讀)。 + public var readings: [String] { mutReadings } + + /// 分節讀音槽。 + /// - Parameters: + /// - lm: 語言模型。可以是任何基於 Megrez.LanguageModel 的衍生型別。 + /// - length: 指定該分節讀音曹內可以允許的最大詞長,預設為 10 字。 + /// - separator: 多字讀音鍵當中用以分割漢字讀音的記號,預設為空。 + public init(lm: LanguageModel, length: Int = 10, separator: String = "") { mutLM = lm mutMaximumBuildSpanLength = length + joinSeparator = separator } + /// 分節讀音槽自我清空專用函數。 public func clear() { mutCursorIndex = 0 mutReadings.removeAll() mutGrid.clear() } - public func length() -> Int { mutReadings.count } - - public func cursorIndex() -> Int { mutCursorIndex } - - public func setCursorIndex(newIndex: Int) { - mutCursorIndex = min(newIndex, mutReadings.count) - } - + /// 在游標位置插入給定的讀音。 + /// - Parameters: + /// - reading: 要插入的讀音。 public func insertReadingAtCursor(reading: String) { mutReadings.insert(reading, at: mutCursorIndex) mutGrid.expandGridByOneAt(location: mutCursorIndex) @@ -58,8 +80,8 @@ extension Megrez { mutCursorIndex += 1 } - public func readings() -> [String] { mutReadings } - + /// 朝著與文字輸入方向相反的方向、砍掉一個與游標相鄰的讀音。 + /// 在威注音的術語體系當中,「與文字輸入方向相反的方向」為向後(Rear)。 @discardableResult public func deleteReadingAtTheRearOfCursor() -> Bool { if mutCursorIndex == 0 { return false @@ -72,6 +94,8 @@ extension Megrez { return true } + /// 朝著往文字輸入方向、砍掉一個與游標相鄰的讀音。 + /// 在威注音的術語體系當中,「文字輸入方向」為向前(Front)。 @discardableResult public func deleteReadingToTheFrontOfCursor() -> Bool { if mutCursorIndex == mutReadings.count { return false @@ -83,8 +107,12 @@ extension Megrez { return true } + /// 移除該分節讀音槽的第一個讀音單元。 + /// + /// 用於輸入法組字區長度上限處理: + /// 將該位置要溢出的敲字內容遞交之後、再執行這個函數。 @discardableResult public func removeHeadReadings(count: Int) -> Bool { - if count > length() { + if count > length { return false } @@ -100,17 +128,108 @@ extension Megrez { return true } - public func setJoinSeparator(separator: String) { - mutJoinSeparator = separator + // MARK: - Walker + + /// 對已給定的軌格按照給定的位置與條件進行正向爬軌。 + /// + /// 其實就是將反向爬軌的結果顛倒順序再給出來而已,省得使用者自己再顛倒一遍。 + /// - Parameters: + /// - at: 開始爬軌的位置。 + /// - score: 給定累計權重,非必填參數。預設值為 0。 + /// - nodesLimit: 限定最多只爬多少個節點。 + /// - balanced: 啟用平衡權重,在節點權重的基礎上根據節點幅位長度來加權。 + public func walk( + at location: Int, + score accumulatedScore: Double = 0.0, + nodesLimit: Int = 0, + balanced: Bool = false + ) -> [NodeAnchor] { + Array( + reverseWalk( + at: location, score: accumulatedScore, + nodesLimit: nodesLimit, balanced: balanced + ).reversed()) } - public func joinSeparator() -> String { mutJoinSeparator } + /// 對已給定的軌格按照給定的位置與條件進行反向爬軌。 + /// - Parameters: + /// - at: 開始爬軌的位置。 + /// - score: 給定累計權重,非必填參數。預設值為 0。 + /// - nodesLimit: 限定最多只爬多少個節點。 + /// - balanced: 啟用平衡權重,在節點權重的基礎上根據節點幅位長度來加權。 + public func reverseWalk( + at location: Int, + score accumulatedScore: Double = 0.0, + nodesLimit: Int = 0, + balanced: Bool = false + ) -> [NodeAnchor] { + if location == 0 || location > mutGrid.width { + return [] as [NodeAnchor] + } + + var paths: [[NodeAnchor]] = [] + var nodes: [NodeAnchor] = mutGrid.nodesEndingAt(location: location) + + if balanced { + nodes.sort { + $0.balancedScore > $1.balancedScore + } + } + + for (i, n) in nodes.enumerated() { + // 只檢查前 X 個 NodeAnchor 是否有 node。 + // 這裡有 abs 是為了防止有白癡填負數。 + if abs(nodesLimit) > 0, i == abs(nodesLimit) - 1 { + break + } - public func grid() -> Grid { mutGrid } + var n = n + guard let nNode = n.node else { + continue + } + + n.accumulatedScore = accumulatedScore + nNode.score + + // 利用幅位長度來決定權重。 + // 這樣一來,例:「再見」比「在」與「見」的權重更高。 + if balanced { + let weightedScore: Double = (Double(n.spanningLength) - 1) * 2 + n.accumulatedScore += weightedScore + } + + var path: [NodeAnchor] = reverseWalk( + at: location - n.spanningLength, + score: n.accumulatedScore + ) + + path.insert(n, at: 0) + + paths.append(path) + + // 始終使用固定的候選字詞 + if balanced, nNode.score >= 0 { + break + } + } + + if !paths.isEmpty { + if var result = paths.first { + for value in paths { + if let vLast = value.last, let rLast = result.last { + if vLast.accumulatedScore > rLast.accumulatedScore { + result = value + } + } + } + return result + } + } + return [] as [NodeAnchor] + } - public func build() { - // if (mutLM == nil) { return } // 這個出不了 nil,所以註釋掉。 + // MARK: - Private functions + private func build() { let itrBegin: Int = (mutCursorIndex < mutMaximumBuildSpanLength) ? 0 : mutCursorIndex - mutMaximumBuildSpanLength let itrEnd: Int = min(mutCursorIndex + mutMaximumBuildSpanLength, mutReadings.count) @@ -121,7 +240,7 @@ extension Megrez { break } let strSlice = mutReadings[p..<(p + q)] - let combinedReading: String = join(slice: strSlice, separator: mutJoinSeparator) + let combinedReading: String = join(slice: strSlice, separator: joinSeparator) if !mutGrid.hasMatchedNode(location: p, spanningLength: q, key: combinedReading) { let unigrams: [Unigram] = mutLM.unigramsFor(key: combinedReading) @@ -134,7 +253,7 @@ extension Megrez { } } - public func join(slice strSlice: ArraySlice, separator: String) -> String { + private func join(slice strSlice: ArraySlice, separator: String) -> String { var arrResult: [String] = [] for value in strSlice { arrResult.append(value) diff --git a/Sources/Megrez/1_Walker.swift b/Sources/Megrez/1_Walker.swift deleted file mode 100644 index 5bd934b..0000000 --- a/Sources/Megrez/1_Walker.swift +++ /dev/null @@ -1,123 +0,0 @@ -// Swiftified by (c) 2022 and onwards The vChewing Project (MIT-NTL License). -// Rebranded from (c) Lukhnos Liu's C++ library "Gramambular" (MIT License). -/* -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -1. The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -2. No trademark license is granted to use the trade names, trademarks, service -marks, or product names of Contributor, except as required to fulfill notice -requirements above. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -extension Megrez { - public class Walker { - var mutGrid: Grid - - public init(grid: Megrez.Grid = Megrez.Grid()) { - mutGrid = grid - } - - public func walk( - at location: Int, - score accumulatedScore: Double = 0.0, - nodesLimit: Int = 0, - balanced: Bool = false - ) -> [NodeAnchor] { - var arrReturn: [NodeAnchor] = [] - let arrReversedSource = reverseWalk( - at: location, score: accumulatedScore, - nodesLimit: nodesLimit, balanced: balanced - ).reversed() - - for neta in arrReversedSource { - arrReturn.append(neta) - } - - return arrReturn - } - - public func reverseWalk( - at location: Int, - score accumulatedScore: Double = 0.0, - nodesLimit: Int = 0, - balanced: Bool = false - ) -> [NodeAnchor] { - if location == 0 || location > mutGrid.width() { - return [] as [NodeAnchor] - } - - var paths: [[NodeAnchor]] = [] - var nodes: [NodeAnchor] = mutGrid.nodesEndingAt(location: location) - - if balanced { - nodes.sort { - $0.balancedScore > $1.balancedScore - } - } - - for (i, n) in nodes.enumerated() { - // 只檢查前 X 個 NodeAnchor 是否有 node。 - // 這裡有 abs 是為了防止有白癡填負數。 - if abs(nodesLimit) > 0, i == abs(nodesLimit) - 1 { - break - } - - var n = n - guard let nNode = n.node else { - continue - } - - n.accumulatedScore = accumulatedScore + nNode.score() - - // 利用 Spanning Length 來決定權重。 - // 這樣一來,例:「再見」比「在」與「見」的權重更高。 - if balanced { - let weightedScore: Double = (Double(n.spanningLength) - 1) * 2 - n.accumulatedScore += weightedScore - } - - var path: [NodeAnchor] = reverseWalk( - at: location - n.spanningLength, - score: n.accumulatedScore - ) - - path.insert(n, at: 0) - - paths.append(path) - - // 始終使用固定的候選字 - if balanced, nNode.score() >= 0 { - break - } - } - - if !paths.isEmpty { - if var result = paths.first { - for value in paths { - if let vLast = value.last, let rLast = result.last { - if vLast.accumulatedScore > rLast.accumulatedScore { - result = value - } - } - } - return result - } - } - return [] as [NodeAnchor] - } - } -} diff --git a/Sources/Megrez/2_Grid.swift b/Sources/Megrez/2_Grid.swift index 9f71f3b..da53256 100644 --- a/Sources/Megrez/2_Grid.swift +++ b/Sources/Megrez/2_Grid.swift @@ -24,17 +24,28 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 軌格。 public class Grid { - var mutSpans: [Megrez.Span] + /// 幅位陣列。 + private var mutSpans: [Megrez.Span] + + /// 軌格的寬度,也就是其內的幅位陣列當中的幅位數量。 + var width: Int { mutSpans.count } public init() { mutSpans = [Megrez.Span]() } + /// 自我清空該軌格的內容。 public func clear() { mutSpans = [Megrez.Span]() } + /// 往該軌格的指定位置插入指定幅位長度的指定節點。 + /// - Parameters: + /// - node: 節點。 + /// - location: 位置。 + /// - spanningLength: 給定的幅位長度。 public func insertNode(node: Node, location: Int, spanningLength: Int) { if location >= mutSpans.count { let diff = location - mutSpans.count + 1 @@ -45,15 +56,23 @@ extension Megrez { mutSpans[location].insert(node: node, length: spanningLength) } + /// 給定索引鍵、位置、幅位長度,在該軌格內確認是否有對應的節點存在。 + /// - Parameters: + /// - location: 位置。 + /// - spanningLength: 給定的幅位長度。 + /// - key: 索引鍵。 public func hasMatchedNode(location: Int, spanningLength: Int, key: String) -> Bool { if location > mutSpans.count { return false } let n = mutSpans[location].node(length: spanningLength) - return n == nil ? false : key == n?.key() + return n == nil ? false : key == n?.key } + /// 在該軌格的指定位置擴增一個幅位。 + /// - Parameters: + /// - location: 位置。 public func expandGridByOneAt(location: Int) { // 這裡加入 abs 完全是一個防呆設計 mutSpans.insert(Span(), at: abs(location)) @@ -65,6 +84,9 @@ extension Megrez { } } + /// 在該軌格的指定位置減少一個幅位。 + /// - Parameters: + /// - location: 位置。 public func shrinkGridByOneAt(location: Int) { if location >= mutSpans.count { return @@ -77,8 +99,9 @@ extension Megrez { } } - public func width() -> Int { mutSpans.count } - + /// 給定位置,枚舉出所有在這個位置結尾的節點。 + /// - Parameters: + /// - location: 位置。 public func nodesEndingAt(location: Int) -> [NodeAnchor] { var results: [NodeAnchor] = [] if !mutSpans.isEmpty, location <= mutSpans.count { @@ -100,6 +123,9 @@ extension Megrez { return results } + /// 給定位置,枚舉出所有在這個位置結尾、或者橫跨該位置的節點。 + /// - Parameters: + /// - location: 位置。 public func nodesCrossingOrEndingAt(location: Int) -> [NodeAnchor] { var results: [NodeAnchor] = [] if !mutSpans.isEmpty, location <= mutSpans.count { @@ -126,14 +152,18 @@ extension Megrez { return results } - public func fixNodeSelectedCandidate(location: Int, value: String) -> NodeAnchor { + /// 將給定位置的節點的候選字詞改為與給定的字串一致的候選字詞。 + /// - Parameters: + /// - location: 位置。 + /// - value: 給定字串。 + @discardableResult public func fixNodeSelectedCandidate(location: Int, value: String) -> NodeAnchor { var node = NodeAnchor() for nodeAnchor in nodesCrossingOrEndingAt(location: location) { guard let theNode = nodeAnchor.node else { continue } - let candidates = theNode.candidates() - // Reset the candidate-fixed state of every node at the location. + let candidates = theNode.candidates + // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 theNode.resetCandidate() for (i, candidate) in candidates.enumerated() { if candidate.value == value { @@ -146,13 +176,18 @@ extension Megrez { return node } + /// 將給定位置的節點的與給定的字串一致的候選字詞的權重複寫為給定權重數值。 + /// - Parameters: + /// - location: 位置。 + /// - value: 給定字串。 + /// - overridingScore: 給定權重數值。 public func overrideNodeScoreForSelectedCandidate(location: Int, value: String, overridingScore: Double) { for nodeAnchor in nodesCrossingOrEndingAt(location: location) { guard let theNode = nodeAnchor.node else { continue } - let candidates = theNode.candidates() - // Reset the candidate-fixed state of every node at the location. + let candidates = theNode.candidates + // 將該位置的所有節點的候選字詞鎖定狀態全部重設。 theNode.resetCandidate() for (i, candidate) in candidates.enumerated() { if candidate.value == value { @@ -164,3 +199,38 @@ extension Megrez { } } } + +// MARK: - DumpDOT-related functions. + +extension Megrez.Grid { + public var dumpDOT: String { + var sst = "digraph {\ngraph [ rankdir=LR ];\nBOS;\n" + for (p, span) in mutSpans.enumerated() { + for ni in 0...(span.maximumLength) { + guard let np: Megrez.Node = span.node(length: ni) else { + continue + } + if p == 0 { + sst += "BOS -> \(np.currentKeyValue.value);\n" + } + + sst += "\(np.currentKeyValue.value);\n" + + if (p + ni) < mutSpans.count { + let dstSpan = mutSpans[p + ni] + for q in 0...(dstSpan.maximumLength) { + if let dn = dstSpan.node(length: q) { + sst += np.currentKeyValue.value + " -> " + dn.currentKeyValue.value + ";\n" + } + } + } + + if (p + ni) == mutSpans.count { + sst += np.currentKeyValue.value + " -> EOS;\n" + } + } + } + sst += "EOS;\n}\n" + return sst + } +} diff --git a/Sources/Megrez/3_NodeAnchor.swift b/Sources/Megrez/3_NodeAnchor.swift index 938262f..0c7ffa9 100644 --- a/Sources/Megrez/3_NodeAnchor.swift +++ b/Sources/Megrez/3_NodeAnchor.swift @@ -24,19 +24,52 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 節锚。 @frozen public struct NodeAnchor { + /// 節點。一個節锚內不一定有節點。 public var node: Node? + /// 節锚所在的位置。 public var location: Int = 0 + /// 幅位長度。 public var spanningLength: Int = 0 + /// 累計權重。 public var accumulatedScore: Double = 0.0 + /// 索引鍵的長度。 public var keyLength: Int { - node?.key().count ?? 0 + node?.key.count ?? 0 } + /// 將當前節點節锚成一個字串。 + public var printed: String { + var stream = "" + stream += "{@(" + String(location) + "," + String(spanningLength) + ")," + if let node = node { + stream += node.printed + } else { + stream += "null" + } + stream += "}" + return stream + } + + /// 獲取平衡權重。 public var balancedScore: Double { let weightedScore: Double = (Double(spanningLength) - 1) * 2 - let nodeScore: Double = node?.score() ?? 0 + let nodeScore: Double = node?.score ?? 0 return weightedScore + nodeScore } } } + +// MARK: - DumpDOT-related functions. + +extension Array where Element == Megrez.NodeAnchor { + /// 將節锚陣列列印成一個字串。 + public var printed: String { + var arrOutputContent = [""] + for anchor in self { + arrOutputContent.append(anchor.printed) + } + return arrOutputContent.joined(separator: "<-") + } +} diff --git a/Sources/Megrez/3_Span.swift b/Sources/Megrez/3_Span.swift index 5b5eee7..d99238a 100644 --- a/Sources/Megrez/3_Span.swift +++ b/Sources/Megrez/3_Span.swift @@ -24,23 +24,28 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 幅位。 @frozen public struct Span { - private var mutLengthNodeMap: [Int: Megrez.Node] - private var mutMaximumLength: Int + /// 辭典:以節點長度為索引,以節點為資料值。 + private var mutLengthNodeMap: [Int: Megrez.Node] = [:] + /// 最大節點長度。 + private var mutMaximumLength: Int = 0 + + /// 公開:最長幅距(唯讀)。 var maximumLength: Int { mutMaximumLength } - public init() { - mutLengthNodeMap = [:] - mutMaximumLength = 0 - } - + /// 自我清空,各項參數歸零。 mutating func clear() { mutLengthNodeMap.removeAll() mutMaximumLength = 0 } + /// 往自身插入一個節點、及給定的節點長度。 + /// - Parameters: + /// - node: 節點。 + /// - length: 給定的節點長度。 mutating func insert(node: Node, length: Int) { mutLengthNodeMap[length] = node if length > mutMaximumLength { @@ -48,6 +53,9 @@ extension Megrez { } } + /// 移除任何比給定的長度更長的節點。 + /// - Parameters: + /// - length: 給定的節點長度。 mutating func removeNodeOfLengthGreaterThan(_ length: Int) { if length > mutMaximumLength { return } var max = 0 @@ -67,6 +75,9 @@ extension Megrez { mutMaximumLength = max } + /// 給定節點長度,獲取節點。 + /// - Parameters: + /// - length: 給定的節點長度。 public func node(length: Int) -> Node? { mutLengthNodeMap[length] } diff --git a/Sources/Megrez/4_Node.swift b/Sources/Megrez/4_Node.swift index be518b3..acb1697 100644 --- a/Sources/Megrez/4_Node.swift +++ b/Sources/Megrez/4_Node.swift @@ -24,55 +24,69 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 節點。 public class Node { - let mutLM: LanguageModel - var mutKey: String - var mutScore: Double = 0 - var mutUnigrams: [Unigram] - var mutCandidates: [KeyValuePair] - var mutValueUnigramIndexMap: [String: Int] - var mutPrecedingBigramMap: [KeyValuePair: [Megrez.Bigram]] + /// 當前節點對應的語言模型。 + private let mutLM: LanguageModel = .init() + /// 鍵。 + private var mutKey: String = "" + /// 當前節點的當前被選中的候選字詞「在該節點內的」目前的權重。 + private var mutScore: Double = 0 + /// 單元圖陣列。 + private var mutUnigrams: [Unigram] + /// 雙元圖陣列。 + private var mutBigrams: [Bigram] + /// 候選字詞陣列,以鍵值陣列的形式存在。 + private var mutCandidates: [KeyValuePair] = [] + /// 專門「用單元圖資料值來調查索引值」的辭典。 + private var mutValueUnigramIndexMap: [String: Int] = [:] + /// 專門「用給定鍵值來取對應的雙元圖陣列」的辭典。 + private var mutPrecedingBigramMap: [KeyValuePair: [Megrez.Bigram]] = [:] + /// 狀態標記變數,用來記載當前節點是否處於候選字詞鎖定狀態。 + private var mutCandidateFixed: Bool = false + /// 用來登記「當前選中的單元圖」的索引值的變數。 + private var mutSelectedUnigramIndex: Int = 0 + /// 用來登記要施加給「『被標記為選中狀態』的候選字詞」的複寫權重的數值。 + private let kSelectedCandidateScore: Double = 99 + /// 將當前節點列印成一個字串。 + public var printed: String { + "(node,key:\(mutKey),fixed:\(mutCandidateFixed ? "true" : "false"),selected:\(mutSelectedUnigramIndex),\(mutUnigrams.printed))" + } - var mutCandidateFixed: Bool = false - var mutSelectedUnigramIndex: Int = 0 + /// 公開:候選字詞陣列(唯讀),以鍵值陣列的形式存在。 + var candidates: [KeyValuePair] { mutCandidates } + /// 公開:用來登記「當前選中的單元圖」的索引值的變數(唯讀)。 + var isCandidateFixed: Bool { mutCandidateFixed } + + /// 公開:鍵(唯讀)。 + var key: String { mutKey } + /// 公開:當前節點的當前被選中的候選字詞「在該節點內的」目前的權重(唯讀)。 + var score: Double { mutScore } + /// 公開:當前被選中的候選字詞的鍵值配對。 + var currentKeyValue: KeyValuePair { + mutSelectedUnigramIndex >= mutUnigrams.count ? KeyValuePair() : mutCandidates[mutSelectedUnigramIndex] + } - let kSelectedCandidateScore: Double = 99 + /// 公開:給出當前單元圖陣列內最高的權重數值。 + var highestUnigramScore: Double { mutUnigrams.isEmpty ? 0.0 : mutUnigrams[0].score } + /// 初期化一個節點。 + /// - Parameters: + /// - key: 索引鍵。 + /// - unigrams: 單元圖陣列。 + /// - bigrams: 雙元圖陣列(非必填)。 public init(key: String, unigrams: [Megrez.Unigram], bigrams: [Megrez.Bigram] = []) { - mutLM = LanguageModel() - mutKey = key - mutScore = 0 - mutUnigrams = unigrams - mutCandidates = [] - mutValueUnigramIndexMap = [:] - mutPrecedingBigramMap = [:] + mutBigrams = bigrams - mutCandidateFixed = false - mutSelectedUnigramIndex = 0 - - if bigrams == [] { - node(key: key, unigrams: unigrams, bigrams: bigrams) - } else { - node(key: key, unigrams: unigrams) - } - } - - public func node(key: String, unigrams: [Megrez.Unigram], bigrams: [Megrez.Bigram] = []) { - var unigrams = unigrams - mutKey = key - unigrams.sort { + mutUnigrams.sort { $0.score > $1.score } - if !mutUnigrams.isEmpty { - mutScore = mutUnigrams[0].score - } - - for (i, theGram) in unigrams.enumerated() { - mutValueUnigramIndexMap[theGram.keyValue.value] = i - mutCandidates.append(theGram.keyValue) + for (i, gram) in mutUnigrams.enumerated() { + mutValueUnigramIndexMap[gram.keyValue.value] = i + mutCandidates.append(gram.keyValue) } for gram in bigrams { @@ -80,11 +94,14 @@ extension Megrez { } } + /// 對擁有「給定的前述鍵值陣列」的節點提權。 + /// - Parameters: + /// - precedingKeyValues: 前述鍵值陣列。 public func primeNodeWith(precedingKeyValues: [KeyValuePair]) { var newIndex = mutSelectedUnigramIndex var max = mutScore - if !isCandidateFixed() { + if !isCandidateFixed { for neta in precedingKeyValues { let bigrams = mutPrecedingBigramMap[neta] ?? [] for bigram in bigrams { @@ -107,16 +124,17 @@ extension Megrez { } } - public func isCandidateFixed() -> Bool { mutCandidateFixed } - - public func candidates() -> [KeyValuePair] { mutCandidates } - + /// 選中位於給定索引位置的候選字詞。 + /// - Parameters: + /// - index: 索引位置。 + /// - fix: 是否將當前解點標記為「候選詞已鎖定」的狀態。 public func selectCandidateAt(index: Int = 0, fix: Bool = false) { mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index mutCandidateFixed = fix mutScore = kSelectedCandidateScore } + /// 重設該節點的候選字詞狀態。 public func resetCandidate() { mutSelectedUnigramIndex = 0 mutCandidateFixed = false @@ -125,16 +143,19 @@ extension Megrez { } } + /// 選中位於給定索引位置的候選字詞、且施加給定的權重。 + /// - Parameters: + /// - index: 索引位置。 + /// - score: 給定權重條件。 public func selectFloatingCandidateAt(index: Int, score: Double) { mutSelectedUnigramIndex = index >= mutUnigrams.count ? 0 : index mutCandidateFixed = false mutScore = score } - public func key() -> String { mutKey } - - public func score() -> Double { mutScore } - + /// 藉由給定的候選字詞字串,找出在庫的單元圖權重數值。沒有的話就找零。 + /// - Parameters: + /// - candidate: 給定的候選字詞字串。 public func scoreFor(candidate: String) -> Double { for unigram in mutUnigrams { if unigram.keyValue.value == candidate { @@ -144,14 +165,6 @@ extension Megrez { return 0.0 } - public func currentKeyValue() -> KeyValuePair { - mutSelectedUnigramIndex >= mutUnigrams.count ? KeyValuePair() : mutCandidates[mutSelectedUnigramIndex] - } - - public func highestUnigramScore() -> Double { - mutUnigrams.isEmpty ? 0.0 : mutUnigrams[0].score - } - public static func == (lhs: Node, rhs: Node) -> Bool { lhs.mutUnigrams == rhs.mutUnigrams && lhs.mutCandidates == rhs.mutCandidates && lhs.mutValueUnigramIndexMap == rhs.mutValueUnigramIndexMap diff --git a/Sources/Megrez/5_LanguageModel.swift b/Sources/Megrez/5_LanguageModel.swift index 383cdbc..776f644 100644 --- a/Sources/Megrez/5_LanguageModel.swift +++ b/Sources/Megrez/5_LanguageModel.swift @@ -24,19 +24,23 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { - // 這裡充其量只是框架,回頭實際使用時需要派生一個型別、且重寫相關函數。 - // 這裡寫了一點假內容,不然有些 Swift 格式化工具會破壞掉函數的參數設計。 + /// 語言模型框架,回頭實際使用時需要派生一個型別、且重寫相關函數。 open class LanguageModel { public init() {} + // 這裡寫了一點假內容,不然有些 Swift 格式化工具會破壞掉函數的參數設計。 + + /// 給定鍵,讓語言模型找給一筆單元圖。 open func unigramsFor(key: String) -> [Megrez.Unigram] { key.isEmpty ? [Megrez.Unigram]() : [Megrez.Unigram]() } + /// 給定當前鍵與前述鍵,讓語言模型找給一筆雙元圖。 open func bigramsForKeys(precedingKey: String, key: String) -> [Megrez.Bigram] { precedingKey == key ? [Megrez.Bigram]() : [Megrez.Bigram]() } + /// 給定鍵, open func hasUnigramsFor(key: String) -> Bool { key.count != 0 } diff --git a/Sources/Megrez/6_Bigram.swift b/Sources/Megrez/6_Bigram.swift index f934f1a..7413584 100644 --- a/Sources/Megrez/6_Bigram.swift +++ b/Sources/Megrez/6_Bigram.swift @@ -24,17 +24,28 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 雙元圖。 @frozen public struct Bigram: Equatable { + /// 當前鍵值。 public var keyValue: KeyValuePair + /// 前述鍵值。 public var precedingKeyValue: KeyValuePair + /// 權重。 public var score: Double - // var paired: String + /// 將當前雙元圖列印成一個字串。 + public var printed: String { + "(" + keyValue.printed + "|" + precedingKeyValue.printed + "," + String(score) + ")" + } + /// 初期化一筆「雙元圖」。一筆雙元圖由一組前述鍵值配對、一組當前鍵值配對、與一筆權重數值組成。 + /// - Parameters: + /// - precedingKeyValue: 前述鍵值。 + /// - keyValue: 當前鍵值。 + /// - score: 權重(雙精度小數)。 public init(precedingKeyValue: KeyValuePair, keyValue: KeyValuePair, score: Double) { self.keyValue = keyValue self.precedingKeyValue = precedingKeyValue self.score = score - // paired = "(" + keyValue.paired + "|" + precedingKeyValue.paired + "," + String(score) + ")" } public func hash(into hasher: inout Hasher) { @@ -44,16 +55,6 @@ extension Megrez { // hasher.combine(paired) } - // static func getPairedBigrams(grams: [Bigram]) -> String { - // var arrOutputContent = [""] - // var index = 0 - // for gram in grams { - // arrOutputContent.append(contentsOf: [String(index) + "=>" + gram.paired]) - // index += 1 - // } - // return "[" + String(grams.count) + "]=>{" + arrOutputContent.joined(separator: ",") + "}" - // } - public static func == (lhs: Bigram, rhs: Bigram) -> Bool { lhs.precedingKeyValue == rhs.precedingKeyValue && lhs.keyValue == rhs.keyValue && lhs.score == rhs.score } @@ -72,3 +73,16 @@ extension Megrez { } } } + +// MARK: - DumpDOT-related functions. + +extension Array where Element == Megrez.Bigram { + /// 將雙元圖陣列列印成一個字串。 + public var printed: String { + var arrOutputContent = [""] + for (index, gram) in enumerated() { + arrOutputContent.append(contentsOf: [String(index) + "=>" + gram.printed]) + } + return "[" + String(count) + "]=>{" + arrOutputContent.joined(separator: ",") + "}" + } +} diff --git a/Sources/Megrez/6_Unigram.swift b/Sources/Megrez/6_Unigram.swift index 793b5db..53fe75b 100644 --- a/Sources/Megrez/6_Unigram.swift +++ b/Sources/Megrez/6_Unigram.swift @@ -24,21 +24,29 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 單元圖。 @frozen public struct Unigram: Equatable { + /// 鍵值。 public var keyValue: KeyValuePair + /// 權重。 public var score: Double - // var paired: String + /// 將當前單元圖列印成一個字串。 + public var printed: String { + "(" + keyValue.printed + "," + String(score) + ")" + } + /// 初期化一筆「單元圖」。一筆單元圖由一組鍵值配對與一筆權重數值組成。 + /// - Parameters: + /// - keyValue: 鍵值。 + /// - score: 權重(雙精度小數)。 public init(keyValue: KeyValuePair, score: Double) { self.keyValue = keyValue self.score = score - // paired = "(" + keyValue.paired + "," + String(score) + ")" } public func hash(into hasher: inout Hasher) { hasher.combine(keyValue) hasher.combine(score) - // hasher.combine(paired) } // 這個函數不再需要了。 @@ -46,16 +54,6 @@ extension Megrez { a.score > b.score } - // static func getPairedUnigrams(grams: [Unigram]) -> String { - // var arrOutputContent = [""] - // var index = 0 - // for gram in grams { - // arrOutputContent.append(contentsOf: [String(index) + "=>" + gram.paired]) - // index += 1 - // } - // return "[" + String(grams.count) + "]=>{" + arrOutputContent.joined(separator: ",") + "}" - // } - public static func == (lhs: Unigram, rhs: Unigram) -> Bool { lhs.keyValue == rhs.keyValue && lhs.score == rhs.score } @@ -73,3 +71,16 @@ extension Megrez { } } } + +// MARK: - DumpDOT-related functions. + +extension Array where Element == Megrez.Unigram { + /// 將單元圖陣列列印成一個字串。 + public var printed: String { + var arrOutputContent = [""] + for (index, gram) in enumerated() { + arrOutputContent.append(contentsOf: [String(index) + "=>" + gram.printed]) + } + return "[" + String(count) + "]=>{" + arrOutputContent.joined(separator: ",") + "}" + } +} diff --git a/Sources/Megrez/7_KeyValuePair.swift b/Sources/Megrez/7_KeyValuePair.swift index b10a9e8..1756c24 100644 --- a/Sources/Megrez/7_KeyValuePair.swift +++ b/Sources/Megrez/7_KeyValuePair.swift @@ -24,21 +24,29 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ extension Megrez { + /// 鍵值配對。 @frozen public struct KeyValuePair: Equatable, Hashable, Comparable { + /// 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 public var key: String + /// 資料值。 public var value: String - // public var paired: String + /// 將當前鍵值列印成一個字串。 + public var printed: String { + "(" + key + "," + value + ")" + } + /// 初期化一組鍵值配對 + /// - Parameters: + /// - key: 鍵。一般情況下用來放置讀音等可以用來作為索引的內容。 + /// - value: 資料值。 public init(key: String = "", value: String = "") { self.key = key self.value = value - // paired = "(" + key + "," + value + ")" } public func hash(into hasher: inout Hasher) { hasher.combine(key) hasher.combine(value) - // hasher.combine(paired) } public static func == (lhs: KeyValuePair, rhs: KeyValuePair) -> Bool { diff --git a/Tests/MegrezTests/MegrezTests.swift b/Tests/MegrezTests/MegrezTests.swift index d085b06..7b8f795 100644 --- a/Tests/MegrezTests/MegrezTests.swift +++ b/Tests/MegrezTests/MegrezTests.swift @@ -37,60 +37,66 @@ final class MegrezTests: XCTestCase { builder.insertReadingAtCursor(reading: "gao1") builder.insertReadingAtCursor(reading: "ji4") - builder.setCursorIndex(newIndex: 1) + builder.cursorIndex = 1 builder.insertReadingAtCursor(reading: "ke1") - builder.setCursorIndex(newIndex: 1) + builder.cursorIndex = 1 builder.deleteReadingToTheFrontOfCursor() builder.insertReadingAtCursor(reading: "ke1") - builder.setCursorIndex(newIndex: 0) + builder.cursorIndex = 0 builder.deleteReadingToTheFrontOfCursor() builder.insertReadingAtCursor(reading: "gao1") - builder.setCursorIndex(newIndex: builder.length()) + builder.cursorIndex = builder.length builder.insertReadingAtCursor(reading: "gong1") builder.insertReadingAtCursor(reading: "si1") builder.insertReadingAtCursor(reading: "de5") builder.insertReadingAtCursor(reading: "nian2") builder.insertReadingAtCursor(reading: "zhong1") - _ = builder.grid().fixNodeSelectedCandidate(location: 7, value: "年終") + builder.grid.fixNodeSelectedCandidate(location: 7, value: "年終") builder.insertReadingAtCursor(reading: "jiang3") builder.insertReadingAtCursor(reading: "jin1") builder.insertReadingAtCursor(reading: "ni3") builder.insertReadingAtCursor(reading: "zhe4") builder.insertReadingAtCursor(reading: "yang4") - let walker = Megrez.Walker(grid: builder.grid()) - - var walked = walker.walk(at: builder.grid().width(), score: 0.0, nodesLimit: 3, balanced: true) + var walked = builder.walk(at: builder.grid.width, score: 0.0, nodesLimit: 3, balanced: true) // 這裡模擬一個輸入法的常見情況:每次敲一個字詞都會 walk,然後你回頭編輯完一些內容之後又會立刻重新 walk。 // 如果只在這裡測試第一遍 walk 的話,測試通過了也無法測試之後再次 walk 是否會正常。 - builder.setCursorIndex(newIndex: 1) + builder.cursorIndex = 1 builder.deleteReadingToTheFrontOfCursor() // 於是咱們 walk 第二遍 - walked = walker.walk(at: builder.grid().width(), score: 0.0, nodesLimit: 3, balanced: true) + walked = builder.walk(at: builder.grid.width, score: 0.0, nodesLimit: 3, balanced: true) XCTAssert(!walked.isEmpty) // 做好第三遍的準備,這次咱們來一次插入性編輯 builder.insertReadingAtCursor(reading: "ke1") // 重點測試這句是否正常,畢竟是在 walked 過的節點內進行插入編輯 // 於是咱們 walk 第三遍,這一遍會直接曝露「上述修改是否有對 builder 造成了破壞性的損失」所以很重要 - walked = walker.walk(at: builder.grid().width(), score: 0.0, nodesLimit: 3, balanced: true) + walked = builder.walk(at: builder.grid.width, score: 0.0, nodesLimit: 3, balanced: true) XCTAssert(!walked.isEmpty) var composed: [String] = [] for phrase in walked { if let node = phrase.node { - composed.append(node.currentKeyValue().value) + composed.append(node.currentKeyValue.value) } } print(composed) let correctResult = ["高科技", "公司", "的", "年終", "獎金", "你", "這樣"] print(" - 上述列印結果理應於下面這行一致:") print(correctResult) - XCTAssertEqual(composed, correctResult) + + // 測試 DumpDOT + builder.cursorIndex = builder.length + builder.deleteReadingAtTheRearOfCursor() + builder.deleteReadingAtTheRearOfCursor() + builder.deleteReadingAtTheRearOfCursor() + let expectedDumpDOT = + "digraph {\ngraph [ rankdir=LR ];\nBOS;\nBOS -> 高;\n高;\n高 -> 科;\n高 -> 科技;\nBOS -> 高科技;\n高科技;\n高科技 -> 工;\n高科技 -> 公司;\n科;\n科 -> 際;\n科 -> 濟公;\n科技;\n科技 -> 工;\n科技 -> 公司;\n際;\n際 -> 工;\n際 -> 公司;\n濟公;\n濟公 -> 斯;\n工;\n工 -> 斯;\n公司;\n公司 -> 的;\n斯;\n斯 -> 的;\n的;\n的 -> 年;\n的 -> 年終;\n年;\n年 -> 中;\n年終;\n年終 -> 獎;\n年終 -> 獎金;\n中;\n中 -> 獎;\n中 -> 獎金;\n獎;\n獎 -> 金;\n獎金;\n獎金 -> EOS;\n金;\n金 -> EOS;\nEOS;\n}\n" + XCTAssertEqual(builder.grid.dumpDOT, expectedDumpDOT) } // MARK: - Test Word Segmentation @@ -98,7 +104,7 @@ final class MegrezTests: XCTestCase { func testWordSegmentation() throws { print("// 開始測試語句分節處理") let lmTestSegmentation = SimpleLM(input: strSampleData, swapKeyValue: true) - let builder = Megrez.BlockReadingBuilder(lm: lmTestSegmentation) + let builder = Megrez.BlockReadingBuilder(lm: lmTestSegmentation, separator: "") builder.insertReadingAtCursor(reading: "高") builder.insertReadingAtCursor(reading: "科") @@ -111,13 +117,11 @@ final class MegrezTests: XCTestCase { builder.insertReadingAtCursor(reading: "獎") builder.insertReadingAtCursor(reading: "金") - let walker = Megrez.Walker(grid: builder.grid()) - var walked: [Megrez.NodeAnchor] = walker.reverseWalk(at: builder.grid().width(), score: 0.0) - walked = walked.reversed() + let walked = Array(builder.reverseWalk(at: builder.grid.width, score: 0.0).reversed()) var segmented: [String] = [] for phrase in walked { - if let key = phrase.node?.currentKeyValue().key { + if let key = phrase.node?.currentKeyValue.key { segmented.append(key) } } @@ -143,10 +147,10 @@ class SimpleLM: Megrez.LanguageModel { continue } - let linestream = line.components(separatedBy: " ") - let col0 = linestream[0] - let col1 = linestream[1] - let col2 = linestream[2] + let linestream = line.split(separator: " ") + let col0 = String(linestream[0]) + let col1 = String(linestream[1]) + let col2 = Double(linestream[2]) ?? 0.0 var u = Megrez.Unigram(keyValue: Megrez.KeyValuePair(), score: 0) @@ -158,7 +162,7 @@ class SimpleLM: Megrez.LanguageModel { u.keyValue.value = col1 } - u.score = Double(col2)! + u.score = col2 mutDatabase[u.keyValue.key, default: []].append(u) } } @@ -167,7 +171,7 @@ class SimpleLM: Megrez.LanguageModel { if let f = mutDatabase[key] { return f } else { - return [Megrez.Unigram]() + return [Megrez.Unigram]().sorted { $0.score > $1.score } } } @@ -178,7 +182,7 @@ class SimpleLM: Megrez.LanguageModel { // MARK: - 用以測試的詞頻數據 -let strSampleData = #""" +private let strSampleData = #""" # # 下述詞頻資料取自 libTaBE 資料庫 (http://sourceforge.net/projects/libtabe/) # (2002 最終版). 該專案於 1999 年由 Pai-Hsiang Hsiao 發起、以 BSD 授權發行。 @@ -187,28 +191,28 @@ ni3 你 -6.000000 // Non-LibTaBE zhe4 這 -6.000000 // Non-LibTaBE yang4 樣 -6.000000 // Non-LibTaBE si1 絲 -9.495858 -si1 思 -9.00644 +si1 思 -9.006414 si1 私 -99.000000 si1 斯 -8.091803 si1 司 -99.000000 -si1 嘶 -3.53987 -si1 撕 -2.259095 -gao1 高 -7.17551 +si1 嘶 -13.513987 +si1 撕 -12.259095 +gao1 高 -7.171551 ke1 顆 -10.574273 ke1 棵 -11.504072 ke1 刻 -10.450457 ke1 科 -7.171052 ke1 柯 -99.000000 gao1 膏 -11.928720 -gao1 篙 -3.624335 -gao1 糕 -2.390804 +gao1 篙 -13.624335 +gao1 糕 -12.390804 de5 的 -3.516024 di2 的 -3.516024 di4 的 -3.516024 zhong1 中 -5.809297 de5 得 -7.427179 gong1 共 -8.381971 -gong1 供 -8.50463 +gong1 供 -8.501463 ji4 既 -99.000000 jin1 今 -8.034095 gong1 紅 -8.858181 @@ -220,7 +224,7 @@ zhong1 終 -99.000000 ji4 記 -99.000000 ji4 寄 -99.000000 jin1 斤 -99.000000 -ji4 繼 -9.75317 +ji4 繼 -9.715317 ji4 計 -7.926683 ji4 暨 -8.373022 zhong1 鐘 -9.877580 @@ -242,23 +246,23 @@ ji4 忌 -99.000000 ji4 技 -8.450826 jin1 筋 -11.074890 gong1 躬 -99.000000 -ji4 冀 -2.045357 +ji4 冀 -12.045357 zhong1 忠 -99.000000 ji4 妓 -99.000000 ji4 濟 -9.517568 -ji4 薊 -2.02587 +ji4 薊 -12.021587 jin1 巾 -99.000000 -jin1 襟 -2.784206 -nian2 年 -6.08655 +jin1 襟 -12.784206 +nian2 年 -6.086515 jiang3 講 -9.164384 jiang3 獎 -8.690941 -jiang3 蔣 -10.27828 +jiang3 蔣 -10.127828 nian2 黏 -11.336864 nian2 粘 -11.285740 -jiang3 槳 -2.492933 +jiang3 槳 -12.492933 gong1si1 公司 -6.299461 -ke1ji4 科技 -6.73663 -ji4gong1 濟公 -3.336653 +ke1ji4 科技 -6.736613 +ji4gong1 濟公 -13.336653 jiang3jin1 獎金 -10.344678 nian2zhong1 年終 -11.668947 nian2zhong1 年中 -11.373044