Skip to content

Commit

Permalink
Refactoring walker, etc. (#102)
Browse files Browse the repository at this point in the history
* Refactor Vertex and VertexSpans.

* Deprecating NodeAnchor, removing unnecessary stable-sorts.

* Add unit tests for fetchCandidates().
  • Loading branch information
ShikiSuen authored Mar 21, 2023
1 parent aa1c8ef commit f6a2c2b
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 110 deletions.
42 changes: 20 additions & 22 deletions Sources/Megrez/2_Walker.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,39 +18,27 @@ public extension Megrez.Compositor {
defer { walkedNodes = result }
guard !spans.isEmpty else { return (result, true) }

var vertexSpans = [[Vertex]]()
spans.forEach { _ in
vertexSpans.append(.init())
}

spans.enumerated().forEach { i, span in
(1 ... max(span.maxLength, 1)).forEach { j in
guard let theNode = span[j] else { return }
vertexSpans[i].append(.init(node: theNode))
}
}
var vertexSpans: [[Int: Vertex]] = spans.map(\.asVertexSpan)

let terminal = Vertex(node: .init(keyArray: ["_TERMINAL_"]))
var root = Vertex(node: .init(keyArray: ["_ROOT_"]))
root.distance = 0

vertexSpans.enumerated().forEach { i, vertexSpan in
vertexSpan.forEach { vertex in
let nextVertexPosition = i + vertex.node.spanLength
vertexSpans.enumerated().forEach { location, vertexSpan in
vertexSpan.values.forEach { vertex in
let nextVertexPosition = location + vertex.node.spanLength
if nextVertexPosition == vertexSpans.count {
vertex.edges.append(terminal)
return
}
vertexSpans[nextVertexPosition].forEach { vertex.edges.append($0) }
vertexSpans[nextVertexPosition].values.forEach { vertex.edges.append($0) }
}
}

root.distance = 0
root.edges.append(contentsOf: vertexSpans[0])
root.edges.append(contentsOf: vertexSpans[0].values)

var ordered = topologicalSort(root: &root)
ordered.reversed().enumerated().forEach { j, neta in
neta.edges.indices.forEach { relax(u: neta, v: &neta.edges[$0]) }
ordered[j] = neta
topologicalSort(root: &root).reversed().forEach { neta in
neta.edges.indices.forEach { neta.relax(target: &neta.edges[$0]) }
}

var iterated = terminal
Expand All @@ -64,7 +52,6 @@ public extension Megrez.Compositor {
}

// 清理內容,否則會有記憶體洩漏。
ordered.removeAll()
vertexSpans.removeAll()
iterated.destroy()
root.destroy()
Expand All @@ -84,3 +71,14 @@ public extension Megrez.Compositor {
return (result, true)
}
}

extension Megrez.SpanUnit {
/// 將當前幅位單元由節點辭典轉為頂點辭典。
var asVertexSpan: [Int: Megrez.Compositor.Vertex] {
var result = [Int: Megrez.Compositor.Vertex]()
forEach { theKey, theValue in
result[theKey] = .init(node: theValue)
}
return result
}
}
44 changes: 6 additions & 38 deletions Sources/Megrez/3_KeyValuePaired.swift
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,7 @@ public extension Megrez.Compositor {
location -= 1
}
location = max(min(location, keys.count - 1), 0)
let anchors: [NodeAnchor] = fetchOverlappingNodes(at: location).stableSorted {
// 按照讀音的長度(幅位長度)來給節點排序。
$0.spanLength > $1.spanLength
}
let anchors: [(location: Int, node: Megrez.Node)] = fetchOverlappingNodes(at: location)
let keyAtCursor = keys[location]
anchors.forEach { theAnchor in
let theNode = theAnchor.node
Expand All @@ -120,11 +117,11 @@ public extension Megrez.Compositor {
// 得加上這道篩選,不然會出現很多無效結果。
if !theNode.keyArray.contains(keyAtCursor) { return }
case .beginAt:
guard theAnchor.spanIndex == location else { return }
guard theAnchor.location == location else { return }
case .endAt:
guard theNode.keyArray.last == keyAtCursor else { return }
switch theNode.spanLength {
case 2... where theAnchor.spanIndex + theAnchor.spanLength - 1 != location: return
case 2... where theAnchor.location + theAnchor.node.spanLength - 1 != location: return
default: break
}
}
Expand Down Expand Up @@ -178,8 +175,8 @@ public extension Megrez.Compositor {
-> Bool
{
let location = max(min(location, keys.count), 0) // 防呆
var arrOverlappedNodes: [NodeAnchor] = fetchOverlappingNodes(at: min(keys.count - 1, location))
var overridden: NodeAnchor?
var arrOverlappedNodes: [(location: Int, node: Megrez.Node)] = fetchOverlappingNodes(at: min(keys.count - 1, location))
var overridden: (location: Int, node: Megrez.Node)?
for anchor in arrOverlappedNodes {
if keyArray != nil, anchor.node.keyArray != keyArray { continue }
if !anchor.node.selectOverrideUnigram(value: value, type: type) { continue }
Expand All @@ -189,7 +186,7 @@ public extension Megrez.Compositor {

guard let overridden = overridden else { return false } // 啥也不覆寫。

(overridden.spanIndex ..< min(spans.count, overridden.spanIndex + overridden.node.spanLength)).forEach { i in
(overridden.location ..< min(spans.count, overridden.location + overridden.node.spanLength)).forEach { i in
/// 咱們還得弱化所有在相同的幅位座標的節點的複寫權重。舉例說之前爬軌的結果是「A BC」
/// 且 A 與 BC 都是被覆寫的結果,然後使用者現在在與 A 相同的幅位座標位置
/// 選了「DEF」,那麼 BC 的覆寫狀態就有必要重設(但 A 不用重設)。
Expand All @@ -208,32 +205,3 @@ public extension Megrez.Compositor {
return true
}
}

// MARK: - Stable Sort Extension

// Reference: https://stackoverflow.com/a/50545761/4162914

private extension Sequence {
/// Return a stable-sorted collection.
///
/// - Parameter areInIncreasingOrder: Return nil when two element are equal.
/// - Returns: The sorted collection.
func stableSorted(
by areInIncreasingOrder: (Element, Element) throws -> Bool
)
rethrows -> [Element]
{
try enumerated()
.sorted { a, b -> Bool in
try areInIncreasingOrder(a.element, b.element)
|| (a.offset < b.offset && !areInIncreasingOrder(b.element, a.element))
}
.map(\.element)
}
}

// MARK: - Bool Extension (Private)

extension Bool {
var negative: Bool { !self }
}
27 changes: 21 additions & 6 deletions Sources/Megrez/4_SpanUnit.swift
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,15 @@ extension Megrez.Compositor {
/// 找出所有與該位置重疊的節點。其返回值為一個節錨陣列(包含節點、以及其起始位置)。
/// - Parameter location: 游標位置。
/// - Returns: 一個包含所有與該位置重疊的節點的陣列。
func fetchOverlappingNodes(at givenLocation: Int) -> [NodeAnchor] {
var results = [NodeAnchor]()
public func fetchOverlappingNodes(at givenLocation: Int) -> [(location: Int, node: Megrez.Node)] {
var results = [(location: Int, node: Megrez.Node)]()
let givenLocation = max(0, min(givenLocation, keys.count - 1))
guard !spans.isEmpty else { return results }

// 先獲取該位置的所有單字節點。
(1 ... max(spans[givenLocation].maxLength, 1)).forEach { theSpanLength in
guard let node = spans[givenLocation][theSpanLength] else { return }
guard !node.keyArray.joined().isEmpty else { return }
results.append(.init(node: node, spanIndex: givenLocation))
Self.insertAnchor(spanIndex: givenLocation, node: node, to: &results)
}

// 再獲取以當前位置結尾或開頭的節點。
Expand All @@ -83,11 +82,27 @@ extension Megrez.Compositor {
guard A <= B else { return }
(A ... B).forEach { theLength in
guard let node = spans[theLocation][theLength] else { return }
guard !node.keyArray.joined().isEmpty else { return }
results.append(.init(node: node, spanIndex: theLocation))
Self.insertAnchor(spanIndex: theLocation, node: node, to: &results)
}
}

return results
}

/// 要在 fetchOverlappingNodes() 內使用的一個工具函式。
private static func insertAnchor(
spanIndex location: Int, node: Megrez.Node,
to targetContainer: inout [(location: Int, node: Megrez.Node)]
) {
guard !node.keyArray.joined().isEmpty else { return }
let anchor = (location: location, node: node)
for i in 0 ... targetContainer.count {
guard !targetContainer.isEmpty else { break }
guard targetContainer[i].node.spanLength <= anchor.node.spanLength else { continue }
targetContainer.insert(anchor, at: i)
return
}
guard targetContainer.isEmpty else { return }
targetContainer.append(anchor)
}
}
34 changes: 17 additions & 17 deletions Sources/Megrez/5_Vertex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,23 @@ extension Megrez.Compositor {
edges.removeAll()
node = .init()
}
}

/// 卸勁函式。
///
/// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。
/// - Parameters:
/// - u: 參照頂點,會在必要時成為 v 的前述頂點。
/// - v: 要影響的頂點。
func relax(u: Vertex, v: inout Vertex) {
// 從 u 到 w 的距離,也就是 v 的權重。
let w: Double = v.node.score
// 這裡計算最大權重:
// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」,
// 我們就更新 v 的距離及其前述頂點。
if v.distance >= u.distance + w { return }
v.distance = u.distance + w
v.prev = u
/// 卸勁函式。
///
/// 「卸勁 (relax)」一詞出自 Cormen 在 2001 年的著作「Introduction to Algorithms」的 585 頁。
/// - Remark: 自己就是參照頂點 (u),會在必要時成為 target (v) 的前述頂點。
/// - Parameters:
/// - target: 要影響的頂點。
public func relax(target: inout Vertex) {
// 從 u 到 w 的距離,也就是 v 的權重。
let w: Double = target.node.score
// 這裡計算最大權重:
// 如果 v 目前的距離值小於「u 的距離值+w(w 是 u 到 w 的距離,也就是 v 的權重)」,
// 我們就更新 v 的距離及其前述頂點。
if target.distance >= distance + w { return }
target.distance = distance + w
target.prev = self
}
}

/// 對持有單個根頂點的有向無環圖進行位相幾何排序(topological
Expand All @@ -65,7 +65,7 @@ extension Megrez.Compositor {
/// 這樣我們就不會受到當前線程的堆棧大小的限制。以下是等價的原始算法。
/// ```
/// func topologicalSort(vertex: Vertex) {
/// vertex.edges.forEach {vertexNode in
/// vertex.edges.forEach { vertexNode in
/// if !vertexNode.topologicallySorted {
/// dfs(vertexNode, result)
/// vertexNode.topologicallySorted = true
Expand Down
25 changes: 0 additions & 25 deletions Sources/Megrez/6_Node.swift
Original file line number Diff line number Diff line change
Expand Up @@ -176,31 +176,6 @@ public extension Megrez {
}
}

public extension Megrez.Compositor {
/// 節錨。在 Gramambular 2 當中又被稱為「NodeInSpan」。
struct NodeAnchor: Hashable {
/// 節點。
let node: Megrez.Node
/// 幅位座標。
let spanIndex: Int
/// 幅位長度。
var spanLength: Int { node.spanLength }
/// 單元圖陣列。
var unigrams: [Megrez.Unigram] { node.unigrams }
/// 索引鍵陣列。
var keyArray: [String] { node.keyArray }
/// 給出該節點內部單元圖陣列內目前被索引位置所指向的單元圖的資料值。
var value: String { node.value }

/// 做為預設雜湊函式。
/// - Parameter hasher: 目前物件的雜湊碼。
public func hash(into hasher: inout Hasher) {
hasher.combine(node)
hasher.combine(spanIndex)
}
}
}

// MARK: - Array Extensions.

public extension Array where Element == Megrez.Node {
Expand Down
40 changes: 40 additions & 0 deletions Tests/MegrezTests/MegrezImplForTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Swiftified and further development by (c) 2022 and onwards The vChewing Project (MIT License).
// Was initially rebranded from (c) Lukhnos Liu's C++ library "Gramambular 2" (MIT License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)

import Megrez

// MARK: - Megrez Extensions for Test Purposes Only.

public extension Megrez.Compositor {
/// 返回在當前位置的所有候選字詞(以詞音配對的形式)。如果組字器內有幅位、且游標
/// 位於組字器的(文字輸入順序的)最前方(也就是游標位置的數值是最大合規數值)的
/// 話,那麼這裡會用到 location - 1、以免去在呼叫該函式後再處理的麻煩。
/// - Remark: 該函式已被淘汰,因為有「無法徹底清除 node-crossing 內容」的故障。
/// 現僅用於單元測試、以確認其繼任者是否有給出所有該給出的正常結果。
/// - Parameter location: 游標位置。
/// - Returns: 候選字音配對陣列。
func fetchCandidatesDeprecated(at location: Int, filter: CandidateFetchFilter = .all) -> [Megrez.KeyValuePaired] {
var result = [Megrez.KeyValuePaired]()
guard !keys.isEmpty else { return result }
let location = max(min(location, keys.count - 1), 0) // 防呆
let anchors: [(location: Int, node: Megrez.Node)] = fetchOverlappingNodes(at: location)
let keyAtCursor = keys[location]
anchors.map(\.node).forEach { theNode in
theNode.unigrams.forEach { gram in
switch filter {
case .all:
// 得加上這道篩選,不然會出現很多無效結果。
if !theNode.keyArray.contains(keyAtCursor) { return }
case .beginAt:
if theNode.keyArray[0] != keyAtCursor { return }
case .endAt:
if theNode.keyArray.reversed()[0] != keyAtCursor { return }
}
result.append(.init(keyArray: theNode.keyArray, value: gram.value))
}
}
return result
}
}
27 changes: 25 additions & 2 deletions Tests/MegrezTests/MegrezTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ final class MegrezTests: XCTestCase {
XCTAssertEqual(result.values, ["高熱", "🔥", "危險"])
}

func test20_Compositor_updateUnigramData() throws {
func test20_Compositor_UpdateUnigramData() throws {
let theLM = SimpleLM(input: strSampleData)
var compositor = Megrez.Compositor(with: theLM)
compositor.separator = ""
Expand All @@ -547,7 +547,7 @@ final class MegrezTests: XCTestCase {
XCTAssertEqual(newResult2, ["", ""])
}

func test21_Compositor_hardCopy() throws {
func test21_Compositor_HardCopy() throws {
let theLM = SimpleLM(input: strSampleData)
let rawReadings = "gao1 ke1 ji4 gong1 si1 de5 nian2 zhong1 jiang3 jin1"
var compositorA = Megrez.Compositor(with: theLM)
Expand Down Expand Up @@ -580,4 +580,27 @@ final class MegrezTests: XCTestCase {
d = compositor.fetchCandidates(at: 2, filter: .endAt).map(\.keyArray.count).max() ?? 0
XCTAssertEqual("\(a) \(b) \(c) \(d)", "1 1 2 2")
}

func test23_Compositor_CheckGetCandidates() throws {
let theLM = SimpleLM(input: strSampleData)
let rawReadings = "gao1 ke1 ji4 gong1 si1 de5 nian2 zhong1 jiang3 jin1"
var compositor = Megrez.Compositor(with: theLM)
rawReadings.split(separator: " ").forEach { key in
compositor.insertKey(key.description)
}
var stack1A = [String]()
var stack1B = [String]()
var stack2A = [String]()
var stack2B = [String]()
for i in 0 ... compositor.keys.count {
stack1A.append(compositor.fetchCandidates(at: i, filter: .beginAt).map(\.value).joined(separator: "-"))
stack1B.append(compositor.fetchCandidates(at: i, filter: .endAt).map(\.value).joined(separator: "-"))
stack2A.append(compositor.fetchCandidatesDeprecated(at: i, filter: .beginAt).map(\.value).joined(separator: "-"))
stack2B.append(compositor.fetchCandidatesDeprecated(at: i, filter: .endAt).map(\.value).joined(separator: "-"))
}
stack1B.removeFirst()
stack2B.removeLast()
XCTAssertEqual(stack1A, stack2A)
XCTAssertEqual(stack1B, stack2B)
}
}

0 comments on commit f6a2c2b

Please sign in to comment.