Skip to content

Commit

Permalink
fix: anchor bullet
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu committed Dec 5, 2024
1 parent c55cd9c commit 2604748
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def receive_layout(self, ltpage: LTPage):
# 全局
lstk: list[LTLine] = [] # 全局线条栈
xt: LTChar = None # 上一个字符
xt_cls: int = -1 # 上一个字符所属段落
xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落
vmax: float = ltpage.width / 4 # 行内公式最大宽度
ops: str = "" # 渲染结果

Expand Down Expand Up @@ -216,6 +216,10 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
# 读取当前字符在 layout 中的类别
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
cls = layout[cy, cx]
# 锚定文档中 bullet 的位置
if child.get_text() == "•":
cls = 0
# 判定当前字符是否属于公式
if ( # 判定当前字符是否属于公式
cls == 0 # 1. 类别为保留区域
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
Expand Down

0 comments on commit 2604748

Please sign in to comment.