Skip to content

Commit

Permalink
chore: rm original font
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu committed Dec 7, 2024
1 parent 8bc0137 commit 4caf3df
Showing 1 changed file with 7 additions and 23 deletions.
30 changes: 7 additions & 23 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
from pdfminer.pdffont import PDFFont, PDFCIDFont
from pdfminer.pdffont import PDFCIDFont
from pdfminer.converter import PDFConverter
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.utils import apply_matrix_pt, mult_matrix
Expand Down Expand Up @@ -105,13 +105,12 @@ def render_char(


class Paragraph:
def __init__(self, y, x, x0, x1, size, font, brk):
def __init__(self, y, x, x0, x1, size, brk):
self.y: float = y # 初始纵坐标
self.x: float = x # 初始横坐标
self.x0: float = x0 # 左边界
self.x1: float = x1 # 右边界
self.size: float = size # 字体大小
self.font: PDFFont = font # 字体
self.brk: bool = brk # 换行标记


Expand Down Expand Up @@ -258,21 +257,14 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
pstk[-1].brk = True
else: # 根据当前字符构建一个新的段落
sstk.append("")
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
if not cur_v: # 文字入栈
if ( # 根据当前字符修正段落属性
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
or vflag(pstk[-1].font.fontname, "") # 3. 段落字体为公式字体
or re.match( # 4. 段落字体为粗体
r"(.*Medi|.*Bold)",
pstk[-1].font.fontname,
re.IGNORECASE,
)
):
pstk[-1].y -= child.size - pstk[-1].size # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
pstk[-1].size = child.size
pstk[-1].font = child.font
sstk[-1] += child.get_text()
else: # 公式入栈
if ( # 根据公式左侧的文字修正公式的纵向偏移
Expand Down Expand Up @@ -358,18 +350,17 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
_x, _y = 0, 0
for id, new in enumerate(news):
x: float = pstk[id].x # 段落初始横坐标
y: float = pstk[id].y # 段落上边界
y: float = pstk[id].y # 段落初始纵坐标
x0: float = pstk[id].x0 # 段落左边界
x1: float = pstk[id].x1 # 段落右边界
size: float = pstk[id].size # 段落字体大小
font: PDFFont = pstk[id].font # 段落字体
brk: bool = pstk[id].brk # 段落属性
brk: bool = pstk[id].brk # 段落换行标记
cstk: str = "" # 当前文字栈
fcur: str = None # 当前字体ID
fcur: str = None # 当前字体 ID
tx = x
fcur_ = fcur
ptr = 0
log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
while ptr < len(new):
vy_regex = re.match(
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
Expand All @@ -387,20 +378,13 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
else: # 加载文字
ch = new[ptr]
fcur_ = None
# 原字体编码容易出问题,这里直接放弃掉
# try:
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
# fcur_=self.fontid[font] # 原字体
# except:
# pass
try:
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
fcur_ = "tiro" # 默认拉丁字体
except Exception:
pass
if fcur_ is None:
fcur_ = self.resfont # 默认非拉丁字体
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
if fcur_ == 'noto':
adv = self.noto.char_lengths(ch, size)[0]
else:
Expand Down

0 comments on commit 4caf3df

Please sign in to comment.