-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkindle_pdf.py
113 lines (89 loc) · 3.75 KB
/
kindle_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import fitz
import re
import sys
from datetime import datetime
from tqdm import tqdm
def parse_clippings(content):
entries = content.split('==========')
highlights_and_notes = []
for entry in entries:
if not entry.strip():
continue
lines = entry.strip().split('\n')
if len(lines) < 2:
continue
metadata = lines[1].strip()
# Extract page number and timestamp
page_match = re.search(r'page (\d+)', metadata, re.IGNORECASE)
time_match = re.search(r'Added on (.+)', metadata, re.IGNORECASE)
if not page_match or not time_match:
continue
# Parse timestamp
timestamp = datetime.strptime(time_match.group(1).strip(), '%A, %B %d, %Y %I:%M:%S %p')
entry_type = 'Highlight' if 'Highlight' in metadata else 'Note'
content = '\n'.join(line for line in lines[2:] if line.strip())
content = re.sub(r'\s+', ' ', content).replace('\u00a0', ' ').strip()
highlights_and_notes.append({
'type': entry_type,
'content': content,
'page': int(page_match.group(1)),
'timestamp': timestamp
})
return highlights_and_notes
filename = sys.argv[1]
clippings = sys.argv[2]
with open(clippings, "r", encoding='utf-8-sig') as f:
content = f.read()
entries = parse_clippings(content)
print(f"{len(entries)} highlights/notes found in clippings")
doc = fitz.open(filename)
processed = []
# Separate highlights and notes
highlights = [e for e in entries if e['type'] == 'Highlight']
notes = [e for e in entries if e['type'] == 'Note']
# For each note, find the closest highlight by timestamp
for note in notes:
closest_highlight = None
min_time_diff = float('inf')
for highlight in highlights:
time_diff = abs((highlight['timestamp'] - note['timestamp']).total_seconds())
if time_diff < min_time_diff:
min_time_diff = time_diff
closest_highlight = highlight
if closest_highlight:
note['closest_highlight'] = closest_highlight
# Process pages and add annotations
for page_num, page in enumerate(tqdm(doc, total=len(doc), desc="Processing pages")):
page_text = ' '.join(page.get_text().split())
# Add highlights
for highlight in highlights:
if highlight in processed or highlight['page'] != page_num + 1:
continue
if highlight['content'] in page_text:
quads = page.search_for(highlight['content'], quads=True)
if quads:
annot = page.add_highlight_annot(quads)
annot.set_colors({"stroke": (1, 1, 0)})
annot.update()
processed.append(highlight)
# Add notes next to their closest highlights
for note in notes:
if note in processed or note['page'] != page_num + 1:
continue
closest_highlight = note.get('closest_highlight')
if closest_highlight and closest_highlight in processed:
highlight_quads = page.search_for(closest_highlight['content'], quads=True)
if highlight_quads:
rect = highlight_quads[0].rect
point = fitz.Point(rect.x1 + 10, rect.y0) # Place note to the right of the highlight
annot = page.add_text_annot(point, note['content'])
annot.update()
processed.append(note)
output_path = filename.rsplit(".", 1)[0] + "_annotated.pdf"
doc.save(output_path)
print(f"Saved annotated PDF to {output_path}")
remaining = [e for e in entries if e not in processed]
if remaining:
print("\nCouldn't process these entries:")
for entry in remaining:
print(f"- {entry['content'][:100]}...")