-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_bib_pages.py
275 lines (234 loc) · 10.9 KB
/
get_bib_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import google.generativeai as genai
import json
import pikepdf
import os
from collections import defaultdict
def clean_json_response(text):
"""Clean JSON response from markdown formatting."""
# Remove markdown code block markers if present
text = text.strip()
if text.startswith('```json'):
text = text[7:] # Remove ```json prefix
elif text.startswith('```'):
text = text[3:] # Remove ``` prefix
if text.endswith('```'):
text = text[:-3] # Remove ``` suffix
return text.strip()
def find_reference_section_pages(pdf_path):
"""First pass: Get reference section page ranges from the full PDF."""
try:
model = genai.GenerativeModel("gemini-2.0-flash-exp")
uploaded_pdf = genai.upload_file(pdf_path)
prompt = """
Your task is to find ALL bibliography/reference sections in this PDF.
INSTRUCTIONS:
1. Look for sections titled: References, Bibliography, Works Cited, Citations, Notes, Endnotes
2. For each section found, note its exact start and end page numbers
3. Return ONLY a JSON array with the format shown below
4. Do not include any explanatory text
REQUIRED FORMAT:
[
{
"start_page": 211,
"end_page": 278
}
]
IMPORTANT RULES:
- ALWAYS include both start_page AND end_page for each section
- Use INTEGER numbers only
- Use the actual printed page numbers visible on the pages
- Return ONLY valid JSON, no markdown, no explanations
- Include ALL reference sections if multiple exist.
- And if there is only one at the end, only return only this one.
- Be sure to include every whole section!
"""
response = model.generate_content(
[prompt, uploaded_pdf],
generation_config=genai.GenerationConfig(
response_mime_type="application/json"
)
)
if response.text:
try:
cleaned_response = clean_json_response(response.text)
sections = json.loads(cleaned_response)
# Validate that all sections have both start_page and end_page
valid_sections = []
for section in sections:
if 'start_page' in section and 'end_page' in section:
valid_sections.append(section)
else:
print(f"Warning: Skipping incomplete section: {section}")
if not valid_sections:
print("No valid sections found (missing start_page or end_page)")
return None
print(f"Found reference sections: {json.dumps(valid_sections, indent=2)}")
# Store the model and uploaded PDF in function attributes for reuse
find_reference_section_pages.model = model
find_reference_section_pages.uploaded_pdf = uploaded_pdf
return valid_sections
except json.JSONDecodeError as e:
print(f"Error parsing JSON response: {e}")
print(f"Raw response: {response.text}")
return None
else:
print("Empty response from model")
return None
except Exception as e:
print(f"Error finding reference sections: {str(e)}")
return None
def detect_page_number_offset(pdf_path: str, model: genai.GenerativeModel, uploaded_pdf) -> int | None:
"""
Sample pages at 20%, 40%, 60%, and 80% of the document to detect offset.
"""
temp_pdf_path = "temp_sample_pages.pdf"
try:
# Create sampled PDF
with pikepdf.Pdf.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
fifth = total_pages // 5
# Sample at 1/5, 2/5, 3/5, and 4/5 of document
new_pdf = pikepdf.Pdf.new()
for i in range(1, 5): # Will give us pages at 20%, 40%, 60%, 80%
page_num = i * fifth
new_pdf.pages.append(pdf.pages[page_num - 1]) # -1 for 0-based indexing
# Save the sampled pages to a temporary file
new_pdf.save(temp_pdf_path)
# Upload the sampled pages
sampled_pdf = genai.upload_file(temp_pdf_path)
prompt = """
TASK: Extract the printed page numbers from this PDF sample.
INSTRUCTIONS:
1. For each page in this sample (numbered 1-4):
- Find the actual printed page number visible on the page
- Return both the sample page number (1-4) and the printed number
REQUIRED FORMAT:
[
{
"physical_page": 1,
"printed_number": 20
},
{
"physical_page": 2,
"printed_number": 40
}
]
IMPORTANT:
- Use only integers
- physical_page must be 1-4
- printed_number should be the actual number printed on the page
- Return ONLY valid JSON, no explanations
"""
response = model.generate_content(
[prompt, sampled_pdf],
generation_config=genai.GenerationConfig(
response_mime_type="application/json"
)
)
if response.text:
try:
cleaned_response = clean_json_response(response.text)
page_numbers = json.loads(cleaned_response)
print(f"Detected page numbers: {json.dumps(page_numbers, indent=2)}")
offsets = defaultdict(int)
for entry in page_numbers:
try:
phys_page = int(entry.get('physical_page', 0))
print(phys_page)
printed_num = entry.get('printed_number', '0')
if phys_page > 0:
actual_page = phys_page * fifth
offset = printed_num - actual_page
print(f"Sample #{phys_page}: Physical page {actual_page} shows number {printed_num} -> offset {offset}")
offsets[offset] += 1
except (ValueError, TypeError, IndexError) as e:
continue
if offsets:
print(f"Detected offsets and their frequencies: {dict(offsets)}")
most_common_offset = max(offsets.items(), key=lambda x: x[1])
if most_common_offset[1] >= 2: # At least 2 pages agree on offset
return most_common_offset[0]
except json.JSONDecodeError as e:
print(f"Error parsing JSON response for page numbers: {e}")
print(f"Raw response: {response.text}")
except Exception as e:
print(f"Error in pagination detection: {e}")
finally:
# Clean up temporary file
if os.path.exists(temp_pdf_path):
try:
os.remove(temp_pdf_path)
except Exception as e:
print(f"Error removing temporary file: {e}")
return None
def extract_reference_sections(pdf_path: str, output_dir: str = "~/Nextcloud/DT/papers"):
"""Main function to find, adjust, and extract reference sections."""
# First, find reference sections
sections = find_reference_section_pages(pdf_path)
if not sections:
print("No reference sections found.")
return
# Then detect page number offset using the same model and uploaded PDF
offset = detect_page_number_offset(pdf_path, find_reference_section_pages.model, find_reference_section_pages.uploaded_pdf)
if offset is None:
print("Warning: Could not detect page number offset. Using physical page numbers.")
offset = 0
else:
print(f"Detected page number offset: {offset}")
# Adjust section page numbers
adjusted_sections = []
with pikepdf.Pdf.open(pdf_path) as doc: # Changed to pikepdf for consistency
total_pages = len(doc.pages)
for section in sections:
try:
start_page = section.get('start_page', 0)
end_page = section.get('end_page', 0)
if start_page > 0 and end_page > 0:
# Adjust for offset and 0-based indexing
physical_start = start_page - offset - 1 # Subtract 1 for 0-based indexing
physical_end = end_page - offset - 1 # Subtract 1 for 0-based indexing
if (0 <= physical_start < total_pages and # Changed to 0-based comparison
0 <= physical_end < total_pages and # Changed to 0-based comparison
physical_start <= physical_end):
adjusted_sections.append({
"start_page": physical_start, # Keep 0-based for pikepdf
"end_page": physical_end # Keep 0-based for pikepdf
})
except (ValueError, TypeError) as e:
print(f"Skipping invalid section {section}: {e}")
continue
if not adjusted_sections:
print("No valid sections after adjustment.")
return
print("\nAdjusted sections to extract:", json.dumps(adjusted_sections, indent=2))
# Ask for user confirmation before extraction
confirmation = input("\nWould you like to extract these sections? (yes/no): ").lower().strip()
if confirmation != 'yes':
print("Extraction cancelled.")
return
# Extract the sections after confirmation
output_dir = os.path.expanduser(output_dir)
os.makedirs(output_dir, exist_ok=True)
print(f"\nExtracting to directory: {output_dir}")
with pikepdf.Pdf.open(pdf_path) as doc:
for i, section in enumerate(adjusted_sections, 1):
start_page = section["start_page"] # Already 0-based from adjustment
end_page = section["end_page"] # Already 0-based from adjustment
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
for page_num in range(start_page, end_page + 1): # +1 for inclusive range
new_pdf = pikepdf.Pdf.new()
new_pdf.pages.append(doc.pages[page_num]) # Use directly with 0-based index
output_path = os.path.join(
output_dir,
f"{base_name}_section{i}_page{page_num+1}.pdf" # +1 for human-readable naming
)
try:
new_pdf.save(output_path)
print(f"Extracted page {page_num+1} of section {i} to {output_path}")
except Exception as err:
print(f"Error extracting page {page_num+1} of section {i}: {err}")
finally:
new_pdf.close()
if __name__ == "__main__":
pdf_path = "/home/jay/Nextcloud/DT/misc/Katharina Pistor - The Code of Capital_ How the Law Creates Wealth and Inequality (2019, Princeton University Press) [10.1515_9780691189437] - libgen.li.pdf"
extract_reference_sections(pdf_path)