Skip to content

Commit

Permalink
enhance: create and update google docs by importing markdown directly
Browse files Browse the repository at this point in the history
Instead of parsing google docs to/from markdown:
- import markdown directly via the APIs
- export docs to markdown via the APIs

This makes formatting more consistent and reduces the complexity of both
operations considerably.

Signed-off-by: Nick Hale <[email protected]>
  • Loading branch information
njhale committed Jan 29, 2025
1 parent 7ca907b commit 4fe7933
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 156 deletions.
47 changes: 15 additions & 32 deletions google/docs/read_doc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import sys
import os

Expand All @@ -13,44 +14,26 @@ def main():
raise ValueError('DOC_REF environment variable is missing or empty')

file_id = extract_file_id(doc_ref)
service = client('docs', 'v1')
document = service.documents().get(documentId=file_id).execute()

print(convert_to_markdown(document))
service = client('drive', 'v3')

request = service.files().export_media(
fileId=file_id,
mimeType='text/markdown'
)
file = io.BytesIO()
downloader = MediaIoBaseDownload(file, request)
done = False

while not done:
_, done = downloader.next_chunk()

print(file.getvalue().decode('utf-8'))

except Exception as err:
sys.stderr.write(err)
sys.exit(1)

def convert_to_markdown(document):
md_text = ""
for element in document.get('body', {}).get('content', []):
if 'paragraph' in element:
for part in element['paragraph']['elements']:
text_run = part.get('textRun')
if text_run:
md_text += text_run['content']
md_text += "\n\n" # Separate paragraphs with extra newlines
elif 'table' in element:
md_text += parse_table(element['table'])
md_text += "\n\n" # Extra newline after a table
return md_text

def parse_table(table):
md_table = ""
for row in table.get('tableRows', []):
row_text = "|"
for cell in row.get('tableCells', []):
cell_text = ""
for content in cell.get('content', []):
if 'paragraph' in content:
for element in content['paragraph']['elements']:
text_run = element.get('textRun')
if text_run:
cell_text += text_run['content']
row_text += f" {cell_text.strip()} |"
md_table += row_text + "\n"
return md_table

if __name__ == "__main__":
main()
4 changes: 1 addition & 3 deletions google/docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
google-api-python-client
google-auth-httplib2
google-auth-oauthlib
beautifulsoup4
markdown
google-auth-oauthlib
147 changes: 26 additions & 121 deletions google/docs/update_doc.py
Original file line number Diff line number Diff line change
@@ -1,146 +1,51 @@
import sys
import os
import io

import markdown
from bs4 import BeautifulSoup
from googleapiclient.http import MediaIoBaseUpload

from auth import client
from id import extract_file_id
from move_doc import move_doc


def markdown_to_google_doc_requests(markdown_content):
# Convert markdown content to HTML
html_content = markdown.markdown(markdown_content)
soup = BeautifulSoup(html_content, 'html.parser')

requests = []
current_index = 1

def add_text_request(text, bold=False, italic=False, underline=False, link=None):
nonlocal current_index
# Skip completely empty or whitespace-only values, except for single newlines
if not text.strip() and text != "\n":
return

text_style = {
"bold": bold,
"italic": italic,
"underline": underline,
}
if link:
text_style["link"] = {"url": link}

text_length = len(text)
requests.append({
"insertText": {
"location": {"index": current_index},
"text": text
}
})

if text_style or link:
requests.append({
"updateTextStyle": {
"range": {
"startIndex": current_index,
"endIndex": current_index + text_length
},
"textStyle": text_style,
"fields": ",".join(text_style.keys())
}
})

current_index += text_length

# Handle unstyled newlines
if text.endswith("\n"):
newline_length = 1
requests.append({
"updateTextStyle": {
"range": {
"startIndex": current_index - newline_length,
"endIndex": current_index
},
"textStyle": {}, # Explicitly remove styles
"fields": "bold,italic,underline,link"
}
})

for element in soup.contents:
if element.name in ['p']:
add_text_request(element.get_text())
add_text_request("\n")
elif element.name in ['h1', 'h2', 'h3']:
add_text_request(element.get_text(), bold=True)
elif element.name in ['ul']:
for li in element.find_all('li'):
add_text_request("\u2022 " + li.get_text())
elif element.name in ['ol']:
for i, li in enumerate(element.find_all('li'), start=1):
add_text_request(f"{i}. " + li.get_text())
elif element.name == 'a':
add_text_request(element.get_text(), link=element['href'])
elif element.name == 'table':
for row in element.find_all('tr'):
row_text = "\t".join([cell.get_text() for cell in row.find_all(['td', 'th'])]) + "\n"
add_text_request(row_text)
else:
add_text_request(element.get_text())
add_text_request("\n")

return requests
def replace_google_doc_using_import(doc_ref, markdown_content, new_drive_dir):
file_id = extract_file_id(doc_ref)
drive_service = client('drive', 'v3')

# Convert Markdown content into an in-memory file
markdown_file = io.BytesIO(markdown_content.encode("utf-8"))

# Use media upload for Drive import
media = MediaIoBaseUpload(markdown_file, mimetype="text/markdown", resumable=True)

# Overwrite the existing Google Doc with imported content
updated_file = drive_service.files().update(
fileId=file_id,
media_body=media,
body={'mimeType': 'application/vnd.google-apps.document'}
).execute()

print(f"Document replaced successfully using import: https://docs.google.com/document/d/{file_id}")

# Move the document if a new directory is specified
if new_drive_dir:
move_doc(drive_service, file_id, new_drive_dir)


def main():
try:
doc_ref = os.getenv('DOC_REF')
new_doc_content = os.getenv('NEW_DOC_CONTENT')
new_drive_dir = os.getenv('NEW_DRIVE_DIR', '').strip() # Get the optional NEW_DRIVE_DIR
new_drive_dir = os.getenv('NEW_DRIVE_DIR', '').strip()

if not doc_ref:
raise ValueError('DOC_REF environment variable is missing or empty')

if not new_doc_content:
raise ValueError('NEW_DOC_CONTENT environment variable is missing or empty')

try:
requests = markdown_to_google_doc_requests(new_doc_content)
except Exception as e:
raise ValueError(f"Failed to parse NEW_DOC_CONTENT: {e}")

file_id = extract_file_id(doc_ref)
docs_service = client('docs', 'v1')
drive_service = client('drive', 'v3')

# Retrieve the document to determine its length
document = docs_service.documents().get(documentId=file_id).execute()
content = document.get('body').get('content')
document_length = content[-1].get('endIndex') if content and 'endIndex' in content[-1] else 1

if document_length > 2:
# Prepare requests to clear existing document content
requests = [
{
"deleteContentRange": {
"range": {
"startIndex": 1,
"endIndex": document_length - 1
}
}
}
] + requests

# Issue a batch update request to clear and apply new content
response = docs_service.documents().batchUpdate(
documentId=file_id,
body={"requests": requests}
).execute()

print(f"Document updated successfully: {file_id}")

# Move the document to the specified folder, if NEW_DRIVE_DIR is set
move_doc(drive_service, file_id, new_drive_dir)
replace_google_doc_using_import(doc_ref, new_doc_content, new_drive_dir)

except Exception as err:
sys.stderr.write(f"Error: {err}\n")
Expand Down

0 comments on commit 4fe7933

Please sign in to comment.