enhance: create and update google docs by importing markdown directly

Instead of parsing google docs to/from markdown: - import markdown directly via the APIs - export docs to markdown via the APIs This makes formatting more consistent and reduces the complexity of both operations considerably. Signed-off-by: Nick Hale <[email protected]>
obot-platform · Jan 29, 2025 · 4fe7933 · 4fe7933
1 parent 7ca907b
commit 4fe7933
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 156 deletions.
diff --git a/google/docs/read_doc.py b/google/docs/read_doc.py
@@ -1,3 +1,4 @@
+import io
 import sys
 import os
 
@@ -13,44 +14,26 @@ def main():
             raise ValueError('DOC_REF environment variable is missing or empty')
 
         file_id = extract_file_id(doc_ref)
-        service = client('docs', 'v1')
-        document = service.documents().get(documentId=file_id).execute()
 
-        print(convert_to_markdown(document))
+        service = client('drive', 'v3')
+
+        request = service.files().export_media(
+            fileId=file_id,
+            mimeType='text/markdown'
+        )
+        file = io.BytesIO()
+        downloader = MediaIoBaseDownload(file, request)
+        done = False
+
+        while not done:
+            _, done = downloader.next_chunk()
+
+        print(file.getvalue().decode('utf-8'))
 
     except Exception as err:
         sys.stderr.write(err)
         sys.exit(1)
 
-def convert_to_markdown(document):
-    md_text = ""
-    for element in document.get('body', {}).get('content', []):
-        if 'paragraph' in element:
-            for part in element['paragraph']['elements']:
-                text_run = part.get('textRun')
-                if text_run:
-                    md_text += text_run['content']
-            md_text += "\n\n"  # Separate paragraphs with extra newlines
-        elif 'table' in element:
-            md_text += parse_table(element['table'])
-            md_text += "\n\n"  # Extra newline after a table
-    return md_text
-
-def parse_table(table):
-    md_table = ""
-    for row in table.get('tableRows', []):
-        row_text = "|"
-        for cell in row.get('tableCells', []):
-            cell_text = ""
-            for content in cell.get('content', []):
-                if 'paragraph' in content:
-                    for element in content['paragraph']['elements']:
-                        text_run = element.get('textRun')
-                        if text_run:
-                            cell_text += text_run['content']
-            row_text += f" {cell_text.strip()} |"
-        md_table += row_text + "\n"
-    return md_table
 
 if __name__ == "__main__":
     main()
diff --git a/google/docs/requirements.txt b/google/docs/requirements.txt
@@ -1,5 +1,3 @@
 google-api-python-client
 google-auth-httplib2
-google-auth-oauthlib
-beautifulsoup4
-markdown
+google-auth-oauthlib
diff --git a/google/docs/update_doc.py b/google/docs/update_doc.py
@@ -1,146 +1,51 @@
 import sys
 import os
+import io
 
-import markdown
-from bs4 import BeautifulSoup
+from googleapiclient.http import MediaIoBaseUpload
 
 from auth import client
 from id import extract_file_id
 from move_doc import move_doc
 
 
-def markdown_to_google_doc_requests(markdown_content):
-    # Convert markdown content to HTML
-    html_content = markdown.markdown(markdown_content)
-    soup = BeautifulSoup(html_content, 'html.parser')
-
-    requests = []
-    current_index = 1
-
-    def add_text_request(text, bold=False, italic=False, underline=False, link=None):
-        nonlocal current_index
-        # Skip completely empty or whitespace-only values, except for single newlines
-        if not text.strip() and text != "\n":
-            return
-
-        text_style = {
-            "bold": bold,
-            "italic": italic,
-            "underline": underline,
-        }
-        if link:
-            text_style["link"] = {"url": link}
-
-        text_length = len(text)
-        requests.append({
-            "insertText": {
-                "location": {"index": current_index},
-                "text": text
-            }
-        })
-
-        if text_style or link:
-            requests.append({
-                "updateTextStyle": {
-                    "range": {
-                        "startIndex": current_index,
-                        "endIndex": current_index + text_length
-                    },
-                    "textStyle": text_style,
-                    "fields": ",".join(text_style.keys())
-                }
-            })
-
-        current_index += text_length
-
-        # Handle unstyled newlines
-        if text.endswith("\n"):
-            newline_length = 1
-            requests.append({
-                "updateTextStyle": {
-                    "range": {
-                        "startIndex": current_index - newline_length,
-                        "endIndex": current_index
-                    },
-                    "textStyle": {},  # Explicitly remove styles
-                    "fields": "bold,italic,underline,link"
-                }
-            })
-
-    for element in soup.contents:
-        if element.name in ['p']:
-            add_text_request(element.get_text())
-            add_text_request("\n")
-        elif element.name in ['h1', 'h2', 'h3']:
-            add_text_request(element.get_text(), bold=True)
-        elif element.name in ['ul']:
-            for li in element.find_all('li'):
-                add_text_request("\u2022 " + li.get_text())
-        elif element.name in ['ol']:
-            for i, li in enumerate(element.find_all('li'), start=1):
-                add_text_request(f"{i}. " + li.get_text())
-        elif element.name == 'a':
-            add_text_request(element.get_text(), link=element['href'])
-        elif element.name == 'table':
-            for row in element.find_all('tr'):
-                row_text = "\t".join([cell.get_text() for cell in row.find_all(['td', 'th'])]) + "\n"
-                add_text_request(row_text)
-        else:
-            add_text_request(element.get_text())
-            add_text_request("\n")
-
-    return requests
+def replace_google_doc_using_import(doc_ref, markdown_content, new_drive_dir):
+    file_id = extract_file_id(doc_ref)
+    drive_service = client('drive', 'v3')
+
+    # Convert Markdown content into an in-memory file
+    markdown_file = io.BytesIO(markdown_content.encode("utf-8"))
+
+    # Use media upload for Drive import
+    media = MediaIoBaseUpload(markdown_file, mimetype="text/markdown", resumable=True)
+
+    # Overwrite the existing Google Doc with imported content
+    updated_file = drive_service.files().update(
+        fileId=file_id,
+        media_body=media,
+        body={'mimeType': 'application/vnd.google-apps.document'}
+    ).execute()
+
+    print(f"Document replaced successfully using import: https://docs.google.com/document/d/{file_id}")
+
+    # Move the document if a new directory is specified
+    if new_drive_dir:
+        move_doc(drive_service, file_id, new_drive_dir)
 
 
 def main():
     try:
         doc_ref = os.getenv('DOC_REF')
         new_doc_content = os.getenv('NEW_DOC_CONTENT')
-        new_drive_dir = os.getenv('NEW_DRIVE_DIR', '').strip()  # Get the optional NEW_DRIVE_DIR
+        new_drive_dir = os.getenv('NEW_DRIVE_DIR', '').strip()
 
         if not doc_ref:
             raise ValueError('DOC_REF environment variable is missing or empty')
 
         if not new_doc_content:
             raise ValueError('NEW_DOC_CONTENT environment variable is missing or empty')
 
-        try:
-            requests = markdown_to_google_doc_requests(new_doc_content)
-        except Exception as e:
-            raise ValueError(f"Failed to parse NEW_DOC_CONTENT: {e}")
-
-        file_id = extract_file_id(doc_ref)
-        docs_service = client('docs', 'v1')
-        drive_service = client('drive', 'v3')
-
-        # Retrieve the document to determine its length
-        document = docs_service.documents().get(documentId=file_id).execute()
-        content = document.get('body').get('content')
-        document_length = content[-1].get('endIndex') if content and 'endIndex' in content[-1] else 1
-
-        if document_length > 2:
-            # Prepare requests to clear existing document content
-            requests = [
-                {
-                    "deleteContentRange": {
-                        "range": {
-                            "startIndex": 1,
-                            "endIndex": document_length - 1
-                        }
-                    }
-                }
-            ] + requests
-
-        # Issue a batch update request to clear and apply new content
-        response = docs_service.documents().batchUpdate(
-            documentId=file_id,
-            body={"requests": requests}
-        ).execute()
-
-        print(f"Document updated successfully: {file_id}")
-
-        # Move the document to the specified folder, if NEW_DRIVE_DIR is set
-        move_doc(drive_service, file_id, new_drive_dir)
+        replace_google_doc_using_import(doc_ref, new_doc_content, new_drive_dir)
 
     except Exception as err:
         sys.stderr.write(f"Error: {err}\n")