question: how do we use this with private repos? #5

dep · 2024-03-12T13:24:50Z

Seems a pretty common use case. Documentation around this would be helpful

rnehrboss · 2024-03-12T14:04:51Z

Bump... I'm curious too

dep · 2024-03-12T15:18:37Z

To adapt the provided code to use a personal access token for authentication, you can add the token to the request headers. Here's how you can modify the download_repo function:

cc @rnehrboss

import os
import sys
import requests
import zipfile
import io
import ast

def download_repo(repo_url, output_file, token):
    """Download and process files from a GitHub repository."""
    headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url + "/archive/master.zip", headers=headers)
    response.raise_for_status()  # Raise an exception for non-2xx status codes

    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    with open(output_file, "w", encoding="utf-8") as outfile:
        for file_path in zip_file.namelist():
            # Skip directories, non-Python files, less likely useful files, hidden directories, and test files
            if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
                continue

            file_content = zip_file.read(file_path).decode("utf-8")

            # Skip test files based on content and files with insufficient substantive content
            if is_test_file(file_content) or not has_sufficient_content(file_content):
                continue

            try:
                file_content = remove_comments_and_docstrings(file_content)
            except SyntaxError:
                # Skip files with syntax errors
                continue

            outfile.write(f"# File: {file_path}\n")
            outfile.write(file_content)
            outfile.write("\n\n")

# ... (the rest of the code remains the same)

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <github_repo_url> <personal_access_token>")
        sys.exit(1)

    repo_url = sys.argv[1]
    token = sys.argv[2]
    repo_name = repo_url.split("/")[-1]
    output_file = f"{repo_name}_python.txt"

    download_repo(repo_url, output_file, token)
    print(f"Combined Python source code saved to {output_file}")

dep · 2024-03-12T15:20:03Z

I also made a simplified / modified version that you can use for JavaScript / Typescript codebases. Gives you the idea of how to extend it for your repo as well:

import os
import sys
import requests
import zipfile
import io
import ast
import re

def is_valid_file(file_path):
    """Check if the file is a JavaScript or TypeScript file."""
    allowed_extensions = [".js", ".jsx", ".tsx"]
    return any(file_path.endswith(ext) for ext in allowed_extensions)

def is_likely_useful_file(file_path):
    # Exclude test files
    if re.search(r'\.test\.', file_path) or re.search(r'\.stories\.', file_path) or re.search(r'\node_modules\.', file_path) or re.search(r'\vendor\.', file_path) or re.search(r'\dist\.', file_path):
        return False

    # Include files in <root>/components/** and <root>/applications/**
    if '/components/' in file_path or '/applications/' in file_path:
        return True

    return False

def download_repo(repo_url, output_file, token):
    headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url + "/archive/master.zip", headers=headers)
    response.raise_for_status()

    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    with open(output_file, "w", encoding="utf-8") as outfile:
        for file_path in zip_file.namelist():
            if file_path.endswith("/") or not is_valid_file(file_path) or not is_likely_useful_file(file_path):
                continue

            file_content = zip_file.read(file_path).decode("utf-8")

            try:
                file_content
            except SyntaxError:
                continue

            outfile.write(f"# File: {file_path}\n")
            outfile.write(file_content)
            outfile.write("\n\n")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <github_repo_url> <personal_access_token>")
        sys.exit(1)

    repo_url = sys.argv[1]
    token = sys.argv[2]
    repo_name = repo_url.split("/")[-1]
    output_file = f"{repo_name}_python.txt"

    download_repo(repo_url, output_file, token)
    print(f"Combined Python source code saved to {output_file}")

oslook · 2024-03-13T09:37:51Z

Here's the updated Python code that reads the personal access token from an environment variable:

import os
import sys
import requests
import zipfile
import io
import ast

def download_repo(repo_url, output_file, token):
    """Download and process files from a GitHub repository."""
    # Read the access token from the environment variable
    headers = {}
    token = os.environ.get("GITHUB_ACCESS_TOKEN")
    if token != "":
         headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url + "/archive/master.zip", headers=headers)
    response.raise_for_status()  # Raise an exception for non-2xx status codes

    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    with open(output_file, "w", encoding="utf-8") as outfile:
        for file_path in zip_file.namelist():
            # Skip directories, non-Python files, less likely useful files, hidden directories, and test files
            if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
                continue

            file_content = zip_file.read(file_path).decode("utf-8")

            # Skip test files based on content and files with insufficient substantive content
            if is_test_file(file_content) or not has_sufficient_content(file_content):
                continue

            try:
                file_content = remove_comments_and_docstrings(file_content)
            except SyntaxError:
                # Skip files with syntax errors
                continue

            outfile.write(f"# File: {file_path}\n")
            outfile.write(file_content)
            outfile.write("\n\n")

# ... (the rest of the code remains the same)

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <github_repo_url>")
        sys.exit(1)

    repo_url = sys.argv[1]
    repo_name = repo_url.split("/")[-1]
    output_file = f"{repo_name}_python.txt"

    download_repo(repo_url, output_file)
    print(f"Combined Python source code saved to {output_file}")

How to use:

Set the environment variable:
In your terminal, run the following command, replacing <your_token> with your actual personal access token:

export GITHUB_ACCESS_TOKEN=<your_token>

Run the Python script.

ehartford · 2024-03-13T19:17:39Z

Wanna make a PR?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

question: how do we use this with private repos? #5

question: how do we use this with private repos? #5

dep commented Mar 12, 2024

rnehrboss commented Mar 12, 2024

dep commented Mar 12, 2024 •

edited

Loading

dep commented Mar 12, 2024 •

edited

Loading

oslook commented Mar 13, 2024

ehartford commented Mar 13, 2024

question: how do we use this with private repos? #5

question: how do we use this with private repos? #5

Comments

dep commented Mar 12, 2024

rnehrboss commented Mar 12, 2024

dep commented Mar 12, 2024 • edited Loading

dep commented Mar 12, 2024 • edited Loading

oslook commented Mar 13, 2024

ehartford commented Mar 13, 2024

dep commented Mar 12, 2024 •

edited

Loading

dep commented Mar 12, 2024 •

edited

Loading