Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

question: how do we use this with private repos? #5

Open
dep opened this issue Mar 12, 2024 · 5 comments
Open

question: how do we use this with private repos? #5

dep opened this issue Mar 12, 2024 · 5 comments

Comments

@dep
Copy link

dep commented Mar 12, 2024

Seems a pretty common use case. Documentation around this would be helpful

@rnehrboss
Copy link

Bump... I'm curious too

@dep
Copy link
Author

dep commented Mar 12, 2024

To adapt the provided code to use a personal access token for authentication, you can add the token to the request headers. Here's how you can modify the download_repo function:

cc @rnehrboss

import os
import sys
import requests
import zipfile
import io
import ast

def download_repo(repo_url, output_file, token):
    """Download and process files from a GitHub repository."""
    headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url + "/archive/master.zip", headers=headers)
    response.raise_for_status()  # Raise an exception for non-2xx status codes

    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    with open(output_file, "w", encoding="utf-8") as outfile:
        for file_path in zip_file.namelist():
            # Skip directories, non-Python files, less likely useful files, hidden directories, and test files
            if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
                continue

            file_content = zip_file.read(file_path).decode("utf-8")

            # Skip test files based on content and files with insufficient substantive content
            if is_test_file(file_content) or not has_sufficient_content(file_content):
                continue

            try:
                file_content = remove_comments_and_docstrings(file_content)
            except SyntaxError:
                # Skip files with syntax errors
                continue

            outfile.write(f"# File: {file_path}\n")
            outfile.write(file_content)
            outfile.write("\n\n")

# ... (the rest of the code remains the same)

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <github_repo_url> <personal_access_token>")
        sys.exit(1)

    repo_url = sys.argv[1]
    token = sys.argv[2]
    repo_name = repo_url.split("/")[-1]
    output_file = f"{repo_name}_python.txt"

    download_repo(repo_url, output_file, token)
    print(f"Combined Python source code saved to {output_file}")

@dep
Copy link
Author

dep commented Mar 12, 2024

I also made a simplified / modified version that you can use for JavaScript / Typescript codebases. Gives you the idea of how to extend it for your repo as well:

import os
import sys
import requests
import zipfile
import io
import ast
import re

def is_valid_file(file_path):
    """Check if the file is a JavaScript or TypeScript file."""
    allowed_extensions = [".js", ".jsx", ".tsx"]
    return any(file_path.endswith(ext) for ext in allowed_extensions)

def is_likely_useful_file(file_path):
    # Exclude test files
    if re.search(r'\.test\.', file_path) or re.search(r'\.stories\.', file_path) or re.search(r'\node_modules\.', file_path) or re.search(r'\vendor\.', file_path) or re.search(r'\dist\.', file_path):
        return False

    # Include files in <root>/components/** and <root>/applications/**
    if '/components/' in file_path or '/applications/' in file_path:
        return True

    return False

def download_repo(repo_url, output_file, token):
    headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url + "/archive/master.zip", headers=headers)
    response.raise_for_status()

    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    with open(output_file, "w", encoding="utf-8") as outfile:
        for file_path in zip_file.namelist():
            if file_path.endswith("/") or not is_valid_file(file_path) or not is_likely_useful_file(file_path):
                continue

            file_content = zip_file.read(file_path).decode("utf-8")

            try:
                file_content
            except SyntaxError:
                continue

            outfile.write(f"# File: {file_path}\n")
            outfile.write(file_content)
            outfile.write("\n\n")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <github_repo_url> <personal_access_token>")
        sys.exit(1)

    repo_url = sys.argv[1]
    token = sys.argv[2]
    repo_name = repo_url.split("/")[-1]
    output_file = f"{repo_name}_python.txt"

    download_repo(repo_url, output_file, token)
    print(f"Combined Python source code saved to {output_file}")

@oslook
Copy link
Contributor

oslook commented Mar 13, 2024

Here's the updated Python code that reads the personal access token from an environment variable:

import os
import sys
import requests
import zipfile
import io
import ast

def download_repo(repo_url, output_file, token):
    """Download and process files from a GitHub repository."""
    # Read the access token from the environment variable
    headers = {}
    token = os.environ.get("GITHUB_ACCESS_TOKEN")
    if token != "":
         headers = {'Authorization': f'token {token}'}
    response = requests.get(repo_url + "/archive/master.zip", headers=headers)
    response.raise_for_status()  # Raise an exception for non-2xx status codes

    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    with open(output_file, "w", encoding="utf-8") as outfile:
        for file_path in zip_file.namelist():
            # Skip directories, non-Python files, less likely useful files, hidden directories, and test files
            if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
                continue

            file_content = zip_file.read(file_path).decode("utf-8")

            # Skip test files based on content and files with insufficient substantive content
            if is_test_file(file_content) or not has_sufficient_content(file_content):
                continue

            try:
                file_content = remove_comments_and_docstrings(file_content)
            except SyntaxError:
                # Skip files with syntax errors
                continue

            outfile.write(f"# File: {file_path}\n")
            outfile.write(file_content)
            outfile.write("\n\n")

# ... (the rest of the code remains the same)

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <github_repo_url>")
        sys.exit(1)

    repo_url = sys.argv[1]
    repo_name = repo_url.split("/")[-1]
    output_file = f"{repo_name}_python.txt"

    download_repo(repo_url, output_file)
    print(f"Combined Python source code saved to {output_file}")

How to use:

  1. Set the environment variable:
    In your terminal, run the following command, replacing <your_token> with your actual personal access token:
export GITHUB_ACCESS_TOKEN=<your_token>
  1. Run the Python script.

@ehartford
Copy link
Contributor

Wanna make a PR?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants