-
Notifications
You must be signed in to change notification settings - Fork 103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
question: how do we use this with private repos? #5
Comments
Bump... I'm curious too |
To adapt the provided code to use a personal access token for authentication, you can add the token to the request headers. Here's how you can modify the download_repo function: cc @rnehrboss import os
import sys
import requests
import zipfile
import io
import ast
def download_repo(repo_url, output_file, token):
"""Download and process files from a GitHub repository."""
headers = {'Authorization': f'token {token}'}
response = requests.get(repo_url + "/archive/master.zip", headers=headers)
response.raise_for_status() # Raise an exception for non-2xx status codes
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
with open(output_file, "w", encoding="utf-8") as outfile:
for file_path in zip_file.namelist():
# Skip directories, non-Python files, less likely useful files, hidden directories, and test files
if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
continue
file_content = zip_file.read(file_path).decode("utf-8")
# Skip test files based on content and files with insufficient substantive content
if is_test_file(file_content) or not has_sufficient_content(file_content):
continue
try:
file_content = remove_comments_and_docstrings(file_content)
except SyntaxError:
# Skip files with syntax errors
continue
outfile.write(f"# File: {file_path}\n")
outfile.write(file_content)
outfile.write("\n\n")
# ... (the rest of the code remains the same)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <github_repo_url> <personal_access_token>")
sys.exit(1)
repo_url = sys.argv[1]
token = sys.argv[2]
repo_name = repo_url.split("/")[-1]
output_file = f"{repo_name}_python.txt"
download_repo(repo_url, output_file, token)
print(f"Combined Python source code saved to {output_file}") |
I also made a simplified / modified version that you can use for JavaScript / Typescript codebases. Gives you the idea of how to extend it for your repo as well: import os
import sys
import requests
import zipfile
import io
import ast
import re
def is_valid_file(file_path):
"""Check if the file is a JavaScript or TypeScript file."""
allowed_extensions = [".js", ".jsx", ".tsx"]
return any(file_path.endswith(ext) for ext in allowed_extensions)
def is_likely_useful_file(file_path):
# Exclude test files
if re.search(r'\.test\.', file_path) or re.search(r'\.stories\.', file_path) or re.search(r'\node_modules\.', file_path) or re.search(r'\vendor\.', file_path) or re.search(r'\dist\.', file_path):
return False
# Include files in <root>/components/** and <root>/applications/**
if '/components/' in file_path or '/applications/' in file_path:
return True
return False
def download_repo(repo_url, output_file, token):
headers = {'Authorization': f'token {token}'}
response = requests.get(repo_url + "/archive/master.zip", headers=headers)
response.raise_for_status()
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
with open(output_file, "w", encoding="utf-8") as outfile:
for file_path in zip_file.namelist():
if file_path.endswith("/") or not is_valid_file(file_path) or not is_likely_useful_file(file_path):
continue
file_content = zip_file.read(file_path).decode("utf-8")
try:
file_content
except SyntaxError:
continue
outfile.write(f"# File: {file_path}\n")
outfile.write(file_content)
outfile.write("\n\n")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <github_repo_url> <personal_access_token>")
sys.exit(1)
repo_url = sys.argv[1]
token = sys.argv[2]
repo_name = repo_url.split("/")[-1]
output_file = f"{repo_name}_python.txt"
download_repo(repo_url, output_file, token)
print(f"Combined Python source code saved to {output_file}") |
Here's the updated Python code that reads the personal access token from an environment variable: import os
import sys
import requests
import zipfile
import io
import ast
def download_repo(repo_url, output_file, token):
"""Download and process files from a GitHub repository."""
# Read the access token from the environment variable
headers = {}
token = os.environ.get("GITHUB_ACCESS_TOKEN")
if token != "":
headers = {'Authorization': f'token {token}'}
response = requests.get(repo_url + "/archive/master.zip", headers=headers)
response.raise_for_status() # Raise an exception for non-2xx status codes
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
with open(output_file, "w", encoding="utf-8") as outfile:
for file_path in zip_file.namelist():
# Skip directories, non-Python files, less likely useful files, hidden directories, and test files
if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
continue
file_content = zip_file.read(file_path).decode("utf-8")
# Skip test files based on content and files with insufficient substantive content
if is_test_file(file_content) or not has_sufficient_content(file_content):
continue
try:
file_content = remove_comments_and_docstrings(file_content)
except SyntaxError:
# Skip files with syntax errors
continue
outfile.write(f"# File: {file_path}\n")
outfile.write(file_content)
outfile.write("\n\n")
# ... (the rest of the code remains the same)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <github_repo_url>")
sys.exit(1)
repo_url = sys.argv[1]
repo_name = repo_url.split("/")[-1]
output_file = f"{repo_name}_python.txt"
download_repo(repo_url, output_file)
print(f"Combined Python source code saved to {output_file}") How to use:
export GITHUB_ACCESS_TOKEN=<your_token>
|
Wanna make a PR? |
Seems a pretty common use case. Documentation around this would be helpful
The text was updated successfully, but these errors were encountered: