repo-to-text/repo_to_text/core/core.py
2024-12-17 15:06:59 +01:00

335 lines
12 KiB
Python

"""
Core functionality for repo-to-text
"""
import os
import subprocess
from typing import Tuple, Optional, List, Dict, Any, Set
from datetime import datetime, timezone
import logging
import yaml
import pathspec
from pathspec import PathSpec
from ..utils.utils import check_tree_command, is_ignored_path
def get_tree_structure(
path: str = '.',
gitignore_spec: Optional[PathSpec] = None,
tree_and_content_ignore_spec: Optional[PathSpec] = None
) -> str:
"""Generate tree structure of the directory.
Args:
path: Directory path to generate tree for
gitignore_spec: PathSpec object for gitignore patterns
tree_and_content_ignore_spec: PathSpec object for tree and content ignore patterns
Returns:
str: Generated tree structure with empty directories and ignored files removed
"""
if not check_tree_command():
return ""
logging.debug('Generating tree structure for path: %s', path)
result = subprocess.run(
['tree', '-a', '-f', '--noreport', path],
stdout=subprocess.PIPE,
check=True
)
tree_output = result.stdout.decode('utf-8')
logging.debug('Tree output generated:\n%s', tree_output)
if not gitignore_spec and not tree_and_content_ignore_spec:
logging.debug('No .gitignore or ignore-tree-and-content specification found')
return tree_output
logging.debug(
'Filtering tree output based on .gitignore and ignore-tree-and-content specification'
)
lines: List[str] = tree_output.splitlines()
non_empty_dirs: Set[str] = set()
current_path: List[str] = []
for line in lines:
indent_level = len(line) - len(line.lstrip('│ ├└'))
current_path = current_path[:indent_level]
idx = line.find('./')
if idx == -1:
idx = line.find(path)
if idx != -1:
full_path = line[idx:].strip()
else:
continue
if full_path == '.':
continue
relative_path = os.path.relpath(full_path, path)
relative_path = relative_path.replace(os.sep, '/')
# Skip if file should be ignored
if should_ignore_file(
full_path,
relative_path,
gitignore_spec,
None,
tree_and_content_ignore_spec
):
logging.debug('Ignored: %s', relative_path)
continue
# If this is a file, mark all parent directories as non-empty
if not os.path.isdir(full_path):
dir_path = os.path.dirname(relative_path)
while dir_path:
non_empty_dirs.add(dir_path)
dir_path = os.path.dirname(dir_path)
# Second pass: build filtered output
filtered_lines: List[str] = []
current_path = []
for line in lines:
indent_level = len(line) - len(line.lstrip('│ ├└'))
current_path = current_path[:indent_level]
# Always include root path
if indent_level == 0:
filtered_lines.append(line)
continue
idx = line.find('./')
if idx == -1:
idx = line.find(path)
if idx != -1:
full_path = line[idx:].strip()
else:
continue
relative_path = os.path.relpath(full_path, path)
relative_path = relative_path.replace(os.sep, '/')
# Skip if file should be ignored
if should_ignore_file(
full_path,
relative_path,
gitignore_spec,
None,
tree_and_content_ignore_spec
):
continue
# Include line if it's a file or a non-empty directory
if not os.path.isdir(full_path) or os.path.dirname(relative_path) in non_empty_dirs:
display_line = line.replace('./', '', 1)
filtered_lines.append(display_line)
filtered_tree_output = '\n'.join(filtered_lines)
logging.debug('Filtered tree structure:\n%s', filtered_tree_output)
return filtered_tree_output
def load_ignore_specs(
path: str = '.',
cli_ignore_patterns: Optional[List[str]] = None
) -> Tuple[Optional[PathSpec], Optional[PathSpec], PathSpec]:
"""Load ignore specifications from various sources.
Args:
path: Base directory path
cli_ignore_patterns: List of patterns from command line
Returns:
Tuple[Optional[PathSpec], Optional[PathSpec], PathSpec]: Tuple of gitignore_spec,
content_ignore_spec, and tree_and_content_ignore_spec
"""
gitignore_spec = None
content_ignore_spec = None
tree_and_content_ignore_list: List[str] = []
use_gitignore = True
repo_settings_path = os.path.join(path, '.repo-to-text-settings.yaml')
if os.path.exists(repo_settings_path):
logging.debug('Loading .repo-to-text-settings.yaml from path: %s', repo_settings_path)
with open(repo_settings_path, 'r', encoding='utf-8') as f:
settings: Dict[str, Any] = yaml.safe_load(f)
use_gitignore = settings.get('gitignore-import-and-ignore', True)
if 'ignore-content' in settings:
content_ignore_spec: Optional[PathSpec] = pathspec.PathSpec.from_lines(
'gitwildmatch', settings['ignore-content']
)
if 'ignore-tree-and-content' in settings:
tree_and_content_ignore_list.extend(settings.get('ignore-tree-and-content', []))
if cli_ignore_patterns:
tree_and_content_ignore_list.extend(cli_ignore_patterns)
if use_gitignore:
gitignore_path = os.path.join(path, '.gitignore')
if os.path.exists(gitignore_path):
logging.debug('Loading .gitignore from path: %s', gitignore_path)
with open(gitignore_path, 'r', encoding='utf-8') as f:
gitignore_spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
tree_and_content_ignore_spec = pathspec.PathSpec.from_lines(
'gitwildmatch', tree_and_content_ignore_list
)
return gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec
def should_ignore_file(
file_path: str,
relative_path: str,
gitignore_spec: Optional[PathSpec],
content_ignore_spec: Optional[PathSpec],
tree_and_content_ignore_spec: Optional[PathSpec]
) -> bool:
"""Check if a file should be ignored based on various ignore specifications.
Args:
file_path: Full path to the file
relative_path: Path relative to the repository root
gitignore_spec: PathSpec object for gitignore patterns
content_ignore_spec: PathSpec object for content ignore patterns
tree_and_content_ignore_spec: PathSpec object for tree and content ignore patterns
Returns:
bool: True if file should be ignored, False otherwise
"""
relative_path = relative_path.replace(os.sep, '/')
if relative_path.startswith('./'):
relative_path = relative_path[2:]
if os.path.isdir(file_path):
relative_path += '/'
result = (
is_ignored_path(file_path) or
bool(
gitignore_spec and
gitignore_spec.match_file(relative_path)
) or
bool(
content_ignore_spec and
content_ignore_spec.match_file(relative_path)
) or
bool(
tree_and_content_ignore_spec and
tree_and_content_ignore_spec.match_file(relative_path)
) or
os.path.basename(file_path).startswith('repo-to-text_')
)
logging.debug('Checking if file should be ignored:')
logging.debug(' file_path: %s', file_path)
logging.debug(' relative_path: %s', relative_path)
logging.debug(' Result: %s', result)
return result
def save_repo_to_text(
path: str = '.',
output_dir: Optional[str] = None,
to_stdout: bool = False,
cli_ignore_patterns: Optional[List[str]] = None
) -> str:
"""Save repository structure and contents to a text file.
Args:
path: Repository path
output_dir: Directory to save output file
to_stdout: Whether to output to stdout instead of file
cli_ignore_patterns: List of patterns from command line
Returns:
str: Path to the output file or the output text if to_stdout is True
"""
logging.debug('Starting to save repo structure to text for path: %s', path)
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(
path, cli_ignore_patterns
)
tree_structure: str = get_tree_structure(
path, gitignore_spec, tree_and_content_ignore_spec
)
logging.debug('Final tree structure to be written: %s', tree_structure)
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-UTC')
output_file = f'repo-to-text_{timestamp}.txt'
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, output_file)
output_content: List[str] = []
project_name = os.path.basename(os.path.abspath(path))
output_content.append(f'Directory: {project_name}\n\n')
output_content.append('Directory Structure:\n')
output_content.append('```\n.\n')
if os.path.exists(os.path.join(path, '.gitignore')):
output_content.append('├── .gitignore\n')
output_content.append(tree_structure + '\n' + '```\n')
logging.debug('Tree structure written to output content')
for root, _, files in os.walk(path):
for filename in files:
file_path = os.path.join(root, filename)
relative_path = os.path.relpath(file_path, path)
if should_ignore_file(
file_path,
relative_path,
gitignore_spec,
content_ignore_spec,
tree_and_content_ignore_spec
):
continue
relative_path = relative_path.replace('./', '', 1)
output_content.append(f'\nContents of {relative_path}:\n')
output_content.append('```\n')
try:
with open(file_path, 'r', encoding='utf-8') as f:
output_content.append(f.read())
except UnicodeDecodeError:
logging.debug('Could not decode file contents: %s', file_path)
output_content.append('[Could not decode file contents]\n')
output_content.append('\n```\n')
output_content.append('\n')
logging.debug('Repository contents written to output content')
output_text = ''.join(output_content)
if to_stdout:
print(output_text)
return output_text
with open(output_file, 'w', encoding='utf-8') as file:
file.write(output_text)
try:
import importlib.util # pylint: disable=import-outside-toplevel
if importlib.util.find_spec("pyperclip"):
import pyperclip # pylint: disable=import-outside-toplevel # type: ignore
pyperclip.copy(output_text) # type: ignore
logging.debug('Repository structure and contents copied to clipboard')
else:
print("Tip: Install 'pyperclip' package to enable automatic clipboard copying:")
print(" pip install pyperclip")
except (ImportError) as e:
logging.warning(
'Could not copy to clipboard. You might be running this '
'script over SSH or without clipboard support.'
)
logging.debug('Clipboard copy error: %s', e)
print(
"[SUCCESS] Repository structure and contents successfully saved to "
f"file: \"./{output_file}\""
)
return output_file