add support for splitted text by maximum word count

This commit is contained in:
Zhan Li 2025-05-25 00:11:54 -07:00
parent 9431ff9d07
commit e066b481af
6 changed files with 516 additions and 60 deletions

View file

@ -39,6 +39,11 @@ def create_default_settings_file() -> None:
- "README.md"
- "LICENSE"
- "package-lock.json"
# Optional: Maximum number of words per output file before splitting.
# If not specified or null, no splitting based on word count will occur.
# Must be a positive integer if set.
# maximum_word_count_per_file: 10000
""")
with open('.repo-to-text-settings.yaml', 'w', encoding='utf-8') as f:
f.write(default_settings)

View file

@ -4,11 +4,11 @@ Core functionality for repo-to-text
import os
import subprocess
from typing import Tuple, Optional, List, Dict, Any, Set
from typing import Tuple, Optional, List, Dict, Any, Set, IO
from datetime import datetime, timezone
from importlib.machinery import ModuleSpec
import logging
import yaml
import yaml # type: ignore
import pathspec
from pathspec import PathSpec
@ -118,7 +118,7 @@ def load_ignore_specs(
cli_ignore_patterns: List of patterns from command line
Returns:
Tuple[Optional[PathSpec], Optional[PathSpec], PathSpec]: Tuple of gitignore_spec,
Tuple[Optional[PathSpec], Optional[PathSpec], PathSpec]: Tuple of gitignore_spec,
content_ignore_spec, and tree_and_content_ignore_spec
"""
gitignore_spec = None
@ -128,12 +128,12 @@ def load_ignore_specs(
repo_settings_path = os.path.join(path, '.repo-to-text-settings.yaml')
if os.path.exists(repo_settings_path):
logging.debug('Loading .repo-to-text-settings.yaml from path: %s', repo_settings_path)
logging.debug('Loading .repo-to-text-settings.yaml for ignore specs from path: %s', repo_settings_path)
with open(repo_settings_path, 'r', encoding='utf-8') as f:
settings: Dict[str, Any] = yaml.safe_load(f)
use_gitignore = settings.get('gitignore-import-and-ignore', True)
if 'ignore-content' in settings:
content_ignore_spec: Optional[PathSpec] = pathspec.PathSpec.from_lines(
content_ignore_spec = pathspec.PathSpec.from_lines(
'gitwildmatch', settings['ignore-content']
)
if 'ignore-tree-and-content' in settings:
@ -154,6 +154,27 @@ def load_ignore_specs(
)
return gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec
def load_additional_specs(path: str = '.') -> Dict[str, Any]:
"""Load additional specifications from the settings file."""
additional_specs: Dict[str, Any] = {
'maximum_word_count_per_file': None
}
repo_settings_path = os.path.join(path, '.repo-to-text-settings.yaml')
if os.path.exists(repo_settings_path):
logging.debug('Loading .repo-to-text-settings.yaml for additional specs from path: %s', repo_settings_path)
with open(repo_settings_path, 'r', encoding='utf-8') as f:
settings: Dict[str, Any] = yaml.safe_load(f)
if 'maximum_word_count_per_file' in settings:
max_words = settings['maximum_word_count_per_file']
if isinstance(max_words, int) and max_words > 0:
additional_specs['maximum_word_count_per_file'] = max_words
elif max_words is not None: # Allow null/None to mean "not set"
logging.warning(
"Invalid value for 'maximum_word_count_per_file': %s. "
"It must be a positive integer or null. Ignoring.", max_words
)
return additional_specs
def should_ignore_file(
file_path: str,
relative_path: str,
@ -210,61 +231,133 @@ def save_repo_to_text(
to_stdout: bool = False,
cli_ignore_patterns: Optional[List[str]] = None
) -> str:
"""Save repository structure and contents to a text file."""
"""Save repository structure and contents to a text file or multiple files."""
logging.debug('Starting to save repo structure to text for path: %s', path)
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(
path, cli_ignore_patterns
)
additional_specs = load_additional_specs(path)
maximum_word_count_per_file = additional_specs.get('maximum_word_count_per_file')
tree_structure: str = get_tree_structure(
path, gitignore_spec, tree_and_content_ignore_spec
)
logging.debug('Final tree structure to be written: %s', tree_structure)
output_content = generate_output_content(
output_content_segments = generate_output_content(
path,
tree_structure,
gitignore_spec,
content_ignore_spec,
tree_and_content_ignore_spec
tree_and_content_ignore_spec,
maximum_word_count_per_file
)
if to_stdout:
print(output_content)
return output_content
for segment in output_content_segments:
print(segment, end='') # Avoid double newlines if segments naturally end with one
# Return joined content for consistency, though primarily printed
return "".join(output_content_segments)
output_file = write_output_to_file(output_content, output_dir)
copy_to_clipboard(output_content)
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-UTC')
base_output_name_stem = f'repo-to-text_{timestamp}'
output_filepaths: List[str] = []
print(
"[SUCCESS] Repository structure and contents successfully saved to "
f"file: \"./{output_file}\""
)
if not output_content_segments:
logging.warning("generate_output_content returned no segments. No output file will be created.")
return "" # Or handle by creating an empty placeholder file
if len(output_content_segments) == 1:
single_filename = f"{base_output_name_stem}.txt"
full_path_single_file = os.path.join(output_dir, single_filename) if output_dir else single_filename
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(full_path_single_file, 'w', encoding='utf-8') as f:
f.write(output_content_segments[0])
output_filepaths.append(full_path_single_file)
copy_to_clipboard(output_content_segments[0])
print(
"[SUCCESS] Repository structure and contents successfully saved to "
f"file: \"{os.path.relpath(full_path_single_file)}\"" # Use relpath for cleaner output
)
else: # Multiple segments
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir) # Create output_dir once if needed
for i, segment_content in enumerate(output_content_segments):
part_filename = f"{base_output_name_stem}_part_{i+1}.txt"
full_path_part_file = os.path.join(output_dir, part_filename) if output_dir else part_filename
with open(full_path_part_file, 'w', encoding='utf-8') as f:
f.write(segment_content)
output_filepaths.append(full_path_part_file)
print(
f"[SUCCESS] Repository structure and contents successfully saved to {len(output_filepaths)} files:"
)
for fp in output_filepaths:
print(f" - \"{os.path.relpath(fp)}\"") # Use relpath for cleaner output
return os.path.relpath(output_filepaths[0]) if output_filepaths else ""
return output_file
def generate_output_content(
path: str,
tree_structure: str,
gitignore_spec: Optional[PathSpec],
content_ignore_spec: Optional[PathSpec],
tree_and_content_ignore_spec: Optional[PathSpec]
) -> str:
"""Generate the output content for the repository."""
output_content: List[str] = []
tree_and_content_ignore_spec: Optional[PathSpec],
maximum_word_count_per_file: Optional[int] = None
) -> List[str]:
"""Generate the output content for the repository, potentially split into segments."""
# pylint: disable=too-many-arguments
# pylint: disable=too-many-locals
output_segments: List[str] = []
current_segment_builder: List[str] = []
current_segment_word_count: int = 0
project_name = os.path.basename(os.path.abspath(path))
def count_words(text: str) -> int:
return len(text.split())
def _finalize_current_segment():
nonlocal current_segment_word_count # Allow modification
if current_segment_builder:
output_segments.append("".join(current_segment_builder))
current_segment_builder.clear()
current_segment_word_count = 0
# Add XML opening tag
output_content.append('<repo-to-text>\n')
output_content.append(f'Directory: {project_name}\n\n')
output_content.append('Directory Structure:\n')
output_content.append('<directory_structure>\n.\n')
def _add_chunk_to_output(chunk: str):
nonlocal current_segment_word_count
chunk_wc = count_words(chunk)
if maximum_word_count_per_file is not None:
# If current segment is not empty, and adding this chunk would exceed limit,
# finalize the current segment before adding this new chunk.
if current_segment_builder and \
(current_segment_word_count + chunk_wc > maximum_word_count_per_file):
_finalize_current_segment()
current_segment_builder.append(chunk)
current_segment_word_count += chunk_wc
# This logic ensures that if a single chunk itself is larger than the limit,
# it forms its own segment. The next call to _add_chunk_to_output
# or the final _finalize_current_segment will commit it.
_add_chunk_to_output('<repo-to-text>\n')
_add_chunk_to_output(f'Directory: {project_name}\n\n')
_add_chunk_to_output('Directory Structure:\n')
_add_chunk_to_output('<directory_structure>\n.\n')
if os.path.exists(os.path.join(path, '.gitignore')):
output_content.append('├── .gitignore\n')
_add_chunk_to_output('├── .gitignore\n')
output_content.append(tree_structure + '\n' + '</directory_structure>\n')
logging.debug('Tree structure written to output content')
_add_chunk_to_output(tree_structure + '\n' + '</directory_structure>\n')
logging.debug('Tree structure added to output content segment builder')
for root, _, files in os.walk(path):
for filename in files:
@ -280,45 +373,47 @@ def generate_output_content(
):
continue
relative_path = relative_path.replace('./', '', 1)
cleaned_relative_path = relative_path.replace('./', '', 1)
_add_chunk_to_output(f'\n<content full_path="{cleaned_relative_path}">\n')
try:
# Try to open as text first
with open(file_path, 'r', encoding='utf-8') as f:
file_content = f.read()
output_content.append(f'\n<content full_path="{relative_path}">\n')
output_content.append(file_content)
output_content.append('\n</content>\n')
_add_chunk_to_output(file_content)
except UnicodeDecodeError:
# Handle binary files with the same content tag format
logging.debug('Handling binary file contents: %s', file_path)
with open(file_path, 'rb') as f:
binary_content = f.read()
output_content.append(f'\n<content full_path="{relative_path}">\n')
output_content.append(binary_content.decode('latin1'))
output_content.append('\n</content>\n')
with open(file_path, 'rb') as f_bin:
binary_content: bytes = f_bin.read()
_add_chunk_to_output(binary_content.decode('latin1')) # Add decoded binary
_add_chunk_to_output('\n</content>\n')
# Add XML closing tag
output_content.append('\n</repo-to-text>\n')
_add_chunk_to_output('\n</repo-to-text>\n')
logging.debug('Repository contents written to output content')
_finalize_current_segment() # Finalize any remaining content in the builder
return ''.join(output_content)
logging.debug(f'Repository contents generated into {len(output_segments)} segment(s)')
# Ensure at least one segment is returned, even if it's just the empty repo structure
if not output_segments and not current_segment_builder : # Should not happen if header/footer always added
# This case implies an empty repo and an extremely small word limit that split even the minimal tags.
# Or, if all content was filtered out.
# Return a minimal valid structure if everything else resulted in empty.
# However, the _add_chunk_to_output for repo tags should ensure current_segment_builder is not empty.
# And _finalize_current_segment ensures output_segments gets it.
# If output_segments is truly empty, it means an error or unexpected state.
# For safety, if it's empty, return a list with one empty string or minimal tags.
# Given the logic, this path is unlikely.
logging.warning("No output segments were generated. Returning a single empty segment.")
return ["<repo-to-text>\n</repo-to-text>\n"]
def write_output_to_file(output_content: str, output_dir: Optional[str]) -> str:
"""Write the output content to a file."""
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-UTC')
output_file = f'repo-to-text_{timestamp}.txt'
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, output_file)
return output_segments
with open(output_file, 'w', encoding='utf-8') as file:
file.write(output_content)
return output_file
# The original write_output_to_file function is no longer needed as its logic
# is incorporated into save_repo_to_text for handling single/multiple files.
def copy_to_clipboard(output_content: str) -> None:
"""Copy the output content to the clipboard if possible."""