mirror of
https://github.com/kirill-markin/repo-to-text.git
synced 2025-12-05 19:12:24 -08:00
add support for splitted text by maximum word count
This commit is contained in:
parent
9431ff9d07
commit
e066b481af
6 changed files with 516 additions and 60 deletions
|
|
@ -18,3 +18,8 @@ ignore-content:
|
||||||
- "README.md"
|
- "README.md"
|
||||||
- "LICENSE"
|
- "LICENSE"
|
||||||
- "tests/"
|
- "tests/"
|
||||||
|
|
||||||
|
# Optional: Maximum number of words per output file before splitting.
|
||||||
|
# If not specified or null, no splitting based on word count will occur.
|
||||||
|
# Must be a positive integer if set.
|
||||||
|
# maximum_word_count_per_file: 10000
|
||||||
|
|
|
||||||
|
|
@ -205,6 +205,13 @@ You can copy this file from the [existing example in the project](https://github
|
||||||
- **ignore-content**: Ignore files and directories only for the contents sections.
|
- **ignore-content**: Ignore files and directories only for the contents sections.
|
||||||
|
|
||||||
Using these settings, you can control which files and directories are included or excluded from the final text file.
|
Using these settings, you can control which files and directories are included or excluded from the final text file.
|
||||||
|
- **maximum_word_count_per_file**: Optional integer. Sets a maximum word count for each output file. If the total content exceeds this limit, the output will be split into multiple files. The split files will be named using the convention `output_filename_part_N.txt`, where `N` is the part number.
|
||||||
|
Example:
|
||||||
|
```yaml
|
||||||
|
# Optional: Maximum word count per output file.
|
||||||
|
# If set, the output will be split into multiple files if the total word count exceeds this.
|
||||||
|
# maximum_word_count_per_file: 10000
|
||||||
|
```
|
||||||
|
|
||||||
### Wildcards and Inclusions
|
### Wildcards and Inclusions
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ authors = [
|
||||||
]
|
]
|
||||||
description = "Convert a directory structure and its contents into a single text file, including the tree output and file contents in structured XML format. It may be useful to chat with LLM about your code."
|
description = "Convert a directory structure and its contents into a single text file, including the tree output and file contents in structured XML format. It may be useful to chat with LLM about your code."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.6"
|
requires-python = ">=3.8"
|
||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,11 @@ def create_default_settings_file() -> None:
|
||||||
- "README.md"
|
- "README.md"
|
||||||
- "LICENSE"
|
- "LICENSE"
|
||||||
- "package-lock.json"
|
- "package-lock.json"
|
||||||
|
|
||||||
|
# Optional: Maximum number of words per output file before splitting.
|
||||||
|
# If not specified or null, no splitting based on word count will occur.
|
||||||
|
# Must be a positive integer if set.
|
||||||
|
# maximum_word_count_per_file: 10000
|
||||||
""")
|
""")
|
||||||
with open('.repo-to-text-settings.yaml', 'w', encoding='utf-8') as f:
|
with open('.repo-to-text-settings.yaml', 'w', encoding='utf-8') as f:
|
||||||
f.write(default_settings)
|
f.write(default_settings)
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,11 @@ Core functionality for repo-to-text
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
from typing import Tuple, Optional, List, Dict, Any, Set
|
from typing import Tuple, Optional, List, Dict, Any, Set, IO
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from importlib.machinery import ModuleSpec
|
from importlib.machinery import ModuleSpec
|
||||||
import logging
|
import logging
|
||||||
import yaml
|
import yaml # type: ignore
|
||||||
import pathspec
|
import pathspec
|
||||||
from pathspec import PathSpec
|
from pathspec import PathSpec
|
||||||
|
|
||||||
|
|
@ -128,12 +128,12 @@ def load_ignore_specs(
|
||||||
|
|
||||||
repo_settings_path = os.path.join(path, '.repo-to-text-settings.yaml')
|
repo_settings_path = os.path.join(path, '.repo-to-text-settings.yaml')
|
||||||
if os.path.exists(repo_settings_path):
|
if os.path.exists(repo_settings_path):
|
||||||
logging.debug('Loading .repo-to-text-settings.yaml from path: %s', repo_settings_path)
|
logging.debug('Loading .repo-to-text-settings.yaml for ignore specs from path: %s', repo_settings_path)
|
||||||
with open(repo_settings_path, 'r', encoding='utf-8') as f:
|
with open(repo_settings_path, 'r', encoding='utf-8') as f:
|
||||||
settings: Dict[str, Any] = yaml.safe_load(f)
|
settings: Dict[str, Any] = yaml.safe_load(f)
|
||||||
use_gitignore = settings.get('gitignore-import-and-ignore', True)
|
use_gitignore = settings.get('gitignore-import-and-ignore', True)
|
||||||
if 'ignore-content' in settings:
|
if 'ignore-content' in settings:
|
||||||
content_ignore_spec: Optional[PathSpec] = pathspec.PathSpec.from_lines(
|
content_ignore_spec = pathspec.PathSpec.from_lines(
|
||||||
'gitwildmatch', settings['ignore-content']
|
'gitwildmatch', settings['ignore-content']
|
||||||
)
|
)
|
||||||
if 'ignore-tree-and-content' in settings:
|
if 'ignore-tree-and-content' in settings:
|
||||||
|
|
@ -154,6 +154,27 @@ def load_ignore_specs(
|
||||||
)
|
)
|
||||||
return gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec
|
return gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec
|
||||||
|
|
||||||
|
def load_additional_specs(path: str = '.') -> Dict[str, Any]:
|
||||||
|
"""Load additional specifications from the settings file."""
|
||||||
|
additional_specs: Dict[str, Any] = {
|
||||||
|
'maximum_word_count_per_file': None
|
||||||
|
}
|
||||||
|
repo_settings_path = os.path.join(path, '.repo-to-text-settings.yaml')
|
||||||
|
if os.path.exists(repo_settings_path):
|
||||||
|
logging.debug('Loading .repo-to-text-settings.yaml for additional specs from path: %s', repo_settings_path)
|
||||||
|
with open(repo_settings_path, 'r', encoding='utf-8') as f:
|
||||||
|
settings: Dict[str, Any] = yaml.safe_load(f)
|
||||||
|
if 'maximum_word_count_per_file' in settings:
|
||||||
|
max_words = settings['maximum_word_count_per_file']
|
||||||
|
if isinstance(max_words, int) and max_words > 0:
|
||||||
|
additional_specs['maximum_word_count_per_file'] = max_words
|
||||||
|
elif max_words is not None: # Allow null/None to mean "not set"
|
||||||
|
logging.warning(
|
||||||
|
"Invalid value for 'maximum_word_count_per_file': %s. "
|
||||||
|
"It must be a positive integer or null. Ignoring.", max_words
|
||||||
|
)
|
||||||
|
return additional_specs
|
||||||
|
|
||||||
def should_ignore_file(
|
def should_ignore_file(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
relative_path: str,
|
relative_path: str,
|
||||||
|
|
@ -210,61 +231,133 @@ def save_repo_to_text(
|
||||||
to_stdout: bool = False,
|
to_stdout: bool = False,
|
||||||
cli_ignore_patterns: Optional[List[str]] = None
|
cli_ignore_patterns: Optional[List[str]] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Save repository structure and contents to a text file."""
|
"""Save repository structure and contents to a text file or multiple files."""
|
||||||
logging.debug('Starting to save repo structure to text for path: %s', path)
|
logging.debug('Starting to save repo structure to text for path: %s', path)
|
||||||
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(
|
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(
|
||||||
path, cli_ignore_patterns
|
path, cli_ignore_patterns
|
||||||
)
|
)
|
||||||
|
additional_specs = load_additional_specs(path)
|
||||||
|
maximum_word_count_per_file = additional_specs.get('maximum_word_count_per_file')
|
||||||
|
|
||||||
tree_structure: str = get_tree_structure(
|
tree_structure: str = get_tree_structure(
|
||||||
path, gitignore_spec, tree_and_content_ignore_spec
|
path, gitignore_spec, tree_and_content_ignore_spec
|
||||||
)
|
)
|
||||||
logging.debug('Final tree structure to be written: %s', tree_structure)
|
logging.debug('Final tree structure to be written: %s', tree_structure)
|
||||||
|
|
||||||
output_content = generate_output_content(
|
output_content_segments = generate_output_content(
|
||||||
path,
|
path,
|
||||||
tree_structure,
|
tree_structure,
|
||||||
gitignore_spec,
|
gitignore_spec,
|
||||||
content_ignore_spec,
|
content_ignore_spec,
|
||||||
tree_and_content_ignore_spec
|
tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file
|
||||||
)
|
)
|
||||||
|
|
||||||
if to_stdout:
|
if to_stdout:
|
||||||
print(output_content)
|
for segment in output_content_segments:
|
||||||
return output_content
|
print(segment, end='') # Avoid double newlines if segments naturally end with one
|
||||||
|
# Return joined content for consistency, though primarily printed
|
||||||
|
return "".join(output_content_segments)
|
||||||
|
|
||||||
output_file = write_output_to_file(output_content, output_dir)
|
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-UTC')
|
||||||
copy_to_clipboard(output_content)
|
base_output_name_stem = f'repo-to-text_{timestamp}'
|
||||||
|
|
||||||
|
output_filepaths: List[str] = []
|
||||||
|
|
||||||
|
if not output_content_segments:
|
||||||
|
logging.warning("generate_output_content returned no segments. No output file will be created.")
|
||||||
|
return "" # Or handle by creating an empty placeholder file
|
||||||
|
|
||||||
|
if len(output_content_segments) == 1:
|
||||||
|
single_filename = f"{base_output_name_stem}.txt"
|
||||||
|
full_path_single_file = os.path.join(output_dir, single_filename) if output_dir else single_filename
|
||||||
|
|
||||||
|
if output_dir and not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
with open(full_path_single_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(output_content_segments[0])
|
||||||
|
output_filepaths.append(full_path_single_file)
|
||||||
|
copy_to_clipboard(output_content_segments[0])
|
||||||
print(
|
print(
|
||||||
"[SUCCESS] Repository structure and contents successfully saved to "
|
"[SUCCESS] Repository structure and contents successfully saved to "
|
||||||
f"file: \"./{output_file}\""
|
f"file: \"{os.path.relpath(full_path_single_file)}\"" # Use relpath for cleaner output
|
||||||
)
|
)
|
||||||
|
else: # Multiple segments
|
||||||
|
if output_dir and not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir) # Create output_dir once if needed
|
||||||
|
|
||||||
|
for i, segment_content in enumerate(output_content_segments):
|
||||||
|
part_filename = f"{base_output_name_stem}_part_{i+1}.txt"
|
||||||
|
full_path_part_file = os.path.join(output_dir, part_filename) if output_dir else part_filename
|
||||||
|
|
||||||
|
with open(full_path_part_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(segment_content)
|
||||||
|
output_filepaths.append(full_path_part_file)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[SUCCESS] Repository structure and contents successfully saved to {len(output_filepaths)} files:"
|
||||||
|
)
|
||||||
|
for fp in output_filepaths:
|
||||||
|
print(f" - \"{os.path.relpath(fp)}\"") # Use relpath for cleaner output
|
||||||
|
|
||||||
|
return os.path.relpath(output_filepaths[0]) if output_filepaths else ""
|
||||||
|
|
||||||
return output_file
|
|
||||||
|
|
||||||
def generate_output_content(
|
def generate_output_content(
|
||||||
path: str,
|
path: str,
|
||||||
tree_structure: str,
|
tree_structure: str,
|
||||||
gitignore_spec: Optional[PathSpec],
|
gitignore_spec: Optional[PathSpec],
|
||||||
content_ignore_spec: Optional[PathSpec],
|
content_ignore_spec: Optional[PathSpec],
|
||||||
tree_and_content_ignore_spec: Optional[PathSpec]
|
tree_and_content_ignore_spec: Optional[PathSpec],
|
||||||
) -> str:
|
maximum_word_count_per_file: Optional[int] = None
|
||||||
"""Generate the output content for the repository."""
|
) -> List[str]:
|
||||||
output_content: List[str] = []
|
"""Generate the output content for the repository, potentially split into segments."""
|
||||||
|
# pylint: disable=too-many-arguments
|
||||||
|
# pylint: disable=too-many-locals
|
||||||
|
output_segments: List[str] = []
|
||||||
|
current_segment_builder: List[str] = []
|
||||||
|
current_segment_word_count: int = 0
|
||||||
project_name = os.path.basename(os.path.abspath(path))
|
project_name = os.path.basename(os.path.abspath(path))
|
||||||
|
|
||||||
# Add XML opening tag
|
def count_words(text: str) -> int:
|
||||||
output_content.append('<repo-to-text>\n')
|
return len(text.split())
|
||||||
|
|
||||||
output_content.append(f'Directory: {project_name}\n\n')
|
def _finalize_current_segment():
|
||||||
output_content.append('Directory Structure:\n')
|
nonlocal current_segment_word_count # Allow modification
|
||||||
output_content.append('<directory_structure>\n.\n')
|
if current_segment_builder:
|
||||||
|
output_segments.append("".join(current_segment_builder))
|
||||||
|
current_segment_builder.clear()
|
||||||
|
current_segment_word_count = 0
|
||||||
|
|
||||||
|
def _add_chunk_to_output(chunk: str):
|
||||||
|
nonlocal current_segment_word_count
|
||||||
|
chunk_wc = count_words(chunk)
|
||||||
|
|
||||||
|
if maximum_word_count_per_file is not None:
|
||||||
|
# If current segment is not empty, and adding this chunk would exceed limit,
|
||||||
|
# finalize the current segment before adding this new chunk.
|
||||||
|
if current_segment_builder and \
|
||||||
|
(current_segment_word_count + chunk_wc > maximum_word_count_per_file):
|
||||||
|
_finalize_current_segment()
|
||||||
|
|
||||||
|
current_segment_builder.append(chunk)
|
||||||
|
current_segment_word_count += chunk_wc
|
||||||
|
|
||||||
|
# This logic ensures that if a single chunk itself is larger than the limit,
|
||||||
|
# it forms its own segment. The next call to _add_chunk_to_output
|
||||||
|
# or the final _finalize_current_segment will commit it.
|
||||||
|
|
||||||
|
_add_chunk_to_output('<repo-to-text>\n')
|
||||||
|
_add_chunk_to_output(f'Directory: {project_name}\n\n')
|
||||||
|
_add_chunk_to_output('Directory Structure:\n')
|
||||||
|
_add_chunk_to_output('<directory_structure>\n.\n')
|
||||||
|
|
||||||
if os.path.exists(os.path.join(path, '.gitignore')):
|
if os.path.exists(os.path.join(path, '.gitignore')):
|
||||||
output_content.append('├── .gitignore\n')
|
_add_chunk_to_output('├── .gitignore\n')
|
||||||
|
|
||||||
output_content.append(tree_structure + '\n' + '</directory_structure>\n')
|
_add_chunk_to_output(tree_structure + '\n' + '</directory_structure>\n')
|
||||||
logging.debug('Tree structure written to output content')
|
logging.debug('Tree structure added to output content segment builder')
|
||||||
|
|
||||||
for root, _, files in os.walk(path):
|
for root, _, files in os.walk(path):
|
||||||
for filename in files:
|
for filename in files:
|
||||||
|
|
@ -280,45 +373,47 @@ def generate_output_content(
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
relative_path = relative_path.replace('./', '', 1)
|
cleaned_relative_path = relative_path.replace('./', '', 1)
|
||||||
|
|
||||||
|
_add_chunk_to_output(f'\n<content full_path="{cleaned_relative_path}">\n')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try to open as text first
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
output_content.append(f'\n<content full_path="{relative_path}">\n')
|
_add_chunk_to_output(file_content)
|
||||||
output_content.append(file_content)
|
|
||||||
output_content.append('\n</content>\n')
|
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# Handle binary files with the same content tag format
|
|
||||||
logging.debug('Handling binary file contents: %s', file_path)
|
logging.debug('Handling binary file contents: %s', file_path)
|
||||||
with open(file_path, 'rb') as f:
|
with open(file_path, 'rb') as f_bin:
|
||||||
binary_content = f.read()
|
binary_content: bytes = f_bin.read()
|
||||||
output_content.append(f'\n<content full_path="{relative_path}">\n')
|
_add_chunk_to_output(binary_content.decode('latin1')) # Add decoded binary
|
||||||
output_content.append(binary_content.decode('latin1'))
|
|
||||||
output_content.append('\n</content>\n')
|
|
||||||
|
|
||||||
# Add XML closing tag
|
_add_chunk_to_output('\n</content>\n')
|
||||||
output_content.append('\n</repo-to-text>\n')
|
|
||||||
|
|
||||||
logging.debug('Repository contents written to output content')
|
_add_chunk_to_output('\n</repo-to-text>\n')
|
||||||
|
|
||||||
return ''.join(output_content)
|
_finalize_current_segment() # Finalize any remaining content in the builder
|
||||||
|
|
||||||
def write_output_to_file(output_content: str, output_dir: Optional[str]) -> str:
|
logging.debug(f'Repository contents generated into {len(output_segments)} segment(s)')
|
||||||
"""Write the output content to a file."""
|
|
||||||
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-UTC')
|
|
||||||
output_file = f'repo-to-text_{timestamp}.txt'
|
|
||||||
|
|
||||||
if output_dir:
|
# Ensure at least one segment is returned, even if it's just the empty repo structure
|
||||||
if not os.path.exists(output_dir):
|
if not output_segments and not current_segment_builder : # Should not happen if header/footer always added
|
||||||
os.makedirs(output_dir)
|
# This case implies an empty repo and an extremely small word limit that split even the minimal tags.
|
||||||
output_file = os.path.join(output_dir, output_file)
|
# Or, if all content was filtered out.
|
||||||
|
# Return a minimal valid structure if everything else resulted in empty.
|
||||||
|
# However, the _add_chunk_to_output for repo tags should ensure current_segment_builder is not empty.
|
||||||
|
# And _finalize_current_segment ensures output_segments gets it.
|
||||||
|
# If output_segments is truly empty, it means an error or unexpected state.
|
||||||
|
# For safety, if it's empty, return a list with one empty string or minimal tags.
|
||||||
|
# Given the logic, this path is unlikely.
|
||||||
|
logging.warning("No output segments were generated. Returning a single empty segment.")
|
||||||
|
return ["<repo-to-text>\n</repo-to-text>\n"]
|
||||||
|
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(output_content)
|
|
||||||
|
|
||||||
return output_file
|
return output_segments
|
||||||
|
|
||||||
|
|
||||||
|
# The original write_output_to_file function is no longer needed as its logic
|
||||||
|
# is incorporated into save_repo_to_text for handling single/multiple files.
|
||||||
|
|
||||||
def copy_to_clipboard(output_content: str) -> None:
|
def copy_to_clipboard(output_content: str) -> None:
|
||||||
"""Copy the output content to the clipboard if possible."""
|
"""Copy the output content to the clipboard if possible."""
|
||||||
|
|
|
||||||
|
|
@ -3,15 +3,20 @@
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Generator
|
from typing import Generator, IO
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from unittest.mock import patch, mock_open, MagicMock
|
||||||
|
import yaml # For creating mock settings files easily
|
||||||
|
|
||||||
from repo_to_text.core.core import (
|
from repo_to_text.core.core import (
|
||||||
get_tree_structure,
|
get_tree_structure,
|
||||||
load_ignore_specs,
|
load_ignore_specs,
|
||||||
should_ignore_file,
|
should_ignore_file,
|
||||||
is_ignored_path,
|
is_ignored_path,
|
||||||
save_repo_to_text
|
save_repo_to_text,
|
||||||
|
load_additional_specs,
|
||||||
|
generate_output_content
|
||||||
)
|
)
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name
|
# pylint: disable=redefined-outer-name
|
||||||
|
|
@ -60,6 +65,26 @@ ignore-content:
|
||||||
|
|
||||||
return tmp_path_str
|
return tmp_path_str
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def simple_word_count_repo(tmp_path: str) -> str:
|
||||||
|
"""Create a simple repository for word count testing."""
|
||||||
|
repo_path = str(tmp_path)
|
||||||
|
files_content = {
|
||||||
|
"file1.txt": "This is file one. It has eight words.", # 8 words
|
||||||
|
"file2.txt": "File two is here. This makes six words.", # 6 words
|
||||||
|
"subdir/file3.txt": "Another file in a subdirectory, with ten words exactly." # 10 words
|
||||||
|
}
|
||||||
|
for file_path, content in files_content.items():
|
||||||
|
full_path = os.path.join(repo_path, file_path)
|
||||||
|
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||||
|
with open(full_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(content)
|
||||||
|
return repo_path
|
||||||
|
|
||||||
|
def count_words_for_test(text: str) -> int:
|
||||||
|
"""Helper to count words consistently with core logic for tests."""
|
||||||
|
return len(text.split())
|
||||||
|
|
||||||
def test_is_ignored_path() -> None:
|
def test_is_ignored_path() -> None:
|
||||||
"""Test the is_ignored_path function."""
|
"""Test the is_ignored_path function."""
|
||||||
assert is_ignored_path(".git/config") is True
|
assert is_ignored_path(".git/config") is True
|
||||||
|
|
@ -302,5 +327,324 @@ def test_empty_dirs_filtering(tmp_path: str) -> None:
|
||||||
# Check that no line contains 'empty_dir'
|
# Check that no line contains 'empty_dir'
|
||||||
assert "empty_dir" not in line, f"Found empty_dir in line: {line}"
|
assert "empty_dir" not in line, f"Found empty_dir in line: {line}"
|
||||||
|
|
||||||
|
# Tests for maximum_word_count_per_file functionality
|
||||||
|
|
||||||
|
def test_load_additional_specs_valid_max_words(tmp_path: str) -> None:
|
||||||
|
"""Test load_additional_specs with a valid maximum_word_count_per_file."""
|
||||||
|
settings_content = {"maximum_word_count_per_file": 1000}
|
||||||
|
settings_file = os.path.join(tmp_path, ".repo-to-text-settings.yaml")
|
||||||
|
with open(settings_file, "w", encoding="utf-8") as f:
|
||||||
|
yaml.dump(settings_content, f)
|
||||||
|
|
||||||
|
specs = load_additional_specs(tmp_path)
|
||||||
|
assert specs["maximum_word_count_per_file"] == 1000
|
||||||
|
|
||||||
|
def test_load_additional_specs_invalid_max_words_string(tmp_path: str, caplog) -> None:
|
||||||
|
"""Test load_additional_specs with an invalid string for maximum_word_count_per_file."""
|
||||||
|
settings_content = {"maximum_word_count_per_file": "not-an-integer"}
|
||||||
|
settings_file = os.path.join(tmp_path, ".repo-to-text-settings.yaml")
|
||||||
|
with open(settings_file, "w", encoding="utf-8") as f:
|
||||||
|
yaml.dump(settings_content, f)
|
||||||
|
|
||||||
|
specs = load_additional_specs(tmp_path)
|
||||||
|
assert specs["maximum_word_count_per_file"] is None
|
||||||
|
assert "Invalid value for 'maximum_word_count_per_file': not-an-integer" in caplog.text
|
||||||
|
|
||||||
|
def test_load_additional_specs_invalid_max_words_negative(tmp_path: str, caplog) -> None:
|
||||||
|
"""Test load_additional_specs with a negative integer for maximum_word_count_per_file."""
|
||||||
|
settings_content = {"maximum_word_count_per_file": -100}
|
||||||
|
settings_file = os.path.join(tmp_path, ".repo-to-text-settings.yaml")
|
||||||
|
with open(settings_file, "w", encoding="utf-8") as f:
|
||||||
|
yaml.dump(settings_content, f)
|
||||||
|
|
||||||
|
specs = load_additional_specs(tmp_path)
|
||||||
|
assert specs["maximum_word_count_per_file"] is None
|
||||||
|
assert "Invalid value for 'maximum_word_count_per_file': -100" in caplog.text
|
||||||
|
|
||||||
|
def test_load_additional_specs_max_words_is_none_in_yaml(tmp_path: str, caplog) -> None:
|
||||||
|
"""Test load_additional_specs when maximum_word_count_per_file is explicitly null in YAML."""
|
||||||
|
settings_content = {"maximum_word_count_per_file": None} # In YAML, this is 'null'
|
||||||
|
settings_file = os.path.join(tmp_path, ".repo-to-text-settings.yaml")
|
||||||
|
with open(settings_file, "w", encoding="utf-8") as f:
|
||||||
|
yaml.dump(settings_content, f)
|
||||||
|
|
||||||
|
specs = load_additional_specs(tmp_path)
|
||||||
|
assert specs["maximum_word_count_per_file"] is None
|
||||||
|
assert "Invalid value for 'maximum_word_count_per_file'" not in caplog.text
|
||||||
|
|
||||||
|
def test_load_additional_specs_max_words_not_present(tmp_path: str) -> None:
|
||||||
|
"""Test load_additional_specs when maximum_word_count_per_file is not present."""
|
||||||
|
settings_content = {"other_setting": "value"}
|
||||||
|
settings_file = os.path.join(tmp_path, ".repo-to-text-settings.yaml")
|
||||||
|
with open(settings_file, "w", encoding="utf-8") as f:
|
||||||
|
yaml.dump(settings_content, f)
|
||||||
|
|
||||||
|
specs = load_additional_specs(tmp_path)
|
||||||
|
assert specs["maximum_word_count_per_file"] is None
|
||||||
|
|
||||||
|
def test_load_additional_specs_no_settings_file(tmp_path: str) -> None:
|
||||||
|
"""Test load_additional_specs when no settings file exists."""
|
||||||
|
specs = load_additional_specs(tmp_path)
|
||||||
|
assert specs["maximum_word_count_per_file"] is None
|
||||||
|
|
||||||
|
# Tests for generate_output_content related to splitting
|
||||||
|
def test_generate_output_content_no_splitting_max_words_not_set(simple_word_count_repo: str) -> None:
|
||||||
|
"""Test generate_output_content with no splitting when max_words is not set."""
|
||||||
|
path = simple_word_count_repo
|
||||||
|
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(path)
|
||||||
|
tree_structure = get_tree_structure(path, gitignore_spec, tree_and_content_ignore_spec)
|
||||||
|
|
||||||
|
segments = generate_output_content(
|
||||||
|
path, tree_structure, gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file=None
|
||||||
|
)
|
||||||
|
assert len(segments) == 1
|
||||||
|
assert "file1.txt" in segments[0]
|
||||||
|
assert "This is file one." in segments[0]
|
||||||
|
|
||||||
|
def test_generate_output_content_no_splitting_content_less_than_limit(simple_word_count_repo: str) -> None:
|
||||||
|
"""Test generate_output_content with no splitting when content is less than max_words limit."""
|
||||||
|
path = simple_word_count_repo
|
||||||
|
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(path)
|
||||||
|
tree_structure = get_tree_structure(path, gitignore_spec, tree_and_content_ignore_spec)
|
||||||
|
|
||||||
|
segments = generate_output_content(
|
||||||
|
path, tree_structure, gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file=500 # High limit
|
||||||
|
)
|
||||||
|
assert len(segments) == 1
|
||||||
|
assert "file1.txt" in segments[0]
|
||||||
|
|
||||||
|
def test_generate_output_content_splitting_occurs(simple_word_count_repo: str) -> None:
|
||||||
|
"""Test generate_output_content when splitting occurs due to max_words limit."""
|
||||||
|
path = simple_word_count_repo
|
||||||
|
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(path)
|
||||||
|
tree_structure = get_tree_structure(path, gitignore_spec, tree_and_content_ignore_spec)
|
||||||
|
max_words = 30
|
||||||
|
segments = generate_output_content(
|
||||||
|
path, tree_structure, gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file=max_words
|
||||||
|
)
|
||||||
|
assert len(segments) > 1
|
||||||
|
total_content = "".join(segments)
|
||||||
|
assert "file1.txt" in total_content
|
||||||
|
assert "This is file one." in total_content
|
||||||
|
for i, segment in enumerate(segments):
|
||||||
|
segment_word_count = count_words_for_test(segment)
|
||||||
|
if i < len(segments) - 1: # For all but the last segment
|
||||||
|
# A segment can be larger than max_words if a single chunk (e.g. file content block) is larger
|
||||||
|
assert segment_word_count <= max_words or \
|
||||||
|
(segment_word_count > max_words and count_words_for_test(segment.splitlines()[-2]) > max_words)
|
||||||
|
else: # Last segment can be smaller
|
||||||
|
assert segment_word_count > 0
|
||||||
|
|
||||||
|
def test_generate_output_content_splitting_very_small_limit(simple_word_count_repo: str) -> None:
|
||||||
|
"""Test generate_output_content with a very small max_words limit."""
|
||||||
|
path = simple_word_count_repo
|
||||||
|
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(path)
|
||||||
|
tree_structure = get_tree_structure(path, gitignore_spec, tree_and_content_ignore_spec)
|
||||||
|
max_words = 10 # Very small limit
|
||||||
|
segments = generate_output_content(
|
||||||
|
path, tree_structure, gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file=max_words
|
||||||
|
)
|
||||||
|
assert len(segments) > 3 # Expect multiple splits
|
||||||
|
total_content = "".join(segments)
|
||||||
|
assert "file1.txt" in total_content
|
||||||
|
# Check if file content (which is a chunk) forms its own segment if it's > max_words
|
||||||
|
found_file1_content_chunk = False
|
||||||
|
expected_file1_chunk = "<content full_path=\"file1.txt\">\nThis is file one. It has eight words.\n</content>"
|
||||||
|
for segment in segments:
|
||||||
|
if expected_file1_chunk.strip() in segment.strip(): # Check for the core content
|
||||||
|
# This segment should contain the file1.txt content and its tags
|
||||||
|
# The chunk itself is ~13 words. If max_words is 10, this chunk will be its own segment.
|
||||||
|
assert count_words_for_test(segment) == count_words_for_test(expected_file1_chunk)
|
||||||
|
assert count_words_for_test(segment) > max_words
|
||||||
|
found_file1_content_chunk = True
|
||||||
|
break
|
||||||
|
assert found_file1_content_chunk
|
||||||
|
|
||||||
|
def test_generate_output_content_file_header_content_together(tmp_path: str) -> None:
|
||||||
|
"""Test that file header and its content are not split if word count allows."""
|
||||||
|
repo_path = str(tmp_path)
|
||||||
|
file_content_str = "word " * 15 # 15 words
|
||||||
|
# Tags: <content full_path="single_file.txt">\n (3) + \n</content> (2) = 5 words. Total block = 20 words.
|
||||||
|
files_content = {"single_file.txt": file_content_str.strip()}
|
||||||
|
for file_path_key, content_val in files_content.items():
|
||||||
|
full_path = os.path.join(repo_path, file_path_key)
|
||||||
|
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||||
|
with open(full_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(content_val)
|
||||||
|
|
||||||
|
gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec = load_ignore_specs(repo_path)
|
||||||
|
tree_structure = get_tree_structure(repo_path, gitignore_spec, tree_and_content_ignore_spec)
|
||||||
|
|
||||||
|
max_words_sufficient = 35 # Enough for header + this one file block (around 20 words + initial header)
|
||||||
|
segments = generate_output_content(
|
||||||
|
repo_path, tree_structure, gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file=max_words_sufficient
|
||||||
|
)
|
||||||
|
assert len(segments) == 1 # Expect no splitting of this file from its tags
|
||||||
|
expected_file_block = f'<content full_path="single_file.txt">\n{file_content_str.strip()}\n</content>'
|
||||||
|
assert expected_file_block in segments[0]
|
||||||
|
|
||||||
|
# Test if it splits if max_words is too small for the file block (20 words)
|
||||||
|
max_words_small = 10
|
||||||
|
segments_small_limit = generate_output_content(
|
||||||
|
repo_path, tree_structure, gitignore_spec, content_ignore_spec, tree_and_content_ignore_spec,
|
||||||
|
maximum_word_count_per_file=max_words_small
|
||||||
|
)
|
||||||
|
# The file block (20 words) is a single chunk. It will form its own segment.
|
||||||
|
# Header part will be one segment. File block another. Footer another.
|
||||||
|
assert len(segments_small_limit) >= 2
|
||||||
|
|
||||||
|
found_file_block_in_own_segment = False
|
||||||
|
for segment in segments_small_limit:
|
||||||
|
if expected_file_block in segment:
|
||||||
|
assert count_words_for_test(segment) == count_words_for_test(expected_file_block)
|
||||||
|
found_file_block_in_own_segment = True
|
||||||
|
break
|
||||||
|
assert found_file_block_in_own_segment
|
||||||
|
|
||||||
|
# Tests for save_repo_to_text related to splitting
|
||||||
|
@patch('repo_to_text.core.core.load_additional_specs')
|
||||||
|
@patch('repo_to_text.core.core.generate_output_content')
|
||||||
|
@patch('repo_to_text.core.core.os.makedirs')
|
||||||
|
@patch('builtins.open', new_callable=mock_open)
|
||||||
|
@patch('repo_to_text.core.core.pyperclip.copy')
|
||||||
|
def test_save_repo_to_text_no_splitting_mocked(
|
||||||
|
mock_pyperclip_copy: MagicMock,
|
||||||
|
mock_file_open: MagicMock, # This is the mock_open instance
|
||||||
|
mock_makedirs: MagicMock,
|
||||||
|
mock_generate_output: MagicMock,
|
||||||
|
mock_load_specs: MagicMock,
|
||||||
|
simple_word_count_repo: str,
|
||||||
|
tmp_path: str
|
||||||
|
) -> None:
|
||||||
|
"""Test save_repo_to_text: no splitting, single file output."""
|
||||||
|
mock_load_specs.return_value = {'maximum_word_count_per_file': None}
|
||||||
|
mock_generate_output.return_value = ["Single combined content\nfile1.txt\ncontent1"]
|
||||||
|
output_dir = os.path.join(str(tmp_path), "output")
|
||||||
|
|
||||||
|
with patch('repo_to_text.core.core.datetime') as mock_datetime:
|
||||||
|
mock_datetime.now.return_value.strftime.return_value = "mock_timestamp"
|
||||||
|
returned_path = save_repo_to_text(simple_word_count_repo, output_dir=output_dir)
|
||||||
|
|
||||||
|
mock_load_specs.assert_called_once_with(simple_word_count_repo)
|
||||||
|
mock_generate_output.assert_called_once() # Args are complex, basic check
|
||||||
|
expected_filename = os.path.join(output_dir, "repo-to-text_mock_timestamp.txt")
|
||||||
|
assert returned_path == os.path.relpath(expected_filename)
|
||||||
|
mock_makedirs.assert_called_once_with(output_dir)
|
||||||
|
mock_file_open.assert_called_once_with(expected_filename, 'w', encoding='utf-8')
|
||||||
|
mock_file_open().write.assert_called_once_with("Single combined content\nfile1.txt\ncontent1")
|
||||||
|
mock_pyperclip_copy.assert_called_once_with("Single combined content\nfile1.txt\ncontent1")
|
||||||
|
|
||||||
|
@patch('repo_to_text.core.core.load_additional_specs')
|
||||||
|
@patch('repo_to_text.core.core.generate_output_content')
|
||||||
|
@patch('repo_to_text.core.core.os.makedirs')
|
||||||
|
@patch('builtins.open') # Patch builtins.open to get the mock of the function
|
||||||
|
@patch('repo_to_text.core.core.pyperclip.copy')
|
||||||
|
def test_save_repo_to_text_splitting_occurs_mocked(
|
||||||
|
mock_pyperclip_copy: MagicMock,
|
||||||
|
mock_open_function: MagicMock, # This is the mock for the open function itself
|
||||||
|
mock_makedirs: MagicMock,
|
||||||
|
mock_generate_output: MagicMock,
|
||||||
|
mock_load_specs: MagicMock,
|
||||||
|
simple_word_count_repo: str,
|
||||||
|
tmp_path: str
|
||||||
|
) -> None:
|
||||||
|
"""Test save_repo_to_text: splitting occurs, multiple file outputs with better write check."""
|
||||||
|
mock_load_specs.return_value = {'maximum_word_count_per_file': 50}
|
||||||
|
segments_content = ["Segment 1 content data", "Segment 2 content data"]
|
||||||
|
mock_generate_output.return_value = segments_content
|
||||||
|
output_dir = os.path.join(str(tmp_path), "output_split_adv")
|
||||||
|
|
||||||
|
# Mock file handles that 'open' will return when called in a 'with' statement
|
||||||
|
mock_file_handle1 = MagicMock(spec=IO)
|
||||||
|
mock_file_handle2 = MagicMock(spec=IO)
|
||||||
|
# Configure the mock_open_function to return these handles sequentially
|
||||||
|
mock_open_function.side_effect = [mock_file_handle1, mock_file_handle2]
|
||||||
|
|
||||||
|
with patch('repo_to_text.core.core.datetime') as mock_datetime:
|
||||||
|
mock_datetime.now.return_value.strftime.return_value = "mock_ts_split_adv"
|
||||||
|
returned_path = save_repo_to_text(simple_word_count_repo, output_dir=output_dir)
|
||||||
|
|
||||||
|
expected_filename_part1 = os.path.join(output_dir, "repo-to-text_mock_ts_split_adv_part_1.txt")
|
||||||
|
expected_filename_part2 = os.path.join(output_dir, "repo-to-text_mock_ts_split_adv_part_2.txt")
|
||||||
|
|
||||||
|
assert returned_path == os.path.relpath(expected_filename_part1)
|
||||||
|
mock_makedirs.assert_called_once_with(output_dir)
|
||||||
|
|
||||||
|
# Check calls to the open function
|
||||||
|
mock_open_function.assert_any_call(expected_filename_part1, 'w', encoding='utf-8')
|
||||||
|
mock_open_function.assert_any_call(expected_filename_part2, 'w', encoding='utf-8')
|
||||||
|
assert mock_open_function.call_count == 2 # Exactly two calls for writing output
|
||||||
|
|
||||||
|
# Check writes to the mocked file handles (returned by open's side_effect)
|
||||||
|
# __enter__() is called by the 'with' statement
|
||||||
|
mock_file_handle1.__enter__().write.assert_called_once_with(segments_content[0])
|
||||||
|
mock_file_handle2.__enter__().write.assert_called_once_with(segments_content[1])
|
||||||
|
|
||||||
|
mock_pyperclip_copy.assert_not_called()
|
||||||
|
|
||||||
|
@patch('repo_to_text.core.core.load_additional_specs')
|
||||||
|
@patch('repo_to_text.core.core.generate_output_content')
|
||||||
|
@patch('repo_to_text.core.core.os.makedirs')
|
||||||
|
@patch('builtins.open', new_callable=mock_open)
|
||||||
|
@patch('repo_to_text.core.core.pyperclip.copy')
|
||||||
|
def test_save_repo_to_text_stdout_with_splitting(
|
||||||
|
mock_pyperclip_copy: MagicMock,
|
||||||
|
mock_file_open: MagicMock,
|
||||||
|
mock_os_makedirs: MagicMock,
|
||||||
|
mock_generate_output: MagicMock,
|
||||||
|
mock_load_specs: MagicMock,
|
||||||
|
simple_word_count_repo: str,
|
||||||
|
capsys
|
||||||
|
) -> None:
|
||||||
|
"""Test save_repo_to_text with to_stdout=True and content that would split."""
|
||||||
|
mock_load_specs.return_value = {'maximum_word_count_per_file': 10} # Assume causes splitting
|
||||||
|
mock_generate_output.return_value = ["Segment 1 for stdout.", "Segment 2 for stdout."]
|
||||||
|
|
||||||
|
result_string = save_repo_to_text(simple_word_count_repo, to_stdout=True)
|
||||||
|
|
||||||
|
mock_load_specs.assert_called_once_with(simple_word_count_repo)
|
||||||
|
mock_generate_output.assert_called_once()
|
||||||
|
mock_os_makedirs.assert_not_called()
|
||||||
|
mock_file_open.assert_not_called()
|
||||||
|
mock_pyperclip_copy.assert_not_called()
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
# core.py uses print(segment, end=''), so segments are joined directly.
|
||||||
|
assert "Segment 1 for stdout.Segment 2 for stdout." == captured.out
|
||||||
|
assert result_string == "Segment 1 for stdout.Segment 2 for stdout."
|
||||||
|
|
||||||
|
@patch('repo_to_text.core.core.load_additional_specs')
|
||||||
|
@patch('repo_to_text.core.core.generate_output_content')
|
||||||
|
@patch('repo_to_text.core.core.os.makedirs')
|
||||||
|
@patch('builtins.open', new_callable=mock_open)
|
||||||
|
@patch('repo_to_text.core.core.pyperclip.copy')
|
||||||
|
def test_save_repo_to_text_empty_segments(
|
||||||
|
mock_pyperclip_copy: MagicMock,
|
||||||
|
mock_file_open: MagicMock,
|
||||||
|
mock_makedirs: MagicMock,
|
||||||
|
mock_generate_output: MagicMock,
|
||||||
|
mock_load_specs: MagicMock,
|
||||||
|
simple_word_count_repo: str,
|
||||||
|
tmp_path: str,
|
||||||
|
caplog
|
||||||
|
) -> None:
|
||||||
|
"""Test save_repo_to_text when generate_output_content returns no segments."""
|
||||||
|
mock_load_specs.return_value = {'maximum_word_count_per_file': None}
|
||||||
|
mock_generate_output.return_value = [] # Empty list
|
||||||
|
output_dir = os.path.join(str(tmp_path), "output_empty")
|
||||||
|
|
||||||
|
returned_path = save_repo_to_text(simple_word_count_repo, output_dir=output_dir)
|
||||||
|
|
||||||
|
assert returned_path == ""
|
||||||
|
mock_makedirs.assert_not_called()
|
||||||
|
mock_file_open.assert_not_called()
|
||||||
|
mock_pyperclip_copy.assert_not_called()
|
||||||
|
assert "generate_output_content returned no segments" in caplog.text
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pytest.main([__file__])
|
pytest.main([__file__])
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue