Files
dumpy/dumpy.py
2026-01-26 16:26:56 -06:00

161 lines
4.7 KiB
Python

from pathlib import Path
from rich import print as rprint
from rich.console import Console
from rich.text import Text
import pathspec
import argparse
import mimetypes
import pyperclip
def get_file_description(path: Path):
mime_type, _ = mimetypes.guess_type(path)
return mime_type or "Unknown mimetype"
def generate_metadata_string(path: Path, root: Path) -> list[str]:
desc = get_file_description(path)
relative_path = path.relative_to(root)
return_data = []
return_data.append("#################")
return_data.append(f"## Filename: {relative_path}")
return_data.append(f"## Mimetype: {desc}")
return_data.append("#################")
return return_data
def get_ignore_spec(root: Path):
gitignore_path = root / ".gitignore"
if gitignore_path.exists():
with gitignore_path.open("r") as f:
return pathspec.PathSpec.from_lines('gitwildmatch', f)
return None
def should_include(path: Path, spec: pathspec.PathSpec | None, root: Path, include_git_dir: bool = False, manual_ignores = []):
relative_path = path.relative_to(root)
if any(part in manual_ignores for part in path.parts):
return False
if ".git" in path.parts and not include_git_dir:
return False
if spec is None:
return True
return not spec.match_file(str(relative_path))
def walk_filesystem(ignore_gitignore: bool, include_git_dir: bool = False, manual_ignores = []) -> tuple[int, list[str]]:
root = Path(".")
spec = None
if not ignore_gitignore:
spec = get_ignore_spec(root)
final_content: list[str] = []
file_count: int = 0
for path in root.rglob("*"):
if path.is_file():
# print the filename (for now)
if should_include(path, spec, root, include_git_dir, manual_ignores):
final_content += generate_metadata_string(path, root)
file_count += 1
success, content = get_file_contents(path)
if success:
final_content.append("## File Contents:\n```")
final_content.append(content)
final_content.append("```\n")
return (file_count, final_content)
def get_file_contents(path: Path) -> tuple[bool, str]:
try:
with path.open("r", encoding="utf-8") as f:
return (True, f.read())
except (UnicodeDecodeError, PermissionError):
return (False, "")
def main():
parser = argparse.ArgumentParser(description="Dumpy: A tool for providing a text representation of a project formatted in a way that LLMs will understand.")
parser.add_argument(
"--no-gitignore",
action="store_true",
help="Ignore the .gitignore file and include all files"
)
parser.add_argument(
"--include-git-dir",
action="store_true",
help="Include the .git directory in the output"
)
parser.add_argument(
"--no-clipboard",
action="store_true",
help="Skip putting content into the clipboard and ouput directly to the console"
)
parser.add_argument(
"--no-stats",
action="store_true",
help="Skip printing stats at the end of the output"
)
parser.add_argument(
"--ignore",
type=str,
help="Comma separated list of patterns to ignore."
)
args = parser.parse_args()
content: list[str] = []
content.append(f"Root Directory: {Path(".").absolute()}\n")
manual_ignores = []
if args.ignore:
manual_ignores = [item.strip() for item in args.ignore.split(",")]
count, content = walk_filesystem(ignore_gitignore=args.no_gitignore, manual_ignores=manual_ignores)
string_content = "\n".join(content)
if args.no_clipboard:
print(string_content)
else:
pyperclip.copy(string_content)
if not args.no_stats:
rprint("\n[bold green]Directoy contents copied to clipboard...[/]")
if not args.no_clipboard:
print() # Fix styling from the "contents copied" message
if not args.no_stats:
token_estimate = round(len(string_content) / 4)
if token_estimate > 3000:
tokens_color = "orange3"
elif token_estimate > 4000:
tokens_color = "red"
else:
tokens_color = "green"
console = Console()
# Output stats to the command line
rprint(f"[bold blue]Total Files:[/] [white]{count}[/white]")
rprint(f"[bold blue]Total Characters:[/] [white]{len(string_content)}[/white]")
label = Text("Estimated Tokens: ", style="bold blue")
label.append(Text(f"~{token_estimate}", style=tokens_color))
console.print(label + "\n")
if __name__ == "__main__":
main()