From 316610e932da26edda0140bbf8b8359a06927790 Mon Sep 17 00:00:00 2001 From: Stephen Simpson Date: Wed, 10 Dec 2025 11:16:55 -0600 Subject: [PATCH] updates --- Dockerfile | 10 +- LICENSE | 2 +- README.md | 278 +++++++++------------------ pyproject.toml | 10 +- src/rocky_man/main.py | 47 +---- src/rocky_man/models/manfile.py | 31 +-- src/rocky_man/processor/converter.py | 90 ++------- src/rocky_man/processor/extractor.py | 75 +++----- src/rocky_man/repo/contents.py | 77 ++------ src/rocky_man/repo/manager.py | 37 +--- src/rocky_man/utils/config.py | 14 +- src/rocky_man/web/generator.py | 60 +++--- templates/404.html | 137 +++++++++++++ templates/root.html | 2 + 14 files changed, 350 insertions(+), 520 deletions(-) create mode 100644 templates/404.html diff --git a/Dockerfile b/Dockerfile index 570ef2f..8dc7ed2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # Multi-stage Dockerfile for Rocky Man # This creates an architecture-independent image that can run on x86_64, aarch64, etc. -FROM rockylinux/rockylinux:9 AS builder +FROM rockylinux/rockylinux:10 AS builder # Install system dependencies RUN dnf install -y epel-release \ @@ -18,7 +18,7 @@ RUN dnf install -y epel-release \ WORKDIR /app # Copy project files -COPY pyproject.toml README.md LICENSE THIRD-PARTY-LICENSES.md ./ +COPY pyproject.toml README.md LICENSE ./ COPY src ./src COPY templates ./templates @@ -26,7 +26,7 @@ COPY templates ./templates RUN python3 -m pip install --no-cache-dir -e . # Runtime stage -FROM rockylinux/rockylinux:9 +FROM rockylinux/rockylinux:10 # Install runtime dependencies RUN dnf install -y epel-release \ @@ -39,8 +39,8 @@ RUN dnf install -y epel-release \ && dnf clean all # Copy Python packages and app from builder -COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages -COPY --from=builder /usr/local/lib64/python3.9/site-packages /usr/local/lib64/python3.9/site-packages +COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages +COPY --from=builder /usr/local/lib64/python3.12/site-packages /usr/local/lib64/python3.12/site-packages COPY --from=builder /app /app WORKDIR /app diff --git a/LICENSE b/LICENSE index ed08977..1b2ab88 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Stephen Simpson +Copyright (c) 2025 Ctrl IQ, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index accf1dc..728c82c 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,108 @@ -# Rocky Man πŸ“š +# πŸš€ Rocky Man πŸš€ **Rocky Man** is a tool for generating searchable HTML documentation from Rocky Linux man pages across BaseOS and AppStream repositories for Rocky Linux 8, 9, and 10. ## Features -- **Fast & Efficient**: Uses filelists.xml to pre-filter packages with man pages -- **Complete Coverage**: All packages from BaseOS and AppStream repositories -- **Container Ready**: Works on x86_64, aarch64, arm64, etc. -- **Smart Cleanup**: Automatic cleanup of temporary files (configurable) -- **Parallel Processing**: Concurrent downloads and conversions for maximum speed -- **Multi-version**: Support for Rocky Linux 8, 9, and 10 simultaneously +- Uses filelists.xml to pre-filter packages with man pages +- Processes packages from BaseOS and AppStream repositories +- Runs in containers on x86_64, aarch64, and arm64 architectures +- Configurable cleanup of temporary files +- Concurrent downloads and conversions +- Supports Rocky Linux 8, 9, and 10 ## Quick Start -### Podman (Recommended) - -```bash -# Build the image -podman build -t rocky-man . - -# Generate man pages for Rocky Linux 9.6 (using defaults, no custom args) -podman run --rm -v $(pwd)/html:/data/html:Z rocky-man - -# Generate for specific versions (requires explicit paths) -podman run --rm -v $(pwd)/html:/app/html:Z rocky-man \ - --versions 8.10 9.6 10.0 --output-dir /app/html - -# With verbose logging -podman run --rm -v $(pwd)/html:/app/html:Z rocky-man \ - --versions 9.6 --output-dir /app/html --verbose - -# Keep downloaded RPMs (mount the download directory) -podman run --rm -it \ - -v $(pwd)/html:/app/html:Z \ - -v $(pwd)/downloads:/app/tmp/downloads:Z \ - rocky-man --versions 9.6 --keep-rpms \ - --output-dir /app/html --download-dir /app/tmp/downloads --verbose -``` - -### Docker +### Podman ```bash # Build the image docker build -t rocky-man . -# Generate man pages (using defaults, no custom args) -docker run --rm -v $(pwd)/html:/data/html rocky-man +# Generate for specific versions +podman run --rm -v $(pwd)/html:/data/html:Z rocky-man \ + --versions 8.10 9.6 10.0 -# Generate for specific versions (requires explicit paths) -docker run --rm -v $(pwd)/html:/app/html rocky-man \ - --versions 9.6 --output-dir /app/html +# Keep downloaded RPMs for multiple builds +podman run --rm -it \ + -v $(pwd)/html:/data/html:Z \ + -v $(pwd)/downloads:/data/tmp/downloads:Z \ + rocky-man --versions 9.6 --keep-rpms --verbose +``` -# Interactive mode for debugging -docker run --rm -it -v $(pwd)/html:/app/html rocky-man \ - --versions 9.6 --output-dir /app/html --verbose +### View the HTML Locally -# Keep downloaded RPMs (mount the download directory) -docker run --rm -it \ - -v $(pwd)/html:/app/html \ - -v $(pwd)/downloads:/app/tmp/downloads \ - rocky-man --versions 9.6 --keep-rpms \ - --output-dir /app/html --download-dir /app/tmp/downloads --verbose +Start a local web server to browse the generated documentation: + +```bash +python3 -m http.server -d ./html +``` + +Then open [http://127.0.0.1:8000](http://127.0.0.1:8000) in your browser. + +To use a different port: + +```bash +python3 -m http.server 8080 -d ./html ``` ### Directory Structure in Container -The container uses different paths depending on whether you pass custom arguments: +The container uses the following paths: -**Without custom arguments** (using Dockerfile CMD defaults): - `/data/html` - Generated HTML output - `/data/tmp/downloads` - Downloaded RPM files - `/data/tmp/extracts` - Extracted man page files -**With custom arguments** (argparse defaults from working directory `/app`): -- `/app/html` - Generated HTML output -- `/app/tmp/downloads` - Downloaded RPM files -- `/app/tmp/extracts` - Extracted man page files - -**Important**: When passing custom arguments, the container's CMD is overridden and the code falls back to relative paths (`./html` = `/app/html`). You must explicitly specify `--output-dir /app/html --download-dir /app/tmp/downloads` to match your volume mounts. Without this, files are written inside the container and lost when it stops (especially with `--rm`). +These paths are used by default and can be overridden with command-line arguments if needed. ### Local Development -#### Prerequisites +**Important**: Rocky Man requires Rocky Linux because it uses the system's native `python3-dnf` module to interact with DNF repositories. This module cannot be installed via pip and must come from the Rocky Linux system packages. -- Python 3.9+ -- pip (Python package manager) -- mandoc (man page converter) -- Rocky Linux system or container (for DNF) - -#### Installation +#### Option 1: Run in a Rocky Linux Container (Recommended) ```bash -# On Rocky Linux, install system dependencies +# Start a Rocky Linux container with your project mounted +podman run --rm -it -v $(pwd):/workspace:Z rockylinux/rockylinux:9 /bin/bash + +# Inside the container, navigate to the project +cd /workspace + +# Install epel-release for mandoc +dnf install -y epel-release + +# Install system dependencies dnf install -y python3 python3-pip python3-dnf mandoc rpm-build dnf-plugins-core # Install Python dependencies pip3 install -e . + +# Run the tool +python3 -m rocky_man.main --versions 9.6 --output-dir ./html/ ``` -#### Usage +#### Option 2: On a Native Rocky Linux System ```bash -# Generate man pages for Rocky 9.6 -python -m rocky_man.main --versions 9.6 +# Install epel-release for mandoc +dnf install -y epel-release -# Generate for multiple versions (default) -python -m rocky_man.main --versions 8.10 9.6 10.0 +# Install system dependencies +dnf install -y python3 python3-pip python3-dnf mandoc rpm-build dnf-plugins-core -# Custom output directory -python -m rocky_man.main --output-dir /var/www/html/man --versions 9.6 +# Install Python dependencies +pip3 install -e . -# Keep downloaded RPMs for debugging -python -m rocky_man.main --keep-rpms --verbose - -# Adjust parallelism for faster processing -python -m rocky_man.main --parallel-downloads 10 --parallel-conversions 20 - -# Use a different mirror -python -m rocky_man.main --mirror https://mirrors.example.com/ - -# Only BaseOS (faster) -python -m rocky_man.main --repo-types BaseOS --versions 9.6 +# Run the tool +python3 -m rocky_man.main --versions 9.6 --output-dir ./html/ ``` ## Architecture -Rocky Man is organized into clean, modular components: +Rocky Man is organized into components: -``` +```text rocky-man/ β”œβ”€β”€ src/rocky_man/ β”‚ β”œβ”€β”€ models/ # Data models (Package, ManFile) @@ -143,22 +118,28 @@ rocky-man/ ### How It Works -1. **Package Discovery** - Parse repository `filelists.xml` to identify packages with man pages -2. **Smart Download** - Download only packages containing man pages with parallel downloads -3. **Extraction** - Extract man page files from RPM packages -4. **Conversion** - Convert troff format to HTML using mandoc -5. **Web Generation** - Wrap HTML in templates and generate search index -6. **Cleanup** - Automatically remove temporary files (configurable) +1. **Package Discovery** - Parses repository metadata (`repodata/repomd.xml` and `filelists.xml.gz`) to identify packages containing files in `/usr/share/man/` directories +2. **Package Download** - Downloads identified RPM packages using DNF, with configurable parallel downloads (default: 5) +3. **Man Page Extraction** - Extracts man page files from RPMs using `rpm2cpio`, filtering by section and language based on configuration +4. **HTML Conversion** - Converts troff-formatted man pages to HTML using mandoc, with parallel processing (default: 10 workers) +5. **Cross-Reference Linking** - Parses converted HTML to add hyperlinks between man page references (e.g., `bash(1)` becomes clickable) +6. **Index Generation** - Creates search indexes (JSON/gzipped) and navigation pages using Jinja2 templates +7. **Cleanup** - Removes temporary files (RPMs and extracted content) unless `--keep-rpms` or `--keep-extracts` is specified ## Command Line Options -``` -usage: rocky-man [-h] [--versions VERSIONS [VERSIONS ...]] - [--repo-types REPO_TYPES [REPO_TYPES ...]] - [--output-dir OUTPUT_DIR] [--download-dir DOWNLOAD_DIR] - [--extract-dir EXTRACT_DIR] [--keep-rpms] [--keep-extracts] - [--parallel-downloads N] [--parallel-conversions N] - [--mirror URL] [--template-dir DIR] [-v] +```bash +usage: main.py [-h] [--versions VERSIONS [VERSIONS ...]] + [--repo-types REPO_TYPES [REPO_TYPES ...]] + [--output-dir OUTPUT_DIR] [--download-dir DOWNLOAD_DIR] + [--extract-dir EXTRACT_DIR] [--keep-rpms] [--keep-extracts] + [--parallel-downloads PARALLEL_DOWNLOADS] + [--parallel-conversions PARALLEL_CONVERSIONS] [--mirror MIRROR] + [--vault] [--existing-versions [VERSION ...]] + [--template-dir TEMPLATE_DIR] [-v] + [--skip-sections [SKIP_SECTIONS ...]] + [--skip-packages [SKIP_PACKAGES ...]] [--skip-languages] + [--keep-languages] [--allow-all-sections] Generate HTML documentation for Rocky Linux man pages @@ -169,11 +150,11 @@ optional arguments: --repo-types REPO_TYPES [REPO_TYPES ...] Repository types to process (default: BaseOS AppStream) --output-dir OUTPUT_DIR - Output directory for HTML files (default: ./html) + Output directory for HTML files (default: /data/html) --download-dir DOWNLOAD_DIR - Directory for downloading packages (default: ./tmp/downloads) + Directory for downloading packages (default: /data/tmp/downloads) --extract-dir EXTRACT_DIR - Directory for extracting man pages (default: ./tmp/extracts) + Directory for extracting man pages (default: /data/tmp/extracts) --keep-rpms Keep downloaded RPM files after processing --keep-extracts Keep extracted man files after processing --parallel-downloads PARALLEL_DOWNLOADS @@ -196,80 +177,11 @@ optional arguments: --allow-all-sections Include all man sections (overrides --skip-sections) ``` -## Troubleshooting +## Attribution -### DNF Errors +The man pages displayed in this documentation are sourced from Rocky Linux distribution packages. All man page content is copyrighted by their respective authors and distributed under the licenses specified within each man page. -**Problem**: `dnf` module not found or repository errors - -**Solution**: Ensure you're running on Rocky Linux or in a Rocky Linux container: - -```bash -# Run in Rocky Linux container -podman run --rm -it -v $(pwd):/app rockylinux:9 /bin/bash -cd /app - -# Install dependencies -dnf install -y python3 python3-dnf mandoc rpm-build dnf-plugins-core - -# Run the script -python3 -m rocky_man.main --versions 9.6 -``` - -### Mandoc Not Found - -**Problem**: `mandoc: command not found` - -**Solution**: Install mandoc: - -```bash -dnf install -y mandoc -``` - -### Permission Errors in Container - -**Problem**: Cannot write to mounted volume - -**Solution**: Use the `:Z` flag with podman for SELinux contexts: - -```bash -podman run --rm -v $(pwd)/html:/data/html:Z rocky-man -``` - -For Docker, ensure the volume path is absolute: - -```bash -docker run --rm -v "$(pwd)/html":/data/html rocky-man -``` - -### Out of Memory - -**Problem**: Process killed due to memory - -**Solution**: Reduce parallelism: - -```bash -python -m rocky_man.main --parallel-downloads 2 --parallel-conversions 5 -``` - -### Slow Downloads - -**Problem**: Downloads are very slow - -**Solution**: Use a closer mirror: - -```bash -# Find mirrors at: https://mirrors.rockylinux.org/mirrormanager/mirrors -python -m rocky_man.main --mirror https://mirror.example.com/rocky/ -``` - -## Performance Tips - -1. **Use closer mirrors** - Significant speed improvement for downloads -2. **Increase parallelism** - If you have bandwidth: `--parallel-downloads 15` -3. **Process one repo at a time** - Use `--repo-types BaseOS` first, then `--repo-types AppStream` -4. **Keep RPMs for re-runs** - Use `--keep-rpms` if testing -5. **Run in container** - More consistent performance +This tool generates HTML documentation from man pages contained in Rocky Linux packages but does not modify the content of the man pages themselves. ## License @@ -277,20 +189,16 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ### Third-Party Software -This project uses several open source components. See [THIRD-PARTY-LICENSES.md](THIRD-PARTY-LICENSES.md) for complete license information and attributions. +This project uses several open source components. + +Key dependencies include: + +- **mandoc** - Man page converter (ISC License) +- **python3-dnf** - DNF package manager Python bindings (GPL-2.0-or-later) +- **Fuse.js** - Client-side search (Apache 2.0) +- **Python packages**: requests, rpmfile, Jinja2, lxml, zstandard +- **Fonts**: Red Hat Display, Red Hat Text, JetBrains Mono (SIL OFL) ### Trademark Notice -Rocky Linuxβ„’ is a trademark of the Rocky Enterprise Software Foundation (RESF). This project is not officially affiliated with or endorsed by RESF. All trademarks are the property of their respective owners. This project complies with RESF's trademark usage guidelines. - -## Contributing - -Contributions welcome! Please: - -1. Fork the repository -2. Create a feature branch (`git checkout -b feature/amazing-feature`) -3. Make your changes with proper documentation -4. Test thoroughly -5. Commit with clear messages (`git commit -m 'feat: add amazing feature'`) -6. Push to your branch (`git push origin feature/amazing-feature`) -7. Open a Pull Request +Rocky Linux is a trademark of the Rocky Enterprise Software Foundation (RESF). This project is not officially affiliated with or endorsed by RESF. All trademarks are the property of their respective owners. This project complies with RESF's trademark usage guidelines. diff --git a/pyproject.toml b/pyproject.toml index 7b22547..3014eca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,13 +7,13 @@ license = {text = "MIT"} authors = [ { name = "Stephen Simpson", email = "ssimpson89@users.noreply.github.com" } ] -requires-python = ">=3.9" +requires-python = ">=3.12" dependencies = [ - "requests>=2.31.0", - "rpmfile>=2.0.0", + "requests>=2.32.0", + "rpmfile>=2.1.0", "jinja2>=3.1.0", - "lxml>=5.0.0", - "zstandard>=0.18.0", + "lxml>=6.0.0", + "zstandard>=0.25.0", ] [project.scripts] diff --git a/src/rocky_man/main.py b/src/rocky_man/main.py index 0a230a2..920fec1 100644 --- a/src/rocky_man/main.py +++ b/src/rocky_man/main.py @@ -43,18 +43,13 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: all_man_files = [] - # Process each repository type for repo_type in config.repo_types: logger.info(f"Processing {repo_type} repository") - # Use first available architecture (man pages are arch-independent) arch = config.architectures[0] - - # Create cache dir for this repo cache_dir = config.download_dir / f".cache/{version}/{repo_type}" try: - # Initialize repository manager repo_manager = RepoManager( config=config, version=version, @@ -64,7 +59,6 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: download_dir=version_download_dir, ) - # List packages (with man pages only) packages = repo_manager.list_packages(with_manpages_only=True) if not packages: @@ -73,7 +67,6 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: logger.info(f"Found {len(packages)} packages with man pages in {repo_type}") - # Filter out packages that should be skipped if config.skip_packages: original_count = len(packages) packages = [ @@ -86,13 +79,11 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: ) logger.info(f"Processing {len(packages)} packages") - # Download packages logger.info("Downloading packages...") downloaded = repo_manager.download_packages( packages, max_workers=config.parallel_downloads ) - # Extract man pages logger.info("Extracting man pages...") extractor = ManPageExtractor( version_extract_dir, @@ -105,7 +96,6 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: logger.info(f"Extracted {len(man_files)} man pages") - # Read content for each man file logger.info("Reading man page content...") man_files_with_content = [] for man_file in man_files: @@ -113,7 +103,6 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: if content: man_files_with_content.append((man_file, content)) - # Convert to HTML logger.info("Converting man pages to HTML...") converter = ManPageConverter(version_output_dir) converted = converter.convert_many( @@ -122,7 +111,6 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: all_man_files.extend(converted) - # Cleanup if requested if not config.keep_rpms: logger.info("Cleaning up downloaded packages...") for package in downloaded: @@ -141,30 +129,21 @@ def process_version(config: Config, version: str, template_dir: Path) -> bool: logger.error(f"No man pages were successfully processed for version {version}") return False - # Generate web pages logger.info("Generating web pages...") web_gen = WebGenerator(template_dir, config.output_dir) - # Generate search index search_index = web_gen.generate_search_index(all_man_files, version) web_gen.save_search_index(search_index, version) - - # Generate index page web_gen.generate_index(version, search_index) - - # Generate packages index page web_gen.generate_packages_index(version, search_index) - # Set HTML paths for all man files for man_file in all_man_files: if not man_file.html_path: man_file.html_path = web_gen._get_manpage_path(man_file, version) - # Link cross-references between man pages logger.info("Linking cross-references...") converter.link_cross_references(all_man_files, version) - # Wrap man pages in templates logger.info("Generating man page HTML...") for man_file in all_man_files: web_gen.generate_manpage_html(man_file, version) @@ -198,22 +177,22 @@ def main(): parser.add_argument( "--output-dir", type=Path, - default=Path("./html"), - help="Output directory for HTML files (default: ./html)", + default=Path("/data/html"), + help="Output directory for HTML files (default: /data/html)", ) parser.add_argument( "--download-dir", type=Path, - default=Path("./tmp/downloads"), - help="Directory for downloading packages (default: ./tmp/downloads)", + default=Path("/data/tmp/downloads"), + help="Directory for downloading packages (default: /data/tmp/downloads)", ) parser.add_argument( "--extract-dir", type=Path, - default=Path("./tmp/extracts"), - help="Directory for extracting man pages (default: ./tmp/extracts)", + default=Path("/data/tmp/extracts"), + help="Directory for extracting man pages (default: /data/tmp/extracts)", ) parser.add_argument( @@ -307,21 +286,17 @@ def main(): args = parser.parse_args() - # Setup logging setup_logging(args.verbose) logger = logging.getLogger(__name__) - # Handle filtering options - skip_languages = True # default + skip_languages = True if args.keep_languages: skip_languages = False elif args.skip_languages is not None: skip_languages = args.skip_languages - # Determine content directory content_dir = "vault/rocky" if args.vault else "pub/rocky" - # Create configuration config = Config( base_url=args.mirror, content_dir=content_dir, @@ -340,7 +315,6 @@ def main(): allow_all_sections=args.allow_all_sections, ) - # Get existing versions from scan and argument scanned_versions = [ d.name for d in config.output_dir.iterdir() @@ -348,7 +322,6 @@ def main(): ] arg_versions = args.existing_versions or [] - # Sort versions numerically by (major, minor) def version_key(v): try: major, minor = v.split(".") @@ -365,7 +338,6 @@ def main(): logger.info(f"Repositories: {', '.join(config.repo_types)}") logger.info(f"Output directory: {config.output_dir}") - # Log filtering configuration if config.skip_sections: logger.info(f"Skipping man sections: {', '.join(config.skip_sections)}") else: @@ -379,7 +351,6 @@ def main(): else: logger.info("Including all languages") - # Process each version processed_versions = [] for version in config.versions: try: @@ -392,11 +363,13 @@ def main(): logger.error("No versions were successfully processed") return 1 - # Generate root index logger.info("Generating root index page...") web_gen = WebGenerator(args.template_dir, config.output_dir) web_gen.generate_root_index(all_versions) + logger.info("Generating 404 page...") + web_gen.generate_404_page() + logger.info("=" * 60) logger.info("Processing complete!") logger.info(f"Generated documentation for: {', '.join(processed_versions)}") diff --git a/src/rocky_man/models/manfile.py b/src/rocky_man/models/manfile.py index 3cfa773..1e3d4d9 100644 --- a/src/rocky_man/models/manfile.py +++ b/src/rocky_man/models/manfile.py @@ -35,35 +35,22 @@ class ManFile: self._parse_path() def _parse_path(self): - """Extract section, name, and language from the file path. - - Example paths: - /usr/share/man/man1/bash.1.gz - /usr/share/man/es/man1/bash.1.gz - /usr/share/man/man3/printf.3.gz - """ + """Extract section, name, and language from the file path.""" parts = self.file_path.parts filename = self.file_path.name - # Remove .gz extension if present if filename.endswith('.gz'): filename = filename[:-3] - # Extract section from parent directory (e.g., 'man1', 'man3p', 'man3pm') for part in reversed(parts): if part.startswith('man') and len(part) > 3: - # Check if it starts with 'man' followed by a digit if part[3].isdigit(): self.section = part[3:] break - # Extract section from filename if not found yet (e.g., 'foo.3pm' -> section '3pm') - # and extract name name_parts = filename.split('.') if len(name_parts) >= 2: - # Try to identify section from last part potential_section = name_parts[-1] - # Section is typically digit optionally followed by letters (1, 3p, 3pm, etc.) if potential_section and potential_section[0].isdigit(): if not self.section: self.section = potential_section @@ -73,14 +60,10 @@ class ManFile: else: self.name = name_parts[0] - # Check for language subdirectory - # Pattern: /usr/share/man//man
/ for i, part in enumerate(parts): if part == 'man' and i + 1 < len(parts): next_part = parts[i + 1] - # If next part is not 'man', it's a language code if not (next_part.startswith('man') and next_part[3:].isdigit()): - # Common language codes are 2-5 chars (en, es, pt_BR, etc.) if len(next_part) <= 5: self.language = next_part break @@ -93,14 +76,12 @@ class ManFile: @property def html_filename(self) -> str: """Get the HTML filename for this man page.""" - # Clean name for filesystem safety safe_name = self._clean_filename(self.name) suffix = f".{self.language}" if self.language else "" return f"{safe_name}.{self.section}{suffix}.html" def _clean_filename(self, name: str) -> str: """Clean filename for filesystem safety.""" - # Replace problematic characters name = name.replace('/', '_') name = name.replace(':', '_') name = re.sub(r'\.\.', '__', name) @@ -108,19 +89,13 @@ class ManFile: @property def uri_path(self) -> str: - """Get the URI path for this man page (relative to version root). - - Returns path like: 'bash/man1/bash.1.html' - """ + """Get the URI path for this man page (relative to version root).""" if not self.html_path: return "" - # Get path relative to the version directory - # Assuming structure: html///
/.html parts = self.html_path.parts try: - # Find the version part (e.g., '9.5') and return everything after it for i, part in enumerate(parts): - if re.match(r'\d+\.\d+', part): # Version pattern + if re.match(r'\d+\.\d+', part): return '/'.join(parts[i+1:]) except (ValueError, IndexError): pass diff --git a/src/rocky_man/processor/converter.py b/src/rocky_man/processor/converter.py index 6dee0b2..6d82920 100644 --- a/src/rocky_man/processor/converter.py +++ b/src/rocky_man/processor/converter.py @@ -38,15 +38,11 @@ class ManPageConverter: def _check_mandoc() -> bool: """Check if mandoc is available.""" try: - # Run mandoc with no arguments - it will show usage and exit - # We just want to verify the command exists, not that it succeeds subprocess.run(["mandoc"], capture_output=True, timeout=5) return True except FileNotFoundError: - # mandoc command not found return False except Exception: - # Other errors (timeout, etc) - but mandoc exists return True def convert(self, man_file: ManFile, content: str) -> bool: @@ -60,26 +56,20 @@ class ManPageConverter: True if conversion successful """ try: - # Run mandoc to convert to HTML html = self._run_mandoc(content) if not html: logger.warning(f"mandoc produced no output for {man_file.display_name}") return False - # Clean up HTML html = self._clean_html(html) - # Check if mandoc output indicates this is a symlink/redirect - # Pattern:
/usr/share/man/man8/target.8.gz
- # or:
See the file /usr/share/man/man8/target.8.
- # or:
See the file man1/builtin.1.
+ # Check if output indicates this is a symlink/redirect symlink_match = re.search( r'
.*?(?:See the file )?((?:/usr/share/man/)?man\d+[a-z]*/([^/]+)\.(\d+[a-z]*)(?:\.gz)?)\..*?
', html, re.DOTALL, ) if not symlink_match: - # Try simpler pattern without "See the file" or period symlink_match = re.search( r'
.*?((?:/usr/share/man/)?man\d+[a-z]*/([^/<]+)\.(\d+[a-z]*)(?:\.gz)?).*?
', html, @@ -94,14 +84,9 @@ class ManPageConverter: ) html = self._generate_redirect_html({"name": name, "section": section}) - # Store in ManFile object man_file.html_content = html - - # Determine output path output_path = self._get_output_path(man_file) man_file.html_path = output_path - - # Save HTML file output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: f.write(html) @@ -128,13 +113,11 @@ class ManPageConverter: converted = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: - # Submit all conversion tasks future_to_manfile = { executor.submit(self.convert, man_file, content): man_file for man_file, content in man_files } - # Collect results for future in as_completed(future_to_manfile): man_file = future_to_manfile[future] try: @@ -166,7 +149,6 @@ class ManPageConverter: if result.returncode != 0: stderr = result.stderr.decode("utf-8", errors="replace") logger.warning(f"mandoc returned error: {stderr}") - # Sometimes mandoc returns non-zero but still produces output if result.stdout: return result.stdout.decode("utf-8", errors="replace") return None @@ -189,15 +171,11 @@ class ManPageConverter: Returns: Cleaned HTML """ - # Remove empty parentheses in header cells html = re.sub( - r'\(\)', '', html + r'\(\)', + r'', + html, ) - html = re.sub( - r'\(\)', '', html - ) - - # Strip leading/trailing whitespace html = html.strip() return html @@ -213,12 +191,8 @@ class ManPageConverter: """ name = target_info["name"] section = target_info["section"] - - # Generate the relative path to the target man page - # Symlinks are in the same package, just different file names target_filename = f"{name}.{section}.html" - # Generate simple redirect HTML with a working hyperlink html = f'''