CUSP-1256 (#1)

* Complete refactor Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com> * Complete refactor Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com> --------- Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
2025-11-20 12:16:33 -05:00
parent 5248edad62
commit ec32c72363
44 changed files with 4083 additions and 1540 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/rocky_man/init.py
+++ b/src/rocky_man/init.py
@@ -0,0 +1,5 @@
+from .utils.config import Config
+
+__version__ = "0.1.0"
+
+__all__ = ["Config"]
--- a/src/rocky_man/main.py
+++ b/src/rocky_man/main.py
@@ -0,0 +1,377 @@
+"""Main entry point for Rocky Man."""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from .utils.config import Config
+from .repo import RepoManager
+from .processor import ManPageExtractor, ManPageConverter
+from .web import WebGenerator
+
+
+def setup_logging(verbose: bool = False):
+    """Configure logging."""
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+
+def process_version(
+    config: Config,
+    version: str,
+    template_dir: Path
+) -> bool:
+    """Process a single Rocky Linux version.
+
+    Args:
+        config: Configuration object
+        version: Rocky Linux version to process
+        template_dir: Path to templates directory
+
+    Returns:
+        True if successful
+    """
+    logger = logging.getLogger(__name__)
+    logger.info(f"Processing Rocky Linux {version}")
+
+    # Setup directories for this version
+    version_download_dir = config.get_version_download_dir(version)
+    version_extract_dir = config.get_version_extract_dir(version)
+    version_output_dir = config.get_version_output_dir(version)
+
+    all_man_files = []
+
+    # Process each repository type
+    for repo_type in config.repo_types:
+        logger.info(f"Processing {repo_type} repository")
+
+        # Use first available architecture (man pages are arch-independent)
+        arch = config.architectures[0]
+
+        # Get repository URL
+        repo_url = config.get_repo_url(version, repo_type, arch)
+
+        # Create cache dir for this repo
+        cache_dir = config.download_dir / f".cache/{version}/{repo_type}"
+
+        try:
+            # Initialize repository manager
+            repo_manager = RepoManager(
+                repo_url=repo_url,
+                version=version,
+                repo_type=repo_type,
+                arch=arch,
+                cache_dir=cache_dir,
+                download_dir=version_download_dir
+            )
+
+            # List packages (with man pages only)
+            packages = repo_manager.list_packages(with_manpages_only=True)
+
+            if not packages:
+                logger.warning(f"No packages found in {repo_type}")
+                continue
+
+            logger.info(f"Found {len(packages)} packages with man pages in {repo_type}")
+
+            # Filter out packages that should be skipped
+            if config.skip_packages:
+                original_count = len(packages)
+                packages = [
+                    pkg for pkg in packages
+                    if pkg.name not in config.skip_packages
+                ]
+                filtered_count = original_count - len(packages)
+                if filtered_count > 0:
+                    logger.info(f"Filtered out {filtered_count} packages based on skip list")
+                    logger.info(f"Processing {len(packages)} packages")
+
+            # Download packages
+            logger.info("Downloading packages...")
+            downloaded = repo_manager.download_packages(
+                packages,
+                max_workers=config.parallel_downloads
+            )
+
+            # Extract man pages
+            logger.info("Extracting man pages...")
+            extractor = ManPageExtractor(
+                version_extract_dir,
+                skip_sections=config.skip_sections,
+                skip_languages=config.skip_languages
+            )
+            man_files = extractor.extract_from_packages(
+                downloaded,
+                max_workers=config.parallel_downloads
+            )
+
+            logger.info(f"Extracted {len(man_files)} man pages")
+
+            # Read content for each man file
+            logger.info("Reading man page content...")
+            man_files_with_content = []
+            for man_file in man_files:
+                content = extractor.read_manpage_content(man_file)
+                if content:
+                    man_files_with_content.append((man_file, content))
+
+            # Convert to HTML
+            logger.info("Converting man pages to HTML...")
+            converter = ManPageConverter(version_output_dir)
+            converted = converter.convert_many(
+                man_files_with_content,
+                max_workers=config.parallel_conversions
+            )
+
+            all_man_files.extend(converted)
+
+            # Cleanup if requested
+            if not config.keep_rpms:
+                logger.info("Cleaning up downloaded packages...")
+                for package in downloaded:
+                    repo_manager.cleanup_package(package)
+
+            if not config.keep_extracts:
+                logger.info("Cleaning up extracted files...")
+                for package in downloaded:
+                    extractor.cleanup_extracts(package)
+
+        except Exception as e:
+            logger.error(f"Error processing {repo_type}: {e}", exc_info=True)
+            continue
+
+    if not all_man_files:
+        logger.error(f"No man pages were successfully processed for version {version}")
+        return False
+
+    # Link cross-references between man pages
+    logger.info("Linking cross-references...")
+    converter = ManPageConverter(version_output_dir)
+    converter.link_cross_references(all_man_files)
+
+    # Generate web pages
+    logger.info("Generating web pages...")
+    web_gen = WebGenerator(template_dir, config.output_dir)
+
+    # Generate search index
+    search_index = web_gen.generate_search_index(all_man_files, version)
+    web_gen.save_search_index(search_index, version)
+
+    # Generate index page
+    web_gen.generate_index(version, search_index)
+
+    # Generate packages index page
+    web_gen.generate_packages_index(version, search_index)
+
+    # Wrap man pages in templates
+    logger.info("Generating man page HTML...")
+    for man_file in all_man_files:
+        web_gen.generate_manpage_html(man_file, version)
+
+    logger.info(f"Successfully processed {len(all_man_files)} man pages for Rocky Linux {version}")
+    return True
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Generate HTML documentation for Rocky Linux man pages'
+    )
+
+    parser.add_argument(
+        '--versions',
+        nargs='+',
+        default=['8.10', '9.6', '10.0'],
+        help='Rocky Linux versions to process (default: 8.10 9.6 10.0)'
+    )
+
+    parser.add_argument(
+        '--repo-types',
+        nargs='+',
+        default=['BaseOS', 'AppStream'],
+        help='Repository types to process (default: BaseOS AppStream)'
+    )
+
+    parser.add_argument(
+        '--output-dir',
+        type=Path,
+        default=Path('./html'),
+        help='Output directory for HTML files (default: ./html)'
+    )
+
+    parser.add_argument(
+        '--download-dir',
+        type=Path,
+        default=Path('./tmp/downloads'),
+        help='Directory for downloading packages (default: ./tmp/downloads)'
+    )
+
+    parser.add_argument(
+        '--extract-dir',
+        type=Path,
+        default=Path('./tmp/extracts'),
+        help='Directory for extracting man pages (default: ./tmp/extracts)'
+    )
+
+    parser.add_argument(
+        '--keep-rpms',
+        action='store_true',
+        help='Keep downloaded RPM files after processing'
+    )
+
+    parser.add_argument(
+        '--keep-extracts',
+        action='store_true',
+        help='Keep extracted man files after processing'
+    )
+
+    parser.add_argument(
+        '--parallel-downloads',
+        type=int,
+        default=5,
+        help='Number of parallel downloads (default: 5)'
+    )
+
+    parser.add_argument(
+        '--parallel-conversions',
+        type=int,
+        default=10,
+        help='Number of parallel HTML conversions (default: 10)'
+    )
+
+    parser.add_argument(
+        '--mirror',
+        default='http://dl.rockylinux.org/',
+        help='Rocky Linux mirror URL (default: http://dl.rockylinux.org/)'
+    )
+
+    parser.add_argument(
+        '--template-dir',
+        type=Path,
+        default=Path(__file__).parent.parent.parent / 'templates',
+        help='Template directory (default: ./templates)'
+    )
+
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='Enable verbose logging'
+    )
+
+    parser.add_argument(
+        '--skip-sections',
+        nargs='*',
+        default=None,
+        help='Man sections to skip (default: 3 3p 3pm). Use empty list to skip none.'
+    )
+
+    parser.add_argument(
+        '--skip-packages',
+        nargs='*',
+        default=None,
+        help='Package names to skip (default: lapack dpdk-devel gl-manpages). Use empty list to skip none.'
+    )
+
+    parser.add_argument(
+        '--skip-languages',
+        action='store_true',
+        default=None,
+        help='Skip non-English man pages (default: enabled)'
+    )
+
+    parser.add_argument(
+        '--keep-languages',
+        action='store_true',
+        help='Keep all languages (disables --skip-languages)'
+    )
+
+    parser.add_argument(
+        '--allow-all-sections',
+        action='store_true',
+        help='Include all man sections (overrides --skip-sections)'
+    )
+
+    args = parser.parse_args()
+
+    # Setup logging
+    setup_logging(args.verbose)
+    logger = logging.getLogger(__name__)
+
+    # Handle filtering options
+    skip_languages = True  # default
+    if args.keep_languages:
+        skip_languages = False
+    elif args.skip_languages is not None:
+        skip_languages = args.skip_languages
+
+    # Create configuration
+    config = Config(
+        base_url=args.mirror,
+        versions=args.versions,
+        repo_types=args.repo_types,
+        download_dir=args.download_dir,
+        extract_dir=args.extract_dir,
+        output_dir=args.output_dir,
+        keep_rpms=args.keep_rpms,
+        keep_extracts=args.keep_extracts,
+        parallel_downloads=args.parallel_downloads,
+        parallel_conversions=args.parallel_conversions,
+        skip_sections=args.skip_sections,
+        skip_packages=args.skip_packages,
+        skip_languages=skip_languages,
+        allow_all_sections=args.allow_all_sections
+    )
+
+    logger.info("Rocky Man - Rocky Linux Man Page Generator")
+    logger.info(f"Versions: {', '.join(config.versions)}")
+    logger.info(f"Repositories: {', '.join(config.repo_types)}")
+    logger.info(f"Output directory: {config.output_dir}")
+
+    # Log filtering configuration
+    if config.skip_sections:
+        logger.info(f"Skipping man sections: {', '.join(config.skip_sections)}")
+    else:
+        logger.info("Including all man sections")
+
+    if config.skip_packages:
+        logger.info(f"Skipping packages: {', '.join(config.skip_packages)}")
+
+    if config.skip_languages:
+        logger.info("Skipping non-English languages")
+    else:
+        logger.info("Including all languages")
+
+    # Process each version
+    processed_versions = []
+    for version in config.versions:
+        try:
+            if process_version(config, version, args.template_dir):
+                processed_versions.append(version)
+        except Exception as e:
+            logger.error(f"Failed to process version {version}: {e}", exc_info=True)
+
+    if not processed_versions:
+        logger.error("No versions were successfully processed")
+        return 1
+
+    # Generate root index
+    logger.info("Generating root index page...")
+    web_gen = WebGenerator(args.template_dir, config.output_dir)
+    web_gen.generate_root_index(processed_versions)
+
+    logger.info("=" * 60)
+    logger.info("Processing complete!")
+    logger.info(f"Generated documentation for: {', '.join(processed_versions)}")
+    logger.info(f"Output directory: {config.output_dir.absolute()}")
+    logger.info("=" * 60)
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/src/rocky_man/models/init.py
+++ b/src/rocky_man/models/init.py
@@ -0,0 +1,6 @@
+"""Data models for Rocky Man."""
+
+from .package import Package
+from .manfile import ManFile
+
+__all__ = ["Package", "ManFile"]
--- a/src/rocky_man/models/manfile.py
+++ b/src/rocky_man/models/manfile.py
@@ -0,0 +1,130 @@
+"""ManFile model representing a man page file."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import re
+
+
+@dataclass
+class ManFile:
+    """Represents a man page file extracted from an RPM package.
+
+    Attributes:
+        file_path: Path to the extracted man page file
+        package_name: Name of the package this man page belongs to
+        section: Man page section (1-9)
+        name: Man page name without extension
+        language: Language code (e.g., 'en', 'es', None for default)
+        content: Raw man page content (gzipped or plain text)
+        html_content: Converted HTML content
+        html_path: Path where HTML file is saved
+    """
+
+    file_path: Path
+    package_name: str
+    section: Optional[str] = None
+    name: Optional[str] = None
+    language: Optional[str] = None
+    content: Optional[bytes] = None
+    html_content: Optional[str] = None
+    html_path: Optional[Path] = None
+
+    def __post_init__(self):
+        """Parse file information from the path."""
+        self._parse_path()
+
+    def _parse_path(self):
+        """Extract section, name, and language from the file path.
+
+        Example paths:
+            /usr/share/man/man1/bash.1.gz
+            /usr/share/man/es/man1/bash.1.gz
+            /usr/share/man/man3/printf.3.gz
+        """
+        parts = self.file_path.parts
+        filename = self.file_path.name
+
+        # Remove .gz extension if present
+        if filename.endswith('.gz'):
+            filename = filename[:-3]
+
+        # Extract section from parent directory (e.g., 'man1', 'man3p', 'man3pm')
+        for part in reversed(parts):
+            if part.startswith('man') and len(part) > 3:
+                # Check if it starts with 'man' followed by a digit
+                if part[3].isdigit():
+                    self.section = part[3:]
+                    break
+
+        # Extract section from filename if not found yet (e.g., 'foo.3pm' -> section '3pm')
+        # and extract name
+        name_parts = filename.split('.')
+        if len(name_parts) >= 2:
+            # Try to identify section from last part
+            potential_section = name_parts[-1]
+            # Section is typically digit optionally followed by letters (1, 3p, 3pm, etc.)
+            if potential_section and potential_section[0].isdigit():
+                if not self.section:
+                    self.section = potential_section
+                self.name = '.'.join(name_parts[:-1])
+            else:
+                self.name = name_parts[0]
+        else:
+            self.name = name_parts[0]
+
+        # Check for language subdirectory
+        # Pattern: /usr/share/man/<lang>/man<section>/
+        for i, part in enumerate(parts):
+            if part == 'man' and i + 1 < len(parts):
+                next_part = parts[i + 1]
+                # If next part is not 'man<digit>', it's a language code
+                if not (next_part.startswith('man') and next_part[3:].isdigit()):
+                    # Common language codes are 2-5 chars (en, es, pt_BR, etc.)
+                    if len(next_part) <= 5:
+                        self.language = next_part
+                break
+
+    @property
+    def display_name(self) -> str:
+        """Get display name for the man page (e.g., 'bash(1)')."""
+        return f"{self.name}({self.section})" if self.section else self.name
+
+    @property
+    def html_filename(self) -> str:
+        """Get the HTML filename for this man page."""
+        # Clean name for filesystem safety
+        safe_name = self._clean_filename(self.name)
+        suffix = f".{self.language}" if self.language else ""
+        return f"{safe_name}.{self.section}{suffix}.html"
+
+    def _clean_filename(self, name: str) -> str:
+        """Clean filename for filesystem safety."""
+        # Replace problematic characters
+        name = name.replace('/', '_')
+        name = name.replace(':', '_')
+        name = re.sub(r'\.\.', '__', name)
+        return name
+
+    @property
+    def uri_path(self) -> str:
+        """Get the URI path for this man page (relative to version root).
+
+        Returns path like: 'bash/man1/bash.1.html'
+        """
+        if not self.html_path:
+            return ""
+        # Get path relative to the version directory
+        # Assuming structure: html/<version>/<package>/<section>/<file>.html
+        parts = self.html_path.parts
+        try:
+            # Find the version part (e.g., '9.5') and return everything after it
+            for i, part in enumerate(parts):
+                if re.match(r'\d+\.\d+', part):  # Version pattern
+                    return '/'.join(parts[i+1:])
+        except (ValueError, IndexError):
+            pass
+        return str(self.html_path)
+
+    def __str__(self):
+        return f"{self.package_name}: {self.display_name}"
--- a/src/rocky_man/models/package.py
+++ b/src/rocky_man/models/package.py
@@ -0,0 +1,58 @@
+"""Package model representing an RPM package."""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class Package:
+    """Represents an RPM package from a Rocky Linux repository.
+
+    Attributes:
+        name: Package name (e.g., 'bash')
+        version: Package version
+        release: Package release
+        arch: Architecture (e.g., 'x86_64', 'noarch')
+        repo_type: Repository type ('BaseOS' or 'AppStream')
+        location: Relative path in repo (e.g., 'Packages/b/bash-5.1.8-6.el9.x86_64.rpm')
+        baseurl: Base URL of the repository
+        checksum: Package checksum for verification
+        checksum_type: Type of checksum (e.g., 'sha256')
+        download_path: Local path where package is downloaded
+        has_manpages: Whether this package contains man pages
+    """
+
+    name: str
+    version: str
+    release: str
+    arch: str
+    repo_type: str
+    location: str
+    baseurl: str
+    checksum: str
+    checksum_type: str
+    has_manpages: bool = False
+    download_path: Optional[Path] = None
+
+    @property
+    def filename(self) -> str:
+        """Get the RPM filename from the location."""
+        return self.location.split("/")[-1]
+
+    @property
+    def download_url(self) -> str:
+        """Get the full download URL for this package."""
+        return f"{self.baseurl.rstrip('/')}/{self.location.lstrip('/')}"
+
+    @property
+    def nvra(self) -> str:
+        """Get the Name-Version-Release-Arch identifier."""
+        return f"{self.name}-{self.version}-{self.release}.{self.arch}"
+
+    def __lt__(self, other):
+        """Enable sorting packages by name."""
+        return self.name < other.name
+
+    def __str__(self):
+        return f"{self.nvra} ({self.repo_type})"
--- a/src/rocky_man/processor/init.py
+++ b/src/rocky_man/processor/init.py
@@ -0,0 +1,4 @@
+from .extractor import ManPageExtractor
+from .converter import ManPageConverter
+
+__all__ = ["ManPageExtractor", "ManPageConverter"]
--- a/src/rocky_man/processor/converter.py
+++ b/src/rocky_man/processor/converter.py
@@ -0,0 +1,292 @@
+"""Convert man pages to HTML using mandoc."""
+
+import logging
+import re
+import subprocess
+from pathlib import Path
+from typing import List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from ..models import ManFile
+
+logger = logging.getLogger(__name__)
+
+
+class ManPageConverter:
+    """Converts man pages to HTML using mandoc.
+
+    Handles:
+    - Converting troff to HTML using mandoc
+    - Cleaning up HTML output
+    - Parallel conversion of multiple man pages
+    """
+
+    def __init__(self, output_dir: Path):
+        """Initialize converter.
+
+        Args:
+            output_dir: Base directory for HTML output
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Check if mandoc is available
+        if not self._check_mandoc():
+            raise RuntimeError("mandoc is not installed or not in PATH")
+
+    @staticmethod
+    def _check_mandoc() -> bool:
+        """Check if mandoc is available."""
+        try:
+            # Run mandoc with no arguments - it will show usage and exit
+            # We just want to verify the command exists, not that it succeeds
+            subprocess.run(
+                ['mandoc'],
+                capture_output=True,
+                timeout=5
+            )
+            return True
+        except FileNotFoundError:
+            # mandoc command not found
+            return False
+        except Exception:
+            # Other errors (timeout, etc) - but mandoc exists
+            return True
+
+    def convert(self, man_file: ManFile, content: str) -> bool:
+        """Convert a single man page to HTML.
+
+        Args:
+            man_file: ManFile object to convert
+            content: Raw man page content (troff format)
+
+        Returns:
+            True if conversion successful
+        """
+        try:
+            # Run mandoc to convert to HTML
+            html = self._run_mandoc(content)
+            if not html:
+                logger.warning(f"mandoc produced no output for {man_file.display_name}")
+                return False
+
+            # Clean up HTML
+            html = self._clean_html(html)
+
+            # Store in ManFile object
+            man_file.html_content = html
+
+            # Determine output path
+            output_path = self._get_output_path(man_file)
+            man_file.html_path = output_path
+
+            # Save HTML file
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+
+            logger.debug(f"Converted {man_file.display_name} -> {output_path}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error converting {man_file.display_name}: {e}")
+            return False
+
+    def convert_many(
+        self,
+        man_files: List[tuple],
+        max_workers: int = 10
+    ) -> List[ManFile]:
+        """Convert multiple man pages in parallel.
+
+        Args:
+            man_files: List of (ManFile, content) tuples
+            max_workers: Maximum number of parallel conversions
+
+        Returns:
+            List of successfully converted ManFile objects
+        """
+        converted = []
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all conversion tasks
+            future_to_manfile = {
+                executor.submit(self.convert, man_file, content): man_file
+                for man_file, content in man_files
+            }
+
+            # Collect results
+            for future in as_completed(future_to_manfile):
+                man_file = future_to_manfile[future]
+                try:
+                    if future.result():
+                        converted.append(man_file)
+                except Exception as e:
+                    logger.error(f"Error converting {man_file.display_name}: {e}")
+
+        logger.info(f"Converted {len(converted)}/{len(man_files)} man pages to HTML")
+        return converted
+
+    def _run_mandoc(self, content: str) -> Optional[str]:
+        """Run mandoc to convert man page to HTML.
+
+        Args:
+            content: Raw man page content
+
+        Returns:
+            HTML output from mandoc, or None on error
+        """
+        try:
+            result = subprocess.run(
+                ['mandoc', '-T', 'html', '-O', 'fragment,toc'],
+                input=content.encode('utf-8'),
+                capture_output=True,
+                timeout=30
+            )
+
+            if result.returncode != 0:
+                stderr = result.stderr.decode('utf-8', errors='replace')
+                logger.warning(f"mandoc returned error: {stderr}")
+                # Sometimes mandoc returns non-zero but still produces output
+                if result.stdout:
+                    return result.stdout.decode('utf-8', errors='replace')
+                return None
+
+            return result.stdout.decode('utf-8', errors='replace')
+
+        except subprocess.TimeoutExpired:
+            logger.error("mandoc conversion timed out")
+            return None
+        except Exception as e:
+            logger.error(f"Error running mandoc: {e}")
+            return None
+
+    def _clean_html(self, html: str) -> str:
+        """Clean up mandoc HTML output.
+
+        Args:
+            html: Raw HTML from mandoc
+
+        Returns:
+            Cleaned HTML
+        """
+        # Remove empty parentheses in header cells
+        html = re.sub(
+            r'<td class="head-ltitle">\(\)</td>',
+            '<td class="head-ltitle"></td>',
+            html
+        )
+        html = re.sub(
+            r'<td class="head-rtitle">\(\)</td>',
+            '<td class="head-rtitle"></td>',
+            html
+        )
+
+        # Strip leading/trailing whitespace
+        html = html.strip()
+
+        return html
+
+    def link_cross_references(self, man_files: List[ManFile]) -> None:
+        """Add hyperlinks to cross-references in SEE ALSO sections.
+
+        Goes through all converted HTML files and converts man page references
+        like pty(4) into working hyperlinks.
+
+        Args:
+            man_files: List of all converted ManFile objects
+        """
+        # Build lookup index: (name, section) -> relative_path
+        lookup = {}
+        for mf in man_files:
+            key = (mf.name.lower(), str(mf.section))
+            if key not in lookup:
+                # Store the relative path from the version root
+                lookup[key] = f"{mf.package_name}/man{mf.section}/{mf.html_filename}"
+
+        logger.info(f"Linking cross-references across {len(man_files)} man pages...")
+
+        # Process each man page HTML file
+        for man_file in man_files:
+            if not man_file.html_path or not man_file.html_path.exists():
+                continue
+
+            try:
+                # Read the HTML
+                with open(man_file.html_path, 'r', encoding='utf-8') as f:
+                    html = f.read()
+
+                # Find and replace man page references
+                # Mandoc outputs references as: <b>name</b>(section)
+                # Pattern matches both <b>name</b>(section) and plain name(section)
+                pattern = r'<b>([\w\-_.]+)</b>\((\d+[a-z]*)\)|\b([\w\-_.]+)\((\d+[a-z]*)\)'
+
+                def replace_reference(match):
+                    full_match = match.group(0)
+
+                    # Check if this match is already inside an <a> tag
+                    # Look back up to 500 chars for context
+                    before_text = html[max(0, match.start()-500):match.start()]
+
+                    # Find the last <a and last </a> before this match
+                    last_open = before_text.rfind('<a ')
+                    last_close = before_text.rfind('</a>')
+
+                    # If the last <a> is after the last </a>, we're inside a link
+                    if last_open > last_close:
+                        return full_match
+
+                    if match.group(1):  # <b>name</b>(section) format
+                        name = match.group(1).lower()
+                        section = match.group(2)
+                    else:  # plain name(section) format
+                        name = match.group(3).lower()
+                        section = match.group(4)
+
+                    # Look up the referenced man page
+                    key = (name, section)
+                    if key in lookup:
+                        # Calculate relative path from current file to target
+                        target_path = lookup[key]
+                        # File structure: output_dir/version/package_name/manN/file.html
+                        # Need to go up 3 levels to reach version root
+                        # Current: package_name/manN/file.html
+                        # Target: other_package/manM/file.html
+                        rel_path = f"../../../{target_path}"
+                        return f'<a href="{rel_path}">{full_match}</a>'
+
+                    return full_match
+
+                updated_html = re.sub(pattern, replace_reference, html)
+
+                # Only write if something changed
+                if updated_html != html:
+                    with open(man_file.html_path, 'w', encoding='utf-8') as f:
+                        f.write(updated_html)
+
+            except Exception as e:
+                logger.warning(f"Error linking references in {man_file.display_name}: {e}")
+
+        logger.info("Cross-reference linking complete")
+
+    def _get_output_path(self, man_file: ManFile) -> Path:
+        """Determine output path for HTML file.
+
+        Structure: output_dir/<package>/<section>/<name>.<section>[.<lang>].html
+
+        Args:
+            man_file: ManFile object
+
+        Returns:
+            Path for HTML output
+        """
+        # Package directory
+        pkg_dir = self.output_dir / man_file.package_name
+
+        # Section directory (man1, man2, etc.)
+        section_dir = pkg_dir / f"man{man_file.section}"
+
+        # HTML filename
+        filename = man_file.html_filename
+
+        return section_dir / filename
--- a/src/rocky_man/processor/extractor.py
+++ b/src/rocky_man/processor/extractor.py
@@ -0,0 +1,222 @@
+"""Extract man pages from RPM packages."""
+
+import gzip
+import logging
+from pathlib import Path
+from typing import List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import rpmfile
+
+from ..models import Package, ManFile
+
+logger = logging.getLogger(__name__)
+
+
+class ManPageExtractor:
+    """Extracts man pages from RPM packages.
+
+    Handles:
+    - Extracting man pages from RPMs
+    - Reading gzipped man page content
+    - Organizing extracted files by package
+    """
+
+    def __init__(self, extract_dir: Path, skip_sections: List[str] = None, skip_languages: bool = True):
+        """Initialize extractor.
+
+        Args:
+            extract_dir: Base directory for extracting man pages
+            skip_sections: List of man sections to skip (e.g., ['3', '3p', '3pm'])
+            skip_languages: If True, skip non-English man pages
+        """
+        self.extract_dir = Path(extract_dir)
+        self.extract_dir.mkdir(parents=True, exist_ok=True)
+        self.skip_sections = skip_sections or []
+        self.skip_languages = skip_languages
+
+    def extract_from_package(self, package: Package) -> List[ManFile]:
+        """Extract all man pages from a package.
+
+        Args:
+            package: Package to extract from
+
+        Returns:
+            List of ManFile objects for extracted man pages
+        """
+        if not package.download_path or not package.download_path.exists():
+            logger.warning(f"Package file not found: {package.name}")
+            return []
+
+        # Create extraction directory for this package
+        pkg_extract_dir = self.extract_dir / package.name
+        pkg_extract_dir.mkdir(parents=True, exist_ok=True)
+
+        man_files = []
+
+        try:
+            logger.info(f"Extracting man pages from {package.filename}")
+
+            with rpmfile.open(package.download_path) as rpm:
+                for member in rpm.getmembers():
+                    # Check if this is a man page file
+                    if not self._is_manpage(member.name):
+                        continue
+
+                    # Create ManFile object
+                    extract_path = pkg_extract_dir / member.name.lstrip('/')
+                    man_file = ManFile(
+                        file_path=extract_path,
+                        package_name=package.name
+                    )
+
+                    # Apply section filtering
+                    if self.skip_sections and man_file.section in self.skip_sections:
+                        logger.debug(f"Skipping {man_file.display_name} (section {man_file.section})")
+                        continue
+
+                    # Apply language filtering
+                    if self.skip_languages and man_file.language and man_file.language != 'en':
+                        logger.debug(f"Skipping {man_file.display_name} (language {man_file.language})")
+                        continue
+
+                    # Extract the file
+                    extract_path.parent.mkdir(parents=True, exist_ok=True)
+
+                    try:
+                        content = rpm.extractfile(member).read()
+                        with open(extract_path, 'wb') as f:
+                            f.write(content)
+
+                        man_file.content = content
+                        man_files.append(man_file)
+
+                    except Exception as e:
+                        logger.warning(f"Failed to extract {member.name}: {e}")
+
+            logger.info(f"Extracted {len(man_files)} man pages from {package.name}")
+
+        except Exception as e:
+            logger.error(f"Error extracting from {package.filename}: {e}")
+
+        return man_files
+
+    def extract_from_packages(
+        self,
+        packages: List[Package],
+        max_workers: int = 5
+    ) -> List[ManFile]:
+        """Extract man pages from multiple packages in parallel.
+
+        Args:
+            packages: List of packages to process
+            max_workers: Maximum number of parallel extractions
+
+        Returns:
+            List of all extracted ManFile objects
+        """
+        all_man_files = []
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all extraction tasks
+            future_to_pkg = {
+                executor.submit(self.extract_from_package, pkg): pkg
+                for pkg in packages
+            }
+
+            # Collect results
+            for future in as_completed(future_to_pkg):
+                pkg = future_to_pkg[future]
+                try:
+                    man_files = future.result()
+                    all_man_files.extend(man_files)
+                except Exception as e:
+                    logger.error(f"Error processing {pkg.name}: {e}")
+
+        logger.info(f"Extracted total of {len(all_man_files)} man pages from {len(packages)} packages")
+        return all_man_files
+
+    def read_manpage_content(self, man_file: ManFile) -> str:
+        """Read and decompress man page content.
+
+        Args:
+            man_file: ManFile to read
+
+        Returns:
+            Decompressed man page content as string
+        """
+        if not man_file.file_path.exists():
+            logger.warning(f"Man page file not found: {man_file.file_path}")
+            return ""
+
+        try:
+            # Try reading as gzipped file first
+            if man_file.file_path.suffix == '.gz':
+                with gzip.open(man_file.file_path, 'rb') as f:
+                    content = f.read()
+            else:
+                # Read as plain text
+                with open(man_file.file_path, 'rb') as f:
+                    content = f.read()
+
+            # Decode with error handling
+            return content.decode('utf-8', errors='replace')
+
+        except gzip.BadGzipFile:
+            # Not a gzip file, try reading as plain text
+            try:
+                with open(man_file.file_path, 'rb') as f:
+                    content = f.read()
+                return content.decode('utf-8', errors='replace')
+            except Exception as e:
+                logger.error(f"Error reading {man_file.file_path}: {e}")
+                return ""
+
+        except Exception as e:
+            logger.error(f"Error reading {man_file.file_path}: {e}")
+            return ""
+
+    @staticmethod
+    def _is_manpage(path: str) -> bool:
+        """Check if a file path is a man page.
+
+        Args:
+            path: File path to check
+
+        Returns:
+            True if this looks like a man page file
+        """
+        # Must contain /man/ in path
+        if '/man/' not in path:
+            return False
+
+        # Should be in /usr/share/man/ or /usr/man/
+        if not ('/share/man/' in path or path.startswith('/usr/man/')):
+            return False
+
+        # Common man page patterns
+        # - /usr/share/man/man1/foo.1.gz
+        # - /usr/share/man/es/man1/foo.1.gz
+        # - /usr/share/man/man3/printf.3.gz
+
+        parts = path.split('/')
+
+        # Check for man<digit> directory
+        has_man_section = any(
+            part.startswith('man') and len(part) > 3 and part[3].isdigit()
+            for part in parts
+        )
+
+        return has_man_section
+
+    def cleanup_extracts(self, package: Package):
+        """Clean up extracted files for a package.
+
+        Args:
+            package: Package whose extracts to clean up
+        """
+        pkg_extract_dir = self.extract_dir / package.name
+        if pkg_extract_dir.exists():
+            import shutil
+            shutil.rmtree(pkg_extract_dir)
+            logger.debug(f"Cleaned up extracts for {package.name}")
--- a/src/rocky_man/repo/init.py
+++ b/src/rocky_man/repo/init.py
@@ -0,0 +1,4 @@
+from .manager import RepoManager
+from .contents import ContentsParser
+
+__all__ = ["RepoManager", "ContentsParser"]
--- a/src/rocky_man/repo/contents.py
+++ b/src/rocky_man/repo/contents.py
@@ -0,0 +1,221 @@
+"""Contents file parser for identifying packages with man pages."""
+
+import gzip
+import logging
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Set, Dict
+from urllib.parse import urljoin
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class ContentsParser:
+    """Parse repository metadata to identify packages containing man pages.
+
+    This is a key optimization - instead of downloading all packages,
+    we parse the filelists.xml to find only packages with man pages.
+    """
+
+    def __init__(self, repo_url: str, cache_dir: Path):
+        """Initialize the contents parser.
+
+        Args:
+            repo_url: Base URL of the repository (e.g., .../BaseOS/x86_64/os/)
+            cache_dir: Directory to cache downloaded metadata
+        """
+        self.repo_url = repo_url.rstrip('/') + '/'
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+
+    def get_packages_with_manpages(self) -> Set[str]:
+        """Get set of package names that contain man pages.
+
+        Returns:
+            Set of package names (e.g., {'bash', 'coreutils', ...})
+        """
+        logger.info(f"Fetching filelists for {self.repo_url}")
+
+        # Download and parse repomd.xml to find filelists location
+        filelists_path = self._get_filelists_path()
+        if not filelists_path:
+            logger.warning("Could not find filelists in repository metadata")
+            return set()
+
+        # Download filelists.xml
+        filelists_file = self._download_filelists(filelists_path)
+        if not filelists_file:
+            logger.warning("Could not download filelists")
+            return set()
+
+        # Parse filelists to find packages with man pages
+        packages = self._parse_filelists(filelists_file)
+        logger.info(f"Found {len(packages)} packages with man pages")
+
+        return packages
+
+    def _get_filelists_path(self) -> str:
+        """Parse repomd.xml to get the filelists.xml location.
+
+        Returns:
+            Relative path to filelists.xml.gz
+        """
+        repomd_url = urljoin(self.repo_url, 'repodata/repomd.xml')
+
+        try:
+            response = requests.get(repomd_url, timeout=30)
+            response.raise_for_status()
+
+            # Parse XML
+            root = ET.fromstring(response.content)
+
+            # Find filelists entry
+            # XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
+            ns = {'repo': 'http://linux.duke.edu/metadata/repo'}
+
+            for data in root.findall('repo:data', ns):
+                if data.get('type') == 'filelists':
+                    location = data.find('repo:location', ns)
+                    if location is not None:
+                        return location.get('href')
+
+            # Fallback: try without namespace
+            for data in root.findall('data'):
+                if data.get('type') == 'filelists':
+                    location = data.find('location')
+                    if location is not None:
+                        return location.get('href')
+
+        except Exception as e:
+            logger.error(f"Error parsing repomd.xml: {e}")
+
+        return None
+
+    def _download_filelists(self, relative_path: str) -> Path:
+        """Download filelists.xml.gz file.
+
+        Args:
+            relative_path: Relative path from repo root (e.g., 'repodata/...-filelists.xml.gz')
+
+        Returns:
+            Path to downloaded file
+        """
+        url = urljoin(self.repo_url, relative_path)
+        cache_file = self.cache_dir / relative_path.split('/')[-1]
+
+        # Return cached file if it exists
+        if cache_file.exists():
+            logger.debug(f"Using cached filelists: {cache_file}")
+            return cache_file
+
+        try:
+            logger.info(f"Downloading {url}")
+            response = requests.get(url, timeout=60, stream=True)
+            response.raise_for_status()
+
+            cache_file.parent.mkdir(parents=True, exist_ok=True)
+            with open(cache_file, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            return cache_file
+
+        except Exception as e:
+            logger.error(f"Error downloading filelists: {e}")
+            return None
+
+    def _parse_filelists(self, filelists_path: Path) -> Set[str]:
+        """Parse filelists.xml.gz to find packages with man pages.
+
+        Args:
+            filelists_path: Path to filelists.xml.gz file
+
+        Returns:
+            Set of package names containing man pages
+        """
+        packages = set()
+
+        try:
+            # Open gzipped XML file
+            with gzip.open(filelists_path, 'rb') as f:
+                # Use iterparse for memory efficiency (files can be large)
+                context = ET.iterparse(f, events=('start', 'end'))
+
+                current_package = None
+                has_manpage = False
+
+                for event, elem in context:
+                    if event == 'start':
+                        if elem.tag.endswith('package'):
+                            # Get package name from 'name' attribute
+                            current_package = elem.get('name')
+                            has_manpage = False
+
+                    elif event == 'end':
+                        if elem.tag.endswith('file'):
+                            # Check if file path contains /man/
+                            file_path = elem.text
+                            if file_path and '/man/' in file_path:
+                                # Could be /usr/share/man/ or /usr/man/
+                                if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
+                                    has_manpage = True
+
+                        elif elem.tag.endswith('package'):
+                            # End of package entry
+                            if has_manpage and current_package:
+                                packages.add(current_package)
+
+                            # Clear element to free memory
+                            elem.clear()
+                            current_package = None
+                            has_manpage = False
+
+        except Exception as e:
+            logger.error(f"Error parsing filelists: {e}")
+
+        return packages
+
+    def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
+        """Get detailed list of man files for each package.
+
+        Args:
+            filelists_path: Path to filelists.xml.gz file
+
+        Returns:
+            Dict mapping package name to list of man page paths
+        """
+        packages = {}
+
+        try:
+            with gzip.open(filelists_path, 'rb') as f:
+                context = ET.iterparse(f, events=('start', 'end'))
+
+                current_package = None
+                current_files = []
+
+                for event, elem in context:
+                    if event == 'start':
+                        if elem.tag.endswith('package'):
+                            current_package = elem.get('name')
+                            current_files = []
+
+                    elif event == 'end':
+                        if elem.tag.endswith('file'):
+                            file_path = elem.text
+                            if file_path and '/share/man/' in file_path:
+                                current_files.append(file_path)
+
+                        elif elem.tag.endswith('package'):
+                            if current_files and current_package:
+                                packages[current_package] = current_files
+
+                            elem.clear()
+                            current_package = None
+                            current_files = []
+
+        except Exception as e:
+            logger.error(f"Error parsing filelists: {e}")
+
+        return packages
--- a/src/rocky_man/repo/manager.py
+++ b/src/rocky_man/repo/manager.py
@@ -0,0 +1,237 @@
+"""Repository manager for querying and downloading RPM packages."""
+
+import logging
+from pathlib import Path
+from typing import List, Set, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import dnf
+import requests
+
+from ..models import Package
+from .contents import ContentsParser
+
+logger = logging.getLogger(__name__)
+
+
+class RepoManager:
+    """Manages Rocky Linux repository operations.
+
+    Handles:
+    - Repository configuration with DNF
+    - Package discovery and filtering
+    - Package downloads with progress tracking
+    """
+
+    def __init__(
+        self,
+        repo_url: str,
+        version: str,
+        repo_type: str,
+        arch: str,
+        cache_dir: Path,
+        download_dir: Path,
+    ):
+        """Initialize repository manager.
+
+        Args:
+            repo_url: Full repository URL
+            version: Rocky Linux version (e.g., '9.5')
+            repo_type: Repository type ('BaseOS' or 'AppStream')
+            arch: Architecture (e.g., 'x86_64')
+            cache_dir: Directory for caching metadata
+            download_dir: Directory for downloading packages
+        """
+        self.repo_url = repo_url
+        self.version = version
+        self.repo_type = repo_type
+        self.arch = arch
+        self.cache_dir = Path(cache_dir)
+        self.download_dir = Path(download_dir)
+
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.download_dir.mkdir(parents=True, exist_ok=True)
+
+        # Initialize DNF
+        self.base = dnf.Base()
+        self.base.conf.debuglevel = 0
+        self.base.conf.errorlevel = 0
+        self.base.conf.cachedir = str(self.cache_dir / "dnf")
+
+        self._configure_repo()
+        self.packages_with_manpages: Optional[Set[str]] = None
+
+    def _configure_repo(self):
+        """Configure DNF repository."""
+        repo_id = f"rocky-{self.repo_type.lower()}-{self.version}-{self.arch}"
+        repo = dnf.repo.Repo(repo_id, self.base.conf)
+        repo.baseurl = [self.repo_url]
+        repo.enabled = True
+        repo.gpgcheck = False  # We verify checksums separately
+
+        self.base.repos.add(repo)
+        logger.info(f"Configured repository: {repo_id} at {self.repo_url}")
+
+        # Fill the sack (package database)
+        self.base.fill_sack(load_system_repo=False, load_available_repos=True)
+        logger.info("Repository metadata loaded")
+
+    def discover_packages_with_manpages(self) -> Set[str]:
+        """Discover which packages contain man pages using filelists.
+
+        This is the key optimization - we parse repository metadata
+        to identify packages with man pages before downloading anything.
+
+        Returns:
+            Set of package names that contain man pages
+        """
+        if self.packages_with_manpages is not None:
+            return self.packages_with_manpages
+
+        parser = ContentsParser(self.repo_url, self.cache_dir)
+        self.packages_with_manpages = parser.get_packages_with_manpages()
+
+        return self.packages_with_manpages
+
+    def list_packages(self, with_manpages_only: bool = True) -> List[Package]:
+        """List all packages in the repository.
+
+        Args:
+            with_manpages_only: If True, only return packages with man pages
+
+        Returns:
+            List of Package objects
+        """
+        logger.info(f"Querying packages from {self.repo_type} ({self.version}/{self.arch})")
+
+        # Get packages with man pages if filtering
+        manpage_packages = None
+        if with_manpages_only:
+            manpage_packages = self.discover_packages_with_manpages()
+            logger.info(f"Filtering to {len(manpage_packages)} packages with man pages")
+
+        packages = []
+
+        # Query all available packages
+        query = self.base.sack.query().available()
+
+        # For each package name, get only one arch (prefer noarch, then our target arch)
+        seen_names = set()
+
+        for pkg in query:
+            pkg_name = pkg.name
+
+            # Skip if we've already added this package
+            if pkg_name in seen_names:
+                continue
+
+            # Skip if filtering and package doesn't have man pages
+            if manpage_packages and pkg_name not in manpage_packages:
+                continue
+
+            # Get repo information
+            repo = pkg.repo
+            baseurl = repo.baseurl[0] if repo and repo.baseurl else self.repo_url
+
+            # Create Package object
+            package = Package(
+                name=pkg_name,
+                version=pkg.version,
+                release=pkg.release,
+                arch=pkg.arch,
+                repo_type=self.repo_type,
+                location=pkg.location,
+                baseurl=baseurl,
+                checksum=pkg.chksum[1] if pkg.chksum else "",  # chksum is (type, value)
+                checksum_type=pkg.chksum[0] if pkg.chksum else "sha256",
+                has_manpages=True if manpage_packages else False,
+            )
+
+            packages.append(package)
+            seen_names.add(pkg_name)
+
+        logger.info(f"Found {len(packages)} packages to process")
+        return sorted(packages)  # Sort by name for consistent ordering
+
+    def download_package(self, package: Package) -> bool:
+        """Download a single package.
+
+        Args:
+            package: Package to download
+
+        Returns:
+            True if download successful, False otherwise
+        """
+        download_path = self.download_dir / package.filename
+        package.download_path = download_path
+
+        # Skip if already downloaded
+        if download_path.exists():
+            logger.debug(f"Package already downloaded: {package.filename}")
+            return True
+
+        try:
+            logger.info(f"Downloading {package.filename}")
+            response = requests.get(package.download_url, timeout=300, stream=True)
+            response.raise_for_status()
+
+            # Download with progress (optional: could add progress bar here)
+            with open(download_path, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+
+            logger.debug(f"Downloaded: {package.filename}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error downloading {package.filename}: {e}")
+            # Clean up partial download
+            if download_path.exists():
+                download_path.unlink()
+            return False
+
+    def download_packages(
+        self,
+        packages: List[Package],
+        max_workers: int = 5
+    ) -> List[Package]:
+        """Download multiple packages in parallel.
+
+        Args:
+            packages: List of packages to download
+            max_workers: Maximum number of parallel downloads
+
+        Returns:
+            List of successfully downloaded packages
+        """
+        downloaded = []
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all download tasks
+            future_to_pkg = {
+                executor.submit(self.download_package, pkg): pkg
+                for pkg in packages
+            }
+
+            # Process completed downloads
+            for future in as_completed(future_to_pkg):
+                pkg = future_to_pkg[future]
+                try:
+                    if future.result():
+                        downloaded.append(pkg)
+                except Exception as e:
+                    logger.error(f"Error processing {pkg.name}: {e}")
+
+        logger.info(f"Successfully downloaded {len(downloaded)}/{len(packages)} packages")
+        return downloaded
+
+    def cleanup_package(self, package: Package):
+        """Delete a downloaded package file.
+
+        Args:
+            package: Package to clean up
+        """
+        if package.download_path and package.download_path.exists():
+            package.download_path.unlink()
+            logger.debug(f"Deleted: {package.filename}")
--- a/src/rocky_man/utils/init.py
+++ b/src/rocky_man/utils/init.py
@@ -0,0 +1,3 @@
+from .config import Config
+
+__all__ = ["Config"]
--- a/src/rocky_man/utils/config.py
+++ b/src/rocky_man/utils/config.py
@@ -0,0 +1,110 @@
+"""Configuration management for Rocky Man."""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+
+
+@dataclass
+class Config:
+    """Configuration for Rocky Man page generation.
+
+    Attributes:
+        base_url: Base URL for Rocky Linux mirror
+        content_dir: Content directory path (usually 'pub/rocky')
+        versions: List of Rocky Linux versions to process (e.g., ['8.10', '9.5'])
+        architectures: List of architectures to consider (we'll pick one)
+        repo_types: Repository types to process (e.g., ['BaseOS', 'AppStream'])
+        download_dir: Directory for downloading RPM packages
+        extract_dir: Directory for extracting man pages
+        output_dir: Directory for generated HTML files
+        keep_rpms: Whether to keep downloaded RPM files after processing
+        keep_extracts: Whether to keep extracted man files after processing
+        parallel_downloads: Number of parallel downloads
+        parallel_conversions: Number of parallel HTML conversions
+    """
+
+    # Repository configuration
+    base_url: str = "http://dl.rockylinux.org/"
+    content_dir: str = "pub/rocky"
+    versions: List[str] = None
+    architectures: List[str] = None
+    repo_types: List[str] = None
+
+    # Directory configuration
+    download_dir: Path = Path("/data/tmp/downloads")
+    extract_dir: Path = Path("/data/tmp/extracts")
+    output_dir: Path = Path("/data/html")
+
+    # Cleanup options
+    keep_rpms: bool = False
+    keep_extracts: bool = False
+
+    # Performance options
+    parallel_downloads: int = 5
+    parallel_conversions: int = 10
+
+    # Filtering options
+    skip_sections: List[str] = None
+    skip_packages: List[str] = None
+    skip_languages: bool = True  # Skip non-English languages by default
+    allow_all_sections: bool = False  # Override skip_sections if True
+
+    def __post_init__(self):
+        """Set defaults and ensure directories exist."""
+        if self.versions is None:
+            self.versions = ["8.10", "9.6", "10.0"]
+
+        if self.architectures is None:
+            # Man pages are arch-independent, so we just need one
+            # We prefer x86_64 as it's most common, fallback to others
+            self.architectures = ["x86_64", "aarch64", "ppc64le", "s390x"]
+
+        if self.repo_types is None:
+            self.repo_types = ["BaseOS", "AppStream"]
+
+        # Set default skip sections (man3 library APIs)
+        if self.skip_sections is None and not self.allow_all_sections:
+            self.skip_sections = ["3", "3p", "3pm"]
+        elif self.allow_all_sections:
+            self.skip_sections = []
+
+        # Set default skip packages (high-volume API docs)
+        if self.skip_packages is None:
+            self.skip_packages = [
+                "lapack",
+                "dpdk-devel",
+                "gl-manpages",
+            ]
+
+        # Ensure all paths are Path objects
+        self.download_dir = Path(self.download_dir)
+        self.extract_dir = Path(self.extract_dir)
+        self.output_dir = Path(self.output_dir)
+
+    def get_repo_url(self, version: str, repo_type: str, arch: str) -> str:
+        """Construct repository URL for given parameters.
+
+        Args:
+            version: Rocky Linux version (e.g., '9.5')
+            repo_type: Repository type ('BaseOS' or 'AppStream')
+            arch: Architecture (e.g., 'x86_64')
+
+        Returns:
+            Full repository URL
+        """
+        url = self.base_url.rstrip('/')
+        path = f"{self.content_dir}/{version}/{repo_type}/{arch}/os"
+        return f"{url}/{path}/"
+
+    def get_version_output_dir(self, version: str) -> Path:
+        """Get output directory for a specific version."""
+        return self.output_dir / version
+
+    def get_version_download_dir(self, version: str) -> Path:
+        """Get download directory for a specific version."""
+        return self.download_dir / version
+
+    def get_version_extract_dir(self, version: str) -> Path:
+        """Get extract directory for a specific version."""
+        return self.extract_dir / version
--- a/src/rocky_man/web/init.py
+++ b/src/rocky_man/web/init.py
@@ -0,0 +1,3 @@
+from .generator import WebGenerator
+
+__all__ = ["WebGenerator"]
--- a/src/rocky_man/web/generator.py
+++ b/src/rocky_man/web/generator.py
@@ -0,0 +1,297 @@
+"""Web page generator for Rocky Man."""
+
+import gzip
+import json
+import logging
+from pathlib import Path
+from typing import List, Dict, Any
+
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+
+from ..models import ManFile
+
+logger = logging.getLogger(__name__)
+
+
+class WebGenerator:
+    """Generates web pages and search index for Rocky Man.
+
+    Handles:
+    - Generating index/search page
+    - Wrapping man page HTML in templates
+    - Creating search index JSON
+    """
+
+    def __init__(self, template_dir: Path, output_dir: Path):
+        """Initialize web generator.
+
+        Args:
+            template_dir: Directory containing Jinja2 templates
+            output_dir: Directory for HTML output
+        """
+        self.template_dir = Path(template_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Setup Jinja2 environment
+        self.env = Environment(
+            loader=FileSystemLoader(str(self.template_dir)),
+            autoescape=select_autoescape(['html', 'xml'])
+        )
+
+    def generate_manpage_html(self, man_file: ManFile, version: str) -> bool:
+        """Generate complete HTML page for a man page.
+
+        Args:
+            man_file: ManFile with html_content already set
+            version: Rocky Linux version
+
+        Returns:
+            True if successful
+        """
+        if not man_file.html_content:
+            logger.warning(f"No HTML content for {man_file.display_name}")
+            return False
+
+        try:
+            template = self.env.get_template('manpage.html')
+
+            html = template.render(
+                title=f"{man_file.display_name} - {man_file.package_name} - Rocky Linux {version}",
+                header_title=man_file.display_name,
+                package_name=man_file.package_name,
+                version=version,
+                section=man_file.section,
+                language=man_file.language or 'en',
+                content=man_file.html_content
+            )
+
+            # Ensure output path is set
+            if not man_file.html_path:
+                man_file.html_path = self._get_manpage_path(man_file, version)
+
+            man_file.html_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with open(man_file.html_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Error generating HTML for {man_file.display_name}: {e}")
+            return False
+
+    def generate_index(self, version: str, search_data: Dict[str, Any]) -> bool:
+        """Generate search/index page for a version.
+
+        Args:
+            version: Rocky Linux version
+            search_data: Search index data
+
+        Returns:
+            True if successful
+        """
+        try:
+            template = self.env.get_template('index.html')
+
+            html = template.render(
+                title=f"Rocky Linux {version} Man Pages",
+                version=version,
+                total_pages=len(search_data),
+                packages=sorted(search_data.keys())
+            )
+
+            index_path = self.output_dir / version / 'index.html'
+            index_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with open(index_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+
+            logger.info(f"Generated index for version {version}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error generating index for {version}: {e}")
+            return False
+    
+    def generate_packages_index(self, version: str, search_data: Dict[str, Any]) -> bool:
+        """Generate full packages index page.
+
+        Args:
+            version: Rocky Linux version
+            search_data: Search index data
+
+        Returns:
+            True if successful
+        """
+        try:
+            # Group packages by first letter
+            packages_by_letter = {}
+            
+            for pkg_name, pages in search_data.items():
+                first_char = pkg_name[0].upper()
+                if not first_char.isalpha():
+                    first_char = 'other'
+                
+                if first_char not in packages_by_letter:
+                    packages_by_letter[first_char] = []
+                
+                packages_by_letter[first_char].append({
+                    'name': pkg_name,
+                    'count': len(pages)
+                })
+
+            # Sort packages within each letter
+            for letter in packages_by_letter:
+                packages_by_letter[letter].sort(key=lambda x: x['name'])
+
+            template = self.env.get_template('packages.html')
+
+            html = template.render(
+                title=f"All Packages - Rocky Linux {version}",
+                version=version,
+                total_packages=len(search_data),
+                packages_by_letter=packages_by_letter
+            )
+
+            output_path = self.output_dir / version / 'packages.html'
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+
+            logger.info(f"Generated packages index for version {version}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error generating packages index for {version}: {e}")
+            return False
+
+    def generate_search_index(
+        self,
+        man_files: List[ManFile],
+        version: str
+    ) -> Dict[str, Any]:
+        """Generate search index from man files.
+
+        Args:
+            man_files: List of ManFile objects
+            version: Rocky Linux version
+
+        Returns:
+            Search index dictionary
+        """
+        index = {}
+
+        for man_file in man_files:
+            pkg_name = man_file.package_name
+
+            if pkg_name not in index:
+                index[pkg_name] = {}
+
+            # Create entry for this man page
+            entry = {
+                'name': man_file.name,
+                'section': man_file.section,
+                'display_name': man_file.display_name,
+                'language': man_file.language or 'en',
+                'url': man_file.uri_path,
+                'full_name': f"{man_file.package_name} - {man_file.display_name}"
+            }
+
+            # Use display name as key (handles duplicates with different sections)
+            key = man_file.display_name
+            if man_file.language:
+                key = f"{key}.{man_file.language}"
+
+            index[pkg_name][key] = entry
+
+        return index
+
+    def save_search_index(self, index: Dict[str, Any], version: str) -> bool:
+        """Save search index as JSON (both plain and gzipped).
+
+        Args:
+            index: Search index dictionary
+            version: Rocky Linux version
+
+        Returns:
+            True if successful
+        """
+        try:
+            version_dir = self.output_dir / version
+            version_dir.mkdir(parents=True, exist_ok=True)
+
+            json_path = version_dir / 'search.json'
+            gz_path = version_dir / 'search.json.gz'
+
+            # Sort for consistency
+            sorted_index = {k: index[k] for k in sorted(index)}
+
+            # Save plain JSON
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(sorted_index, f, indent=2)
+
+            # Save gzipped JSON
+            with gzip.open(gz_path, 'wt', encoding='utf-8') as f:
+                json.dump(sorted_index, f)
+
+            logger.info(f"Saved search index for {version} ({len(index)} packages)")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error saving search index: {e}")
+            return False
+
+    def _get_manpage_path(self, man_file: ManFile, version: str) -> Path:
+        """Get output path for a man page HTML file.
+
+        Args:
+            man_file: ManFile object
+            version: Rocky Linux version
+
+        Returns:
+            Path for HTML file
+        """
+        version_dir = self.output_dir / version
+        pkg_dir = version_dir / man_file.package_name
+        section_dir = pkg_dir / f"man{man_file.section}"
+
+        return section_dir / man_file.html_filename
+
+    def generate_root_index(self, versions: List[str]) -> bool:
+        """Generate root index page linking to all versions.
+
+        Args:
+            versions: List of Rocky Linux versions
+
+        Returns:
+            True if successful
+        """
+        try:
+            template = self.env.get_template('root.html')
+
+            # Sort versions numerically (e.g., 8.10, 9.6, 10.0)
+            def version_key(v):
+                try:
+                    parts = v.split('.')
+                    return tuple(int(p) for p in parts)
+                except (ValueError, AttributeError):
+                    return (0, 0)
+
+            html = template.render(
+                title="Rocky Linux Man Pages",
+                versions=sorted(versions, key=version_key)
+            )
+
+            index_path = self.output_dir / 'index.html'
+
+            with open(index_path, 'w', encoding='utf-8') as f:
+                f.write(html)
+
+            logger.info("Generated root index page")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error generating root index: {e}")
+            return False