CUSP-1256 (#1)

* Complete refactor

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>

* Complete refactor

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>

---------

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
This commit is contained in:
Stephen Simpson
2025-11-20 12:16:33 -05:00
committed by GitHub
parent 5248edad62
commit ec32c72363
44 changed files with 4083 additions and 1540 deletions

0
src/__init__.py Normal file
View File

View File

@@ -0,0 +1,5 @@
from .utils.config import Config
__version__ = "0.1.0"
__all__ = ["Config"]

377
src/rocky_man/main.py Normal file
View File

@@ -0,0 +1,377 @@
"""Main entry point for Rocky Man."""
import argparse
import logging
import sys
from pathlib import Path
from .utils.config import Config
from .repo import RepoManager
from .processor import ManPageExtractor, ManPageConverter
from .web import WebGenerator
def setup_logging(verbose: bool = False):
"""Configure logging."""
level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(
level=level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def process_version(
config: Config,
version: str,
template_dir: Path
) -> bool:
"""Process a single Rocky Linux version.
Args:
config: Configuration object
version: Rocky Linux version to process
template_dir: Path to templates directory
Returns:
True if successful
"""
logger = logging.getLogger(__name__)
logger.info(f"Processing Rocky Linux {version}")
# Setup directories for this version
version_download_dir = config.get_version_download_dir(version)
version_extract_dir = config.get_version_extract_dir(version)
version_output_dir = config.get_version_output_dir(version)
all_man_files = []
# Process each repository type
for repo_type in config.repo_types:
logger.info(f"Processing {repo_type} repository")
# Use first available architecture (man pages are arch-independent)
arch = config.architectures[0]
# Get repository URL
repo_url = config.get_repo_url(version, repo_type, arch)
# Create cache dir for this repo
cache_dir = config.download_dir / f".cache/{version}/{repo_type}"
try:
# Initialize repository manager
repo_manager = RepoManager(
repo_url=repo_url,
version=version,
repo_type=repo_type,
arch=arch,
cache_dir=cache_dir,
download_dir=version_download_dir
)
# List packages (with man pages only)
packages = repo_manager.list_packages(with_manpages_only=True)
if not packages:
logger.warning(f"No packages found in {repo_type}")
continue
logger.info(f"Found {len(packages)} packages with man pages in {repo_type}")
# Filter out packages that should be skipped
if config.skip_packages:
original_count = len(packages)
packages = [
pkg for pkg in packages
if pkg.name not in config.skip_packages
]
filtered_count = original_count - len(packages)
if filtered_count > 0:
logger.info(f"Filtered out {filtered_count} packages based on skip list")
logger.info(f"Processing {len(packages)} packages")
# Download packages
logger.info("Downloading packages...")
downloaded = repo_manager.download_packages(
packages,
max_workers=config.parallel_downloads
)
# Extract man pages
logger.info("Extracting man pages...")
extractor = ManPageExtractor(
version_extract_dir,
skip_sections=config.skip_sections,
skip_languages=config.skip_languages
)
man_files = extractor.extract_from_packages(
downloaded,
max_workers=config.parallel_downloads
)
logger.info(f"Extracted {len(man_files)} man pages")
# Read content for each man file
logger.info("Reading man page content...")
man_files_with_content = []
for man_file in man_files:
content = extractor.read_manpage_content(man_file)
if content:
man_files_with_content.append((man_file, content))
# Convert to HTML
logger.info("Converting man pages to HTML...")
converter = ManPageConverter(version_output_dir)
converted = converter.convert_many(
man_files_with_content,
max_workers=config.parallel_conversions
)
all_man_files.extend(converted)
# Cleanup if requested
if not config.keep_rpms:
logger.info("Cleaning up downloaded packages...")
for package in downloaded:
repo_manager.cleanup_package(package)
if not config.keep_extracts:
logger.info("Cleaning up extracted files...")
for package in downloaded:
extractor.cleanup_extracts(package)
except Exception as e:
logger.error(f"Error processing {repo_type}: {e}", exc_info=True)
continue
if not all_man_files:
logger.error(f"No man pages were successfully processed for version {version}")
return False
# Link cross-references between man pages
logger.info("Linking cross-references...")
converter = ManPageConverter(version_output_dir)
converter.link_cross_references(all_man_files)
# Generate web pages
logger.info("Generating web pages...")
web_gen = WebGenerator(template_dir, config.output_dir)
# Generate search index
search_index = web_gen.generate_search_index(all_man_files, version)
web_gen.save_search_index(search_index, version)
# Generate index page
web_gen.generate_index(version, search_index)
# Generate packages index page
web_gen.generate_packages_index(version, search_index)
# Wrap man pages in templates
logger.info("Generating man page HTML...")
for man_file in all_man_files:
web_gen.generate_manpage_html(man_file, version)
logger.info(f"Successfully processed {len(all_man_files)} man pages for Rocky Linux {version}")
return True
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Generate HTML documentation for Rocky Linux man pages'
)
parser.add_argument(
'--versions',
nargs='+',
default=['8.10', '9.6', '10.0'],
help='Rocky Linux versions to process (default: 8.10 9.6 10.0)'
)
parser.add_argument(
'--repo-types',
nargs='+',
default=['BaseOS', 'AppStream'],
help='Repository types to process (default: BaseOS AppStream)'
)
parser.add_argument(
'--output-dir',
type=Path,
default=Path('./html'),
help='Output directory for HTML files (default: ./html)'
)
parser.add_argument(
'--download-dir',
type=Path,
default=Path('./tmp/downloads'),
help='Directory for downloading packages (default: ./tmp/downloads)'
)
parser.add_argument(
'--extract-dir',
type=Path,
default=Path('./tmp/extracts'),
help='Directory for extracting man pages (default: ./tmp/extracts)'
)
parser.add_argument(
'--keep-rpms',
action='store_true',
help='Keep downloaded RPM files after processing'
)
parser.add_argument(
'--keep-extracts',
action='store_true',
help='Keep extracted man files after processing'
)
parser.add_argument(
'--parallel-downloads',
type=int,
default=5,
help='Number of parallel downloads (default: 5)'
)
parser.add_argument(
'--parallel-conversions',
type=int,
default=10,
help='Number of parallel HTML conversions (default: 10)'
)
parser.add_argument(
'--mirror',
default='http://dl.rockylinux.org/',
help='Rocky Linux mirror URL (default: http://dl.rockylinux.org/)'
)
parser.add_argument(
'--template-dir',
type=Path,
default=Path(__file__).parent.parent.parent / 'templates',
help='Template directory (default: ./templates)'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Enable verbose logging'
)
parser.add_argument(
'--skip-sections',
nargs='*',
default=None,
help='Man sections to skip (default: 3 3p 3pm). Use empty list to skip none.'
)
parser.add_argument(
'--skip-packages',
nargs='*',
default=None,
help='Package names to skip (default: lapack dpdk-devel gl-manpages). Use empty list to skip none.'
)
parser.add_argument(
'--skip-languages',
action='store_true',
default=None,
help='Skip non-English man pages (default: enabled)'
)
parser.add_argument(
'--keep-languages',
action='store_true',
help='Keep all languages (disables --skip-languages)'
)
parser.add_argument(
'--allow-all-sections',
action='store_true',
help='Include all man sections (overrides --skip-sections)'
)
args = parser.parse_args()
# Setup logging
setup_logging(args.verbose)
logger = logging.getLogger(__name__)
# Handle filtering options
skip_languages = True # default
if args.keep_languages:
skip_languages = False
elif args.skip_languages is not None:
skip_languages = args.skip_languages
# Create configuration
config = Config(
base_url=args.mirror,
versions=args.versions,
repo_types=args.repo_types,
download_dir=args.download_dir,
extract_dir=args.extract_dir,
output_dir=args.output_dir,
keep_rpms=args.keep_rpms,
keep_extracts=args.keep_extracts,
parallel_downloads=args.parallel_downloads,
parallel_conversions=args.parallel_conversions,
skip_sections=args.skip_sections,
skip_packages=args.skip_packages,
skip_languages=skip_languages,
allow_all_sections=args.allow_all_sections
)
logger.info("Rocky Man - Rocky Linux Man Page Generator")
logger.info(f"Versions: {', '.join(config.versions)}")
logger.info(f"Repositories: {', '.join(config.repo_types)}")
logger.info(f"Output directory: {config.output_dir}")
# Log filtering configuration
if config.skip_sections:
logger.info(f"Skipping man sections: {', '.join(config.skip_sections)}")
else:
logger.info("Including all man sections")
if config.skip_packages:
logger.info(f"Skipping packages: {', '.join(config.skip_packages)}")
if config.skip_languages:
logger.info("Skipping non-English languages")
else:
logger.info("Including all languages")
# Process each version
processed_versions = []
for version in config.versions:
try:
if process_version(config, version, args.template_dir):
processed_versions.append(version)
except Exception as e:
logger.error(f"Failed to process version {version}: {e}", exc_info=True)
if not processed_versions:
logger.error("No versions were successfully processed")
return 1
# Generate root index
logger.info("Generating root index page...")
web_gen = WebGenerator(args.template_dir, config.output_dir)
web_gen.generate_root_index(processed_versions)
logger.info("=" * 60)
logger.info("Processing complete!")
logger.info(f"Generated documentation for: {', '.join(processed_versions)}")
logger.info(f"Output directory: {config.output_dir.absolute()}")
logger.info("=" * 60)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,6 @@
"""Data models for Rocky Man."""
from .package import Package
from .manfile import ManFile
__all__ = ["Package", "ManFile"]

View File

@@ -0,0 +1,130 @@
"""ManFile model representing a man page file."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import re
@dataclass
class ManFile:
"""Represents a man page file extracted from an RPM package.
Attributes:
file_path: Path to the extracted man page file
package_name: Name of the package this man page belongs to
section: Man page section (1-9)
name: Man page name without extension
language: Language code (e.g., 'en', 'es', None for default)
content: Raw man page content (gzipped or plain text)
html_content: Converted HTML content
html_path: Path where HTML file is saved
"""
file_path: Path
package_name: str
section: Optional[str] = None
name: Optional[str] = None
language: Optional[str] = None
content: Optional[bytes] = None
html_content: Optional[str] = None
html_path: Optional[Path] = None
def __post_init__(self):
"""Parse file information from the path."""
self._parse_path()
def _parse_path(self):
"""Extract section, name, and language from the file path.
Example paths:
/usr/share/man/man1/bash.1.gz
/usr/share/man/es/man1/bash.1.gz
/usr/share/man/man3/printf.3.gz
"""
parts = self.file_path.parts
filename = self.file_path.name
# Remove .gz extension if present
if filename.endswith('.gz'):
filename = filename[:-3]
# Extract section from parent directory (e.g., 'man1', 'man3p', 'man3pm')
for part in reversed(parts):
if part.startswith('man') and len(part) > 3:
# Check if it starts with 'man' followed by a digit
if part[3].isdigit():
self.section = part[3:]
break
# Extract section from filename if not found yet (e.g., 'foo.3pm' -> section '3pm')
# and extract name
name_parts = filename.split('.')
if len(name_parts) >= 2:
# Try to identify section from last part
potential_section = name_parts[-1]
# Section is typically digit optionally followed by letters (1, 3p, 3pm, etc.)
if potential_section and potential_section[0].isdigit():
if not self.section:
self.section = potential_section
self.name = '.'.join(name_parts[:-1])
else:
self.name = name_parts[0]
else:
self.name = name_parts[0]
# Check for language subdirectory
# Pattern: /usr/share/man/<lang>/man<section>/
for i, part in enumerate(parts):
if part == 'man' and i + 1 < len(parts):
next_part = parts[i + 1]
# If next part is not 'man<digit>', it's a language code
if not (next_part.startswith('man') and next_part[3:].isdigit()):
# Common language codes are 2-5 chars (en, es, pt_BR, etc.)
if len(next_part) <= 5:
self.language = next_part
break
@property
def display_name(self) -> str:
"""Get display name for the man page (e.g., 'bash(1)')."""
return f"{self.name}({self.section})" if self.section else self.name
@property
def html_filename(self) -> str:
"""Get the HTML filename for this man page."""
# Clean name for filesystem safety
safe_name = self._clean_filename(self.name)
suffix = f".{self.language}" if self.language else ""
return f"{safe_name}.{self.section}{suffix}.html"
def _clean_filename(self, name: str) -> str:
"""Clean filename for filesystem safety."""
# Replace problematic characters
name = name.replace('/', '_')
name = name.replace(':', '_')
name = re.sub(r'\.\.', '__', name)
return name
@property
def uri_path(self) -> str:
"""Get the URI path for this man page (relative to version root).
Returns path like: 'bash/man1/bash.1.html'
"""
if not self.html_path:
return ""
# Get path relative to the version directory
# Assuming structure: html/<version>/<package>/<section>/<file>.html
parts = self.html_path.parts
try:
# Find the version part (e.g., '9.5') and return everything after it
for i, part in enumerate(parts):
if re.match(r'\d+\.\d+', part): # Version pattern
return '/'.join(parts[i+1:])
except (ValueError, IndexError):
pass
return str(self.html_path)
def __str__(self):
return f"{self.package_name}: {self.display_name}"

View File

@@ -0,0 +1,58 @@
"""Package model representing an RPM package."""
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
@dataclass
class Package:
"""Represents an RPM package from a Rocky Linux repository.
Attributes:
name: Package name (e.g., 'bash')
version: Package version
release: Package release
arch: Architecture (e.g., 'x86_64', 'noarch')
repo_type: Repository type ('BaseOS' or 'AppStream')
location: Relative path in repo (e.g., 'Packages/b/bash-5.1.8-6.el9.x86_64.rpm')
baseurl: Base URL of the repository
checksum: Package checksum for verification
checksum_type: Type of checksum (e.g., 'sha256')
download_path: Local path where package is downloaded
has_manpages: Whether this package contains man pages
"""
name: str
version: str
release: str
arch: str
repo_type: str
location: str
baseurl: str
checksum: str
checksum_type: str
has_manpages: bool = False
download_path: Optional[Path] = None
@property
def filename(self) -> str:
"""Get the RPM filename from the location."""
return self.location.split("/")[-1]
@property
def download_url(self) -> str:
"""Get the full download URL for this package."""
return f"{self.baseurl.rstrip('/')}/{self.location.lstrip('/')}"
@property
def nvra(self) -> str:
"""Get the Name-Version-Release-Arch identifier."""
return f"{self.name}-{self.version}-{self.release}.{self.arch}"
def __lt__(self, other):
"""Enable sorting packages by name."""
return self.name < other.name
def __str__(self):
return f"{self.nvra} ({self.repo_type})"

View File

@@ -0,0 +1,4 @@
from .extractor import ManPageExtractor
from .converter import ManPageConverter
__all__ = ["ManPageExtractor", "ManPageConverter"]

View File

@@ -0,0 +1,292 @@
"""Convert man pages to HTML using mandoc."""
import logging
import re
import subprocess
from pathlib import Path
from typing import List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from ..models import ManFile
logger = logging.getLogger(__name__)
class ManPageConverter:
"""Converts man pages to HTML using mandoc.
Handles:
- Converting troff to HTML using mandoc
- Cleaning up HTML output
- Parallel conversion of multiple man pages
"""
def __init__(self, output_dir: Path):
"""Initialize converter.
Args:
output_dir: Base directory for HTML output
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
# Check if mandoc is available
if not self._check_mandoc():
raise RuntimeError("mandoc is not installed or not in PATH")
@staticmethod
def _check_mandoc() -> bool:
"""Check if mandoc is available."""
try:
# Run mandoc with no arguments - it will show usage and exit
# We just want to verify the command exists, not that it succeeds
subprocess.run(
['mandoc'],
capture_output=True,
timeout=5
)
return True
except FileNotFoundError:
# mandoc command not found
return False
except Exception:
# Other errors (timeout, etc) - but mandoc exists
return True
def convert(self, man_file: ManFile, content: str) -> bool:
"""Convert a single man page to HTML.
Args:
man_file: ManFile object to convert
content: Raw man page content (troff format)
Returns:
True if conversion successful
"""
try:
# Run mandoc to convert to HTML
html = self._run_mandoc(content)
if not html:
logger.warning(f"mandoc produced no output for {man_file.display_name}")
return False
# Clean up HTML
html = self._clean_html(html)
# Store in ManFile object
man_file.html_content = html
# Determine output path
output_path = self._get_output_path(man_file)
man_file.html_path = output_path
# Save HTML file
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
logger.debug(f"Converted {man_file.display_name} -> {output_path}")
return True
except Exception as e:
logger.error(f"Error converting {man_file.display_name}: {e}")
return False
def convert_many(
self,
man_files: List[tuple],
max_workers: int = 10
) -> List[ManFile]:
"""Convert multiple man pages in parallel.
Args:
man_files: List of (ManFile, content) tuples
max_workers: Maximum number of parallel conversions
Returns:
List of successfully converted ManFile objects
"""
converted = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all conversion tasks
future_to_manfile = {
executor.submit(self.convert, man_file, content): man_file
for man_file, content in man_files
}
# Collect results
for future in as_completed(future_to_manfile):
man_file = future_to_manfile[future]
try:
if future.result():
converted.append(man_file)
except Exception as e:
logger.error(f"Error converting {man_file.display_name}: {e}")
logger.info(f"Converted {len(converted)}/{len(man_files)} man pages to HTML")
return converted
def _run_mandoc(self, content: str) -> Optional[str]:
"""Run mandoc to convert man page to HTML.
Args:
content: Raw man page content
Returns:
HTML output from mandoc, or None on error
"""
try:
result = subprocess.run(
['mandoc', '-T', 'html', '-O', 'fragment,toc'],
input=content.encode('utf-8'),
capture_output=True,
timeout=30
)
if result.returncode != 0:
stderr = result.stderr.decode('utf-8', errors='replace')
logger.warning(f"mandoc returned error: {stderr}")
# Sometimes mandoc returns non-zero but still produces output
if result.stdout:
return result.stdout.decode('utf-8', errors='replace')
return None
return result.stdout.decode('utf-8', errors='replace')
except subprocess.TimeoutExpired:
logger.error("mandoc conversion timed out")
return None
except Exception as e:
logger.error(f"Error running mandoc: {e}")
return None
def _clean_html(self, html: str) -> str:
"""Clean up mandoc HTML output.
Args:
html: Raw HTML from mandoc
Returns:
Cleaned HTML
"""
# Remove empty parentheses in header cells
html = re.sub(
r'<td class="head-ltitle">\(\)</td>',
'<td class="head-ltitle"></td>',
html
)
html = re.sub(
r'<td class="head-rtitle">\(\)</td>',
'<td class="head-rtitle"></td>',
html
)
# Strip leading/trailing whitespace
html = html.strip()
return html
def link_cross_references(self, man_files: List[ManFile]) -> None:
"""Add hyperlinks to cross-references in SEE ALSO sections.
Goes through all converted HTML files and converts man page references
like pty(4) into working hyperlinks.
Args:
man_files: List of all converted ManFile objects
"""
# Build lookup index: (name, section) -> relative_path
lookup = {}
for mf in man_files:
key = (mf.name.lower(), str(mf.section))
if key not in lookup:
# Store the relative path from the version root
lookup[key] = f"{mf.package_name}/man{mf.section}/{mf.html_filename}"
logger.info(f"Linking cross-references across {len(man_files)} man pages...")
# Process each man page HTML file
for man_file in man_files:
if not man_file.html_path or not man_file.html_path.exists():
continue
try:
# Read the HTML
with open(man_file.html_path, 'r', encoding='utf-8') as f:
html = f.read()
# Find and replace man page references
# Mandoc outputs references as: <b>name</b>(section)
# Pattern matches both <b>name</b>(section) and plain name(section)
pattern = r'<b>([\w\-_.]+)</b>\((\d+[a-z]*)\)|\b([\w\-_.]+)\((\d+[a-z]*)\)'
def replace_reference(match):
full_match = match.group(0)
# Check if this match is already inside an <a> tag
# Look back up to 500 chars for context
before_text = html[max(0, match.start()-500):match.start()]
# Find the last <a and last </a> before this match
last_open = before_text.rfind('<a ')
last_close = before_text.rfind('</a>')
# If the last <a> is after the last </a>, we're inside a link
if last_open > last_close:
return full_match
if match.group(1): # <b>name</b>(section) format
name = match.group(1).lower()
section = match.group(2)
else: # plain name(section) format
name = match.group(3).lower()
section = match.group(4)
# Look up the referenced man page
key = (name, section)
if key in lookup:
# Calculate relative path from current file to target
target_path = lookup[key]
# File structure: output_dir/version/package_name/manN/file.html
# Need to go up 3 levels to reach version root
# Current: package_name/manN/file.html
# Target: other_package/manM/file.html
rel_path = f"../../../{target_path}"
return f'<a href="{rel_path}">{full_match}</a>'
return full_match
updated_html = re.sub(pattern, replace_reference, html)
# Only write if something changed
if updated_html != html:
with open(man_file.html_path, 'w', encoding='utf-8') as f:
f.write(updated_html)
except Exception as e:
logger.warning(f"Error linking references in {man_file.display_name}: {e}")
logger.info("Cross-reference linking complete")
def _get_output_path(self, man_file: ManFile) -> Path:
"""Determine output path for HTML file.
Structure: output_dir/<package>/<section>/<name>.<section>[.<lang>].html
Args:
man_file: ManFile object
Returns:
Path for HTML output
"""
# Package directory
pkg_dir = self.output_dir / man_file.package_name
# Section directory (man1, man2, etc.)
section_dir = pkg_dir / f"man{man_file.section}"
# HTML filename
filename = man_file.html_filename
return section_dir / filename

View File

@@ -0,0 +1,222 @@
"""Extract man pages from RPM packages."""
import gzip
import logging
from pathlib import Path
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
import rpmfile
from ..models import Package, ManFile
logger = logging.getLogger(__name__)
class ManPageExtractor:
"""Extracts man pages from RPM packages.
Handles:
- Extracting man pages from RPMs
- Reading gzipped man page content
- Organizing extracted files by package
"""
def __init__(self, extract_dir: Path, skip_sections: List[str] = None, skip_languages: bool = True):
"""Initialize extractor.
Args:
extract_dir: Base directory for extracting man pages
skip_sections: List of man sections to skip (e.g., ['3', '3p', '3pm'])
skip_languages: If True, skip non-English man pages
"""
self.extract_dir = Path(extract_dir)
self.extract_dir.mkdir(parents=True, exist_ok=True)
self.skip_sections = skip_sections or []
self.skip_languages = skip_languages
def extract_from_package(self, package: Package) -> List[ManFile]:
"""Extract all man pages from a package.
Args:
package: Package to extract from
Returns:
List of ManFile objects for extracted man pages
"""
if not package.download_path or not package.download_path.exists():
logger.warning(f"Package file not found: {package.name}")
return []
# Create extraction directory for this package
pkg_extract_dir = self.extract_dir / package.name
pkg_extract_dir.mkdir(parents=True, exist_ok=True)
man_files = []
try:
logger.info(f"Extracting man pages from {package.filename}")
with rpmfile.open(package.download_path) as rpm:
for member in rpm.getmembers():
# Check if this is a man page file
if not self._is_manpage(member.name):
continue
# Create ManFile object
extract_path = pkg_extract_dir / member.name.lstrip('/')
man_file = ManFile(
file_path=extract_path,
package_name=package.name
)
# Apply section filtering
if self.skip_sections and man_file.section in self.skip_sections:
logger.debug(f"Skipping {man_file.display_name} (section {man_file.section})")
continue
# Apply language filtering
if self.skip_languages and man_file.language and man_file.language != 'en':
logger.debug(f"Skipping {man_file.display_name} (language {man_file.language})")
continue
# Extract the file
extract_path.parent.mkdir(parents=True, exist_ok=True)
try:
content = rpm.extractfile(member).read()
with open(extract_path, 'wb') as f:
f.write(content)
man_file.content = content
man_files.append(man_file)
except Exception as e:
logger.warning(f"Failed to extract {member.name}: {e}")
logger.info(f"Extracted {len(man_files)} man pages from {package.name}")
except Exception as e:
logger.error(f"Error extracting from {package.filename}: {e}")
return man_files
def extract_from_packages(
self,
packages: List[Package],
max_workers: int = 5
) -> List[ManFile]:
"""Extract man pages from multiple packages in parallel.
Args:
packages: List of packages to process
max_workers: Maximum number of parallel extractions
Returns:
List of all extracted ManFile objects
"""
all_man_files = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all extraction tasks
future_to_pkg = {
executor.submit(self.extract_from_package, pkg): pkg
for pkg in packages
}
# Collect results
for future in as_completed(future_to_pkg):
pkg = future_to_pkg[future]
try:
man_files = future.result()
all_man_files.extend(man_files)
except Exception as e:
logger.error(f"Error processing {pkg.name}: {e}")
logger.info(f"Extracted total of {len(all_man_files)} man pages from {len(packages)} packages")
return all_man_files
def read_manpage_content(self, man_file: ManFile) -> str:
"""Read and decompress man page content.
Args:
man_file: ManFile to read
Returns:
Decompressed man page content as string
"""
if not man_file.file_path.exists():
logger.warning(f"Man page file not found: {man_file.file_path}")
return ""
try:
# Try reading as gzipped file first
if man_file.file_path.suffix == '.gz':
with gzip.open(man_file.file_path, 'rb') as f:
content = f.read()
else:
# Read as plain text
with open(man_file.file_path, 'rb') as f:
content = f.read()
# Decode with error handling
return content.decode('utf-8', errors='replace')
except gzip.BadGzipFile:
# Not a gzip file, try reading as plain text
try:
with open(man_file.file_path, 'rb') as f:
content = f.read()
return content.decode('utf-8', errors='replace')
except Exception as e:
logger.error(f"Error reading {man_file.file_path}: {e}")
return ""
except Exception as e:
logger.error(f"Error reading {man_file.file_path}: {e}")
return ""
@staticmethod
def _is_manpage(path: str) -> bool:
"""Check if a file path is a man page.
Args:
path: File path to check
Returns:
True if this looks like a man page file
"""
# Must contain /man/ in path
if '/man/' not in path:
return False
# Should be in /usr/share/man/ or /usr/man/
if not ('/share/man/' in path or path.startswith('/usr/man/')):
return False
# Common man page patterns
# - /usr/share/man/man1/foo.1.gz
# - /usr/share/man/es/man1/foo.1.gz
# - /usr/share/man/man3/printf.3.gz
parts = path.split('/')
# Check for man<digit> directory
has_man_section = any(
part.startswith('man') and len(part) > 3 and part[3].isdigit()
for part in parts
)
return has_man_section
def cleanup_extracts(self, package: Package):
"""Clean up extracted files for a package.
Args:
package: Package whose extracts to clean up
"""
pkg_extract_dir = self.extract_dir / package.name
if pkg_extract_dir.exists():
import shutil
shutil.rmtree(pkg_extract_dir)
logger.debug(f"Cleaned up extracts for {package.name}")

View File

@@ -0,0 +1,4 @@
from .manager import RepoManager
from .contents import ContentsParser
__all__ = ["RepoManager", "ContentsParser"]

View File

@@ -0,0 +1,221 @@
"""Contents file parser for identifying packages with man pages."""
import gzip
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Set, Dict
from urllib.parse import urljoin
import requests
logger = logging.getLogger(__name__)
class ContentsParser:
"""Parse repository metadata to identify packages containing man pages.
This is a key optimization - instead of downloading all packages,
we parse the filelists.xml to find only packages with man pages.
"""
def __init__(self, repo_url: str, cache_dir: Path):
"""Initialize the contents parser.
Args:
repo_url: Base URL of the repository (e.g., .../BaseOS/x86_64/os/)
cache_dir: Directory to cache downloaded metadata
"""
self.repo_url = repo_url.rstrip('/') + '/'
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
def get_packages_with_manpages(self) -> Set[str]:
"""Get set of package names that contain man pages.
Returns:
Set of package names (e.g., {'bash', 'coreutils', ...})
"""
logger.info(f"Fetching filelists for {self.repo_url}")
# Download and parse repomd.xml to find filelists location
filelists_path = self._get_filelists_path()
if not filelists_path:
logger.warning("Could not find filelists in repository metadata")
return set()
# Download filelists.xml
filelists_file = self._download_filelists(filelists_path)
if not filelists_file:
logger.warning("Could not download filelists")
return set()
# Parse filelists to find packages with man pages
packages = self._parse_filelists(filelists_file)
logger.info(f"Found {len(packages)} packages with man pages")
return packages
def _get_filelists_path(self) -> str:
"""Parse repomd.xml to get the filelists.xml location.
Returns:
Relative path to filelists.xml.gz
"""
repomd_url = urljoin(self.repo_url, 'repodata/repomd.xml')
try:
response = requests.get(repomd_url, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
# Find filelists entry
# XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
ns = {'repo': 'http://linux.duke.edu/metadata/repo'}
for data in root.findall('repo:data', ns):
if data.get('type') == 'filelists':
location = data.find('repo:location', ns)
if location is not None:
return location.get('href')
# Fallback: try without namespace
for data in root.findall('data'):
if data.get('type') == 'filelists':
location = data.find('location')
if location is not None:
return location.get('href')
except Exception as e:
logger.error(f"Error parsing repomd.xml: {e}")
return None
def _download_filelists(self, relative_path: str) -> Path:
"""Download filelists.xml.gz file.
Args:
relative_path: Relative path from repo root (e.g., 'repodata/...-filelists.xml.gz')
Returns:
Path to downloaded file
"""
url = urljoin(self.repo_url, relative_path)
cache_file = self.cache_dir / relative_path.split('/')[-1]
# Return cached file if it exists
if cache_file.exists():
logger.debug(f"Using cached filelists: {cache_file}")
return cache_file
try:
logger.info(f"Downloading {url}")
response = requests.get(url, timeout=60, stream=True)
response.raise_for_status()
cache_file.parent.mkdir(parents=True, exist_ok=True)
with open(cache_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return cache_file
except Exception as e:
logger.error(f"Error downloading filelists: {e}")
return None
def _parse_filelists(self, filelists_path: Path) -> Set[str]:
"""Parse filelists.xml.gz to find packages with man pages.
Args:
filelists_path: Path to filelists.xml.gz file
Returns:
Set of package names containing man pages
"""
packages = set()
try:
# Open gzipped XML file
with gzip.open(filelists_path, 'rb') as f:
# Use iterparse for memory efficiency (files can be large)
context = ET.iterparse(f, events=('start', 'end'))
current_package = None
has_manpage = False
for event, elem in context:
if event == 'start':
if elem.tag.endswith('package'):
# Get package name from 'name' attribute
current_package = elem.get('name')
has_manpage = False
elif event == 'end':
if elem.tag.endswith('file'):
# Check if file path contains /man/
file_path = elem.text
if file_path and '/man/' in file_path:
# Could be /usr/share/man/ or /usr/man/
if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
has_manpage = True
elif elem.tag.endswith('package'):
# End of package entry
if has_manpage and current_package:
packages.add(current_package)
# Clear element to free memory
elem.clear()
current_package = None
has_manpage = False
except Exception as e:
logger.error(f"Error parsing filelists: {e}")
return packages
def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
"""Get detailed list of man files for each package.
Args:
filelists_path: Path to filelists.xml.gz file
Returns:
Dict mapping package name to list of man page paths
"""
packages = {}
try:
with gzip.open(filelists_path, 'rb') as f:
context = ET.iterparse(f, events=('start', 'end'))
current_package = None
current_files = []
for event, elem in context:
if event == 'start':
if elem.tag.endswith('package'):
current_package = elem.get('name')
current_files = []
elif event == 'end':
if elem.tag.endswith('file'):
file_path = elem.text
if file_path and '/share/man/' in file_path:
current_files.append(file_path)
elif elem.tag.endswith('package'):
if current_files and current_package:
packages[current_package] = current_files
elem.clear()
current_package = None
current_files = []
except Exception as e:
logger.error(f"Error parsing filelists: {e}")
return packages

View File

@@ -0,0 +1,237 @@
"""Repository manager for querying and downloading RPM packages."""
import logging
from pathlib import Path
from typing import List, Set, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import dnf
import requests
from ..models import Package
from .contents import ContentsParser
logger = logging.getLogger(__name__)
class RepoManager:
"""Manages Rocky Linux repository operations.
Handles:
- Repository configuration with DNF
- Package discovery and filtering
- Package downloads with progress tracking
"""
def __init__(
self,
repo_url: str,
version: str,
repo_type: str,
arch: str,
cache_dir: Path,
download_dir: Path,
):
"""Initialize repository manager.
Args:
repo_url: Full repository URL
version: Rocky Linux version (e.g., '9.5')
repo_type: Repository type ('BaseOS' or 'AppStream')
arch: Architecture (e.g., 'x86_64')
cache_dir: Directory for caching metadata
download_dir: Directory for downloading packages
"""
self.repo_url = repo_url
self.version = version
self.repo_type = repo_type
self.arch = arch
self.cache_dir = Path(cache_dir)
self.download_dir = Path(download_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.download_dir.mkdir(parents=True, exist_ok=True)
# Initialize DNF
self.base = dnf.Base()
self.base.conf.debuglevel = 0
self.base.conf.errorlevel = 0
self.base.conf.cachedir = str(self.cache_dir / "dnf")
self._configure_repo()
self.packages_with_manpages: Optional[Set[str]] = None
def _configure_repo(self):
"""Configure DNF repository."""
repo_id = f"rocky-{self.repo_type.lower()}-{self.version}-{self.arch}"
repo = dnf.repo.Repo(repo_id, self.base.conf)
repo.baseurl = [self.repo_url]
repo.enabled = True
repo.gpgcheck = False # We verify checksums separately
self.base.repos.add(repo)
logger.info(f"Configured repository: {repo_id} at {self.repo_url}")
# Fill the sack (package database)
self.base.fill_sack(load_system_repo=False, load_available_repos=True)
logger.info("Repository metadata loaded")
def discover_packages_with_manpages(self) -> Set[str]:
"""Discover which packages contain man pages using filelists.
This is the key optimization - we parse repository metadata
to identify packages with man pages before downloading anything.
Returns:
Set of package names that contain man pages
"""
if self.packages_with_manpages is not None:
return self.packages_with_manpages
parser = ContentsParser(self.repo_url, self.cache_dir)
self.packages_with_manpages = parser.get_packages_with_manpages()
return self.packages_with_manpages
def list_packages(self, with_manpages_only: bool = True) -> List[Package]:
"""List all packages in the repository.
Args:
with_manpages_only: If True, only return packages with man pages
Returns:
List of Package objects
"""
logger.info(f"Querying packages from {self.repo_type} ({self.version}/{self.arch})")
# Get packages with man pages if filtering
manpage_packages = None
if with_manpages_only:
manpage_packages = self.discover_packages_with_manpages()
logger.info(f"Filtering to {len(manpage_packages)} packages with man pages")
packages = []
# Query all available packages
query = self.base.sack.query().available()
# For each package name, get only one arch (prefer noarch, then our target arch)
seen_names = set()
for pkg in query:
pkg_name = pkg.name
# Skip if we've already added this package
if pkg_name in seen_names:
continue
# Skip if filtering and package doesn't have man pages
if manpage_packages and pkg_name not in manpage_packages:
continue
# Get repo information
repo = pkg.repo
baseurl = repo.baseurl[0] if repo and repo.baseurl else self.repo_url
# Create Package object
package = Package(
name=pkg_name,
version=pkg.version,
release=pkg.release,
arch=pkg.arch,
repo_type=self.repo_type,
location=pkg.location,
baseurl=baseurl,
checksum=pkg.chksum[1] if pkg.chksum else "", # chksum is (type, value)
checksum_type=pkg.chksum[0] if pkg.chksum else "sha256",
has_manpages=True if manpage_packages else False,
)
packages.append(package)
seen_names.add(pkg_name)
logger.info(f"Found {len(packages)} packages to process")
return sorted(packages) # Sort by name for consistent ordering
def download_package(self, package: Package) -> bool:
"""Download a single package.
Args:
package: Package to download
Returns:
True if download successful, False otherwise
"""
download_path = self.download_dir / package.filename
package.download_path = download_path
# Skip if already downloaded
if download_path.exists():
logger.debug(f"Package already downloaded: {package.filename}")
return True
try:
logger.info(f"Downloading {package.filename}")
response = requests.get(package.download_url, timeout=300, stream=True)
response.raise_for_status()
# Download with progress (optional: could add progress bar here)
with open(download_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
logger.debug(f"Downloaded: {package.filename}")
return True
except Exception as e:
logger.error(f"Error downloading {package.filename}: {e}")
# Clean up partial download
if download_path.exists():
download_path.unlink()
return False
def download_packages(
self,
packages: List[Package],
max_workers: int = 5
) -> List[Package]:
"""Download multiple packages in parallel.
Args:
packages: List of packages to download
max_workers: Maximum number of parallel downloads
Returns:
List of successfully downloaded packages
"""
downloaded = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_pkg = {
executor.submit(self.download_package, pkg): pkg
for pkg in packages
}
# Process completed downloads
for future in as_completed(future_to_pkg):
pkg = future_to_pkg[future]
try:
if future.result():
downloaded.append(pkg)
except Exception as e:
logger.error(f"Error processing {pkg.name}: {e}")
logger.info(f"Successfully downloaded {len(downloaded)}/{len(packages)} packages")
return downloaded
def cleanup_package(self, package: Package):
"""Delete a downloaded package file.
Args:
package: Package to clean up
"""
if package.download_path and package.download_path.exists():
package.download_path.unlink()
logger.debug(f"Deleted: {package.filename}")

View File

@@ -0,0 +1,3 @@
from .config import Config
__all__ = ["Config"]

View File

@@ -0,0 +1,110 @@
"""Configuration management for Rocky Man."""
from dataclasses import dataclass
from pathlib import Path
from typing import List
@dataclass
class Config:
"""Configuration for Rocky Man page generation.
Attributes:
base_url: Base URL for Rocky Linux mirror
content_dir: Content directory path (usually 'pub/rocky')
versions: List of Rocky Linux versions to process (e.g., ['8.10', '9.5'])
architectures: List of architectures to consider (we'll pick one)
repo_types: Repository types to process (e.g., ['BaseOS', 'AppStream'])
download_dir: Directory for downloading RPM packages
extract_dir: Directory for extracting man pages
output_dir: Directory for generated HTML files
keep_rpms: Whether to keep downloaded RPM files after processing
keep_extracts: Whether to keep extracted man files after processing
parallel_downloads: Number of parallel downloads
parallel_conversions: Number of parallel HTML conversions
"""
# Repository configuration
base_url: str = "http://dl.rockylinux.org/"
content_dir: str = "pub/rocky"
versions: List[str] = None
architectures: List[str] = None
repo_types: List[str] = None
# Directory configuration
download_dir: Path = Path("/data/tmp/downloads")
extract_dir: Path = Path("/data/tmp/extracts")
output_dir: Path = Path("/data/html")
# Cleanup options
keep_rpms: bool = False
keep_extracts: bool = False
# Performance options
parallel_downloads: int = 5
parallel_conversions: int = 10
# Filtering options
skip_sections: List[str] = None
skip_packages: List[str] = None
skip_languages: bool = True # Skip non-English languages by default
allow_all_sections: bool = False # Override skip_sections if True
def __post_init__(self):
"""Set defaults and ensure directories exist."""
if self.versions is None:
self.versions = ["8.10", "9.6", "10.0"]
if self.architectures is None:
# Man pages are arch-independent, so we just need one
# We prefer x86_64 as it's most common, fallback to others
self.architectures = ["x86_64", "aarch64", "ppc64le", "s390x"]
if self.repo_types is None:
self.repo_types = ["BaseOS", "AppStream"]
# Set default skip sections (man3 library APIs)
if self.skip_sections is None and not self.allow_all_sections:
self.skip_sections = ["3", "3p", "3pm"]
elif self.allow_all_sections:
self.skip_sections = []
# Set default skip packages (high-volume API docs)
if self.skip_packages is None:
self.skip_packages = [
"lapack",
"dpdk-devel",
"gl-manpages",
]
# Ensure all paths are Path objects
self.download_dir = Path(self.download_dir)
self.extract_dir = Path(self.extract_dir)
self.output_dir = Path(self.output_dir)
def get_repo_url(self, version: str, repo_type: str, arch: str) -> str:
"""Construct repository URL for given parameters.
Args:
version: Rocky Linux version (e.g., '9.5')
repo_type: Repository type ('BaseOS' or 'AppStream')
arch: Architecture (e.g., 'x86_64')
Returns:
Full repository URL
"""
url = self.base_url.rstrip('/')
path = f"{self.content_dir}/{version}/{repo_type}/{arch}/os"
return f"{url}/{path}/"
def get_version_output_dir(self, version: str) -> Path:
"""Get output directory for a specific version."""
return self.output_dir / version
def get_version_download_dir(self, version: str) -> Path:
"""Get download directory for a specific version."""
return self.download_dir / version
def get_version_extract_dir(self, version: str) -> Path:
"""Get extract directory for a specific version."""
return self.extract_dir / version

View File

@@ -0,0 +1,3 @@
from .generator import WebGenerator
__all__ = ["WebGenerator"]

View File

@@ -0,0 +1,297 @@
"""Web page generator for Rocky Man."""
import gzip
import json
import logging
from pathlib import Path
from typing import List, Dict, Any
from jinja2 import Environment, FileSystemLoader, select_autoescape
from ..models import ManFile
logger = logging.getLogger(__name__)
class WebGenerator:
"""Generates web pages and search index for Rocky Man.
Handles:
- Generating index/search page
- Wrapping man page HTML in templates
- Creating search index JSON
"""
def __init__(self, template_dir: Path, output_dir: Path):
"""Initialize web generator.
Args:
template_dir: Directory containing Jinja2 templates
output_dir: Directory for HTML output
"""
self.template_dir = Path(template_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
# Setup Jinja2 environment
self.env = Environment(
loader=FileSystemLoader(str(self.template_dir)),
autoescape=select_autoescape(['html', 'xml'])
)
def generate_manpage_html(self, man_file: ManFile, version: str) -> bool:
"""Generate complete HTML page for a man page.
Args:
man_file: ManFile with html_content already set
version: Rocky Linux version
Returns:
True if successful
"""
if not man_file.html_content:
logger.warning(f"No HTML content for {man_file.display_name}")
return False
try:
template = self.env.get_template('manpage.html')
html = template.render(
title=f"{man_file.display_name} - {man_file.package_name} - Rocky Linux {version}",
header_title=man_file.display_name,
package_name=man_file.package_name,
version=version,
section=man_file.section,
language=man_file.language or 'en',
content=man_file.html_content
)
# Ensure output path is set
if not man_file.html_path:
man_file.html_path = self._get_manpage_path(man_file, version)
man_file.html_path.parent.mkdir(parents=True, exist_ok=True)
with open(man_file.html_path, 'w', encoding='utf-8') as f:
f.write(html)
return True
except Exception as e:
logger.error(f"Error generating HTML for {man_file.display_name}: {e}")
return False
def generate_index(self, version: str, search_data: Dict[str, Any]) -> bool:
"""Generate search/index page for a version.
Args:
version: Rocky Linux version
search_data: Search index data
Returns:
True if successful
"""
try:
template = self.env.get_template('index.html')
html = template.render(
title=f"Rocky Linux {version} Man Pages",
version=version,
total_pages=len(search_data),
packages=sorted(search_data.keys())
)
index_path = self.output_dir / version / 'index.html'
index_path.parent.mkdir(parents=True, exist_ok=True)
with open(index_path, 'w', encoding='utf-8') as f:
f.write(html)
logger.info(f"Generated index for version {version}")
return True
except Exception as e:
logger.error(f"Error generating index for {version}: {e}")
return False
def generate_packages_index(self, version: str, search_data: Dict[str, Any]) -> bool:
"""Generate full packages index page.
Args:
version: Rocky Linux version
search_data: Search index data
Returns:
True if successful
"""
try:
# Group packages by first letter
packages_by_letter = {}
for pkg_name, pages in search_data.items():
first_char = pkg_name[0].upper()
if not first_char.isalpha():
first_char = 'other'
if first_char not in packages_by_letter:
packages_by_letter[first_char] = []
packages_by_letter[first_char].append({
'name': pkg_name,
'count': len(pages)
})
# Sort packages within each letter
for letter in packages_by_letter:
packages_by_letter[letter].sort(key=lambda x: x['name'])
template = self.env.get_template('packages.html')
html = template.render(
title=f"All Packages - Rocky Linux {version}",
version=version,
total_packages=len(search_data),
packages_by_letter=packages_by_letter
)
output_path = self.output_dir / version / 'packages.html'
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
logger.info(f"Generated packages index for version {version}")
return True
except Exception as e:
logger.error(f"Error generating packages index for {version}: {e}")
return False
def generate_search_index(
self,
man_files: List[ManFile],
version: str
) -> Dict[str, Any]:
"""Generate search index from man files.
Args:
man_files: List of ManFile objects
version: Rocky Linux version
Returns:
Search index dictionary
"""
index = {}
for man_file in man_files:
pkg_name = man_file.package_name
if pkg_name not in index:
index[pkg_name] = {}
# Create entry for this man page
entry = {
'name': man_file.name,
'section': man_file.section,
'display_name': man_file.display_name,
'language': man_file.language or 'en',
'url': man_file.uri_path,
'full_name': f"{man_file.package_name} - {man_file.display_name}"
}
# Use display name as key (handles duplicates with different sections)
key = man_file.display_name
if man_file.language:
key = f"{key}.{man_file.language}"
index[pkg_name][key] = entry
return index
def save_search_index(self, index: Dict[str, Any], version: str) -> bool:
"""Save search index as JSON (both plain and gzipped).
Args:
index: Search index dictionary
version: Rocky Linux version
Returns:
True if successful
"""
try:
version_dir = self.output_dir / version
version_dir.mkdir(parents=True, exist_ok=True)
json_path = version_dir / 'search.json'
gz_path = version_dir / 'search.json.gz'
# Sort for consistency
sorted_index = {k: index[k] for k in sorted(index)}
# Save plain JSON
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(sorted_index, f, indent=2)
# Save gzipped JSON
with gzip.open(gz_path, 'wt', encoding='utf-8') as f:
json.dump(sorted_index, f)
logger.info(f"Saved search index for {version} ({len(index)} packages)")
return True
except Exception as e:
logger.error(f"Error saving search index: {e}")
return False
def _get_manpage_path(self, man_file: ManFile, version: str) -> Path:
"""Get output path for a man page HTML file.
Args:
man_file: ManFile object
version: Rocky Linux version
Returns:
Path for HTML file
"""
version_dir = self.output_dir / version
pkg_dir = version_dir / man_file.package_name
section_dir = pkg_dir / f"man{man_file.section}"
return section_dir / man_file.html_filename
def generate_root_index(self, versions: List[str]) -> bool:
"""Generate root index page linking to all versions.
Args:
versions: List of Rocky Linux versions
Returns:
True if successful
"""
try:
template = self.env.get_template('root.html')
# Sort versions numerically (e.g., 8.10, 9.6, 10.0)
def version_key(v):
try:
parts = v.split('.')
return tuple(int(p) for p in parts)
except (ValueError, AttributeError):
return (0, 0)
html = template.render(
title="Rocky Linux Man Pages",
versions=sorted(versions, key=version_key)
)
index_path = self.output_dir / 'index.html'
with open(index_path, 'w', encoding='utf-8') as f:
f.write(html)
logger.info("Generated root index page")
return True
except Exception as e:
logger.error(f"Error generating root index: {e}")
return False