"""ManFile model representing a man page file.""" from dataclasses import dataclass, field from pathlib import Path from typing import Optional import re @dataclass class ManFile: """Represents a man page file extracted from an RPM package. Attributes: file_path: Path to the extracted man page file package_name: Name of the package this man page belongs to section: Man page section (1-9) name: Man page name without extension language: Language code (e.g., 'en', 'es', None for default) content: Raw man page content (gzipped or plain text) html_content: Converted HTML content html_path: Path where HTML file is saved """ file_path: Path package_name: str section: Optional[str] = None name: Optional[str] = None language: Optional[str] = None content: Optional[bytes] = None html_content: Optional[str] = None html_path: Optional[Path] = None def __post_init__(self): """Parse file information from the path.""" self._parse_path() def _parse_path(self): """Extract section, name, and language from the file path. Example paths: /usr/share/man/man1/bash.1.gz /usr/share/man/es/man1/bash.1.gz /usr/share/man/man3/printf.3.gz """ parts = self.file_path.parts filename = self.file_path.name # Remove .gz extension if present if filename.endswith('.gz'): filename = filename[:-3] # Extract section from parent directory (e.g., 'man1', 'man3p', 'man3pm') for part in reversed(parts): if part.startswith('man') and len(part) > 3: # Check if it starts with 'man' followed by a digit if part[3].isdigit(): self.section = part[3:] break # Extract section from filename if not found yet (e.g., 'foo.3pm' -> section '3pm') # and extract name name_parts = filename.split('.') if len(name_parts) >= 2: # Try to identify section from last part potential_section = name_parts[-1] # Section is typically digit optionally followed by letters (1, 3p, 3pm, etc.) if potential_section and potential_section[0].isdigit(): if not self.section: self.section = potential_section self.name = '.'.join(name_parts[:-1]) else: self.name = name_parts[0] else: self.name = name_parts[0] # Check for language subdirectory # Pattern: /usr/share/man//man
/ for i, part in enumerate(parts): if part == 'man' and i + 1 < len(parts): next_part = parts[i + 1] # If next part is not 'man', it's a language code if not (next_part.startswith('man') and next_part[3:].isdigit()): # Common language codes are 2-5 chars (en, es, pt_BR, etc.) if len(next_part) <= 5: self.language = next_part break @property def display_name(self) -> str: """Get display name for the man page (e.g., 'bash(1)').""" return f"{self.name}({self.section})" if self.section else self.name @property def html_filename(self) -> str: """Get the HTML filename for this man page.""" # Clean name for filesystem safety safe_name = self._clean_filename(self.name) suffix = f".{self.language}" if self.language else "" return f"{safe_name}.{self.section}{suffix}.html" def _clean_filename(self, name: str) -> str: """Clean filename for filesystem safety.""" # Replace problematic characters name = name.replace('/', '_') name = name.replace(':', '_') name = re.sub(r'\.\.', '__', name) return name @property def uri_path(self) -> str: """Get the URI path for this man page (relative to version root). Returns path like: 'bash/man1/bash.1.html' """ if not self.html_path: return "" # Get path relative to the version directory # Assuming structure: html///
/.html parts = self.html_path.parts try: # Find the version part (e.g., '9.5') and return everything after it for i, part in enumerate(parts): if re.match(r'\d+\.\d+', part): # Version pattern return '/'.join(parts[i+1:]) except (ValueError, IndexError): pass return str(self.html_path) def __str__(self): return f"{self.package_name}: {self.display_name}"