CUSP-1256 (#1)

* Complete refactor

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>

* Complete refactor

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>

---------

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
This commit is contained in:
Stephen Simpson
2025-11-20 12:16:33 -05:00
committed by GitHub
parent 5248edad62
commit ec32c72363
44 changed files with 4083 additions and 1540 deletions

View File

@@ -0,0 +1,130 @@
"""ManFile model representing a man page file."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import re
@dataclass
class ManFile:
"""Represents a man page file extracted from an RPM package.
Attributes:
file_path: Path to the extracted man page file
package_name: Name of the package this man page belongs to
section: Man page section (1-9)
name: Man page name without extension
language: Language code (e.g., 'en', 'es', None for default)
content: Raw man page content (gzipped or plain text)
html_content: Converted HTML content
html_path: Path where HTML file is saved
"""
file_path: Path
package_name: str
section: Optional[str] = None
name: Optional[str] = None
language: Optional[str] = None
content: Optional[bytes] = None
html_content: Optional[str] = None
html_path: Optional[Path] = None
def __post_init__(self):
"""Parse file information from the path."""
self._parse_path()
def _parse_path(self):
"""Extract section, name, and language from the file path.
Example paths:
/usr/share/man/man1/bash.1.gz
/usr/share/man/es/man1/bash.1.gz
/usr/share/man/man3/printf.3.gz
"""
parts = self.file_path.parts
filename = self.file_path.name
# Remove .gz extension if present
if filename.endswith('.gz'):
filename = filename[:-3]
# Extract section from parent directory (e.g., 'man1', 'man3p', 'man3pm')
for part in reversed(parts):
if part.startswith('man') and len(part) > 3:
# Check if it starts with 'man' followed by a digit
if part[3].isdigit():
self.section = part[3:]
break
# Extract section from filename if not found yet (e.g., 'foo.3pm' -> section '3pm')
# and extract name
name_parts = filename.split('.')
if len(name_parts) >= 2:
# Try to identify section from last part
potential_section = name_parts[-1]
# Section is typically digit optionally followed by letters (1, 3p, 3pm, etc.)
if potential_section and potential_section[0].isdigit():
if not self.section:
self.section = potential_section
self.name = '.'.join(name_parts[:-1])
else:
self.name = name_parts[0]
else:
self.name = name_parts[0]
# Check for language subdirectory
# Pattern: /usr/share/man/<lang>/man<section>/
for i, part in enumerate(parts):
if part == 'man' and i + 1 < len(parts):
next_part = parts[i + 1]
# If next part is not 'man<digit>', it's a language code
if not (next_part.startswith('man') and next_part[3:].isdigit()):
# Common language codes are 2-5 chars (en, es, pt_BR, etc.)
if len(next_part) <= 5:
self.language = next_part
break
@property
def display_name(self) -> str:
"""Get display name for the man page (e.g., 'bash(1)')."""
return f"{self.name}({self.section})" if self.section else self.name
@property
def html_filename(self) -> str:
"""Get the HTML filename for this man page."""
# Clean name for filesystem safety
safe_name = self._clean_filename(self.name)
suffix = f".{self.language}" if self.language else ""
return f"{safe_name}.{self.section}{suffix}.html"
def _clean_filename(self, name: str) -> str:
"""Clean filename for filesystem safety."""
# Replace problematic characters
name = name.replace('/', '_')
name = name.replace(':', '_')
name = re.sub(r'\.\.', '__', name)
return name
@property
def uri_path(self) -> str:
"""Get the URI path for this man page (relative to version root).
Returns path like: 'bash/man1/bash.1.html'
"""
if not self.html_path:
return ""
# Get path relative to the version directory
# Assuming structure: html/<version>/<package>/<section>/<file>.html
parts = self.html_path.parts
try:
# Find the version part (e.g., '9.5') and return everything after it
for i, part in enumerate(parts):
if re.match(r'\d+\.\d+', part): # Version pattern
return '/'.join(parts[i+1:])
except (ValueError, IndexError):
pass
return str(self.html_path)
def __str__(self):
return f"{self.package_name}: {self.display_name}"