CUSP-1256 (#1)

* Complete refactor Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com> * Complete refactor Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com> --------- Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
2025-11-20 12:16:33 -05:00
parent 5248edad62
commit ec32c72363
44 changed files with 4083 additions and 1540 deletions
--- a/src/rocky_man/models/manfile.py
+++ b/src/rocky_man/models/manfile.py
@@ -0,0 +1,130 @@
+"""ManFile model representing a man page file."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import re
+
+
+@dataclass
+class ManFile:
+    """Represents a man page file extracted from an RPM package.
+
+    Attributes:
+        file_path: Path to the extracted man page file
+        package_name: Name of the package this man page belongs to
+        section: Man page section (1-9)
+        name: Man page name without extension
+        language: Language code (e.g., 'en', 'es', None for default)
+        content: Raw man page content (gzipped or plain text)
+        html_content: Converted HTML content
+        html_path: Path where HTML file is saved
+    """
+
+    file_path: Path
+    package_name: str
+    section: Optional[str] = None
+    name: Optional[str] = None
+    language: Optional[str] = None
+    content: Optional[bytes] = None
+    html_content: Optional[str] = None
+    html_path: Optional[Path] = None
+
+    def __post_init__(self):
+        """Parse file information from the path."""
+        self._parse_path()
+
+    def _parse_path(self):
+        """Extract section, name, and language from the file path.
+
+        Example paths:
+            /usr/share/man/man1/bash.1.gz
+            /usr/share/man/es/man1/bash.1.gz
+            /usr/share/man/man3/printf.3.gz
+        """
+        parts = self.file_path.parts
+        filename = self.file_path.name
+
+        # Remove .gz extension if present
+        if filename.endswith('.gz'):
+            filename = filename[:-3]
+
+        # Extract section from parent directory (e.g., 'man1', 'man3p', 'man3pm')
+        for part in reversed(parts):
+            if part.startswith('man') and len(part) > 3:
+                # Check if it starts with 'man' followed by a digit
+                if part[3].isdigit():
+                    self.section = part[3:]
+                    break
+
+        # Extract section from filename if not found yet (e.g., 'foo.3pm' -> section '3pm')
+        # and extract name
+        name_parts = filename.split('.')
+        if len(name_parts) >= 2:
+            # Try to identify section from last part
+            potential_section = name_parts[-1]
+            # Section is typically digit optionally followed by letters (1, 3p, 3pm, etc.)
+            if potential_section and potential_section[0].isdigit():
+                if not self.section:
+                    self.section = potential_section
+                self.name = '.'.join(name_parts[:-1])
+            else:
+                self.name = name_parts[0]
+        else:
+            self.name = name_parts[0]
+
+        # Check for language subdirectory
+        # Pattern: /usr/share/man/<lang>/man<section>/
+        for i, part in enumerate(parts):
+            if part == 'man' and i + 1 < len(parts):
+                next_part = parts[i + 1]
+                # If next part is not 'man<digit>', it's a language code
+                if not (next_part.startswith('man') and next_part[3:].isdigit()):
+                    # Common language codes are 2-5 chars (en, es, pt_BR, etc.)
+                    if len(next_part) <= 5:
+                        self.language = next_part
+                break
+
+    @property
+    def display_name(self) -> str:
+        """Get display name for the man page (e.g., 'bash(1)')."""
+        return f"{self.name}({self.section})" if self.section else self.name
+
+    @property
+    def html_filename(self) -> str:
+        """Get the HTML filename for this man page."""
+        # Clean name for filesystem safety
+        safe_name = self._clean_filename(self.name)
+        suffix = f".{self.language}" if self.language else ""
+        return f"{safe_name}.{self.section}{suffix}.html"
+
+    def _clean_filename(self, name: str) -> str:
+        """Clean filename for filesystem safety."""
+        # Replace problematic characters
+        name = name.replace('/', '_')
+        name = name.replace(':', '_')
+        name = re.sub(r'\.\.', '__', name)
+        return name
+
+    @property
+    def uri_path(self) -> str:
+        """Get the URI path for this man page (relative to version root).
+
+        Returns path like: 'bash/man1/bash.1.html'
+        """
+        if not self.html_path:
+            return ""
+        # Get path relative to the version directory
+        # Assuming structure: html/<version>/<package>/<section>/<file>.html
+        parts = self.html_path.parts
+        try:
+            # Find the version part (e.g., '9.5') and return everything after it
+            for i, part in enumerate(parts):
+                if re.match(r'\d+\.\d+', part):  # Version pattern
+                    return '/'.join(parts[i+1:])
+        except (ValueError, IndexError):
+            pass
+        return str(self.html_path)
+
+    def __str__(self):
+        return f"{self.package_name}: {self.display_name}"