rocky-man/src/rocky_man/repo/contents.py

"""Contents file parser for identifying packages with man pages."""

import gzip
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Set
from urllib.parse import urljoin

import requests

logger = logging.getLogger(__name__)


class ContentsParser:
    """Parse repository metadata to identify packages containing man pages.

    This is a key optimization - instead of downloading all packages,
    we parse the filelists.xml to find only packages with man pages.
    """

    def __init__(self, repo_url: str, cache_dir: Path):
        """Initialize the contents parser.

        Args:
            repo_url: Base URL of the repository (e.g., .../BaseOS/x86_64/os/)
            cache_dir: Directory to cache downloaded metadata
        """
        self.repo_url = repo_url.rstrip('/') + '/'
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def get_packages_with_manpages(self) -> Set[str]:
        """Get set of package names that contain man pages.

        Returns:
            Set of package names (e.g., {'bash', 'coreutils', ...})
        """
        logger.info(f"Fetching filelists for {self.repo_url}")

        filelists_path = self._get_filelists_path()
        if not filelists_path:
            logger.warning("Could not find filelists in repository metadata")
            return set()

        filelists_file = self._download_filelists(filelists_path)
        if not filelists_file:
            logger.warning("Could not download filelists")
            return set()

        packages = self._parse_filelists(filelists_file)
        logger.info(f"Found {len(packages)} packages with man pages")

        return packages

    def _get_filelists_path(self) -> str:
        """Parse repomd.xml to get the filelists.xml location.

        Returns:
            Relative path to filelists.xml.gz
        """
        repomd_url = urljoin(self.repo_url, 'repodata/repomd.xml')

        try:
            response = requests.get(repomd_url, timeout=30)
            response.raise_for_status()

            root = ET.fromstring(response.content)
            ns = {'repo': 'http://linux.duke.edu/metadata/repo'}

            for data in root.findall('repo:data', ns):
                if data.get('type') == 'filelists':
                    location = data.find('repo:location', ns)
                    if location is not None:
                        return location.get('href')

            # Fallback without namespace
            for data in root.findall('data'):
                if data.get('type') == 'filelists':
                    location = data.find('location')
                    if location is not None:
                        return location.get('href')

        except Exception as e:
            logger.error(f"Error parsing repomd.xml: {e}")

        return None

    def _download_filelists(self, relative_path: str) -> Path:
        """Download filelists.xml.gz file.

        Args:
            relative_path: Relative path from repo root (e.g., 'repodata/...-filelists.xml.gz')

        Returns:
            Path to downloaded file
        """
        url = urljoin(self.repo_url, relative_path)
        cache_file = self.cache_dir / relative_path.split('/')[-1]

        if cache_file.exists():
            logger.debug(f"Using cached filelists: {cache_file}")
            return cache_file

        try:
            logger.info(f"Downloading {url}")
            response = requests.get(url, timeout=60, stream=True)
            response.raise_for_status()

            cache_file.parent.mkdir(parents=True, exist_ok=True)
            with open(cache_file, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            return cache_file

        except Exception as e:
            logger.error(f"Error downloading filelists: {e}")
            return None

    def _parse_filelists(self, filelists_path: Path) -> Set[str]:
        """Parse filelists.xml.gz to find packages with man pages.

        Args:
            filelists_path: Path to filelists.xml.gz file

        Returns:
            Set of package names containing man pages
        """
        packages = set()

        try:
            with gzip.open(filelists_path, 'rb') as f:
                context = ET.iterparse(f, events=('start', 'end'))

                current_package = None
                has_manpage = False

                for event, elem in context:
                    if event == 'start' and elem.tag.endswith('package'):
                        current_package = elem.get('name')
                        has_manpage = False

                    elif event == 'end':
                        if elem.tag.endswith('file'):
                            file_path = elem.text
                            if file_path and self._is_manpage_path(file_path):
                                has_manpage = True

                        elif elem.tag.endswith('package'):
                            if has_manpage and current_package:
                                packages.add(current_package)
                            elem.clear()
                            current_package = None
                            has_manpage = False

        except Exception as e:
            logger.error(f"Error parsing filelists: {e}")

        return packages

    @staticmethod
    def _is_manpage_path(file_path: str) -> bool:
        """Check if a file path is a man page location.

        Args:
            file_path: File path to check

        Returns:
            True if path is in a standard man page directory
        """
        return '/man/' in file_path and (
            '/share/man/' in file_path or file_path.startswith('/usr/man/')
        )