updates

2025-12-10 11:16:55 -06:00
parent b4ffdb6560
commit 316610e932
14 changed files with 350 additions and 520 deletions
@@ -4,7 +4,7 @@ import gzip
 import logging
 import xml.etree.ElementTree as ET
 from pathlib import Path
-from typing import Set, Dict
+from typing import Set
 from urllib.parse import urljoin

 import requests
@@ -38,19 +38,16 @@ class ContentsParser:
        """
        logger.info(f"Fetching filelists for {self.repo_url}")

-        # Download and parse repomd.xml to find filelists location
        filelists_path = self._get_filelists_path()
        if not filelists_path:
            logger.warning("Could not find filelists in repository metadata")
            return set()

-        # Download filelists.xml
        filelists_file = self._download_filelists(filelists_path)
        if not filelists_file:
            logger.warning("Could not download filelists")
            return set()

-        # Parse filelists to find packages with man pages
        packages = self._parse_filelists(filelists_file)
        logger.info(f"Found {len(packages)} packages with man pages")

@@ -68,11 +65,7 @@ class ContentsParser:
            response = requests.get(repomd_url, timeout=30)
            response.raise_for_status()

-            # Parse XML
            root = ET.fromstring(response.content)
-
-            # Find filelists entry
-            # XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
            ns = {'repo': 'http://linux.duke.edu/metadata/repo'}

            for data in root.findall('repo:data', ns):
@@ -81,7 +74,7 @@ class ContentsParser:
                    if location is not None:
                        return location.get('href')

-            # Fallback: try without namespace
+            # Fallback without namespace
            for data in root.findall('data'):
                if data.get('type') == 'filelists':
                    location = data.find('location')
@@ -105,7 +98,6 @@ class ContentsParser:
        url = urljoin(self.repo_url, relative_path)
        cache_file = self.cache_dir / relative_path.split('/')[-1]

-        # Return cached file if it exists
        if cache_file.exists():
            logger.debug(f"Using cached filelists: {cache_file}")
            return cache_file
@@ -138,36 +130,26 @@ class ContentsParser:
        packages = set()

        try:
-            # Open gzipped XML file
            with gzip.open(filelists_path, 'rb') as f:
-                # Use iterparse for memory efficiency (files can be large)
                context = ET.iterparse(f, events=('start', 'end'))

                current_package = None
                has_manpage = False

                for event, elem in context:
-                    if event == 'start':
-                        if elem.tag.endswith('package'):
-                            # Get package name from 'name' attribute
-                            current_package = elem.get('name')
-                            has_manpage = False
+                    if event == 'start' and elem.tag.endswith('package'):
+                        current_package = elem.get('name')
+                        has_manpage = False

                    elif event == 'end':
                        if elem.tag.endswith('file'):
-                            # Check if file path contains /man/
                            file_path = elem.text
-                            if file_path and '/man/' in file_path:
-                                # Could be /usr/share/man/ or /usr/man/
-                                if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
-                                    has_manpage = True
+                            if file_path and self._is_manpage_path(file_path):
+                                has_manpage = True

                        elif elem.tag.endswith('package'):
-                            # End of package entry
                            if has_manpage and current_package:
                                packages.add(current_package)
-
-                            # Clear element to free memory
                            elem.clear()
                            current_package = None
                            has_manpage = False
@@ -177,45 +159,16 @@ class ContentsParser:

        return packages

-    def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
-        """Get detailed list of man files for each package.
+    @staticmethod
+    def _is_manpage_path(file_path: str) -> bool:
+        """Check if a file path is a man page location.

        Args:
-            filelists_path: Path to filelists.xml.gz file
+            file_path: File path to check

        Returns:
-            Dict mapping package name to list of man page paths
+            True if path is in a standard man page directory
        """
-        packages = {}
-
-        try:
-            with gzip.open(filelists_path, 'rb') as f:
-                context = ET.iterparse(f, events=('start', 'end'))
-
-                current_package = None
-                current_files = []
-
-                for event, elem in context:
-                    if event == 'start':
-                        if elem.tag.endswith('package'):
-                            current_package = elem.get('name')
-                            current_files = []
-
-                    elif event == 'end':
-                        if elem.tag.endswith('file'):
-                            file_path = elem.text
-                            if file_path and '/share/man/' in file_path:
-                                current_files.append(file_path)
-
-                        elif elem.tag.endswith('package'):
-                            if current_files and current_package:
-                                packages[current_package] = current_files
-
-                            elem.clear()
-                            current_package = None
-                            current_files = []
-
-        except Exception as e:
-            logger.error(f"Error parsing filelists: {e}")
-
-        return packages
+        return '/man/' in file_path and (
+            '/share/man/' in file_path or file_path.startswith('/usr/man/')
+        )