This commit is contained in:
Stephen Simpson
2025-12-10 11:16:55 -06:00
parent b4ffdb6560
commit 316610e932
14 changed files with 350 additions and 520 deletions
+15 -62
View File
@@ -4,7 +4,7 @@ import gzip
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Set, Dict
from typing import Set
from urllib.parse import urljoin
import requests
@@ -38,19 +38,16 @@ class ContentsParser:
"""
logger.info(f"Fetching filelists for {self.repo_url}")
# Download and parse repomd.xml to find filelists location
filelists_path = self._get_filelists_path()
if not filelists_path:
logger.warning("Could not find filelists in repository metadata")
return set()
# Download filelists.xml
filelists_file = self._download_filelists(filelists_path)
if not filelists_file:
logger.warning("Could not download filelists")
return set()
# Parse filelists to find packages with man pages
packages = self._parse_filelists(filelists_file)
logger.info(f"Found {len(packages)} packages with man pages")
@@ -68,11 +65,7 @@ class ContentsParser:
response = requests.get(repomd_url, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
# Find filelists entry
# XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
ns = {'repo': 'http://linux.duke.edu/metadata/repo'}
for data in root.findall('repo:data', ns):
@@ -81,7 +74,7 @@ class ContentsParser:
if location is not None:
return location.get('href')
# Fallback: try without namespace
# Fallback without namespace
for data in root.findall('data'):
if data.get('type') == 'filelists':
location = data.find('location')
@@ -105,7 +98,6 @@ class ContentsParser:
url = urljoin(self.repo_url, relative_path)
cache_file = self.cache_dir / relative_path.split('/')[-1]
# Return cached file if it exists
if cache_file.exists():
logger.debug(f"Using cached filelists: {cache_file}")
return cache_file
@@ -138,36 +130,26 @@ class ContentsParser:
packages = set()
try:
# Open gzipped XML file
with gzip.open(filelists_path, 'rb') as f:
# Use iterparse for memory efficiency (files can be large)
context = ET.iterparse(f, events=('start', 'end'))
current_package = None
has_manpage = False
for event, elem in context:
if event == 'start':
if elem.tag.endswith('package'):
# Get package name from 'name' attribute
current_package = elem.get('name')
has_manpage = False
if event == 'start' and elem.tag.endswith('package'):
current_package = elem.get('name')
has_manpage = False
elif event == 'end':
if elem.tag.endswith('file'):
# Check if file path contains /man/
file_path = elem.text
if file_path and '/man/' in file_path:
# Could be /usr/share/man/ or /usr/man/
if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
has_manpage = True
if file_path and self._is_manpage_path(file_path):
has_manpage = True
elif elem.tag.endswith('package'):
# End of package entry
if has_manpage and current_package:
packages.add(current_package)
# Clear element to free memory
elem.clear()
current_package = None
has_manpage = False
@@ -177,45 +159,16 @@ class ContentsParser:
return packages
def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
"""Get detailed list of man files for each package.
@staticmethod
def _is_manpage_path(file_path: str) -> bool:
"""Check if a file path is a man page location.
Args:
filelists_path: Path to filelists.xml.gz file
file_path: File path to check
Returns:
Dict mapping package name to list of man page paths
True if path is in a standard man page directory
"""
packages = {}
try:
with gzip.open(filelists_path, 'rb') as f:
context = ET.iterparse(f, events=('start', 'end'))
current_package = None
current_files = []
for event, elem in context:
if event == 'start':
if elem.tag.endswith('package'):
current_package = elem.get('name')
current_files = []
elif event == 'end':
if elem.tag.endswith('file'):
file_path = elem.text
if file_path and '/share/man/' in file_path:
current_files.append(file_path)
elif elem.tag.endswith('package'):
if current_files and current_package:
packages[current_package] = current_files
elem.clear()
current_package = None
current_files = []
except Exception as e:
logger.error(f"Error parsing filelists: {e}")
return packages
return '/man/' in file_path and (
'/share/man/' in file_path or file_path.startswith('/usr/man/')
)