CUSP-1256 (#1)

* Complete refactor

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>

* Complete refactor

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>

---------

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
This commit is contained in:
Stephen Simpson
2025-11-20 12:16:33 -05:00
committed by GitHub
parent 5248edad62
commit ec32c72363
44 changed files with 4083 additions and 1540 deletions
+221
View File
@@ -0,0 +1,221 @@
"""Contents file parser for identifying packages with man pages."""
import gzip
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Set, Dict
from urllib.parse import urljoin
import requests
logger = logging.getLogger(__name__)
class ContentsParser:
"""Parse repository metadata to identify packages containing man pages.
This is a key optimization - instead of downloading all packages,
we parse the filelists.xml to find only packages with man pages.
"""
def __init__(self, repo_url: str, cache_dir: Path):
"""Initialize the contents parser.
Args:
repo_url: Base URL of the repository (e.g., .../BaseOS/x86_64/os/)
cache_dir: Directory to cache downloaded metadata
"""
self.repo_url = repo_url.rstrip('/') + '/'
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
def get_packages_with_manpages(self) -> Set[str]:
"""Get set of package names that contain man pages.
Returns:
Set of package names (e.g., {'bash', 'coreutils', ...})
"""
logger.info(f"Fetching filelists for {self.repo_url}")
# Download and parse repomd.xml to find filelists location
filelists_path = self._get_filelists_path()
if not filelists_path:
logger.warning("Could not find filelists in repository metadata")
return set()
# Download filelists.xml
filelists_file = self._download_filelists(filelists_path)
if not filelists_file:
logger.warning("Could not download filelists")
return set()
# Parse filelists to find packages with man pages
packages = self._parse_filelists(filelists_file)
logger.info(f"Found {len(packages)} packages with man pages")
return packages
def _get_filelists_path(self) -> str:
"""Parse repomd.xml to get the filelists.xml location.
Returns:
Relative path to filelists.xml.gz
"""
repomd_url = urljoin(self.repo_url, 'repodata/repomd.xml')
try:
response = requests.get(repomd_url, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
# Find filelists entry
# XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
ns = {'repo': 'http://linux.duke.edu/metadata/repo'}
for data in root.findall('repo:data', ns):
if data.get('type') == 'filelists':
location = data.find('repo:location', ns)
if location is not None:
return location.get('href')
# Fallback: try without namespace
for data in root.findall('data'):
if data.get('type') == 'filelists':
location = data.find('location')
if location is not None:
return location.get('href')
except Exception as e:
logger.error(f"Error parsing repomd.xml: {e}")
return None
def _download_filelists(self, relative_path: str) -> Path:
"""Download filelists.xml.gz file.
Args:
relative_path: Relative path from repo root (e.g., 'repodata/...-filelists.xml.gz')
Returns:
Path to downloaded file
"""
url = urljoin(self.repo_url, relative_path)
cache_file = self.cache_dir / relative_path.split('/')[-1]
# Return cached file if it exists
if cache_file.exists():
logger.debug(f"Using cached filelists: {cache_file}")
return cache_file
try:
logger.info(f"Downloading {url}")
response = requests.get(url, timeout=60, stream=True)
response.raise_for_status()
cache_file.parent.mkdir(parents=True, exist_ok=True)
with open(cache_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return cache_file
except Exception as e:
logger.error(f"Error downloading filelists: {e}")
return None
def _parse_filelists(self, filelists_path: Path) -> Set[str]:
"""Parse filelists.xml.gz to find packages with man pages.
Args:
filelists_path: Path to filelists.xml.gz file
Returns:
Set of package names containing man pages
"""
packages = set()
try:
# Open gzipped XML file
with gzip.open(filelists_path, 'rb') as f:
# Use iterparse for memory efficiency (files can be large)
context = ET.iterparse(f, events=('start', 'end'))
current_package = None
has_manpage = False
for event, elem in context:
if event == 'start':
if elem.tag.endswith('package'):
# Get package name from 'name' attribute
current_package = elem.get('name')
has_manpage = False
elif event == 'end':
if elem.tag.endswith('file'):
# Check if file path contains /man/
file_path = elem.text
if file_path and '/man/' in file_path:
# Could be /usr/share/man/ or /usr/man/
if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
has_manpage = True
elif elem.tag.endswith('package'):
# End of package entry
if has_manpage and current_package:
packages.add(current_package)
# Clear element to free memory
elem.clear()
current_package = None
has_manpage = False
except Exception as e:
logger.error(f"Error parsing filelists: {e}")
return packages
def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
"""Get detailed list of man files for each package.
Args:
filelists_path: Path to filelists.xml.gz file
Returns:
Dict mapping package name to list of man page paths
"""
packages = {}
try:
with gzip.open(filelists_path, 'rb') as f:
context = ET.iterparse(f, events=('start', 'end'))
current_package = None
current_files = []
for event, elem in context:
if event == 'start':
if elem.tag.endswith('package'):
current_package = elem.get('name')
current_files = []
elif event == 'end':
if elem.tag.endswith('file'):
file_path = elem.text
if file_path and '/share/man/' in file_path:
current_files.append(file_path)
elif elem.tag.endswith('package'):
if current_files and current_package:
packages[current_package] = current_files
elem.clear()
current_package = None
current_files = []
except Exception as e:
logger.error(f"Error parsing filelists: {e}")
return packages