CUSP-1256 (#1)
* Complete refactor Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com> * Complete refactor Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com> --------- Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,221 @@
|
||||
"""Contents file parser for identifying packages with man pages."""
|
||||
|
||||
import gzip
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from typing import Set, Dict
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ContentsParser:
|
||||
"""Parse repository metadata to identify packages containing man pages.
|
||||
|
||||
This is a key optimization - instead of downloading all packages,
|
||||
we parse the filelists.xml to find only packages with man pages.
|
||||
"""
|
||||
|
||||
def __init__(self, repo_url: str, cache_dir: Path):
|
||||
"""Initialize the contents parser.
|
||||
|
||||
Args:
|
||||
repo_url: Base URL of the repository (e.g., .../BaseOS/x86_64/os/)
|
||||
cache_dir: Directory to cache downloaded metadata
|
||||
"""
|
||||
self.repo_url = repo_url.rstrip('/') + '/'
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def get_packages_with_manpages(self) -> Set[str]:
|
||||
"""Get set of package names that contain man pages.
|
||||
|
||||
Returns:
|
||||
Set of package names (e.g., {'bash', 'coreutils', ...})
|
||||
"""
|
||||
logger.info(f"Fetching filelists for {self.repo_url}")
|
||||
|
||||
# Download and parse repomd.xml to find filelists location
|
||||
filelists_path = self._get_filelists_path()
|
||||
if not filelists_path:
|
||||
logger.warning("Could not find filelists in repository metadata")
|
||||
return set()
|
||||
|
||||
# Download filelists.xml
|
||||
filelists_file = self._download_filelists(filelists_path)
|
||||
if not filelists_file:
|
||||
logger.warning("Could not download filelists")
|
||||
return set()
|
||||
|
||||
# Parse filelists to find packages with man pages
|
||||
packages = self._parse_filelists(filelists_file)
|
||||
logger.info(f"Found {len(packages)} packages with man pages")
|
||||
|
||||
return packages
|
||||
|
||||
def _get_filelists_path(self) -> str:
|
||||
"""Parse repomd.xml to get the filelists.xml location.
|
||||
|
||||
Returns:
|
||||
Relative path to filelists.xml.gz
|
||||
"""
|
||||
repomd_url = urljoin(self.repo_url, 'repodata/repomd.xml')
|
||||
|
||||
try:
|
||||
response = requests.get(repomd_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
# Find filelists entry
|
||||
# XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
|
||||
ns = {'repo': 'http://linux.duke.edu/metadata/repo'}
|
||||
|
||||
for data in root.findall('repo:data', ns):
|
||||
if data.get('type') == 'filelists':
|
||||
location = data.find('repo:location', ns)
|
||||
if location is not None:
|
||||
return location.get('href')
|
||||
|
||||
# Fallback: try without namespace
|
||||
for data in root.findall('data'):
|
||||
if data.get('type') == 'filelists':
|
||||
location = data.find('location')
|
||||
if location is not None:
|
||||
return location.get('href')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing repomd.xml: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _download_filelists(self, relative_path: str) -> Path:
|
||||
"""Download filelists.xml.gz file.
|
||||
|
||||
Args:
|
||||
relative_path: Relative path from repo root (e.g., 'repodata/...-filelists.xml.gz')
|
||||
|
||||
Returns:
|
||||
Path to downloaded file
|
||||
"""
|
||||
url = urljoin(self.repo_url, relative_path)
|
||||
cache_file = self.cache_dir / relative_path.split('/')[-1]
|
||||
|
||||
# Return cached file if it exists
|
||||
if cache_file.exists():
|
||||
logger.debug(f"Using cached filelists: {cache_file}")
|
||||
return cache_file
|
||||
|
||||
try:
|
||||
logger.info(f"Downloading {url}")
|
||||
response = requests.get(url, timeout=60, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(cache_file, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
return cache_file
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading filelists: {e}")
|
||||
return None
|
||||
|
||||
def _parse_filelists(self, filelists_path: Path) -> Set[str]:
|
||||
"""Parse filelists.xml.gz to find packages with man pages.
|
||||
|
||||
Args:
|
||||
filelists_path: Path to filelists.xml.gz file
|
||||
|
||||
Returns:
|
||||
Set of package names containing man pages
|
||||
"""
|
||||
packages = set()
|
||||
|
||||
try:
|
||||
# Open gzipped XML file
|
||||
with gzip.open(filelists_path, 'rb') as f:
|
||||
# Use iterparse for memory efficiency (files can be large)
|
||||
context = ET.iterparse(f, events=('start', 'end'))
|
||||
|
||||
current_package = None
|
||||
has_manpage = False
|
||||
|
||||
for event, elem in context:
|
||||
if event == 'start':
|
||||
if elem.tag.endswith('package'):
|
||||
# Get package name from 'name' attribute
|
||||
current_package = elem.get('name')
|
||||
has_manpage = False
|
||||
|
||||
elif event == 'end':
|
||||
if elem.tag.endswith('file'):
|
||||
# Check if file path contains /man/
|
||||
file_path = elem.text
|
||||
if file_path and '/man/' in file_path:
|
||||
# Could be /usr/share/man/ or /usr/man/
|
||||
if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
|
||||
has_manpage = True
|
||||
|
||||
elif elem.tag.endswith('package'):
|
||||
# End of package entry
|
||||
if has_manpage and current_package:
|
||||
packages.add(current_package)
|
||||
|
||||
# Clear element to free memory
|
||||
elem.clear()
|
||||
current_package = None
|
||||
has_manpage = False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing filelists: {e}")
|
||||
|
||||
return packages
|
||||
|
||||
def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
|
||||
"""Get detailed list of man files for each package.
|
||||
|
||||
Args:
|
||||
filelists_path: Path to filelists.xml.gz file
|
||||
|
||||
Returns:
|
||||
Dict mapping package name to list of man page paths
|
||||
"""
|
||||
packages = {}
|
||||
|
||||
try:
|
||||
with gzip.open(filelists_path, 'rb') as f:
|
||||
context = ET.iterparse(f, events=('start', 'end'))
|
||||
|
||||
current_package = None
|
||||
current_files = []
|
||||
|
||||
for event, elem in context:
|
||||
if event == 'start':
|
||||
if elem.tag.endswith('package'):
|
||||
current_package = elem.get('name')
|
||||
current_files = []
|
||||
|
||||
elif event == 'end':
|
||||
if elem.tag.endswith('file'):
|
||||
file_path = elem.text
|
||||
if file_path and '/share/man/' in file_path:
|
||||
current_files.append(file_path)
|
||||
|
||||
elif elem.tag.endswith('package'):
|
||||
if current_files and current_package:
|
||||
packages[current_package] = current_files
|
||||
|
||||
elem.clear()
|
||||
current_package = None
|
||||
current_files = []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing filelists: {e}")
|
||||
|
||||
return packages
|
||||
Reference in New Issue
Block a user