updates
This commit is contained in:
@@ -4,7 +4,7 @@ import gzip
|
||||
import logging
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from typing import Set, Dict
|
||||
from typing import Set
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
@@ -38,19 +38,16 @@ class ContentsParser:
|
||||
"""
|
||||
logger.info(f"Fetching filelists for {self.repo_url}")
|
||||
|
||||
# Download and parse repomd.xml to find filelists location
|
||||
filelists_path = self._get_filelists_path()
|
||||
if not filelists_path:
|
||||
logger.warning("Could not find filelists in repository metadata")
|
||||
return set()
|
||||
|
||||
# Download filelists.xml
|
||||
filelists_file = self._download_filelists(filelists_path)
|
||||
if not filelists_file:
|
||||
logger.warning("Could not download filelists")
|
||||
return set()
|
||||
|
||||
# Parse filelists to find packages with man pages
|
||||
packages = self._parse_filelists(filelists_file)
|
||||
logger.info(f"Found {len(packages)} packages with man pages")
|
||||
|
||||
@@ -68,11 +65,7 @@ class ContentsParser:
|
||||
response = requests.get(repomd_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
# Find filelists entry
|
||||
# XML structure: <repomd><data type="filelists"><location href="..."/></data></repomd>
|
||||
ns = {'repo': 'http://linux.duke.edu/metadata/repo'}
|
||||
|
||||
for data in root.findall('repo:data', ns):
|
||||
@@ -81,7 +74,7 @@ class ContentsParser:
|
||||
if location is not None:
|
||||
return location.get('href')
|
||||
|
||||
# Fallback: try without namespace
|
||||
# Fallback without namespace
|
||||
for data in root.findall('data'):
|
||||
if data.get('type') == 'filelists':
|
||||
location = data.find('location')
|
||||
@@ -105,7 +98,6 @@ class ContentsParser:
|
||||
url = urljoin(self.repo_url, relative_path)
|
||||
cache_file = self.cache_dir / relative_path.split('/')[-1]
|
||||
|
||||
# Return cached file if it exists
|
||||
if cache_file.exists():
|
||||
logger.debug(f"Using cached filelists: {cache_file}")
|
||||
return cache_file
|
||||
@@ -138,36 +130,26 @@ class ContentsParser:
|
||||
packages = set()
|
||||
|
||||
try:
|
||||
# Open gzipped XML file
|
||||
with gzip.open(filelists_path, 'rb') as f:
|
||||
# Use iterparse for memory efficiency (files can be large)
|
||||
context = ET.iterparse(f, events=('start', 'end'))
|
||||
|
||||
current_package = None
|
||||
has_manpage = False
|
||||
|
||||
for event, elem in context:
|
||||
if event == 'start':
|
||||
if elem.tag.endswith('package'):
|
||||
# Get package name from 'name' attribute
|
||||
current_package = elem.get('name')
|
||||
has_manpage = False
|
||||
if event == 'start' and elem.tag.endswith('package'):
|
||||
current_package = elem.get('name')
|
||||
has_manpage = False
|
||||
|
||||
elif event == 'end':
|
||||
if elem.tag.endswith('file'):
|
||||
# Check if file path contains /man/
|
||||
file_path = elem.text
|
||||
if file_path and '/man/' in file_path:
|
||||
# Could be /usr/share/man/ or /usr/man/
|
||||
if '/share/man/' in file_path or file_path.startswith('/usr/man/'):
|
||||
has_manpage = True
|
||||
if file_path and self._is_manpage_path(file_path):
|
||||
has_manpage = True
|
||||
|
||||
elif elem.tag.endswith('package'):
|
||||
# End of package entry
|
||||
if has_manpage and current_package:
|
||||
packages.add(current_package)
|
||||
|
||||
# Clear element to free memory
|
||||
elem.clear()
|
||||
current_package = None
|
||||
has_manpage = False
|
||||
@@ -177,45 +159,16 @@ class ContentsParser:
|
||||
|
||||
return packages
|
||||
|
||||
def get_package_man_files(self, filelists_path: Path) -> Dict[str, list]:
|
||||
"""Get detailed list of man files for each package.
|
||||
@staticmethod
|
||||
def _is_manpage_path(file_path: str) -> bool:
|
||||
"""Check if a file path is a man page location.
|
||||
|
||||
Args:
|
||||
filelists_path: Path to filelists.xml.gz file
|
||||
file_path: File path to check
|
||||
|
||||
Returns:
|
||||
Dict mapping package name to list of man page paths
|
||||
True if path is in a standard man page directory
|
||||
"""
|
||||
packages = {}
|
||||
|
||||
try:
|
||||
with gzip.open(filelists_path, 'rb') as f:
|
||||
context = ET.iterparse(f, events=('start', 'end'))
|
||||
|
||||
current_package = None
|
||||
current_files = []
|
||||
|
||||
for event, elem in context:
|
||||
if event == 'start':
|
||||
if elem.tag.endswith('package'):
|
||||
current_package = elem.get('name')
|
||||
current_files = []
|
||||
|
||||
elif event == 'end':
|
||||
if elem.tag.endswith('file'):
|
||||
file_path = elem.text
|
||||
if file_path and '/share/man/' in file_path:
|
||||
current_files.append(file_path)
|
||||
|
||||
elif elem.tag.endswith('package'):
|
||||
if current_files and current_package:
|
||||
packages[current_package] = current_files
|
||||
|
||||
elem.clear()
|
||||
current_package = None
|
||||
current_files = []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing filelists: {e}")
|
||||
|
||||
return packages
|
||||
return '/man/' in file_path and (
|
||||
'/share/man/' in file_path or file_path.startswith('/usr/man/')
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user