Implement concurrent processing for man file extraction and package downloads

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
This commit is contained in:
Stephen Simpson
2025-01-14 15:38:44 -06:00
parent 8a652fca4f
commit 5248edad62

View File

@@ -11,6 +11,8 @@ from urllib.parse import urljoin
from typing import List, Dict, Any, Callable
from pathlib import Path
from jinja2 import Environment, FileSystemLoader
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
sitemap = {}
@@ -77,21 +79,31 @@ class ManMaker:
for member in rpm.getmembers():
if "/man/" in member.name:
man_file = ManFile(filelocation=extract_dir / member.name)
man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
with open(man_file.filelocation, "wb") as f:
f.write(rpm.extractfile(member).read())
if not man_file.filelocation.exists():
man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
with open(man_file.filelocation, "wb") as f:
f.write(rpm.extractfile(member).read())
man_files.append(man_file)
self.get_man_file_contents(package, man_files)
def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
for man_file in man_files:
try:
man_file.man_text = self.zcat(man_file.filelocation)
self.convert_man_to_html(man_file, package)
except gzip.BadGzipFile as e:
# print(f"{e}: {man_file.filelocation}")
pass
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(self.process_man_file, man_file, package) for man_file in man_files]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
# Handle exceptions if needed
pass
def process_man_file(self, man_file: ManFile, package: Package):
try:
man_file.man_text = self.zcat(man_file.filelocation)
self.convert_man_to_html(man_file, package)
except gzip.BadGzipFile as e:
# print(f"{e}: {man_file.filelocation}")
pass
def convert_man_to_html(self, man_file: ManFile, package: Package):
process = subprocess.Popen(
@@ -202,7 +214,6 @@ class RepoManager:
repo.enabled = self.enabled
repo.gpgcheck = self.gpgcheck
self.base.repos.add(repo)
print(f"Repository added: {repo.name}")
self.base.fill_sack(load_system_repo=False, load_available_repos=True)
@@ -272,12 +283,15 @@ class RepoManager:
print(f"Error downloading package: {e}")
return
for package in packages:
download_url = urljoin(package.baseurl, package.location)
download_path = self.download_dir / f"{package.filename}"
package.download_path = download_path
self.download_file(download_url, download_path)
if not download_path.exists():
download_url = urljoin(package.baseurl, package.location)
self.download_file(download_url, download_path)
# Process the package immediately after downloading
print(f"Extracting files from {package.filename}...")
man_maker.extract_man_files(package)
return package
@@ -286,11 +300,14 @@ class RepoManager:
packages = self.list_packages_object()
downloaded_files = []
for package in packages:
try:
downloaded_files.append(self.download_package(package.name, man_maker))
except Exception as e:
print(f"Error downloading package: {e}")
with ThreadPoolExecutor() as executor:
future_to_package = {executor.submit(self.download_package, package.name, man_maker): package for package in packages}
for future in as_completed(future_to_package):
package = future_to_package[future]
try:
downloaded_files.append(future.result())
except Exception as e:
print(f"Error downloading package {package.name}: {e}")
return downloaded_files
@@ -305,8 +322,7 @@ def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
json.dump(sorted_sitemap, f)
# Save the gzipped JSON file
gzipped_file_location = f"{json_file_location}.gz"
with gzip.open(gzipped_file_location, "wt") as gz:
with gzip.open(f"{json_file_location}.gz", "wt") as gz:
json.dump(sorted_sitemap, gz)
def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: