Implement concurrent processing for man file extraction and package downloads

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
This commit is contained in:
Stephen Simpson
2025-01-14 15:38:44 -06:00
parent 8a652fca4f
commit 5248edad62

View File

@@ -11,6 +11,8 @@ from urllib.parse import urljoin
from typing import List, Dict, Any, Callable from typing import List, Dict, Any, Callable
from pathlib import Path from pathlib import Path
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
sitemap = {} sitemap = {}
@@ -77,22 +79,32 @@ class ManMaker:
for member in rpm.getmembers(): for member in rpm.getmembers():
if "/man/" in member.name: if "/man/" in member.name:
man_file = ManFile(filelocation=extract_dir / member.name) man_file = ManFile(filelocation=extract_dir / member.name)
man_file.filelocation.parent.mkdir(parents=True, exist_ok=True) if not man_file.filelocation.exists():
with open(man_file.filelocation, "wb") as f: man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
f.write(rpm.extractfile(member).read()) with open(man_file.filelocation, "wb") as f:
f.write(rpm.extractfile(member).read())
man_files.append(man_file) man_files.append(man_file)
self.get_man_file_contents(package, man_files) self.get_man_file_contents(package, man_files)
def get_man_file_contents(self, package: Package, man_files: List[ManFile]): def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
for man_file in man_files: with concurrent.futures.ThreadPoolExecutor() as executor:
try: futures = [executor.submit(self.process_man_file, man_file, package) for man_file in man_files]
man_file.man_text = self.zcat(man_file.filelocation) for future in concurrent.futures.as_completed(futures):
self.convert_man_to_html(man_file, package) try:
except gzip.BadGzipFile as e: future.result()
# print(f"{e}: {man_file.filelocation}") except Exception as e:
pass # Handle exceptions if needed
pass
def process_man_file(self, man_file: ManFile, package: Package):
try:
man_file.man_text = self.zcat(man_file.filelocation)
self.convert_man_to_html(man_file, package)
except gzip.BadGzipFile as e:
# print(f"{e}: {man_file.filelocation}")
pass
def convert_man_to_html(self, man_file: ManFile, package: Package): def convert_man_to_html(self, man_file: ManFile, package: Package):
process = subprocess.Popen( process = subprocess.Popen(
['mandoc', '-T', 'html', '-O', 'fragment,toc'], ['mandoc', '-T', 'html', '-O', 'fragment,toc'],
@@ -202,7 +214,6 @@ class RepoManager:
repo.enabled = self.enabled repo.enabled = self.enabled
repo.gpgcheck = self.gpgcheck repo.gpgcheck = self.gpgcheck
self.base.repos.add(repo) self.base.repos.add(repo)
print(f"Repository added: {repo.name}")
self.base.fill_sack(load_system_repo=False, load_available_repos=True) self.base.fill_sack(load_system_repo=False, load_available_repos=True)
@@ -272,12 +283,15 @@ class RepoManager:
print(f"Error downloading package: {e}") print(f"Error downloading package: {e}")
return return
for package in packages: for package in packages:
download_url = urljoin(package.baseurl, package.location)
download_path = self.download_dir / f"{package.filename}" download_path = self.download_dir / f"{package.filename}"
package.download_path = download_path package.download_path = download_path
self.download_file(download_url, download_path)
if not download_path.exists():
download_url = urljoin(package.baseurl, package.location)
self.download_file(download_url, download_path)
# Process the package immediately after downloading # Process the package immediately after downloading
print(f"Extracting files from {package.filename}...")
man_maker.extract_man_files(package) man_maker.extract_man_files(package)
return package return package
@@ -286,11 +300,14 @@ class RepoManager:
packages = self.list_packages_object() packages = self.list_packages_object()
downloaded_files = [] downloaded_files = []
for package in packages: with ThreadPoolExecutor() as executor:
try: future_to_package = {executor.submit(self.download_package, package.name, man_maker): package for package in packages}
downloaded_files.append(self.download_package(package.name, man_maker)) for future in as_completed(future_to_package):
except Exception as e: package = future_to_package[future]
print(f"Error downloading package: {e}") try:
downloaded_files.append(future.result())
except Exception as e:
print(f"Error downloading package {package.name}: {e}")
return downloaded_files return downloaded_files
@@ -305,8 +322,7 @@ def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
json.dump(sorted_sitemap, f) json.dump(sorted_sitemap, f)
# Save the gzipped JSON file # Save the gzipped JSON file
gzipped_file_location = f"{json_file_location}.gz" with gzip.open(f"{json_file_location}.gz", "wt") as gz:
with gzip.open(gzipped_file_location, "wt") as gz:
json.dump(sorted_sitemap, gz) json.dump(sorted_sitemap, gz)
def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: