Implement concurrent processing for man file extraction and package downloads

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
2025-01-14 15:38:44 -06:00
parent 8a652fca4f
commit 5248edad62
1 changed files with 39 additions and 23 deletions
--- a/rocky_man.py
+++ b/rocky_man.py
@@ -11,6 +11,8 @@ from urllib.parse import urljoin
 from typing import List, Dict, Any, Callable
 from pathlib import Path
 from jinja2 import Environment, FileSystemLoader
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor, as_completed
 sitemap = {}
@@ -77,21 +79,31 @@ class ManMaker:
            for member in rpm.getmembers():
                if "/man/" in member.name:                    
                    man_file = ManFile(filelocation=extract_dir / member.name)
-                    man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
+                    if not man_file.filelocation.exists():
-                    with open(man_file.filelocation, "wb") as f:
+                        man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
-                        f.write(rpm.extractfile(member).read())
+                        with open(man_file.filelocation, "wb") as f:
                            f.write(rpm.extractfile(member).read())
                    man_files.append(man_file)
        self.get_man_file_contents(package, man_files)
    def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
-        for man_file in man_files:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
-            try:
+            futures = [executor.submit(self.process_man_file, man_file, package) for man_file in man_files]
-                man_file.man_text = self.zcat(man_file.filelocation)
+            for future in concurrent.futures.as_completed(futures):
-                self.convert_man_to_html(man_file, package)
+                try:
-            except gzip.BadGzipFile as e:
+                    future.result()
-                # print(f"{e}: {man_file.filelocation}")
+                except Exception as e:
-                pass
+                    # Handle exceptions if needed
                    pass
    def process_man_file(self, man_file: ManFile, package: Package):
        try:
            man_file.man_text = self.zcat(man_file.filelocation)
            self.convert_man_to_html(man_file, package)
        except gzip.BadGzipFile as e:
            # print(f"{e}: {man_file.filelocation}")
            pass
    def convert_man_to_html(self, man_file: ManFile, package: Package):
        process = subprocess.Popen(
@@ -202,7 +214,6 @@ class RepoManager:
            repo.enabled = self.enabled
            repo.gpgcheck = self.gpgcheck
            self.base.repos.add(repo)
            print(f"Repository added: {repo.name}")
        self.base.fill_sack(load_system_repo=False, load_available_repos=True)
@@ -272,12 +283,15 @@ class RepoManager:
            print(f"Error downloading package: {e}")
            return
        for package in packages:
            download_url = urljoin(package.baseurl, package.location)
            download_path = self.download_dir / f"{package.filename}"
            package.download_path = download_path
-            self.download_file(download_url, download_path)
+            
            if not download_path.exists():
                download_url = urljoin(package.baseurl, package.location)
                self.download_file(download_url, download_path)
            # Process the package immediately after downloading
            print(f"Extracting files from {package.filename}...")
            man_maker.extract_man_files(package)
            return package
@@ -286,11 +300,14 @@ class RepoManager:
        packages = self.list_packages_object()
        downloaded_files = []
-        for package in packages:
+        with ThreadPoolExecutor() as executor:
-            try:
+            future_to_package = {executor.submit(self.download_package, package.name, man_maker): package for package in packages}
-                downloaded_files.append(self.download_package(package.name, man_maker))
+            for future in as_completed(future_to_package):
-            except Exception as e:
+                package = future_to_package[future]
-                print(f"Error downloading package: {e}")
+                try:
                    downloaded_files.append(future.result())
                except Exception as e:
                    print(f"Error downloading package {package.name}: {e}")
        return downloaded_files
@@ -305,8 +322,7 @@ def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
        json.dump(sorted_sitemap, f)
    # Save the gzipped JSON file
-    gzipped_file_location = f"{json_file_location}.gz"
+    with gzip.open(f"{json_file_location}.gz", "wt") as gz:
    with gzip.open(gzipped_file_location, "wt") as gz:
        json.dump(sorted_sitemap, gz)
 def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: