Implement concurrent processing for man file extraction and package downloads

Signed-off-by: Stephen Simpson <ssimpson89@users.noreply.github.com>
2025-01-14 15:38:44 -06:00
parent 8a652fca4f
commit 5248edad62
1 changed files with 39 additions and 23 deletions
--- a/rocky_man.py
+++ b/rocky_man.py
@@ -11,6 +11,8 @@ from urllib.parse import urljoin
 from typing import List, Dict, Any, Callable
 from pathlib import Path
 from jinja2 import Environment, FileSystemLoader
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor, as_completed

 sitemap = {}

@@ -77,21 +79,31 @@ class ManMaker:
            for member in rpm.getmembers():
                if "/man/" in member.name:                    
                    man_file = ManFile(filelocation=extract_dir / member.name)
-                    man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
-                    with open(man_file.filelocation, "wb") as f:
-                        f.write(rpm.extractfile(member).read())
+                    if not man_file.filelocation.exists():
+                        man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
+                        with open(man_file.filelocation, "wb") as f:
+                            f.write(rpm.extractfile(member).read())
                    man_files.append(man_file)

        self.get_man_file_contents(package, man_files)

    def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
-        for man_file in man_files:
-            try:
-                man_file.man_text = self.zcat(man_file.filelocation)
-                self.convert_man_to_html(man_file, package)
-            except gzip.BadGzipFile as e:
-                # print(f"{e}: {man_file.filelocation}")
-                pass
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(self.process_man_file, man_file, package) for man_file in man_files]
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    # Handle exceptions if needed
+                    pass
+
+    def process_man_file(self, man_file: ManFile, package: Package):
+        try:
+            man_file.man_text = self.zcat(man_file.filelocation)
+            self.convert_man_to_html(man_file, package)
+        except gzip.BadGzipFile as e:
+            # print(f"{e}: {man_file.filelocation}")
+            pass

    def convert_man_to_html(self, man_file: ManFile, package: Package):
        process = subprocess.Popen(
@@ -202,7 +214,6 @@ class RepoManager:
            repo.enabled = self.enabled
            repo.gpgcheck = self.gpgcheck
            self.base.repos.add(repo)
-            print(f"Repository added: {repo.name}")
            
        self.base.fill_sack(load_system_repo=False, load_available_repos=True)

@@ -272,12 +283,15 @@ class RepoManager:
            print(f"Error downloading package: {e}")
            return
        for package in packages:
-            download_url = urljoin(package.baseurl, package.location)
            download_path = self.download_dir / f"{package.filename}"
            package.download_path = download_path
-            self.download_file(download_url, download_path)
+            
+            if not download_path.exists():
+                download_url = urljoin(package.baseurl, package.location)
+                self.download_file(download_url, download_path)
            
            # Process the package immediately after downloading
+            print(f"Extracting files from {package.filename}...")
            man_maker.extract_man_files(package)
            
            return package
@@ -286,11 +300,14 @@ class RepoManager:
        packages = self.list_packages_object()
        downloaded_files = []

-        for package in packages:
-            try:
-                downloaded_files.append(self.download_package(package.name, man_maker))
-            except Exception as e:
-                print(f"Error downloading package: {e}")
+        with ThreadPoolExecutor() as executor:
+            future_to_package = {executor.submit(self.download_package, package.name, man_maker): package for package in packages}
+            for future in as_completed(future_to_package):
+                package = future_to_package[future]
+                try:
+                    downloaded_files.append(future.result())
+                except Exception as e:
+                    print(f"Error downloading package {package.name}: {e}")

        return downloaded_files
    
@@ -305,8 +322,7 @@ def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
        json.dump(sorted_sitemap, f)
    
    # Save the gzipped JSON file
-    gzipped_file_location = f"{json_file_location}.gz"
-    with gzip.open(gzipped_file_location, "wt") as gz:
+    with gzip.open(f"{json_file_location}.gz", "wt") as gz:
        json.dump(sorted_sitemap, gz)

 def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: