386 lines
14 KiB
Python
386 lines
14 KiB
Python
import requests
|
|
import dnf
|
|
import rpmfile
|
|
import pprint as pp
|
|
import gzip
|
|
import subprocess
|
|
import re
|
|
import json
|
|
import tarfile
|
|
from urllib.parse import urljoin
|
|
from typing import List, Dict, Any, Callable
|
|
from pathlib import Path
|
|
from jinja2 import Environment, FileSystemLoader
|
|
import concurrent.futures
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
sitemap = {}
|
|
|
|
class Package:
|
|
def __lt__(self, other):
|
|
return self.name < other.name
|
|
def __init__(self, name: str, repo_type: str, chksum: str, location: str, baseurl: str, license: str, download_path: Path = None, extract_dir: Path = None):
|
|
self.name = name
|
|
self.repo_type = repo_type
|
|
self.chksum = chksum
|
|
self.location = location
|
|
self.baseurl = baseurl
|
|
self.filename = location.split("/")[-1]
|
|
self.license = license
|
|
self.download_path = download_path
|
|
self.extract_dir = extract_dir
|
|
|
|
class ManFile:
|
|
def __init__(self, filelocation: Path):
|
|
self.filelocation = filelocation
|
|
self.filename = self.filelocation.parts[-1]
|
|
self.context = self.filelocation.parts[-2]
|
|
self.context_number = str(''.join(filter(str.isdigit, self.context)))
|
|
self.regular_name = self.filename.replace(".gz","")
|
|
self.name = ".".join(self.regular_name.split(".")[:-1])
|
|
self.man_text = None
|
|
self.man_html = None
|
|
self.generated_html = None
|
|
self.html_folder_location = None
|
|
self._html_file_location = None
|
|
self.html_uri_location = ""
|
|
|
|
@property
|
|
def html_file_location(self):
|
|
return self._html_file_location
|
|
|
|
@html_file_location.setter
|
|
def html_file_location(self, value: Path):
|
|
self._html_file_location = value
|
|
if value:
|
|
self.html_uri_location = "/".join(value.parts[2:])
|
|
else:
|
|
self.html_uri_location = ""
|
|
|
|
class ManMaker:
|
|
def __init__(self, man_dir: str, html_dir: str):
|
|
self.man_dir = man_dir
|
|
self.html_dir = html_dir
|
|
|
|
def zcat(self, file_path: Path):
|
|
with gzip.open(file_path, 'rb') as f:
|
|
file_content = f.read()
|
|
return file_content.decode('utf-8')
|
|
|
|
def extract_man_files(self, package: Package):
|
|
rpm_file = package.download_path.stem
|
|
|
|
extract_dir = Path(f"{self.man_dir}/{rpm_file}")
|
|
extract_dir.mkdir(parents=True, exist_ok=True)
|
|
package.extract_dir = extract_dir
|
|
|
|
man_files = []
|
|
with rpmfile.open(package.download_path) as rpm:
|
|
for member in rpm.getmembers():
|
|
if "/man/" in member.name:
|
|
man_file = ManFile(filelocation=extract_dir / member.name)
|
|
if not man_file.filelocation.exists():
|
|
man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(man_file.filelocation, "wb") as f:
|
|
f.write(rpm.extractfile(member).read())
|
|
man_files.append(man_file)
|
|
|
|
self.get_man_file_contents(package, man_files)
|
|
|
|
def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
futures = [executor.submit(self.process_man_file, man_file, package) for man_file in man_files]
|
|
for future in concurrent.futures.as_completed(futures):
|
|
try:
|
|
future.result()
|
|
except Exception as e:
|
|
# Handle exceptions if needed
|
|
pass
|
|
|
|
def process_man_file(self, man_file: ManFile, package: Package):
|
|
try:
|
|
man_file.man_text = self.zcat(man_file.filelocation)
|
|
self.convert_man_to_html(man_file, package)
|
|
except gzip.BadGzipFile as e:
|
|
# print(f"{e}: {man_file.filelocation}")
|
|
pass
|
|
|
|
def convert_man_to_html(self, man_file: ManFile, package: Package):
|
|
process = subprocess.Popen(
|
|
['mandoc', '-T', 'html', '-O', 'fragment,toc'],
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True
|
|
)
|
|
|
|
man_file.man_html, stderr = process.communicate(input=man_file.man_text)
|
|
if process.returncode != 0:
|
|
print(f"Error converting man to HTML: {stderr}")
|
|
else:
|
|
self.clean_html(man_file, package)
|
|
|
|
def clean_html(self, man_file: ManFile, package: Package):
|
|
man_file.man_html = re.sub(r'<td class="head-ltitle">\(\)</td>', '<td class="head-ltitle"></td>', man_file.man_html)
|
|
man_file.man_html = re.sub(r'<td class="head-rtitle">\(\)</td>', '<td class="head-rtitle"></td>', man_file.man_html)
|
|
man_file.man_html.strip()
|
|
self.generate_html(man_file, package)
|
|
|
|
def clean_name(self, man_file: ManFile):
|
|
invalid_filenames = {
|
|
"..1": "..1".replace("..", "__"),
|
|
":.1": ":.1".replace(":.", "_"),
|
|
"[.1": "[.1".replace("[", "(").replace(".", "_")
|
|
}
|
|
|
|
cleaned_name = man_file.regular_name
|
|
if cleaned_name in invalid_filenames:
|
|
cleaned_name = invalid_filenames[cleaned_name]
|
|
|
|
return cleaned_name
|
|
|
|
def generate_html(self, man_file: ManFile, package: Package):
|
|
env = setup_jinja()
|
|
template = env.get_template("man_page.j2")
|
|
|
|
data = {
|
|
'title': f'{man_file.name} - {package.name} - Rocky Man Page',
|
|
'header_title': f'{man_file.name}',
|
|
'main_content': man_file.man_html
|
|
}
|
|
|
|
man_file.generated_html = template.render(data)
|
|
self.save_html(man_file, package)
|
|
|
|
def save_html(self, man_file: ManFile, package: Package):
|
|
man_file.html_folder_location = html_folder_export(man_file, package, self.html_dir)
|
|
man_file.html_folder_location.mkdir(parents=True, exist_ok=True)
|
|
|
|
man_file.html_file_location = man_file.html_folder_location / f"{self.clean_name(man_file)}.html"
|
|
|
|
with open(man_file.html_file_location, "w") as f:
|
|
f.write(man_file.generated_html)
|
|
# print(f"Saved HTML to {man_file.html_file_location}")
|
|
|
|
self.update_sitemap(man_file, package)
|
|
|
|
def update_sitemap(self, man_file: ManFile, package: Package):
|
|
global sitemap
|
|
if package.name not in sitemap:
|
|
sitemap[package.name] = {}
|
|
sitemap[package.name][man_file.name] = {
|
|
"url": str(man_file.html_uri_location),
|
|
"man_type": man_file.context,
|
|
"man_type_number": man_file.context_number,
|
|
"repo_type": package.repo_type,
|
|
"fullname": f"{package.name} - {man_file.name}({man_file.context_number})"
|
|
}
|
|
|
|
class RepoManager:
|
|
def __init__(self, base_url: str, contentdir: str, releasever: str, basearch: str, repo_types: str, download_dir, enabled: bool = True, gpgcheck: bool = False):
|
|
self.base_url = base_url
|
|
self.contentdir = contentdir
|
|
self.releasever = releasever
|
|
self.basearch = basearch
|
|
self.repo_type = repo_types
|
|
|
|
self.download_dir = download_dir
|
|
|
|
self.enabled = enabled
|
|
self.gpgcheck = gpgcheck
|
|
|
|
self.base = dnf.Base()
|
|
self.base.conf.debuglevel = 0
|
|
self.base.conf.errorlevel = 0
|
|
|
|
self.download_dir = Path(download_dir)
|
|
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
self._configure_repo()
|
|
|
|
def generate_repo_url(self, repo_type: str = None):
|
|
repo_url = urljoin(self.base_url, f"{self.contentdir}/{self.releasever}/{repo_type}/{self.basearch}/os/")
|
|
return repo_url
|
|
|
|
def print_repo_url(self):
|
|
repo_url = self.generate_repo_url()
|
|
print(f"Repository URL: {repo_url}")
|
|
|
|
def _configure_repo(self):
|
|
for repo_type in self.repo_type:
|
|
self.repo_name = f"{repo_type}-{self.releasever}"
|
|
repo = dnf.repo.Repo(self.repo_name, self.base.conf)
|
|
repo_url = self.generate_repo_url(repo_type)
|
|
repo.baseurl = [repo_url]
|
|
repo.enabled = self.enabled
|
|
repo.gpgcheck = self.gpgcheck
|
|
self.base.repos.add(repo)
|
|
|
|
self.base.fill_sack(load_system_repo=False, load_available_repos=True)
|
|
|
|
def print_repo(self):
|
|
repo = self.base.repos
|
|
print(repo)
|
|
|
|
def list_packages(self) -> List[str]:
|
|
package_list = []
|
|
for pkg in self.base.sack.query().available():
|
|
package_list.append(pkg.name)
|
|
return package_list
|
|
|
|
def list_packages_raw(self):
|
|
for pkg in self.base.sack.query().available():
|
|
print(f"Package: {pkg.name}")
|
|
for attr in dir(pkg):
|
|
if not attr.startswith("_"):
|
|
print(f" {attr}: {getattr(pkg, attr)}")
|
|
print("\n")
|
|
break
|
|
|
|
def list_package_object(self, package_name: str) -> List[Package]:
|
|
pkgs = self.base.sack.query().filter(name=package_name)
|
|
|
|
if not pkgs:
|
|
raise ValueError(f"Package {package_name} not found in the repository.")
|
|
|
|
return self.generate_package_list(pkgs)
|
|
|
|
def list_packages_object(self) -> List[Package]:
|
|
pkgs = self.base.sack.query().available()
|
|
|
|
if not pkgs:
|
|
raise ValueError(f"No packages found in the repository.")
|
|
|
|
return self.generate_package_list(pkgs)
|
|
|
|
def generate_package_list(self, pkgs) -> List[Package]:
|
|
package_list = []
|
|
for pkg in pkgs:
|
|
repo = pkg.repo
|
|
package_info = Package(
|
|
name=getattr(pkg, "name", None),
|
|
repo_type=self.repo_type,
|
|
chksum=getattr(pkg, "chksum", None),
|
|
location=getattr(pkg, "location", None),
|
|
baseurl=repo.baseurl[0] if repo and repo.baseurl else None,
|
|
license=getattr(pkg, "license", None)
|
|
)
|
|
package_list.append(package_info)
|
|
return package_list
|
|
|
|
def download_file(self, download_url: str, download_path: Path):
|
|
if download_path.exists():
|
|
return
|
|
|
|
response = requests.get(download_url)
|
|
response.raise_for_status()
|
|
with open(download_path, "wb") as f:
|
|
f.write(response.content)
|
|
|
|
def download_package(self, package_name: str, man_maker: ManMaker) -> Package:
|
|
try:
|
|
packages = self.list_package_object(package_name)
|
|
except ValueError as e:
|
|
print(f"Error downloading package: {e}")
|
|
return
|
|
for package in packages:
|
|
download_path = self.download_dir / f"{package.filename}"
|
|
package.download_path = download_path
|
|
|
|
if not download_path.exists():
|
|
download_url = urljoin(package.baseurl, package.location)
|
|
self.download_file(download_url, download_path)
|
|
|
|
# Process the package immediately after downloading
|
|
print(f"Extracting files from {package.filename}...")
|
|
man_maker.extract_man_files(package)
|
|
|
|
return package
|
|
|
|
def download_all_packages(self, man_maker: ManMaker) -> List[Package]:
|
|
packages = self.list_packages_object()
|
|
downloaded_files = []
|
|
|
|
with ThreadPoolExecutor() as executor:
|
|
future_to_package = {executor.submit(self.download_package, package.name, man_maker): package for package in packages}
|
|
for future in as_completed(future_to_package):
|
|
package = future_to_package[future]
|
|
try:
|
|
downloaded_files.append(future.result())
|
|
except Exception as e:
|
|
print(f"Error downloading package {package.name}: {e}")
|
|
|
|
return downloaded_files
|
|
|
|
def delete_package(self, rpm_path: Path):
|
|
rpm_path.unlink()
|
|
|
|
def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
|
|
sorted_sitemap = {k: sitemap[k] for k in sorted(sitemap)}
|
|
|
|
# Save the JSON file
|
|
with open(json_file_location, "w") as f:
|
|
json.dump(sorted_sitemap, f)
|
|
|
|
# Save the gzipped JSON file
|
|
with gzip.open(f"{json_file_location}.gz", "wt") as gz:
|
|
json.dump(sorted_sitemap, gz)
|
|
|
|
def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path:
|
|
return Path(f"{html_base_dir}/{package.name}/{man_file.context}")
|
|
|
|
def setup_jinja():
|
|
env = Environment(loader=FileSystemLoader('./templates'))
|
|
return env
|
|
|
|
def generate_index(releasever: str, html_dir: str):
|
|
env = setup_jinja()
|
|
template = env.get_template("index.j2")
|
|
|
|
data = {
|
|
'title': f'Rocky Linux {releasever} - Man Page Search',
|
|
'header_title': f'Rocky Linux {releasever} - Man Page Search'
|
|
}
|
|
|
|
render = template.render(data)
|
|
with open(f"{html_dir}/index.html", "w") as f:
|
|
f.write(render)
|
|
|
|
def main():
|
|
BASE_URL = "http://dl.rockylinux.org/"
|
|
CONTENTDIR = "pub/rocky"
|
|
RELEASEVERS = ["8.10", "9.5"]
|
|
BASEARCH = "aarch64"
|
|
REPO_TYPES = ["BaseOS", "AppStream"]
|
|
DOWNLOAD_BASE_DIR = "./tmp/repo"
|
|
MAN_BASE_DIR = "./tmp/export"
|
|
HTML_BASE_DIR = "./html"
|
|
|
|
for RELEASEVER in RELEASEVERS:
|
|
DOWNLOAD_DIR = f"{DOWNLOAD_BASE_DIR}/{RELEASEVER}"
|
|
MAN_DIR = f"{MAN_BASE_DIR}/{RELEASEVER}"
|
|
HTML_DIR = f"{HTML_BASE_DIR}/{RELEASEVER}"
|
|
|
|
repo_manager = RepoManager(
|
|
base_url = BASE_URL,
|
|
contentdir = CONTENTDIR,
|
|
releasever = RELEASEVER,
|
|
basearch = BASEARCH,
|
|
repo_types = REPO_TYPES,
|
|
download_dir = DOWNLOAD_DIR,
|
|
enabled = True,
|
|
gpgcheck = False
|
|
)
|
|
|
|
man_maker = ManMaker(man_dir=MAN_DIR, html_dir=HTML_DIR)
|
|
|
|
print(f"Downloading packages and generating HTML for {RELEASEVER}...")
|
|
repo_manager.download_all_packages(man_maker)
|
|
# repo_manager.download_package("at", man_maker)
|
|
|
|
generate_index(RELEASEVER, HTML_DIR)
|
|
save_json(sitemap, Path(f"{HTML_DIR}/list.json"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|