From 2287678798ea4bf9914a8e202f6f98af393cb944 Mon Sep 17 00:00:00 2001 From: Stephen Simpson Date: Sat, 4 Jan 2025 08:18:27 -0600 Subject: [PATCH] Init --- .gitignore | 8 + README.md | 39 ++++ old_scripts/apply_template.py | 134 ++++++++++++ old_scripts/convert_man.py | 48 +++++ old_scripts/convert_man.sh | 46 ++++ old_scripts/extract_man.sh | 28 +++ old_scripts/generate_index.py | 95 +++++++++ old_scripts/generate_jinja.py | 32 +++ old_scripts/generate_json.py | 54 +++++ old_scripts/index_base.html | 135 ++++++++++++ old_scripts/requirements.txt | 5 + rocky_man.py | 362 ++++++++++++++++++++++++++++++++ rocky_man2.py | 381 ++++++++++++++++++++++++++++++++++ templates/base.j2 | 80 +++++++ templates/index.j2 | 78 +++++++ templates/man_page.j2 | 9 + 16 files changed, 1534 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 old_scripts/apply_template.py create mode 100644 old_scripts/convert_man.py create mode 100755 old_scripts/convert_man.sh create mode 100755 old_scripts/extract_man.sh create mode 100644 old_scripts/generate_index.py create mode 100644 old_scripts/generate_jinja.py create mode 100644 old_scripts/generate_json.py create mode 100644 old_scripts/index_base.html create mode 100644 old_scripts/requirements.txt create mode 100644 rocky_man.py create mode 100644 rocky_man2.py create mode 100644 templates/base.j2 create mode 100644 templates/index.j2 create mode 100644 templates/man_page.j2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cce1d90 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +downloads/ +export/ +html/ +html_data/ +html_data2/ +repo +rockyman/ +tmp/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ade456 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +To create a persistent pod + +``` +podman create -it --name rocky-9-man -v $(pwd):/data/ rockylinux:9 /bin/bash +podman exec -it rocky-9-man /bin/bash +``` + +To create a temp pod + +``` +podman run --rm -it -v $(pwd):/data/ rockylinux:9 /bin/bash +``` + +Then `cd /data` + +Install Dependencies + +``` +dnf install -y epel-release +dnf install -y python3 python3-dnf python3-rpm python3-requests python3-pip python3-jinja2 python3-aiohttp python3-zstandard mandoc +``` + +Set alternative python if you need to + +``` +alternatives --set python $(which python3) +``` + +And run +``` +python3 rocky_man.py +``` + +This will download all appstream and baseos for 9.5 and 8.10 into ./tmp and the finished html will be saved to ./html. + +TODO: +- Add async +- Investigate "Error downloading package: 'utf-8' codec can't decode byte 0xe2 in position 220: invalid continuation byte" +- Delete files after they have been processed or at the end diff --git a/old_scripts/apply_template.py b/old_scripts/apply_template.py new file mode 100644 index 0000000..e05bea6 --- /dev/null +++ b/old_scripts/apply_template.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import re +from bs4 import BeautifulSoup + +# Simplified CSS with meaningful class names +FILTERED_CSS = """ +/* General Styles */ +body { + font-family: Arial, sans-serif; + margin: 0; + padding: 0; + background-color: #0D0A09; + color: white; +} + +/* Header Styles */ +.header { + background-color: #0FB981; + color: white; + padding: 1rem; + text-align: center; +} + +/* Main Content Styles */ +.main-content { + margin: 2rem auto; + padding: 1rem; + background-color: #282828; + color: white; + max-width: 800px; + box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); +} + +.main-content a { + color: #0FB981; +} + +.head-vol { + color: white; +} + +/* Responsive Adjustments */ +@media (max-width: 600px) { + .main-content { + margin: 1rem; + padding: 0.5rem; + } +} +""" + +# Define the HTML template with placeholders for title, nav, left pane, content, and right pane +HTML_TEMPLATE = """ + + + + {file_name} - {rpm_name} - Rocky Man Page + + + +
+

{file_name}

+
+
+ {content} +
+ + +""" + +def clean_html(html_content): + """ + Removes existing , , and tags from the HTML content. + """ + html_content = re.sub(r']*>', '', html_content, flags=re.IGNORECASE) + html_content = re.sub(r']*>', '', html_content, flags=re.IGNORECASE) + html_content = re.sub(r']*>', '', html_content, flags=re.IGNORECASE) + return html_content.strip() + +def add_see_also_links(html_content): + """ + Adds hyperlinks to existing See Also sections in the HTML content. + """ + soup = BeautifulSoup(html_content, 'html.parser') + + # Locate the section + sections = soup.find_all('section', class_='Sh') + + # Loop through sections to find the one with "SEE ALSO" + for section in sections: + heading = section.find('h1', id="SEE_ALSO") # Look for the specific "SEE ALSO" heading + if heading: # If the heading exists in this section + extracted_content = [] + for b_tag in section.find_all('b'): + text_with_parentheses = b_tag.get_text() + b_tag.next_sibling.strip() # Combine text and next sibling + extracted_content.append(text_with_parentheses) + print(extracted_content) + +def main(): + parser = argparse.ArgumentParser(description="Wrap HTML content with a consistent theme including nav, left pane, and right pane.") + parser.add_argument('--rpm_name', type=str, help="RPM Name") + parser.add_argument('--file_name', type=str, help="File Name") + args = parser.parse_args() + + # Read HTML content from stdin + input_html = sys.stdin.read() + + # Extract or set the title + rpm_name = args.rpm_name + file_name = args.file_name + + # Clean the HTML content + cleaned_content = clean_html(input_html) + + # Add See Also links + content_with_links = add_see_also_links(cleaned_content) + + # Fill the HTML template + themed_html = HTML_TEMPLATE.format( + rpm_name=rpm_name, + css=FILTERED_CSS, + file_name=file_name, + content=content_with_links + ) + + # Output the themed HTML to stdout + print(themed_html) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/old_scripts/convert_man.py b/old_scripts/convert_man.py new file mode 100644 index 0000000..4677fb5 --- /dev/null +++ b/old_scripts/convert_man.py @@ -0,0 +1,48 @@ +import os +import subprocess +from pathlib import Path + +ROCKY_VERSION = "8.10" +MAN_PATH = f"./export/{ROCKY_VERSION}/" +HTML_BASE_PATH = f"./html_data2/{ROCKY_VERSION}/" + +def process_file(file): + rpm_name = file.parts[3] + man_context = file.parts[7] + man_filename = file.name.replace('.gz', '').rsplit('.', 1)[0] + + output_folder = Path(HTML_BASE_PATH) / rpm_name / man_context + output_folder.mkdir(parents=True, exist_ok=True) + + print(man_filename) + + try: + html_content = subprocess.check_output( + f'zcat "{file}" | mandoc -T html -O fragment 2>/tmp/mandoc_error.log | python3 ./apply_template.py --rpm_name "{rpm_name}" --file_name "{man_filename}"', + shell=True, + text=True + ) + except subprocess.CalledProcessError: + print(f"Error processing file: {file}") + with open('/tmp/mandoc_error.log', 'r') as error_log: + print(error_log.read()) + return + + title = "" + for line in html_content.splitlines(): + if '

NAME

' in line: + title = line.split('

')[1].split('

')[0].strip() + break + title = title or man_filename + + if html_content: + with open(output_folder / f"{man_filename}.html", 'w') as f: + f.write(html_content) + +def main(): + for root, _, files in os.walk(MAN_PATH): + for file in files: + process_file(Path(root) / file) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/old_scripts/convert_man.sh b/old_scripts/convert_man.sh new file mode 100755 index 0000000..10b7b5d --- /dev/null +++ b/old_scripts/convert_man.sh @@ -0,0 +1,46 @@ +#! /bin/bash + +ROCKY_VERSION=8.10 +MAN_PATH=./export/${ROCKY_VERSION}/ +LOCAL_MAN_PATH= +HTML_BASE_PATH=./html_data/${ROCKY_VERSION}/ + +process_file() { + local file=$1 + + local rpm_name + rpm_name=$(echo "$file" | cut -d'/' -f 4) + local man_context + man_context=$(echo "$file" | cut -d'/' -f 8) + local man_filename + man_filename=$(echo "$file" | awk -F'/' '{print $NF}' | sed -e 's/.gz//g' -e 's/\.[0-9]*$//g') + + local output_folder="${HTML_BASE_PATH}/${rpm_name}/${man_context}/" + + echo "$man_filename" + + mkdir -p "${output_folder}" + + # Try to convert the file and capture any errors + # if ! html_content=$(zcat "$file" | groff -Thtml -P-D/dev/null -man 2>/tmp/groff_error.log | pandoc -f html -t html 2>/tmp/pandoc_error.log); then + if ! html_content=$(zcat "$file" | mandoc -T html -O fragment 2>/tmp/mandoc_error.log | python3 ./apply_template.py --rpm_name "$rpm_name" --file_name "$man_filename"); then + echo "Error processing file: $file" + cat /tmp/pandoc_error.log + return + fi + + local title + title=$(echo "$html_content" | sed -n 's/.*

NAME<\/h1>\s*

\(.*\)<\/p>/\1/p' | sed 's/<[^>]*>//g') + [ -z "$title" ] && title="$man_filename" + + # Check if html_content is empty + if [ -n "$html_content" ]; then + echo -e "$html_content" > "${output_folder}${man_filename}.html" + # echo -e "---\ntitle: \"$title\"\n---\n$html_content" > "${output_folder}${man_filename}.html" + fi +} + +export -f process_file +export HTML_BASE_PATH + +find "$MAN_PATH" -type f | parallel --will-cite process_file \ No newline at end of file diff --git a/old_scripts/extract_man.sh b/old_scripts/extract_man.sh new file mode 100755 index 0000000..89c16e8 --- /dev/null +++ b/old_scripts/extract_man.sh @@ -0,0 +1,28 @@ +#! /bin/bash + +ROCKY_VERSION=8.10 +MAN_OUTPUT=./export/${ROCKY_VERSION}/ +DIRECTORY=$1 + +if [ -z "$DIRECTORY" ]; then + echo "Please provide the directory containing the RPM files" + exit 1 +fi + +mkdir -p "$MAN_OUTPUT" + +extract_man_pages() { + local rpm=$1 + local man_output=$2 + + MANCOUNT=$(rpm2cpio "$rpm" | cpio -itv --quiet | grep -c "/man/") + RPMNAME=$(rpm -qp --qf "%{NAME}\n" "$rpm") + if [ "$MANCOUNT" -ne 0 ]; then + mkdir -p "${man_output}/${RPMNAME}" + rpm2cpio "$rpm" | cpio -idmv --quiet -D "${man_output}/${RPMNAME}/" '*/man/*' + fi +} + +export -f extract_man_pages + +find "$DIRECTORY" -type f -name "*.rpm" | parallel --will-cite -j+0 extract_man_pages {} "$MAN_OUTPUT" \ No newline at end of file diff --git a/old_scripts/generate_index.py b/old_scripts/generate_index.py new file mode 100644 index 0000000..82b6bfc --- /dev/null +++ b/old_scripts/generate_index.py @@ -0,0 +1,95 @@ +import os +import json +import gzip +from string import Template +from collections import defaultdict +from fnmatch import fnmatch +from jinja2 import Environment, FileSystemLoader + +env = Environment(loader=FileSystemLoader('.')) +template = env.get_template('templates/index.j2') + +directory = '/data/html_data' # Change this to your directory path +rocky_version = "8.10" + +def generate_sitemap(directory): + links = defaultdict(lambda: defaultdict(dict)) + for root, _, files in os.walk(directory): + for file in files: + full_filepath = os.path.join(root, file) + filepath = full_filepath.split(rocky_version, 1)[-1] + + if any(fnmatch(filepath, pattern) for pattern in ['/index.html', '/links.html','/list.json*', '/sitemap*']): + continue + + filepath_parts = filepath.split('/') + package_name = filepath_parts[1] + man_type = filepath_parts[2] + man_type_number = man_type.lstrip('man') if man_type.startswith('man') else man_type + command_file = filepath_parts[3] + command = command_file.split('.html', 1)[0] + + if filepath.startswith('/'): + filepath = filepath[1:] + + fullname = f"{package_name} - {command}({man_type_number})" + + links[package_name][command] = { + "url": filepath, + "man_type": man_type, + "man_type_number": man_type_number, + "fullname": fullname + } + + return links + +def generate_links_html(links): + links_html = "" + + for package_name in links.keys(): + links_html += f"

package_name

" + links_html += "" + + data = { + 'title': f"Rocky Man Page - {rocky_version}", + 'header_title': f"Rocky Man Page - {rocky_version}", + 'main_content': f"{links_html}" + } + + return template.render(data) + +def convert_sitemap_to_json(links, minify=False): + # data + # for package_name in links.keys(): + # for command in links[package_name]: + + # # Add command details to sitemap + # sitemap[package_name][command] = { + # "url": filepath, + # "mantype": man_type, + # "fullname": fullname + # } + + if minify: + return json.dumps(links, separators=(',', ':')) + return json.dumps(links, indent=4) + +if __name__ == "__main__": + sitemap = generate_sitemap(directory) + + # Output the links HTML page to a file + with open(f"{directory}/{rocky_version}/links.html", "w") as file: + file.write(generate_links_html(sitemap)) + + # Output the list JSON to a file + with open(f"{directory}/{rocky_version}/list.json", "w") as file: + file.write(convert_sitemap_to_json(sitemap, minify=True)) + + # Gzip the JSON file + with gzip.open(f"{directory}/{rocky_version}/list.json.gz", "wb") as f_out: + f_out.write(convert_sitemap_to_json(sitemap, minify=True).encode('utf-8')) \ No newline at end of file diff --git a/old_scripts/generate_jinja.py b/old_scripts/generate_jinja.py new file mode 100644 index 0000000..5499a42 --- /dev/null +++ b/old_scripts/generate_jinja.py @@ -0,0 +1,32 @@ +from jinja2 import Environment, FileSystemLoader +import os + +env = Environment(loader=FileSystemLoader('.')) +template = env.get_template('page.j2') + +# Define the data to pass to the template +data = { + 'title': 'Rocky Man Page - 8.10', + 'header_title': 'Welcome to Rocky Man Page', + 'main_content': '' +} + + + + + + + + + + + + + + + + +# Render the template with the data +output = template.render(data) + +print(output) \ No newline at end of file diff --git a/old_scripts/generate_json.py b/old_scripts/generate_json.py new file mode 100644 index 0000000..d293091 --- /dev/null +++ b/old_scripts/generate_json.py @@ -0,0 +1,54 @@ +import os +import json +import argparse +from collections import defaultdict + +rocky_version = "8.10" + +def create_sitemap(directory): + sitemap = defaultdict(lambda: defaultdict(dict)) + for root, dirs, files in os.walk(directory): + for file in files: + full_filepath = os.path.join(root, file) + filepath = full_filepath.split(rocky_version, 1)[-1] + + # Exclude any path containing 'index.html' + if 'index.html' in filepath or 'sitemap.json' in filepath or 'sitemap.xml' in filepath or 'list.json' in filepath or 'list.json.br' in filepath: + continue + + filepath_parts = filepath.split('/') + package_name = filepath_parts[1] + man_type = filepath_parts[2] + man_type_number = man_type.lstrip('man') if man_type.startswith('man') else man_type + command_file = filepath_parts[3] + command = command_file.split('.html', 1)[0] + + if filepath.startswith('/'): + filepath = filepath[1:] + + fullname = f"{package_name} - {command}({man_type_number})" + + # Add command details to sitemap + sitemap[package_name][command] = { + "url": filepath, + "mantype": man_type, + "fullname": fullname + } + + return sitemap + +def convert_sitemap_to_json(sitemap, minify=False): + if minify: + return json.dumps(sitemap, separators=(',', ':')) + return json.dumps(sitemap, indent=4) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate sitemap JSON.') + parser.add_argument('directory', type=str, help='Directory to scan for HTML files') + parser.add_argument('--minify', action='store_true', help='Export minified JSON') + args = parser.parse_args() + + sitemap = create_sitemap(args.directory) + json_output = convert_sitemap_to_json(sitemap, minify=args.minify) + + print(json_output) \ No newline at end of file diff --git a/old_scripts/index_base.html b/old_scripts/index_base.html new file mode 100644 index 0000000..60ca883 --- /dev/null +++ b/old_scripts/index_base.html @@ -0,0 +1,135 @@ + + + + + + + Rocky Man Page - 8.10 + + + + + +
+

Rocky Linux 8.10 - Man Page Listing

+
+
+ + +
+
    +
    + + + + + \ No newline at end of file diff --git a/old_scripts/requirements.txt b/old_scripts/requirements.txt new file mode 100644 index 0000000..c34c622 --- /dev/null +++ b/old_scripts/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4==4.12.3 +Jinja2==3.1.4 +MarkupSafe==3.0.2 +setuptools==68.2.2 +soupsieve==2.6 diff --git a/rocky_man.py b/rocky_man.py new file mode 100644 index 0000000..4b29a53 --- /dev/null +++ b/rocky_man.py @@ -0,0 +1,362 @@ +import requests +import dnf +import rpmfile +import pprint as pp +import gzip +import subprocess +import re +import json +import tarfile +from urllib.parse import urljoin +from typing import List, Dict, Any, Callable +from pathlib import Path +from jinja2 import Environment, FileSystemLoader + +sitemap = {} + +class Package: + def __init__(self, name: str, repo_type: str, chksum: str, location: str, baseurl: str, license: str, download_path: Path = None, extract_dir: Path = None): + self.name = name + self.repo_type = repo_type + self.chksum = chksum + self.location = location + self.baseurl = baseurl + self.filename = location.split("/")[-1] + self.license = license + self.download_path = download_path + self.extract_dir = extract_dir + +class ManFile: + def __init__(self, filelocation: Path): + self.filelocation = filelocation + self.filename = self.filelocation.parts[-1] + self.context = self.filelocation.parts[-2] + self.context_number = str(''.join(filter(str.isdigit, self.context))) + self.regular_name = self.filename.replace(".gz","") + self.name = ".".join(self.regular_name.split(".")[:-1]) + self.man_text = None + self.man_html = None + self.generated_html = None + self.html_folder_location = None + self._html_file_location = None + self.html_uri_location = "" + + @property + def html_file_location(self): + return self._html_file_location + + @html_file_location.setter + def html_file_location(self, value: Path): + self._html_file_location = value + if value: + self.html_uri_location = "/".join(value.parts[2:]) + else: + self.html_uri_location = "" + +class ManMaker: + def __init__(self, man_dir: str, html_dir: str): + self.man_dir = man_dir + self.html_dir = html_dir + + def zcat(self, file_path: Path): + with gzip.open(file_path, 'rb') as f: + file_content = f.read() + return file_content.decode('utf-8') + + def extract_man_files(self, package: Package): + rpm_file = package.download_path.stem + + extract_dir = Path(f"{self.man_dir}/{rpm_file}") + extract_dir.mkdir(parents=True, exist_ok=True) + package.extract_dir = extract_dir + + man_files = [] + with rpmfile.open(package.download_path) as rpm: + for member in rpm.getmembers(): + if "/man/" in member.name: + man_file = ManFile(filelocation=extract_dir / member.name) + man_file.filelocation.parent.mkdir(parents=True, exist_ok=True) + with open(man_file.filelocation, "wb") as f: + f.write(rpm.extractfile(member).read()) + man_files.append(man_file) + + self.get_man_file_contents(package, man_files) + + def get_man_file_contents(self, package: Package, man_files: List[ManFile]): + for man_file in man_files: + try: + man_file.man_text = self.zcat(man_file.filelocation) + self.convert_man_to_html(man_file, package) + except gzip.BadGzipFile as e: + # print(f"{e}: {man_file.filelocation}") + pass + + def convert_man_to_html(self, man_file: ManFile, package: Package): + process = subprocess.Popen( + ['mandoc', '-T', 'html', '-O', 'fragment,toc'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + man_file.man_html, stderr = process.communicate(input=man_file.man_text) + if process.returncode != 0: + print(f"Error converting man to HTML: {stderr}") + else: + self.clean_html(man_file, package) + + def clean_html(self, man_file: ManFile, package: Package): + man_file.man_html = re.sub(r'\(\)', '', man_file.man_html) + man_file.man_html = re.sub(r'\(\)', '', man_file.man_html) + man_file.man_html.strip() + self.generate_html(man_file, package) + + def clean_name(self, man_file: ManFile): + invalid_filenames = { + "..1": "..1".replace("..", "__"), + ":.1": ":.1".replace(":.", "_"), + "[.1": "[.1".replace("[", "(").replace(".", "_") + } + + cleaned_name = man_file.regular_name + if cleaned_name in invalid_filenames: + cleaned_name = invalid_filenames[cleaned_name] + + return cleaned_name + + def generate_html(self, man_file: ManFile, package: Package): + env = setup_jinja() + template = env.get_template("man_page.j2") + + data = { + 'title': f'{man_file.name} - {package.name} - Rocky Man Page', + 'header_title': f'{man_file.name}', + 'main_content': man_file.man_html + } + + man_file.generated_html = template.render(data) + self.save_html(man_file, package) + + def save_html(self, man_file: ManFile, package: Package): + man_file.html_folder_location = html_folder_export(man_file, package, self.html_dir) + man_file.html_folder_location.mkdir(parents=True, exist_ok=True) + + man_file.html_file_location = man_file.html_folder_location / f"{self.clean_name(man_file)}.html" + + with open(man_file.html_file_location, "w") as f: + f.write(man_file.generated_html) + # print(f"Saved HTML to {man_file.html_file_location}") + + self.update_sitemap(man_file, package) + + def update_sitemap(self, man_file: ManFile, package: Package): + global sitemap + if package.name not in sitemap: + sitemap[package.name] = {} + sitemap[package.name][man_file.name] = { + "url": str(man_file.html_uri_location), + "man_type": man_file.context, + "man_type_number": man_file.context_number, + "repo_type": package.repo_type, + "fullname": f"{package.name} - {man_file.name}({man_file.context_number})" + } + +class RepoManager: + def __init__(self, base_url: str, contentdir: str, releasever: str, basearch: str, repo_type: str, download_dir, enabled: bool = True, gpgcheck: bool = False): + self.base_url = base_url + self.contentdir = contentdir + self.releasever = releasever + self.basearch = basearch + self.repo_type = repo_type + self.repo_name = f"{repo_type}-{releasever}" + + self.download_dir = download_dir + + self.enabled = enabled + self.gpgcheck = gpgcheck + + self.base = dnf.Base() + self.base.conf.debuglevel = 0 + self.base.conf.errorlevel = 0 + + self.download_dir = Path(download_dir) + self.download_dir.mkdir(parents=True, exist_ok=True) + self._configure_repo() + + def generate_repo_url(self): + repo_url = urljoin(self.base_url, f"{self.contentdir}/{self.releasever}/BaseOS/{self.basearch}/os/") + return repo_url + + def print_repo_url(self): + repo_url = self.generate_repo_url() + print(f"Repository URL: {repo_url}") + + def _configure_repo(self): + repo = dnf.repo.Repo(self.repo_name, self.base.conf) + repo_url = self.generate_repo_url() + repo.baseurl = [repo_url] + repo.enabled = self.enabled + repo.gpgcheck = self.gpgcheck + self.base.repos.add(repo) + self.base.fill_sack(load_system_repo=False) + + def print_repo(self): + repo = self.base.repos + print(repo) + + def list_packages(self) -> List[str]: + package_list = [] + for pkg in self.base.sack.query().available(): + package_list.append(pkg.name) + return package_list + + def list_packages_raw(self): + for pkg in self.base.sack.query().available(): + print(f"Package: {pkg.name}") + for attr in dir(pkg): + if not attr.startswith("_"): + print(f" {attr}: {getattr(pkg, attr)}") + print("\n") + break + + def list_package_object(self, package_name: str) -> List[Package]: + pkgs = self.base.sack.query().filter(name=package_name) + + if not pkgs: + raise ValueError(f"Package {package_name} not found in the repository.") + + return self.generate_package_list(pkgs) + + def list_packages_object(self) -> List[Package]: + pkgs = self.base.sack.query().available() + + if not pkgs: + raise ValueError(f"No packages found in the repository.") + + return self.generate_package_list(pkgs) + + def generate_package_list(self, pkgs) -> List[Package]: + package_list = [] + for pkg in pkgs: + repo = pkg.repo + package_info = Package( + name=getattr(pkg, "name", None), + repo_type=self.repo_type, + chksum=getattr(pkg, "chksum", None), + location=getattr(pkg, "location", None), + baseurl=repo.baseurl[0] if repo and repo.baseurl else None, + license=getattr(pkg, "license", None) + ) + package_list.append(package_info) + return package_list + + def download_file(self, download_url: str, download_path: Path): + if download_path.exists(): + return + + response = requests.get(download_url) + response.raise_for_status() + with open(download_path, "wb") as f: + f.write(response.content) + + def download_package(self, package_name: str, man_maker: ManMaker) -> Package: + packages = self.list_package_object(package_name) + + for package in packages: + download_url = urljoin(package.baseurl, package.location) + download_path = self.download_dir / f"{package.filename}" + package.download_path = download_path + self.download_file(download_url, download_path) + + # Process the package immediately after downloading + man_maker.extract_man_files(package) + + return package + + def download_all_packages(self, man_maker: ManMaker) -> List[Package]: + packages = self.list_packages_object() + downloaded_files = [] + + for package in packages: + try: + downloaded_files.append(self.download_package(package.name, man_maker)) + except Exception as e: + print(f"Error downloading package: {e}") + + return downloaded_files + + def delete_package(self, rpm_path: Path): + rpm_path.unlink() + +def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path): + sorted_sitemap = {k: sitemap[k] for k in sorted(sitemap)} + + # Save the JSON file + with open(json_file_location, "w") as f: + json.dump(sorted_sitemap, f) + + # Save the gzipped JSON file + gzipped_file_location = f"{json_file_location}.gz" + with gzip.open(gzipped_file_location, "wt") as gz: + json.dump(sorted_sitemap, gz) + +def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: + return Path(f"{html_base_dir}/{package.name}/{man_file.context}") + +def setup_jinja(): + env = Environment(loader=FileSystemLoader('./templates')) + return env + +def generate_index(releasever: str, html_dir: str): + env = setup_jinja() + template = env.get_template("index.j2") + + data = { + 'title': f'Rocky Linux {releasever} - Man Page Search', + 'header_title': f'Rocky Linux {releasever} - Man Page Search' + } + + render = template.render(data) + with open(f"{html_dir}/index.html", "w") as f: + f.write(render) + +def main(): + BASE_URL = "http://dl.rockylinux.org/" + CONTENTDIR = "pub/rocky" + RELEASEVERS = ["8.10", "9.5"] + BASEARCH = "aarch64" + REPO_TYPES = ["BaseOS", "AppStream"] + DOWNLOAD_BASE_DIR = "./tmp/repo" + MAN_BASE_DIR = "./tmp/export" + HTML_BASE_DIR = "./html" + + for RELEASEVER in RELEASEVERS: + for REPO_TYPE in REPO_TYPES: + DOWNLOAD_DIR = f"{DOWNLOAD_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}" + MAN_DIR = f"{MAN_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}" + HTML_DIR = f"{HTML_BASE_DIR}/{RELEASEVER}" + + repo_manager = RepoManager( + base_url = BASE_URL, + contentdir = CONTENTDIR, + releasever = RELEASEVER, + basearch = BASEARCH, + repo_type = REPO_TYPE, + download_dir = DOWNLOAD_DIR, + enabled = True, + gpgcheck = False + ) + + man_maker = ManMaker(man_dir=MAN_DIR, html_dir=HTML_DIR) + + print("Downloading packages and generating HTML...") + repo_manager.download_all_packages(man_maker) + # repo_manager.download_package("at", man_maker) + + generate_index(RELEASEVER, HTML_DIR) + save_json(sitemap, Path(f"{HTML_DIR}/list.json")) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rocky_man2.py b/rocky_man2.py new file mode 100644 index 0000000..a526ee9 --- /dev/null +++ b/rocky_man2.py @@ -0,0 +1,381 @@ +import asyncio +import aiohttp +import aiofiles +import dnf +import rpmfile +import pprint as pp +import gzip +import subprocess +import re +import json +import tarfile +from urllib.parse import urljoin +from typing import List, Dict, Any, Callable +from pathlib import Path +from jinja2 import Environment, FileSystemLoader + +sitemap = {} + +class Package: + def __init__(self, name: str, repo_type: str, chksum: str, location: str, baseurl: str, license: str, download_path: Path = None, extract_dir: Path = None): + self.name = name + self.repo_type = repo_type + self.chksum = chksum + self.location = location + self.baseurl = baseurl + self.filename = location.split("/")[-1] + self.license = license + self.download_path = download_path + self.extract_dir = extract_dir + +class ManFile: + def __init__(self, filelocation: Path): + self.filelocation = filelocation + self.filename = self.filelocation.parts[-1] + self.context = self.filelocation.parts[-2] + self.context_number = str(''.join(filter(str.isdigit, self.context))) + self.regular_name = self.filename.replace(".gz","") + self.name = ".".join(self.regular_name.split(".")[:-1]) + self.man_text = None + self.man_html = None + self.generated_html = None + self.html_folder_location = None + self._html_file_location = None + self.html_uri_location = "" + + @property + def html_file_location(self): + return self._html_file_location + + @html_file_location.setter + def html_file_location(self, value: Path): + self._html_file_location = value + if value: + self.html_uri_location = "/".join(value.parts[2:]) + else: + self.html_uri_location = "" + +class ManMaker: + def __init__(self, man_dir: str, html_dir: str): + self.man_dir = man_dir + self.html_dir = html_dir + + async def zcat(self, file_path: Path): + async with aiofiles.open(file_path, 'rb') as f: + content = await f.read() + try: + return gzip.decompress(content).decode('utf-8') + except gzip.BadGzipFile: + return None + + async def extract_man_files(self, package: Package): + rpm_file = package.download_path.stem + + extract_dir = Path(f"{self.man_dir}/{rpm_file}") + extract_dir.mkdir(parents=True, exist_ok=True) + package.extract_dir = extract_dir + + man_files = [] + with rpmfile.open(package.download_path) as rpm: + for member in rpm.getmembers(): + if "/man/" in member.name: + man_file = ManFile(filelocation=extract_dir / member.name) + man_file.filelocation.parent.mkdir(parents=True, exist_ok=True) + async with aiofiles.open(man_file.filelocation, "wb") as f: + await f.write(rpm.extractfile(member).read()) + man_files.append(man_file) + + await self.get_man_file_contents(package, man_files) + + async def get_man_file_contents(self, package: Package, man_files: List[ManFile]): + tasks = [self.process_man_file(man_file, package) for man_file in man_files] + await asyncio.gather(*tasks) + + async def process_man_file(self, man_file: ManFile, package: Package): + try: + man_file.man_text = await self.zcat(man_file.filelocation) + if man_file.man_text: + await self.convert_man_to_html(man_file, package) + except Exception as e: + print(f"Error processing {man_file.filelocation}: {e}") + + async def convert_man_to_html(self, man_file: ManFile, package: Package): + process = await asyncio.create_subprocess_exec( + 'mandoc', '-T', 'html', '-O', 'fragment,toc', + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + + stdout, stderr = await process.communicate(input=man_file.man_text.encode()) + man_file.man_html = stdout.decode() + + if process.returncode == 0: + await self.clean_html(man_file, package) + else: + print(f"Error converting man to HTML: {stderr.decode()}") + + async def clean_html(self, man_file: ManFile, package: Package): + man_file.man_html = re.sub(r'\(\)', '', man_file.man_html) + man_file.man_html = re.sub(r'\(\)', '', man_file.man_html) + man_file.man_html.strip() + await self.generate_html(man_file, package) + + def clean_name(self, man_file: ManFile): + invalid_filenames = { + "..1": "..1".replace("..", "__"), + ":.1": ":.1".replace(":.", "_"), + "[.1": "[.1".replace("[", "(").replace(".", "_") + } + + cleaned_name = man_file.regular_name + if cleaned_name in invalid_filenames: + cleaned_name = invalid_filenames[cleaned_name] + + return cleaned_name + + async def generate_html(self, man_file: ManFile, package: Package): + env = setup_jinja() + template = env.get_template("man_page.j2") + + data = { + 'title': f'{man_file.name} - {package.name} - Rocky Man Page', + 'header_title': f'{man_file.name}', + 'main_content': man_file.man_html + } + + man_file.generated_html = template.render(data) + await self.save_html(man_file, package) + + async def save_html(self, man_file: ManFile, package: Package): + man_file.html_folder_location = html_folder_export(man_file, package, self.html_dir) + man_file.html_folder_location.mkdir(parents=True, exist_ok=True) + + man_file.html_file_location = man_file.html_folder_location / f"{self.clean_name(man_file)}.html" + + async with aiofiles.open(man_file.html_file_location, "w") as f: + await f.write(man_file.generated_html) + + self.update_sitemap(man_file, package) + + def update_sitemap(self, man_file: ManFile, package: Package): + global sitemap + if package.name not in sitemap: + sitemap[package.name] = {} + sitemap[package.name][man_file.name] = { + "url": str(man_file.html_uri_location), + "man_type": man_file.context, + "man_type_number": man_file.context_number, + "repo_type": package.repo_type, + "fullname": f"{package.name} - {man_file.name}({man_file.context_number})" + } + +class RepoManager: + def __init__(self, base_url: str, contentdir: str, releasever: str, basearch: str, repo_type: str, download_dir, enabled: bool = True, gpgcheck: bool = False): + self.base_url = base_url + self.contentdir = contentdir + self.releasever = releasever + self.basearch = basearch + self.repo_type = repo_type + self.repo_name = f"{repo_type}-{releasever}" + + self.download_dir = download_dir + + self.enabled = enabled + self.gpgcheck = gpgcheck + + self.base = dnf.Base() + self.base.conf.debuglevel = 0 + self.base.conf.errorlevel = 0 + + self.download_dir = Path(download_dir) + self.download_dir.mkdir(parents=True, exist_ok=True) + self._configure_repo() + self.session = None + + async def __aenter__(self): + self.session = aiohttp.ClientSession() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self.session: + await self.session.close() + + def generate_repo_url(self): + repo_url = urljoin(self.base_url, f"{self.contentdir}/{self.releasever}/BaseOS/{self.basearch}/os/") + return repo_url + + def print_repo_url(self): + repo_url = self.generate_repo_url() + print(f"Repository URL: {repo_url}") + + def _configure_repo(self): + repo = dnf.repo.Repo(self.repo_name, self.base.conf) + repo_url = self.generate_repo_url() + repo.baseurl = [repo_url] + repo.enabled = self.enabled + repo.gpgcheck = self.gpgcheck + self.base.repos.add(repo) + self.base.fill_sack(load_system_repo=False) + + def print_repo(self): + repo = self.base.repos + print(repo) + + def list_packages(self) -> List[str]: + package_list = [] + for pkg in self.base.sack.query().available(): + package_list.append(pkg.name) + return package_list + + def list_packages_raw(self): + for pkg in self.base.sack.query().available(): + print(f"Package: {pkg.name}") + for attr in dir(pkg): + if not attr.startswith("_"): + print(f" {attr}: {getattr(pkg, attr)}") + print("\n") + break + + def list_package_object(self, package_name: str) -> List[Package]: + pkgs = self.base.sack.query().filter(name=package_name) + + if not pkgs: + raise ValueError(f"Package {package_name} not found in the repository.") + + return self.generate_package_list(pkgs) + + def list_packages_object(self) -> List[Package]: + pkgs = self.base.sack.query().available() + + if not pkgs: + raise ValueError(f"No packages found in the repository.") + + return self.generate_package_list(pkgs) + + def generate_package_list(self, pkgs) -> List[Package]: + package_list = [] + for pkg in pkgs: + repo = pkg.repo + package_info = Package( + name=getattr(pkg, "name", None), + repo_type=self.repo_type, + chksum=getattr(pkg, "chksum", None), + location=getattr(pkg, "location", None), + baseurl=repo.baseurl[0] if repo and repo.baseurl else None, + license=getattr(pkg, "license", None) + ) + package_list.append(package_info) + return package_list + + async def download_file(self, download_url: str, download_path: Path): + if download_path.exists(): + return + + async with self.session.get(download_url) as response: + response.raise_for_status() + async with aiofiles.open(download_path, "wb") as f: + await f.write(await response.read()) + + async def download_package(self, package_name: str, man_maker: ManMaker) -> Package: + packages = self.list_package_object(package_name) + + for package in packages: + download_url = urljoin(package.baseurl, package.location) + download_path = self.download_dir / f"{package.filename}" + package.download_path = download_path + await self.download_file(download_url, download_path) + + await man_maker.extract_man_files(package) + + return package + + async def download_all_packages(self, man_maker: ManMaker) -> List[Package]: + packages = self.list_packages_object() + tasks = [] + + for package in packages: + try: + tasks.append(self.download_package(package.name, man_maker)) + except Exception as e: + print(f"Error queueing package: {e}") + + return await asyncio.gather(*tasks) + + def delete_package(self, rpm_path: Path): + rpm_path.unlink() + +async def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path): + sorted_sitemap = {k: sitemap[k] for k in sorted(sitemap)} + + async with aiofiles.open(json_file_location, "w") as f: + await f.write(json.dumps(sorted_sitemap)) + + gzipped_file_location = f"{json_file_location}.gz" + with gzip.open(gzipped_file_location, "wt") as gz: + json.dump(sorted_sitemap, gz) + +def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path: + return Path(f"{html_base_dir}/{package.name}/{man_file.context}") + +def setup_jinja(): + env = Environment(loader=FileSystemLoader('./templates')) + return env + +async def generate_index(releasever: str, html_dir: str): + env = setup_jinja() + template = env.get_template("index.j2") + + data = { + 'title': f'Rocky Linux {releasever} - Man Page Search', + 'header_title': f'Rocky Linux {releasever} - Man Page Search' + } + + render = template.render(data) + async with aiofiles.open(f"{html_dir}/index.html", "w") as f: + await f.write(render) + +async def process_repo(base_url: str, contentdir: str, releasever: str, basearch: str, + repo_type: str, download_dir: str, man_dir: str, html_dir: str): + async with RepoManager( + base_url=base_url, + contentdir=contentdir, + releasever=releasever, + basearch=basearch, + repo_type=repo_type, + download_dir=download_dir, + enabled=True, + gpgcheck=False + ) as repo_manager: + man_maker = ManMaker(man_dir=man_dir, html_dir=html_dir) + print(f"Processing {repo_type} for {releasever}...") + await repo_manager.download_all_packages(man_maker) + +async def main(): + BASE_URL = "https://ord.mirror.rackspace.com/" + CONTENTDIR = "rocky" + RELEASEVERS = ["8.10", "9.5"] + BASEARCH = "aarch64" + REPO_TYPES = ["BaseOS", "AppStream"] + DOWNLOAD_BASE_DIR = "./tmp/repo" + MAN_BASE_DIR = "./tmp/export" + HTML_BASE_DIR = "./html" + + for RELEASEVER in RELEASEVERS: + tasks = [] + for REPO_TYPE in REPO_TYPES: + DOWNLOAD_DIR = f"{DOWNLOAD_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}" + MAN_DIR = f"{MAN_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}" + HTML_DIR = f"{HTML_BASE_DIR}/{RELEASEVER}" + + tasks.append(process_repo( + BASE_URL, CONTENTDIR, RELEASEVER, BASEARCH, + REPO_TYPE, DOWNLOAD_DIR, MAN_DIR, HTML_DIR + )) + + await asyncio.gather(*tasks) + await generate_index(RELEASEVER, HTML_DIR) + await save_json(sitemap, Path(f"{HTML_DIR}/list.json")) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/templates/base.j2 b/templates/base.j2 new file mode 100644 index 0000000..32ad293 --- /dev/null +++ b/templates/base.j2 @@ -0,0 +1,80 @@ + + + + + + {{ title }} + + + + + + + {% block body %} + {% endblock %} + + + \ No newline at end of file diff --git a/templates/index.j2 b/templates/index.j2 new file mode 100644 index 0000000..72a1ef1 --- /dev/null +++ b/templates/index.j2 @@ -0,0 +1,78 @@ +{% extends "base.j2" %} +{% block extra_css %} + input#searchInput { + width: 100%; + height: 2rem; + padding: 0.5rem; + border-radius: 4px; + border: 1px solid #ccc; + margin-bottom: 1rem; + font-size: 1rem; + outline: none; + transition: border-color 0.3s ease, box-shadow 0.3s ease; + } + + input#searchInput:focus { + border-color: #0FB981; + box-shadow: 0 0 8px 0 #0FB981; + } + + #searchInputLabel { + display: block; + font-size: larger; + margin-bottom: 1rem; + } +{% endblock %} +{% block body %} +
    +

    {{ header_title }}

    +
    +
    + + +
    +

    +
      +
      + +{% endblock %} \ No newline at end of file diff --git a/templates/man_page.j2 b/templates/man_page.j2 new file mode 100644 index 0000000..d459fd9 --- /dev/null +++ b/templates/man_page.j2 @@ -0,0 +1,9 @@ +{% extends "base.j2" %} +{% block body %} +
      +

      {{ header_title }}

      +
      +
      + {{ main_content }} +
      +{% endblock %} \ No newline at end of file