This commit is contained in:
Stephen Simpson
2025-01-04 08:18:27 -06:00
commit 2287678798
16 changed files with 1534 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@@ -0,0 +1,8 @@
downloads/
export/
html/
html_data/
html_data2/
repo
rockyman/
tmp/

39
README.md Normal file
View File

@@ -0,0 +1,39 @@
To create a persistent pod
```
podman create -it --name rocky-9-man -v $(pwd):/data/ rockylinux:9 /bin/bash
podman exec -it rocky-9-man /bin/bash
```
To create a temp pod
```
podman run --rm -it -v $(pwd):/data/ rockylinux:9 /bin/bash
```
Then `cd /data`
Install Dependencies
```
dnf install -y epel-release
dnf install -y python3 python3-dnf python3-rpm python3-requests python3-pip python3-jinja2 python3-aiohttp python3-zstandard mandoc
```
Set alternative python if you need to
```
alternatives --set python $(which python3)
```
And run
```
python3 rocky_man.py
```
This will download all appstream and baseos for 9.5 and 8.10 into ./tmp and the finished html will be saved to ./html.
TODO:
- Add async
- Investigate "Error downloading package: 'utf-8' codec can't decode byte 0xe2 in position 220: invalid continuation byte"
- Delete files after they have been processed or at the end

View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
import sys
import argparse
import re
from bs4 import BeautifulSoup
# Simplified CSS with meaningful class names
FILTERED_CSS = """
/* General Styles */
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
background-color: #0D0A09;
color: white;
}
/* Header Styles */
.header {
background-color: #0FB981;
color: white;
padding: 1rem;
text-align: center;
}
/* Main Content Styles */
.main-content {
margin: 2rem auto;
padding: 1rem;
background-color: #282828;
color: white;
max-width: 800px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
.main-content a {
color: #0FB981;
}
.head-vol {
color: white;
}
/* Responsive Adjustments */
@media (max-width: 600px) {
.main-content {
margin: 1rem;
padding: 0.5rem;
}
}
"""
# Define the HTML template with placeholders for title, nav, left pane, content, and right pane
HTML_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{file_name} - {rpm_name} - Rocky Man Page</title>
<style>
{css}
</style>
</head>
<body>
<header class="header">
<h1>{file_name}</h1>
</header>
<main class="main-content">
{content}
</main>
</body>
</html>
"""
def clean_html(html_content):
"""
Removes existing <html>, <head>, and <body> tags from the HTML content.
"""
html_content = re.sub(r'</?html[^>]*>', '', html_content, flags=re.IGNORECASE)
html_content = re.sub(r'</?head[^>]*>', '', html_content, flags=re.IGNORECASE)
html_content = re.sub(r'</?body[^>]*>', '', html_content, flags=re.IGNORECASE)
return html_content.strip()
def add_see_also_links(html_content):
"""
Adds hyperlinks to existing See Also sections in the HTML content.
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Locate the section
sections = soup.find_all('section', class_='Sh')
# Loop through sections to find the one with "SEE ALSO"
for section in sections:
heading = section.find('h1', id="SEE_ALSO") # Look for the specific "SEE ALSO" heading
if heading: # If the heading exists in this section
extracted_content = []
for b_tag in section.find_all('b'):
text_with_parentheses = b_tag.get_text() + b_tag.next_sibling.strip() # Combine <b> text and next sibling
extracted_content.append(text_with_parentheses)
print(extracted_content)
def main():
parser = argparse.ArgumentParser(description="Wrap HTML content with a consistent theme including nav, left pane, and right pane.")
parser.add_argument('--rpm_name', type=str, help="RPM Name")
parser.add_argument('--file_name', type=str, help="File Name")
args = parser.parse_args()
# Read HTML content from stdin
input_html = sys.stdin.read()
# Extract or set the title
rpm_name = args.rpm_name
file_name = args.file_name
# Clean the HTML content
cleaned_content = clean_html(input_html)
# Add See Also links
content_with_links = add_see_also_links(cleaned_content)
# Fill the HTML template
themed_html = HTML_TEMPLATE.format(
rpm_name=rpm_name,
css=FILTERED_CSS,
file_name=file_name,
content=content_with_links
)
# Output the themed HTML to stdout
print(themed_html)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,48 @@
import os
import subprocess
from pathlib import Path
ROCKY_VERSION = "8.10"
MAN_PATH = f"./export/{ROCKY_VERSION}/"
HTML_BASE_PATH = f"./html_data2/{ROCKY_VERSION}/"
def process_file(file):
rpm_name = file.parts[3]
man_context = file.parts[7]
man_filename = file.name.replace('.gz', '').rsplit('.', 1)[0]
output_folder = Path(HTML_BASE_PATH) / rpm_name / man_context
output_folder.mkdir(parents=True, exist_ok=True)
print(man_filename)
try:
html_content = subprocess.check_output(
f'zcat "{file}" | mandoc -T html -O fragment 2>/tmp/mandoc_error.log | python3 ./apply_template.py --rpm_name "{rpm_name}" --file_name "{man_filename}"',
shell=True,
text=True
)
except subprocess.CalledProcessError:
print(f"Error processing file: {file}")
with open('/tmp/mandoc_error.log', 'r') as error_log:
print(error_log.read())
return
title = ""
for line in html_content.splitlines():
if '<h1>NAME</h1>' in line:
title = line.split('<p>')[1].split('</p>')[0].strip()
break
title = title or man_filename
if html_content:
with open(output_folder / f"{man_filename}.html", 'w') as f:
f.write(html_content)
def main():
for root, _, files in os.walk(MAN_PATH):
for file in files:
process_file(Path(root) / file)
if __name__ == "__main__":
main()

46
old_scripts/convert_man.sh Executable file
View File

@@ -0,0 +1,46 @@
#! /bin/bash
ROCKY_VERSION=8.10
MAN_PATH=./export/${ROCKY_VERSION}/
LOCAL_MAN_PATH=
HTML_BASE_PATH=./html_data/${ROCKY_VERSION}/
process_file() {
local file=$1
local rpm_name
rpm_name=$(echo "$file" | cut -d'/' -f 4)
local man_context
man_context=$(echo "$file" | cut -d'/' -f 8)
local man_filename
man_filename=$(echo "$file" | awk -F'/' '{print $NF}' | sed -e 's/.gz//g' -e 's/\.[0-9]*$//g')
local output_folder="${HTML_BASE_PATH}/${rpm_name}/${man_context}/"
echo "$man_filename"
mkdir -p "${output_folder}"
# Try to convert the file and capture any errors
# if ! html_content=$(zcat "$file" | groff -Thtml -P-D/dev/null -man 2>/tmp/groff_error.log | pandoc -f html -t html 2>/tmp/pandoc_error.log); then
if ! html_content=$(zcat "$file" | mandoc -T html -O fragment 2>/tmp/mandoc_error.log | python3 ./apply_template.py --rpm_name "$rpm_name" --file_name "$man_filename"); then
echo "Error processing file: $file"
cat /tmp/pandoc_error.log
return
fi
local title
title=$(echo "$html_content" | sed -n 's/.*<h1>NAME<\/h1>\s*<p>\(.*\)<\/p>/\1/p' | sed 's/<[^>]*>//g')
[ -z "$title" ] && title="$man_filename"
# Check if html_content is empty
if [ -n "$html_content" ]; then
echo -e "$html_content" > "${output_folder}${man_filename}.html"
# echo -e "---\ntitle: \"$title\"\n---\n$html_content" > "${output_folder}${man_filename}.html"
fi
}
export -f process_file
export HTML_BASE_PATH
find "$MAN_PATH" -type f | parallel --will-cite process_file

28
old_scripts/extract_man.sh Executable file
View File

@@ -0,0 +1,28 @@
#! /bin/bash
ROCKY_VERSION=8.10
MAN_OUTPUT=./export/${ROCKY_VERSION}/
DIRECTORY=$1
if [ -z "$DIRECTORY" ]; then
echo "Please provide the directory containing the RPM files"
exit 1
fi
mkdir -p "$MAN_OUTPUT"
extract_man_pages() {
local rpm=$1
local man_output=$2
MANCOUNT=$(rpm2cpio "$rpm" | cpio -itv --quiet | grep -c "/man/")
RPMNAME=$(rpm -qp --qf "%{NAME}\n" "$rpm")
if [ "$MANCOUNT" -ne 0 ]; then
mkdir -p "${man_output}/${RPMNAME}"
rpm2cpio "$rpm" | cpio -idmv --quiet -D "${man_output}/${RPMNAME}/" '*/man/*'
fi
}
export -f extract_man_pages
find "$DIRECTORY" -type f -name "*.rpm" | parallel --will-cite -j+0 extract_man_pages {} "$MAN_OUTPUT"

View File

@@ -0,0 +1,95 @@
import os
import json
import gzip
from string import Template
from collections import defaultdict
from fnmatch import fnmatch
from jinja2 import Environment, FileSystemLoader
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template('templates/index.j2')
directory = '/data/html_data' # Change this to your directory path
rocky_version = "8.10"
def generate_sitemap(directory):
links = defaultdict(lambda: defaultdict(dict))
for root, _, files in os.walk(directory):
for file in files:
full_filepath = os.path.join(root, file)
filepath = full_filepath.split(rocky_version, 1)[-1]
if any(fnmatch(filepath, pattern) for pattern in ['/index.html', '/links.html','/list.json*', '/sitemap*']):
continue
filepath_parts = filepath.split('/')
package_name = filepath_parts[1]
man_type = filepath_parts[2]
man_type_number = man_type.lstrip('man') if man_type.startswith('man') else man_type
command_file = filepath_parts[3]
command = command_file.split('.html', 1)[0]
if filepath.startswith('/'):
filepath = filepath[1:]
fullname = f"{package_name} - {command}({man_type_number})"
links[package_name][command] = {
"url": filepath,
"man_type": man_type,
"man_type_number": man_type_number,
"fullname": fullname
}
return links
def generate_links_html(links):
links_html = ""
for package_name in links.keys():
links_html += f"<h2>package_name</h2>"
links_html += "<ul>"
for command in links[package_name]:
url = links[package_name][command]['url']
man_type_number = links[package_name][command]['man_type_number']
links_html += f"<li><a href='{url}'>{command}</a>({man_type_number})</li>"
links_html += "</ul>"
data = {
'title': f"Rocky Man Page - {rocky_version}",
'header_title': f"Rocky Man Page - {rocky_version}",
'main_content': f"{links_html}"
}
return template.render(data)
def convert_sitemap_to_json(links, minify=False):
# data
# for package_name in links.keys():
# for command in links[package_name]:
# # Add command details to sitemap
# sitemap[package_name][command] = {
# "url": filepath,
# "mantype": man_type,
# "fullname": fullname
# }
if minify:
return json.dumps(links, separators=(',', ':'))
return json.dumps(links, indent=4)
if __name__ == "__main__":
sitemap = generate_sitemap(directory)
# Output the links HTML page to a file
with open(f"{directory}/{rocky_version}/links.html", "w") as file:
file.write(generate_links_html(sitemap))
# Output the list JSON to a file
with open(f"{directory}/{rocky_version}/list.json", "w") as file:
file.write(convert_sitemap_to_json(sitemap, minify=True))
# Gzip the JSON file
with gzip.open(f"{directory}/{rocky_version}/list.json.gz", "wb") as f_out:
f_out.write(convert_sitemap_to_json(sitemap, minify=True).encode('utf-8'))

View File

@@ -0,0 +1,32 @@
from jinja2 import Environment, FileSystemLoader
import os
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template('page.j2')
# Define the data to pass to the template
data = {
'title': 'Rocky Man Page - 8.10',
'header_title': 'Welcome to Rocky Man Page',
'main_content': '<input type="text" id="searchInput" placeholder="Search..."><ul><li>Item 1</li><li>Item 2</li></ul>'
}
# Render the template with the data
output = template.render(data)
print(output)

View File

@@ -0,0 +1,54 @@
import os
import json
import argparse
from collections import defaultdict
rocky_version = "8.10"
def create_sitemap(directory):
sitemap = defaultdict(lambda: defaultdict(dict))
for root, dirs, files in os.walk(directory):
for file in files:
full_filepath = os.path.join(root, file)
filepath = full_filepath.split(rocky_version, 1)[-1]
# Exclude any path containing 'index.html'
if 'index.html' in filepath or 'sitemap.json' in filepath or 'sitemap.xml' in filepath or 'list.json' in filepath or 'list.json.br' in filepath:
continue
filepath_parts = filepath.split('/')
package_name = filepath_parts[1]
man_type = filepath_parts[2]
man_type_number = man_type.lstrip('man') if man_type.startswith('man') else man_type
command_file = filepath_parts[3]
command = command_file.split('.html', 1)[0]
if filepath.startswith('/'):
filepath = filepath[1:]
fullname = f"{package_name} - {command}({man_type_number})"
# Add command details to sitemap
sitemap[package_name][command] = {
"url": filepath,
"mantype": man_type,
"fullname": fullname
}
return sitemap
def convert_sitemap_to_json(sitemap, minify=False):
if minify:
return json.dumps(sitemap, separators=(',', ':'))
return json.dumps(sitemap, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Generate sitemap JSON.')
parser.add_argument('directory', type=str, help='Directory to scan for HTML files')
parser.add_argument('--minify', action='store_true', help='Export minified JSON')
args = parser.parse_args()
sitemap = create_sitemap(args.directory)
json_output = convert_sitemap_to_json(sitemap, minify=args.minify)
print(json_output)

135
old_scripts/index_base.html Normal file
View File

@@ -0,0 +1,135 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 36 36%22><text y=%2232%22 font-size=%2232%22>🚀</text></svg>">
<title>Rocky Man Page - 8.10</title>
<script src="https://cdn.jsdelivr.net/npm/fuse.js/dist/fuse.min.js"></script>
<style>
/* General Styles */
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
background-color: #0D0A09;
color: white;
}
li {
font-size: large;
list-style-type: none;
margin-bottom: 0.5rem;
}
/* Header Styles */
.header {
background-color: #0FB981;
color: white;
padding: 1rem;
text-align: center;
}
/* Main Content Styles */
.main-content {
margin: 2rem auto;
padding: 1rem;
background-color: #282828;
color: white;
max-width: 800px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
.main-content a {
color: #0FB981;
}
.head-vol {
color: white;
}
/* Responsive Adjustments */
@media (max-width: 600px) {
.main-content {
margin: 1rem;
padding: 0.5rem;
}
}
input#searchInput {
width: 98%;
height: 2rem;
padding: 0.5rem;
border-radius: 4px;
border: 1px solid #ccc;
margin-bottom: 1rem;
font-size: 1rem;
outline: none;
transition: border-color 0.3s ease, box-shadow 0.3s ease;
}
input#searchInput:focus {
border-color: #0FB981;
box-shadow: 0 0 8px 0 #0FB981;
}
#searchInputLabel {
display: block;
font-size: larger;
margin-bottom: 1rem;
}
</style>
</head>
<body>
<header class="header">
<h1>Rocky Linux 8.10 - Man Page Listing</h1>
</header>
<main class="main-content">
<label id="searchInputLabel" for="searchInput">Search:</label>
<input id="searchInput" placeholder="Loading..." oninput="searchItems()" role="search" disabled />
<br />
<ul id="results"></ul>
</main>
<script>
let fuse;
let index;
fetch('list.json.gz')
.then(response => response.body.pipeThrough(new DecompressionStream('gzip')))
.then(stream => new Response(stream))
.then(response => response.json())
.then(data => {
const flattenedData = [];
Object.values(data).forEach(category => {
Object.values(category).forEach(item => {
flattenedData.push(item);
});
});
fuse = new Fuse(flattenedData, {
keys: ['fullname'],
threshold: 0.2
});
index = fuse.index; // Create the index
document.getElementById("searchInput").placeholder = "";
document.getElementById("searchInput").disabled = false;
});
function searchItems() {
const query = document.getElementById("searchInput").value;
const results = fuse.search(query, { limit: 50 }); // Limit results for performance
const list = document.getElementById("results");
list.innerHTML = "";
results.forEach(item => {
const li = document.createElement("li");
const a = document.createElement("a");
a.href = item.item.url;
a.textContent = item.item.fullname;
li.appendChild(a);
list.appendChild(li);
});
}
</script>
</body>
</html>

View File

@@ -0,0 +1,5 @@
beautifulsoup4==4.12.3
Jinja2==3.1.4
MarkupSafe==3.0.2
setuptools==68.2.2
soupsieve==2.6

362
rocky_man.py Normal file
View File

@@ -0,0 +1,362 @@
import requests
import dnf
import rpmfile
import pprint as pp
import gzip
import subprocess
import re
import json
import tarfile
from urllib.parse import urljoin
from typing import List, Dict, Any, Callable
from pathlib import Path
from jinja2 import Environment, FileSystemLoader
sitemap = {}
class Package:
def __init__(self, name: str, repo_type: str, chksum: str, location: str, baseurl: str, license: str, download_path: Path = None, extract_dir: Path = None):
self.name = name
self.repo_type = repo_type
self.chksum = chksum
self.location = location
self.baseurl = baseurl
self.filename = location.split("/")[-1]
self.license = license
self.download_path = download_path
self.extract_dir = extract_dir
class ManFile:
def __init__(self, filelocation: Path):
self.filelocation = filelocation
self.filename = self.filelocation.parts[-1]
self.context = self.filelocation.parts[-2]
self.context_number = str(''.join(filter(str.isdigit, self.context)))
self.regular_name = self.filename.replace(".gz","")
self.name = ".".join(self.regular_name.split(".")[:-1])
self.man_text = None
self.man_html = None
self.generated_html = None
self.html_folder_location = None
self._html_file_location = None
self.html_uri_location = ""
@property
def html_file_location(self):
return self._html_file_location
@html_file_location.setter
def html_file_location(self, value: Path):
self._html_file_location = value
if value:
self.html_uri_location = "/".join(value.parts[2:])
else:
self.html_uri_location = ""
class ManMaker:
def __init__(self, man_dir: str, html_dir: str):
self.man_dir = man_dir
self.html_dir = html_dir
def zcat(self, file_path: Path):
with gzip.open(file_path, 'rb') as f:
file_content = f.read()
return file_content.decode('utf-8')
def extract_man_files(self, package: Package):
rpm_file = package.download_path.stem
extract_dir = Path(f"{self.man_dir}/{rpm_file}")
extract_dir.mkdir(parents=True, exist_ok=True)
package.extract_dir = extract_dir
man_files = []
with rpmfile.open(package.download_path) as rpm:
for member in rpm.getmembers():
if "/man/" in member.name:
man_file = ManFile(filelocation=extract_dir / member.name)
man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
with open(man_file.filelocation, "wb") as f:
f.write(rpm.extractfile(member).read())
man_files.append(man_file)
self.get_man_file_contents(package, man_files)
def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
for man_file in man_files:
try:
man_file.man_text = self.zcat(man_file.filelocation)
self.convert_man_to_html(man_file, package)
except gzip.BadGzipFile as e:
# print(f"{e}: {man_file.filelocation}")
pass
def convert_man_to_html(self, man_file: ManFile, package: Package):
process = subprocess.Popen(
['mandoc', '-T', 'html', '-O', 'fragment,toc'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
man_file.man_html, stderr = process.communicate(input=man_file.man_text)
if process.returncode != 0:
print(f"Error converting man to HTML: {stderr}")
else:
self.clean_html(man_file, package)
def clean_html(self, man_file: ManFile, package: Package):
man_file.man_html = re.sub(r'<td class="head-ltitle">\(\)</td>', '<td class="head-ltitle"></td>', man_file.man_html)
man_file.man_html = re.sub(r'<td class="head-rtitle">\(\)</td>', '<td class="head-rtitle"></td>', man_file.man_html)
man_file.man_html.strip()
self.generate_html(man_file, package)
def clean_name(self, man_file: ManFile):
invalid_filenames = {
"..1": "..1".replace("..", "__"),
":.1": ":.1".replace(":.", "_"),
"[.1": "[.1".replace("[", "(").replace(".", "_")
}
cleaned_name = man_file.regular_name
if cleaned_name in invalid_filenames:
cleaned_name = invalid_filenames[cleaned_name]
return cleaned_name
def generate_html(self, man_file: ManFile, package: Package):
env = setup_jinja()
template = env.get_template("man_page.j2")
data = {
'title': f'{man_file.name} - {package.name} - Rocky Man Page',
'header_title': f'{man_file.name}',
'main_content': man_file.man_html
}
man_file.generated_html = template.render(data)
self.save_html(man_file, package)
def save_html(self, man_file: ManFile, package: Package):
man_file.html_folder_location = html_folder_export(man_file, package, self.html_dir)
man_file.html_folder_location.mkdir(parents=True, exist_ok=True)
man_file.html_file_location = man_file.html_folder_location / f"{self.clean_name(man_file)}.html"
with open(man_file.html_file_location, "w") as f:
f.write(man_file.generated_html)
# print(f"Saved HTML to {man_file.html_file_location}")
self.update_sitemap(man_file, package)
def update_sitemap(self, man_file: ManFile, package: Package):
global sitemap
if package.name not in sitemap:
sitemap[package.name] = {}
sitemap[package.name][man_file.name] = {
"url": str(man_file.html_uri_location),
"man_type": man_file.context,
"man_type_number": man_file.context_number,
"repo_type": package.repo_type,
"fullname": f"{package.name} - {man_file.name}({man_file.context_number})"
}
class RepoManager:
def __init__(self, base_url: str, contentdir: str, releasever: str, basearch: str, repo_type: str, download_dir, enabled: bool = True, gpgcheck: bool = False):
self.base_url = base_url
self.contentdir = contentdir
self.releasever = releasever
self.basearch = basearch
self.repo_type = repo_type
self.repo_name = f"{repo_type}-{releasever}"
self.download_dir = download_dir
self.enabled = enabled
self.gpgcheck = gpgcheck
self.base = dnf.Base()
self.base.conf.debuglevel = 0
self.base.conf.errorlevel = 0
self.download_dir = Path(download_dir)
self.download_dir.mkdir(parents=True, exist_ok=True)
self._configure_repo()
def generate_repo_url(self):
repo_url = urljoin(self.base_url, f"{self.contentdir}/{self.releasever}/BaseOS/{self.basearch}/os/")
return repo_url
def print_repo_url(self):
repo_url = self.generate_repo_url()
print(f"Repository URL: {repo_url}")
def _configure_repo(self):
repo = dnf.repo.Repo(self.repo_name, self.base.conf)
repo_url = self.generate_repo_url()
repo.baseurl = [repo_url]
repo.enabled = self.enabled
repo.gpgcheck = self.gpgcheck
self.base.repos.add(repo)
self.base.fill_sack(load_system_repo=False)
def print_repo(self):
repo = self.base.repos
print(repo)
def list_packages(self) -> List[str]:
package_list = []
for pkg in self.base.sack.query().available():
package_list.append(pkg.name)
return package_list
def list_packages_raw(self):
for pkg in self.base.sack.query().available():
print(f"Package: {pkg.name}")
for attr in dir(pkg):
if not attr.startswith("_"):
print(f" {attr}: {getattr(pkg, attr)}")
print("\n")
break
def list_package_object(self, package_name: str) -> List[Package]:
pkgs = self.base.sack.query().filter(name=package_name)
if not pkgs:
raise ValueError(f"Package {package_name} not found in the repository.")
return self.generate_package_list(pkgs)
def list_packages_object(self) -> List[Package]:
pkgs = self.base.sack.query().available()
if not pkgs:
raise ValueError(f"No packages found in the repository.")
return self.generate_package_list(pkgs)
def generate_package_list(self, pkgs) -> List[Package]:
package_list = []
for pkg in pkgs:
repo = pkg.repo
package_info = Package(
name=getattr(pkg, "name", None),
repo_type=self.repo_type,
chksum=getattr(pkg, "chksum", None),
location=getattr(pkg, "location", None),
baseurl=repo.baseurl[0] if repo and repo.baseurl else None,
license=getattr(pkg, "license", None)
)
package_list.append(package_info)
return package_list
def download_file(self, download_url: str, download_path: Path):
if download_path.exists():
return
response = requests.get(download_url)
response.raise_for_status()
with open(download_path, "wb") as f:
f.write(response.content)
def download_package(self, package_name: str, man_maker: ManMaker) -> Package:
packages = self.list_package_object(package_name)
for package in packages:
download_url = urljoin(package.baseurl, package.location)
download_path = self.download_dir / f"{package.filename}"
package.download_path = download_path
self.download_file(download_url, download_path)
# Process the package immediately after downloading
man_maker.extract_man_files(package)
return package
def download_all_packages(self, man_maker: ManMaker) -> List[Package]:
packages = self.list_packages_object()
downloaded_files = []
for package in packages:
try:
downloaded_files.append(self.download_package(package.name, man_maker))
except Exception as e:
print(f"Error downloading package: {e}")
return downloaded_files
def delete_package(self, rpm_path: Path):
rpm_path.unlink()
def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
sorted_sitemap = {k: sitemap[k] for k in sorted(sitemap)}
# Save the JSON file
with open(json_file_location, "w") as f:
json.dump(sorted_sitemap, f)
# Save the gzipped JSON file
gzipped_file_location = f"{json_file_location}.gz"
with gzip.open(gzipped_file_location, "wt") as gz:
json.dump(sorted_sitemap, gz)
def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path:
return Path(f"{html_base_dir}/{package.name}/{man_file.context}")
def setup_jinja():
env = Environment(loader=FileSystemLoader('./templates'))
return env
def generate_index(releasever: str, html_dir: str):
env = setup_jinja()
template = env.get_template("index.j2")
data = {
'title': f'Rocky Linux {releasever} - Man Page Search',
'header_title': f'Rocky Linux {releasever} - Man Page Search'
}
render = template.render(data)
with open(f"{html_dir}/index.html", "w") as f:
f.write(render)
def main():
BASE_URL = "http://dl.rockylinux.org/"
CONTENTDIR = "pub/rocky"
RELEASEVERS = ["8.10", "9.5"]
BASEARCH = "aarch64"
REPO_TYPES = ["BaseOS", "AppStream"]
DOWNLOAD_BASE_DIR = "./tmp/repo"
MAN_BASE_DIR = "./tmp/export"
HTML_BASE_DIR = "./html"
for RELEASEVER in RELEASEVERS:
for REPO_TYPE in REPO_TYPES:
DOWNLOAD_DIR = f"{DOWNLOAD_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}"
MAN_DIR = f"{MAN_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}"
HTML_DIR = f"{HTML_BASE_DIR}/{RELEASEVER}"
repo_manager = RepoManager(
base_url = BASE_URL,
contentdir = CONTENTDIR,
releasever = RELEASEVER,
basearch = BASEARCH,
repo_type = REPO_TYPE,
download_dir = DOWNLOAD_DIR,
enabled = True,
gpgcheck = False
)
man_maker = ManMaker(man_dir=MAN_DIR, html_dir=HTML_DIR)
print("Downloading packages and generating HTML...")
repo_manager.download_all_packages(man_maker)
# repo_manager.download_package("at", man_maker)
generate_index(RELEASEVER, HTML_DIR)
save_json(sitemap, Path(f"{HTML_DIR}/list.json"))
if __name__ == "__main__":
main()

381
rocky_man2.py Normal file
View File

@@ -0,0 +1,381 @@
import asyncio
import aiohttp
import aiofiles
import dnf
import rpmfile
import pprint as pp
import gzip
import subprocess
import re
import json
import tarfile
from urllib.parse import urljoin
from typing import List, Dict, Any, Callable
from pathlib import Path
from jinja2 import Environment, FileSystemLoader
sitemap = {}
class Package:
def __init__(self, name: str, repo_type: str, chksum: str, location: str, baseurl: str, license: str, download_path: Path = None, extract_dir: Path = None):
self.name = name
self.repo_type = repo_type
self.chksum = chksum
self.location = location
self.baseurl = baseurl
self.filename = location.split("/")[-1]
self.license = license
self.download_path = download_path
self.extract_dir = extract_dir
class ManFile:
def __init__(self, filelocation: Path):
self.filelocation = filelocation
self.filename = self.filelocation.parts[-1]
self.context = self.filelocation.parts[-2]
self.context_number = str(''.join(filter(str.isdigit, self.context)))
self.regular_name = self.filename.replace(".gz","")
self.name = ".".join(self.regular_name.split(".")[:-1])
self.man_text = None
self.man_html = None
self.generated_html = None
self.html_folder_location = None
self._html_file_location = None
self.html_uri_location = ""
@property
def html_file_location(self):
return self._html_file_location
@html_file_location.setter
def html_file_location(self, value: Path):
self._html_file_location = value
if value:
self.html_uri_location = "/".join(value.parts[2:])
else:
self.html_uri_location = ""
class ManMaker:
def __init__(self, man_dir: str, html_dir: str):
self.man_dir = man_dir
self.html_dir = html_dir
async def zcat(self, file_path: Path):
async with aiofiles.open(file_path, 'rb') as f:
content = await f.read()
try:
return gzip.decompress(content).decode('utf-8')
except gzip.BadGzipFile:
return None
async def extract_man_files(self, package: Package):
rpm_file = package.download_path.stem
extract_dir = Path(f"{self.man_dir}/{rpm_file}")
extract_dir.mkdir(parents=True, exist_ok=True)
package.extract_dir = extract_dir
man_files = []
with rpmfile.open(package.download_path) as rpm:
for member in rpm.getmembers():
if "/man/" in member.name:
man_file = ManFile(filelocation=extract_dir / member.name)
man_file.filelocation.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(man_file.filelocation, "wb") as f:
await f.write(rpm.extractfile(member).read())
man_files.append(man_file)
await self.get_man_file_contents(package, man_files)
async def get_man_file_contents(self, package: Package, man_files: List[ManFile]):
tasks = [self.process_man_file(man_file, package) for man_file in man_files]
await asyncio.gather(*tasks)
async def process_man_file(self, man_file: ManFile, package: Package):
try:
man_file.man_text = await self.zcat(man_file.filelocation)
if man_file.man_text:
await self.convert_man_to_html(man_file, package)
except Exception as e:
print(f"Error processing {man_file.filelocation}: {e}")
async def convert_man_to_html(self, man_file: ManFile, package: Package):
process = await asyncio.create_subprocess_exec(
'mandoc', '-T', 'html', '-O', 'fragment,toc',
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate(input=man_file.man_text.encode())
man_file.man_html = stdout.decode()
if process.returncode == 0:
await self.clean_html(man_file, package)
else:
print(f"Error converting man to HTML: {stderr.decode()}")
async def clean_html(self, man_file: ManFile, package: Package):
man_file.man_html = re.sub(r'<td class="head-ltitle">\(\)</td>', '<td class="head-ltitle"></td>', man_file.man_html)
man_file.man_html = re.sub(r'<td class="head-rtitle">\(\)</td>', '<td class="head-rtitle"></td>', man_file.man_html)
man_file.man_html.strip()
await self.generate_html(man_file, package)
def clean_name(self, man_file: ManFile):
invalid_filenames = {
"..1": "..1".replace("..", "__"),
":.1": ":.1".replace(":.", "_"),
"[.1": "[.1".replace("[", "(").replace(".", "_")
}
cleaned_name = man_file.regular_name
if cleaned_name in invalid_filenames:
cleaned_name = invalid_filenames[cleaned_name]
return cleaned_name
async def generate_html(self, man_file: ManFile, package: Package):
env = setup_jinja()
template = env.get_template("man_page.j2")
data = {
'title': f'{man_file.name} - {package.name} - Rocky Man Page',
'header_title': f'{man_file.name}',
'main_content': man_file.man_html
}
man_file.generated_html = template.render(data)
await self.save_html(man_file, package)
async def save_html(self, man_file: ManFile, package: Package):
man_file.html_folder_location = html_folder_export(man_file, package, self.html_dir)
man_file.html_folder_location.mkdir(parents=True, exist_ok=True)
man_file.html_file_location = man_file.html_folder_location / f"{self.clean_name(man_file)}.html"
async with aiofiles.open(man_file.html_file_location, "w") as f:
await f.write(man_file.generated_html)
self.update_sitemap(man_file, package)
def update_sitemap(self, man_file: ManFile, package: Package):
global sitemap
if package.name not in sitemap:
sitemap[package.name] = {}
sitemap[package.name][man_file.name] = {
"url": str(man_file.html_uri_location),
"man_type": man_file.context,
"man_type_number": man_file.context_number,
"repo_type": package.repo_type,
"fullname": f"{package.name} - {man_file.name}({man_file.context_number})"
}
class RepoManager:
def __init__(self, base_url: str, contentdir: str, releasever: str, basearch: str, repo_type: str, download_dir, enabled: bool = True, gpgcheck: bool = False):
self.base_url = base_url
self.contentdir = contentdir
self.releasever = releasever
self.basearch = basearch
self.repo_type = repo_type
self.repo_name = f"{repo_type}-{releasever}"
self.download_dir = download_dir
self.enabled = enabled
self.gpgcheck = gpgcheck
self.base = dnf.Base()
self.base.conf.debuglevel = 0
self.base.conf.errorlevel = 0
self.download_dir = Path(download_dir)
self.download_dir.mkdir(parents=True, exist_ok=True)
self._configure_repo()
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
def generate_repo_url(self):
repo_url = urljoin(self.base_url, f"{self.contentdir}/{self.releasever}/BaseOS/{self.basearch}/os/")
return repo_url
def print_repo_url(self):
repo_url = self.generate_repo_url()
print(f"Repository URL: {repo_url}")
def _configure_repo(self):
repo = dnf.repo.Repo(self.repo_name, self.base.conf)
repo_url = self.generate_repo_url()
repo.baseurl = [repo_url]
repo.enabled = self.enabled
repo.gpgcheck = self.gpgcheck
self.base.repos.add(repo)
self.base.fill_sack(load_system_repo=False)
def print_repo(self):
repo = self.base.repos
print(repo)
def list_packages(self) -> List[str]:
package_list = []
for pkg in self.base.sack.query().available():
package_list.append(pkg.name)
return package_list
def list_packages_raw(self):
for pkg in self.base.sack.query().available():
print(f"Package: {pkg.name}")
for attr in dir(pkg):
if not attr.startswith("_"):
print(f" {attr}: {getattr(pkg, attr)}")
print("\n")
break
def list_package_object(self, package_name: str) -> List[Package]:
pkgs = self.base.sack.query().filter(name=package_name)
if not pkgs:
raise ValueError(f"Package {package_name} not found in the repository.")
return self.generate_package_list(pkgs)
def list_packages_object(self) -> List[Package]:
pkgs = self.base.sack.query().available()
if not pkgs:
raise ValueError(f"No packages found in the repository.")
return self.generate_package_list(pkgs)
def generate_package_list(self, pkgs) -> List[Package]:
package_list = []
for pkg in pkgs:
repo = pkg.repo
package_info = Package(
name=getattr(pkg, "name", None),
repo_type=self.repo_type,
chksum=getattr(pkg, "chksum", None),
location=getattr(pkg, "location", None),
baseurl=repo.baseurl[0] if repo and repo.baseurl else None,
license=getattr(pkg, "license", None)
)
package_list.append(package_info)
return package_list
async def download_file(self, download_url: str, download_path: Path):
if download_path.exists():
return
async with self.session.get(download_url) as response:
response.raise_for_status()
async with aiofiles.open(download_path, "wb") as f:
await f.write(await response.read())
async def download_package(self, package_name: str, man_maker: ManMaker) -> Package:
packages = self.list_package_object(package_name)
for package in packages:
download_url = urljoin(package.baseurl, package.location)
download_path = self.download_dir / f"{package.filename}"
package.download_path = download_path
await self.download_file(download_url, download_path)
await man_maker.extract_man_files(package)
return package
async def download_all_packages(self, man_maker: ManMaker) -> List[Package]:
packages = self.list_packages_object()
tasks = []
for package in packages:
try:
tasks.append(self.download_package(package.name, man_maker))
except Exception as e:
print(f"Error queueing package: {e}")
return await asyncio.gather(*tasks)
def delete_package(self, rpm_path: Path):
rpm_path.unlink()
async def save_json(sitemap: Dict[str, Dict[str, Any]], json_file_location: Path):
sorted_sitemap = {k: sitemap[k] for k in sorted(sitemap)}
async with aiofiles.open(json_file_location, "w") as f:
await f.write(json.dumps(sorted_sitemap))
gzipped_file_location = f"{json_file_location}.gz"
with gzip.open(gzipped_file_location, "wt") as gz:
json.dump(sorted_sitemap, gz)
def html_folder_export(man_file: ManFile, package: Package, html_base_dir: str) -> Path:
return Path(f"{html_base_dir}/{package.name}/{man_file.context}")
def setup_jinja():
env = Environment(loader=FileSystemLoader('./templates'))
return env
async def generate_index(releasever: str, html_dir: str):
env = setup_jinja()
template = env.get_template("index.j2")
data = {
'title': f'Rocky Linux {releasever} - Man Page Search',
'header_title': f'Rocky Linux {releasever} - Man Page Search'
}
render = template.render(data)
async with aiofiles.open(f"{html_dir}/index.html", "w") as f:
await f.write(render)
async def process_repo(base_url: str, contentdir: str, releasever: str, basearch: str,
repo_type: str, download_dir: str, man_dir: str, html_dir: str):
async with RepoManager(
base_url=base_url,
contentdir=contentdir,
releasever=releasever,
basearch=basearch,
repo_type=repo_type,
download_dir=download_dir,
enabled=True,
gpgcheck=False
) as repo_manager:
man_maker = ManMaker(man_dir=man_dir, html_dir=html_dir)
print(f"Processing {repo_type} for {releasever}...")
await repo_manager.download_all_packages(man_maker)
async def main():
BASE_URL = "https://ord.mirror.rackspace.com/"
CONTENTDIR = "rocky"
RELEASEVERS = ["8.10", "9.5"]
BASEARCH = "aarch64"
REPO_TYPES = ["BaseOS", "AppStream"]
DOWNLOAD_BASE_DIR = "./tmp/repo"
MAN_BASE_DIR = "./tmp/export"
HTML_BASE_DIR = "./html"
for RELEASEVER in RELEASEVERS:
tasks = []
for REPO_TYPE in REPO_TYPES:
DOWNLOAD_DIR = f"{DOWNLOAD_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}"
MAN_DIR = f"{MAN_BASE_DIR}/{RELEASEVER}/{REPO_TYPE}"
HTML_DIR = f"{HTML_BASE_DIR}/{RELEASEVER}"
tasks.append(process_repo(
BASE_URL, CONTENTDIR, RELEASEVER, BASEARCH,
REPO_TYPE, DOWNLOAD_DIR, MAN_DIR, HTML_DIR
))
await asyncio.gather(*tasks)
await generate_index(RELEASEVER, HTML_DIR)
await save_json(sitemap, Path(f"{HTML_DIR}/list.json"))
if __name__ == "__main__":
asyncio.run(main())

80
templates/base.j2 Normal file
View File

@@ -0,0 +1,80 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{{ title }}</title>
<link rel="icon"
href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 36 36%22><text y=%2232%22 font-size=%2232%22>🚀</text></svg>">
<script src="https://cdn.jsdelivr.net/npm/fuse.js/dist/fuse.min.js"></script>
<style>
/* Reset Styles */
* {
box-sizing: border-box;
}
/* General Styles */
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
background-color: #0D0A09;
color: white;
}
li {
font-size: large;
list-style-type: none;
margin-bottom: 0.5rem;
}
/* Header Styles */
.header {
background-color: #0FB981;
color: white;
padding: 1rem;
text-align: center;
}
/* Main Content Styles */
.main-content {
margin: 2rem auto;
padding: 1rem;
background-color: #282828;
color: white;
max-width: 800px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
.main-content a {
color: #0FB981;
}
.head-vol {
color: white;
}
.Bl-compact { #Table of Contents
list-style-type: none;
}
/* Responsive Adjustments */
@media (max-width: 600px) {
.main-content {
margin: 1rem;
padding: 0.5rem;
}
}
/* Extra CSS */
{% block extra_css %}
{% endblock %}
</style>
</head>
<body>
{% block body %}
{% endblock %}
</body>
</html>

78
templates/index.j2 Normal file
View File

@@ -0,0 +1,78 @@
{% extends "base.j2" %}
{% block extra_css %}
input#searchInput {
width: 100%;
height: 2rem;
padding: 0.5rem;
border-radius: 4px;
border: 1px solid #ccc;
margin-bottom: 1rem;
font-size: 1rem;
outline: none;
transition: border-color 0.3s ease, box-shadow 0.3s ease;
}
input#searchInput:focus {
border-color: #0FB981;
box-shadow: 0 0 8px 0 #0FB981;
}
#searchInputLabel {
display: block;
font-size: larger;
margin-bottom: 1rem;
}
{% endblock %}
{% block body %}
<header class="header">
<h1>{{ header_title }}</h1>
</header>
<main class="main-content">
<label id="searchInputLabel" for="searchInput">Search:</label>
<input id="searchInput" placeholder="Loading..." oninput="searchItems()" role="search" disabled />
<br />
<h2 id="result_header"></h2>
<ul id="results"></ul>
</main>
<script>
let fuse;
let index;
fetch('list.json.gz')
.then(response => response.body.pipeThrough(new DecompressionStream('gzip')))
.then(stream => new Response(stream))
.then(response => response.json())
.then(data => {
const flattenedData = [];
Object.values(data).forEach(category => {
Object.values(category).forEach(item => {
flattenedData.push(item);
});
});
fuse = new Fuse(flattenedData, {
keys: ['fullname'],
threshold: 0.2
});
index = fuse.index; // Create the index
document.getElementById("searchInput").placeholder = "";
document.getElementById("searchInput").disabled = false;
});
function searchItems() {
const query = document.getElementById("searchInput").value;
const results = fuse.search(query, { limit: 50 }); // Limit results for performance
const list = document.getElementById("results");
reault_header = document.getElementById("result_header");
result_header.textContent = `Results:`;
list.innerHTML = "";
results.forEach(item => {
const li = document.createElement("li");
const a = document.createElement("a");
a.href = item.item.url;
a.textContent = item.item.fullname;
li.appendChild(a);
list.appendChild(li);
});
}
</script>
{% endblock %}

9
templates/man_page.j2 Normal file
View File

@@ -0,0 +1,9 @@
{% extends "base.j2" %}
{% block body %}
<header class="header">
<h1>{{ header_title }}</h1>
</header>
<main class="main-content">
{{ main_content }}
</main>
{% endblock %}