commit 3fe94dc48e1116dee38707e4fb9e660f7a0dde72
parent 31f86e498437b2d848a6a0ca624d9b99f322155e
Author: egor-achkasov <eaachkasov@gmail.com>
Date: Fri, 20 Feb 2026 20:03:49 +0000
Impl html export
Diffstat:
| A | src/export.rs | | | 205 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | src/main.rs | | | 70 | +++++++++++++++++----------------------------------------------------- |
| A | template.html | | | 99 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 321 insertions(+), 53 deletions(-)
diff --git a/src/export.rs b/src/export.rs
@@ -0,0 +1,205 @@
+use crate::post::Post;
+
+fn html_escape(s: &str) -> String {
+ s.replace('&', "&")
+ .replace('<', "<")
+ .replace('>', ">")
+ .replace('"', """)
+}
+
+/// Converts plain post text to HTML.
+/// - `>>id` → reply link anchor
+/// - Lines starting with `>` (not `>>digit`) → greentext span
+/// - `\n` → `<br>`
+fn render_text_to_html(text: &str) -> String {
+ static RE_REPLY: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
+ regex::Regex::new(r">>(\d+)").unwrap()
+ });
+
+ let lines: Vec<String> = text.split('\n').map(|line| {
+ let escaped = html_escape(line);
+ // Greentext: starts with > but not >>digit
+ let processed = if escaped.starts_with(">") && !escaped.starts_with(">>") {
+ format!("<span class=\"quote\">{}</span>", escaped)
+ } else {
+ escaped
+ };
+ // Reply links: >>id
+ RE_REPLY.replace_all(&processed, |caps: ®ex::Captures| {
+ let id = &caps[1];
+ format!("<a href=\"#post{}\" class=\"reply-link\">>>{}</a>", id, id)
+ }).into_owned()
+ }).collect();
+
+ lines.join("<br>\n")
+}
+
+/// Export the thread to a simple static HTML
+///
+/// Creates a directory as follows:
+/// ./{thread_name}, where {thread_name} is OP subject or first 20 characters of OP text
+/// If download_files is true, downloads files to ./{thread_name}/files
+/// If download_thumbnails is true, downloads thumbnails to ./{thread_name}/thumb
+///
+/// WARNING: If the directory already exists, it will be overwritten
+pub async fn export2html(
+ posts: Vec<Post>,
+ download_files: bool,
+ download_thumbnails: bool,
+) -> Result<(), Box<dyn std::error::Error>> {
+ if posts.is_empty() {
+ return Err("No posts to export".into());
+ }
+
+ // Get thread name
+ let thread_name: String = format!(
+ "{}_{}",
+ posts[0].subject.clone().unwrap_or_else(|| posts[0].text.chars().take(20).collect()),
+ posts[0].id
+ );
+
+ // Create directories
+ let dir = format!("{}", thread_name);
+ std::fs::create_dir_all(&dir)?;
+
+ // Render the thread
+ let posts_html: String = posts
+ .iter()
+ .map(|p| render_post(p, download_files, download_thumbnails))
+ .collect::<Vec<String>>()
+ .join("\n");
+ // Download files
+ if download_files {
+ let dir = format!("{}/files", dir);
+ std::fs::create_dir_all(&dir)?;
+ for (f, filename) in posts.iter().flat_map(|p| &p.files)
+ .filter_map(|f| f.url.split('/').last().map(|name| (f, name)))
+ {
+ let path = format!("{}/{}", dir, filename);
+ download(&f.url, &path).await?;
+ }
+ }
+ // Download thumbnails
+ if download_thumbnails {
+ let dir = format!("{}/thumb", dir);
+ std::fs::create_dir_all(&dir)?;
+ for (f, filename) in posts.iter().flat_map(|p| &p.files)
+ .filter_map(|f| f.url_thumb.split('/').last().map(|name| (f, name)))
+ {
+ let path = format!("{}/{}", dir, filename);
+ download(&f.url, &path).await?;
+ }
+ }
+
+ // Insert the posts html into a template and write as index.html
+ let template = std::fs::read_to_string("template.html")?
+ .replace("{{thread_name}}", &thread_name)
+ .replace("{{posts}}", &posts_html);
+ std::fs::write(format!("{}/index.html", dir), template)?;
+
+ Ok(())
+}
+
+fn render_post(
+ post: &Post,
+ download_files: bool,
+ download_thumbnails: bool,
+) -> String {
+ let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id);
+
+ html.push_str(" <div class=\"post-head\">\n");
+
+ // Sage
+ if let Some(ref mailto) = post.mailto {
+ if mailto.contains("sage") {
+ html.push_str(" <span class=\"sage\">[sage]</span>\n");
+ }
+ }
+
+ // Subject
+ if let Some(ref subject) = post.subject {
+ html.push_str(&format!(
+ " <span class=\"post-subject\">{}</span>\n",
+ html_escape(subject)
+ ));
+ }
+
+ // Name /w mailto/sage
+ let name = post.name.as_deref().unwrap_or("Аноним");
+ let name_display = if let Some(ref mailto) = post.mailto {
+ format!("[mailto:{}] {}", mailto, name)
+ } else {
+ name.to_string()
+ };
+ html.push_str(&format!(
+ " <span class=\"post-name\">{}</span>\n",
+ html_escape(&name_display)
+ ));
+
+ // Time, num, id
+ html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time)));
+ html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num)));
+ html.push_str(&format!(
+ " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n",
+ post.id
+ ));
+
+ html.push_str(" </div>\n");
+
+ // Images
+ html.push_str(&render_images(&post.files, download_files, download_thumbnails));
+
+ // Body
+ html.push_str(" <div class=\"post-body\">\n");
+ if !post.text.is_empty() {
+ html.push_str(" ");
+ html.push_str(&render_text_to_html(&post.text));
+ html.push('\n');
+ }
+ html.push_str(" </div>\n");
+
+ html.push_str("</div>\n");
+ html
+}
+
+fn render_images(
+ files: &[crate::file::File],
+ download_files: bool,
+ download_thumbnails: bool,
+) -> String {
+ if files.is_empty() {
+ return String::new();
+ }
+
+ let mut html = String::from(" <div class=\"post-images\">\n");
+ for file in files {
+ let href = if download_files && !file.url.is_empty() {
+ format!("files/{}", file.name_timestamp)
+ } else {
+ file.url.clone()
+ };
+
+ let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string();
+ let img_src = if download_thumbnails && !file.url_thumb.is_empty() {
+ format!("thumb/{}", thumb_filename)
+ } else {
+ file.url_thumb.clone()
+ };
+
+ html.push_str(&format!(
+ " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n </div>\n",
+ html_escape(&href),
+ html_escape(&file.name_orig),
+ html_escape(&img_src),
+ ));
+ }
+ html.push_str(" </div>\n");
+ html
+}
+
+
+async fn download(url: &str, path: &str) -> Result<(), Box<dyn std::error::Error>> {
+ let bytes = reqwest::get(url).await?.bytes().await?;
+ std::fs::write(path, &bytes)?;
+ Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
@@ -1,57 +1,24 @@
mod parse_args;
mod post;
mod file;
+mod export;
use parse_args::{Config, parse_args};
use post::Post;
-use std::fs;
-
-const ARHIVACH_DOMAIN_NAME: &str = "arhivach.vc";
-
-async fn download_html(url: &str) -> Result<String, reqwest::Error> {
- let response = reqwest::get(url).await?;
- let html = response.text().await?;
- Ok(html)
-}
-
-/// Validate and sanitize arhivach thread URL
-/// param url: URL to validate (https?://arhivach\.vc/thread/\d{7}/?)
-/// Returns None if the URL is invalid
-/// Returns Some(thread_number) if the URL is valid
-fn validate_and_sanitize_url(url: &str) -> Option<u32> {
- let url = url.trim().trim_end_matches('/');
- let parts: Vec<&str> = url.split('/').collect();
-
- // Expect: ["https:" or "http:", "", "arhivach.vc", "thread", "<number>"]
- if parts.len() != 5
- || parts[0] != "https:" && parts[0] != "http:"
- || parts[2] != ARHIVACH_DOMAIN_NAME
- || parts[3] != "thread" {
- return None;
- }
-
- parts[4].parse::<u32>().ok()
-}
-
async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::error::Error>> {
- let thread_number = validate_and_sanitize_url(url)
- .ok_or_else(|| format!("invalid URL: {}", url))?;
- let html = download_html(url).await?;
-
- let dir = thread_number.to_string();
- fs::create_dir_all(&dir)?;
+ // Validate URL (expect https?://arhivach\.vc/thread/\d{7}/?)
+ let is_valid = matches!(
+ url.trim().trim_end_matches('/').split('/').collect::<Vec<_>>().as_slice(),
+ ["https:" | "http:", "", "arhivach.vc", "thread", _]
+ );
+ if !is_valid {
+ return Err("invalid URL".into());
+ }
- // Get posts
+ let html = reqwest::get(url).await?.text().await?;
let posts = Post::parse_posts(&html)?;
-
- // DELETE
- for post in posts {
- println!("{}", post);
- // wait for user to press any button
- let mut input = String::new();
- std::io::stdin().read_line(&mut input)?;
- }
+ export::export2html(posts, config.files, config.thumb).await?;
Ok(())
}
@@ -59,19 +26,16 @@ async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::er
#[tokio::main]
async fn main() {
- let config = match parse_args() {
- Ok(c) => c,
- Err(e) => {
- eprintln!("Error: {}", e);
+ let config = parse_args()
+ .unwrap_or_else(|e| {
+ eprintln!("Error parsing arguments: {}", e);
std::process::exit(1);
- }
- };
+ });
for (i, url) in config.urls.iter().enumerate() {
println!("Processing: {} ({} / {})", url, i + 1, config.urls.len());
- if let Err(e) = scrape_thread(url, &config).await {
- eprintln!("Error processing {}: {}", url, e);
- }
+ scrape_thread(url, &config).await
+ .unwrap_or_else(|e| eprintln!("Error processing {}: {}", url, e));
}
println!("Done");
diff --git a/template.html b/template.html
@@ -0,0 +1,99 @@
+<!DOCTYPE html>
+<html lang="ru">
+<head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{thread_title}} - Thread Archive</title>
+ <style>
+ * { box-sizing: border-box; }
+ body {
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif;
+ line-height: 1.5;
+ max-width: 900px;
+ margin: 0 auto;
+ padding: 1rem;
+ background: #f5f5f5;
+ color: #333;
+ }
+ .thread-title {
+ font-size: 1.5rem;
+ margin: 0 0 1rem 0;
+ padding-bottom: 0.5rem;
+ border-bottom: 2px solid #ccc;
+ }
+ .post {
+ background: #fff;
+ border: 1px solid #ddd;
+ border-radius: 4px;
+ margin-bottom: 0.75rem;
+ padding: 0.75rem;
+ }
+ .post-head {
+ font-size: 0.85rem;
+ color: #666;
+ margin-bottom: 0.5rem;
+ }
+ .post-subject {
+ font-weight: bold;
+ color: #c41;
+ margin-right: 0.5rem;
+ }
+ .post-name {
+ font-weight: bold;
+ color: #117743;
+ }
+ .post-time {
+ margin-left: 0.5rem;
+ }
+ .post-num {
+ margin-left: 0.5rem;
+ color: #999;
+ }
+ .post-id a {
+ margin-left: 0.5rem;
+ color: #c41;
+ text-decoration: none;
+ }
+ .post-id a:hover { text-decoration: underline; }
+ .post-images {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 0.5rem;
+ margin-bottom: 0.5rem;
+ }
+ .post-image {
+ max-width: 200px;
+ }
+ .post-image a {
+ display: block;
+ }
+ .post-image img {
+ max-width: 100%;
+ max-height: 200px;
+ border: 1px solid #ccc;
+ border-radius: 2px;
+ }
+ .post-image img:hover {
+ border-color: #c41;
+ }
+ .post-body {
+ word-wrap: break-word;
+ }
+ .post-body a {
+ color: #c41;
+ text-decoration: none;
+ }
+ .post-body a:hover { text-decoration: underline; }
+ .quote { color: #789922; }
+ .reply-link { color: #c41; }
+ .sage { color: #a00; font-style: italic; }
+ </style>
+</head>
+<body>
+
+<h1 class="thread-title">{{thread_title}}</h1>
+
+{{posts}}
+
+</body>
+</html>