Impl html export - arhivach-downloader - Download arhivach.vc threads

commit 3fe94dc48e1116dee38707e4fb9e660f7a0dde72
parent 31f86e498437b2d848a6a0ca624d9b99f322155e
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Fri, 20 Feb 2026 20:03:49 +0000

Impl html export

Diffstat:
A src/export.rs  | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/main.rs  | 70 +++++++++++++++++-----------------------------------------------------
A template.html  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 321 insertions(+), 53 deletions(-)
diff --git a/src/export.rs b/src/export.rs
@@ -0,0 +1,205 @@
+use crate::post::Post;
+
+fn html_escape(s: &str) -> String {
+    s.replace('&', "&amp;")
+     .replace('<', "&lt;")
+     .replace('>', "&gt;")
+     .replace('"', "&quot;")
+}
+
+/// Converts plain post text to HTML.
+/// - `>>id` → reply link anchor
+/// - Lines starting with `>` (not `>>digit`) → greentext span
+/// - `\n` → `<br>`
+fn render_text_to_html(text: &str) -> String {
+    static RE_REPLY: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
+        regex::Regex::new(r"&gt;&gt;(\d+)").unwrap()
+    });
+
+    let lines: Vec<String> = text.split('\n').map(|line| {
+        let escaped = html_escape(line);
+        // Greentext: starts with > but not >>digit
+        let processed = if escaped.starts_with("&gt;") && !escaped.starts_with("&gt;&gt;") {
+            format!("<span class=\"quote\">{}</span>", escaped)
+        } else {
+            escaped
+        };
+        // Reply links: >>id
+        RE_REPLY.replace_all(&processed, |caps: &regex::Captures| {
+            let id = &caps[1];
+            format!("<a href=\"#post{}\" class=\"reply-link\">&gt;&gt;{}</a>", id, id)
+        }).into_owned()
+    }).collect();
+
+    lines.join("<br>\n")
+}
+
+/// Export the thread to a simple static HTML
+///
+/// Creates a directory as follows:
+/// ./{thread_name}, where {thread_name} is OP subject or first 20 characters of OP text
+/// If download_files is true, downloads files to ./{thread_name}/files
+/// If download_thumbnails is true, downloads thumbnails to ./{thread_name}/thumb
+///
+/// WARNING: If the directory already exists, it will be overwritten
+pub async fn export2html(
+    posts: Vec<Post>,
+    download_files: bool,
+    download_thumbnails: bool,
+) -> Result<(), Box<dyn std::error::Error>> {
+    if posts.is_empty() {
+        return Err("No posts to export".into());
+    }
+
+    // Get thread name
+    let thread_name: String = format!(
+        "{}_{}",
+        posts[0].subject.clone().unwrap_or_else(|| posts[0].text.chars().take(20).collect()),
+        posts[0].id
+    );
+
+    // Create directories
+    let dir = format!("{}", thread_name);
+    std::fs::create_dir_all(&dir)?;
+
+    // Render the thread
+    let posts_html: String = posts
+        .iter()
+        .map(|p| render_post(p, download_files, download_thumbnails))
+        .collect::<Vec<String>>()
+        .join("\n");
+    // Download files
+    if download_files {
+        let dir = format!("{}/files", dir);
+        std::fs::create_dir_all(&dir)?;
+        for (f, filename) in posts.iter().flat_map(|p| &p.files)
+            .filter_map(|f| f.url.split('/').last().map(|name| (f, name)))
+        {
+            let path = format!("{}/{}", dir, filename);
+            download(&f.url, &path).await?;
+        }
+    }
+    // Download thumbnails
+    if download_thumbnails {
+        let dir = format!("{}/thumb", dir);
+        std::fs::create_dir_all(&dir)?;
+        for (f, filename) in posts.iter().flat_map(|p| &p.files)
+            .filter_map(|f| f.url_thumb.split('/').last().map(|name| (f, name)))
+        {
+            let path = format!("{}/{}", dir, filename);
+            download(&f.url, &path).await?;
+        }
+    }
+
+    // Insert the posts html into a template and write as index.html
+    let template = std::fs::read_to_string("template.html")?
+        .replace("{{thread_name}}", &thread_name)
+        .replace("{{posts}}", &posts_html);
+    std::fs::write(format!("{}/index.html", dir), template)?;
+    
+    Ok(())
+}
+
+fn render_post(
+    post: &Post,
+    download_files: bool,
+    download_thumbnails: bool,
+) -> String {
+    let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id);
+
+    html.push_str("  <div class=\"post-head\">\n");
+
+    // Sage
+    if let Some(ref mailto) = post.mailto {
+        if mailto.contains("sage") {
+            html.push_str("    <span class=\"sage\">[sage]</span>\n");
+        }
+    }
+
+    // Subject
+    if let Some(ref subject) = post.subject {
+        html.push_str(&format!(
+            "    <span class=\"post-subject\">{}</span>\n",
+            html_escape(subject)
+        ));
+    }
+
+    // Name /w mailto/sage
+    let name = post.name.as_deref().unwrap_or("Аноним");
+    let name_display = if let Some(ref mailto) = post.mailto {
+        format!("[mailto:{}] {}", mailto, name)
+    } else {
+        name.to_string()
+    };
+    html.push_str(&format!(
+        "    <span class=\"post-name\">{}</span>\n",
+        html_escape(&name_display)
+    ));
+
+    // Time, num, id
+    html.push_str(&format!("    <span class=\"post-time\">{}</span>\n", html_escape(&post.time)));
+    html.push_str(&format!("    <span class=\"post-num\">{}</span>\n", html_escape(&post.num)));
+    html.push_str(&format!(
+        "    <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n",
+        post.id
+    ));
+
+    html.push_str("  </div>\n");
+
+    // Images
+    html.push_str(&render_images(&post.files, download_files, download_thumbnails));
+
+    // Body
+    html.push_str("  <div class=\"post-body\">\n");
+    if !post.text.is_empty() {
+        html.push_str("    ");
+        html.push_str(&render_text_to_html(&post.text));
+        html.push('\n');
+    }
+    html.push_str("  </div>\n");
+
+    html.push_str("</div>\n");
+    html
+}
+
+fn render_images(
+    files: &[crate::file::File],
+    download_files: bool,
+    download_thumbnails: bool,
+) -> String {
+    if files.is_empty() {
+        return String::new();
+    }
+
+    let mut html = String::from("  <div class=\"post-images\">\n");
+    for file in files {
+        let href = if download_files && !file.url.is_empty() {
+            format!("files/{}", file.name_timestamp)
+        } else {
+            file.url.clone()
+        };
+
+        let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string();
+        let img_src = if download_thumbnails && !file.url_thumb.is_empty() {
+            format!("thumb/{}", thumb_filename)
+        } else {
+            file.url_thumb.clone()
+        };
+
+        html.push_str(&format!(
+            "    <div class=\"post-image\">\n      <a href=\"{}\" target=\"_blank\" title=\"{}\">\n        <img src=\"{}\" alt=\"\" loading=\"lazy\">\n      </a>\n    </div>\n",
+            html_escape(&href),
+            html_escape(&file.name_orig),
+            html_escape(&img_src),
+        ));
+    }
+    html.push_str("  </div>\n");
+    html
+}
+
+
+async fn download(url: &str, path: &str) -> Result<(), Box<dyn std::error::Error>> {
+    let bytes = reqwest::get(url).await?.bytes().await?;
+    std::fs::write(path, &bytes)?;
+    Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
@@ -1,57 +1,24 @@
 mod parse_args;
 mod post;
 mod file;
+mod export;
 
 use parse_args::{Config, parse_args};
 use post::Post;
 
-use std::fs;
-
-const ARHIVACH_DOMAIN_NAME: &str = "arhivach.vc";
-
-async fn download_html(url: &str) -> Result<String, reqwest::Error> {
-    let response = reqwest::get(url).await?;
-    let html = response.text().await?;
-    Ok(html)
-}
-
-/// Validate and sanitize arhivach thread URL
-/// param url: URL to validate (https?://arhivach\.vc/thread/\d{7}/?)
-/// Returns None if the URL is invalid
-/// Returns Some(thread_number) if the URL is valid
-fn validate_and_sanitize_url(url: &str) -> Option<u32> {
-    let url = url.trim().trim_end_matches('/');
-    let parts: Vec<&str> = url.split('/').collect();
-
-    // Expect: ["https:" or "http:", "", "arhivach.vc", "thread", "<number>"]
-    if parts.len() != 5 
-    || parts[0] != "https:" && parts[0] != "http:" 
-    || parts[2] != ARHIVACH_DOMAIN_NAME 
-    || parts[3] != "thread" {
-        return None;
-    }
-
-    parts[4].parse::<u32>().ok()
-}
-
 async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::error::Error>> {
-    let thread_number = validate_and_sanitize_url(url)
-        .ok_or_else(|| format!("invalid URL: {}", url))?;
-    let html = download_html(url).await?;
-
-    let dir = thread_number.to_string();
-    fs::create_dir_all(&dir)?;
+    // Validate URL (expect https?://arhivach\.vc/thread/\d{7}/?)
+    let is_valid = matches!(
+        url.trim().trim_end_matches('/').split('/').collect::<Vec<_>>().as_slice(),
+        ["https:" | "http:", "", "arhivach.vc", "thread", _]
+    );
+    if !is_valid {
+        return Err("invalid URL".into());
+    }
 
-    // Get posts
+    let html = reqwest::get(url).await?.text().await?;
     let posts = Post::parse_posts(&html)?;
-
-    // DELETE
-    for post in posts {
-        println!("{}", post);
-        // wait for user to press any button
-        let mut input = String::new();
-        std::io::stdin().read_line(&mut input)?;
-    }
+    export::export2html(posts, config.files, config.thumb).await?;
 
     Ok(())
 }
@@ -59,19 +26,16 @@ async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::er
 
 #[tokio::main]
 async fn main() {
-    let config = match parse_args() {
-        Ok(c) => c,
-        Err(e) => {
-            eprintln!("Error: {}", e);
+    let config = parse_args()
+        .unwrap_or_else(|e| {
+            eprintln!("Error parsing arguments: {}", e);
             std::process::exit(1);
-        }
-    };
+        });
 
     for (i, url) in config.urls.iter().enumerate() {
         println!("Processing: {} ({} / {})", url, i + 1, config.urls.len());
-        if let Err(e) = scrape_thread(url, &config).await {
-            eprintln!("Error processing {}: {}", url, e);
-        }
+        scrape_thread(url, &config).await
+            .unwrap_or_else(|e| eprintln!("Error processing {}: {}", url, e));
     }
 
     println!("Done");
diff --git a/template.html b/template.html
@@ -0,0 +1,99 @@
+<!DOCTYPE html>
+<html lang="ru">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>{{thread_title}} - Thread Archive</title>
+  <style>
+    * { box-sizing: border-box; }
+    body {
+      font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif;
+      line-height: 1.5;
+      max-width: 900px;
+      margin: 0 auto;
+      padding: 1rem;
+      background: #f5f5f5;
+      color: #333;
+    }
+    .thread-title {
+      font-size: 1.5rem;
+      margin: 0 0 1rem 0;
+      padding-bottom: 0.5rem;
+      border-bottom: 2px solid #ccc;
+    }
+    .post {
+      background: #fff;
+      border: 1px solid #ddd;
+      border-radius: 4px;
+      margin-bottom: 0.75rem;
+      padding: 0.75rem;
+    }
+    .post-head {
+      font-size: 0.85rem;
+      color: #666;
+      margin-bottom: 0.5rem;
+    }
+    .post-subject {
+      font-weight: bold;
+      color: #c41;
+      margin-right: 0.5rem;
+    }
+    .post-name {
+      font-weight: bold;
+      color: #117743;
+    }
+    .post-time {
+      margin-left: 0.5rem;
+    }
+    .post-num {
+      margin-left: 0.5rem;
+      color: #999;
+    }
+    .post-id a {
+      margin-left: 0.5rem;
+      color: #c41;
+      text-decoration: none;
+    }
+    .post-id a:hover { text-decoration: underline; }
+    .post-images {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 0.5rem;
+      margin-bottom: 0.5rem;
+    }
+    .post-image {
+      max-width: 200px;
+    }
+    .post-image a {
+      display: block;
+    }
+    .post-image img {
+      max-width: 100%;
+      max-height: 200px;
+      border: 1px solid #ccc;
+      border-radius: 2px;
+    }
+    .post-image img:hover {
+      border-color: #c41;
+    }
+    .post-body {
+      word-wrap: break-word;
+    }
+    .post-body a {
+      color: #c41;
+      text-decoration: none;
+    }
+    .post-body a:hover { text-decoration: underline; }
+    .quote { color: #789922; }
+    .reply-link { color: #c41; }
+    .sage { color: #a00; font-style: italic; }
+  </style>
+</head>
+<body>
+
+<h1 class="thread-title">{{thread_title}}</h1>
+
+{{posts}}
+
+</body>
+</html>

	arhivach-downloader Download arhivach.vc threads
	git clone https://git.ea.contact/arhivach-downloader
	Log \| Files \| Refs \| README

A	src/export.rs	\|	205	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/main.rs	\|	70	+++++++++++++++++-----------------------------------------------------
A	template.html	\|	99	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++