arhivach-downloader

Download arhivach.vc threads
git clone https://git.ea.contact/arhivach-downloader
Log | Files | Refs | README

commit 3fe94dc48e1116dee38707e4fb9e660f7a0dde72
parent 31f86e498437b2d848a6a0ca624d9b99f322155e
Author: egor-achkasov <eaachkasov@gmail.com>
Date:   Fri, 20 Feb 2026 20:03:49 +0000

Impl html export

Diffstat:
Asrc/export.rs | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/main.rs | 70+++++++++++++++++-----------------------------------------------------
Atemplate.html | 99+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 321 insertions(+), 53 deletions(-)

diff --git a/src/export.rs b/src/export.rs @@ -0,0 +1,205 @@ +use crate::post::Post; + +fn html_escape(s: &str) -> String { + s.replace('&', "&amp;") + .replace('<', "&lt;") + .replace('>', "&gt;") + .replace('"', "&quot;") +} + +/// Converts plain post text to HTML. +/// - `>>id` → reply link anchor +/// - Lines starting with `>` (not `>>digit`) → greentext span +/// - `\n` → `<br>` +fn render_text_to_html(text: &str) -> String { + static RE_REPLY: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| { + regex::Regex::new(r"&gt;&gt;(\d+)").unwrap() + }); + + let lines: Vec<String> = text.split('\n').map(|line| { + let escaped = html_escape(line); + // Greentext: starts with > but not >>digit + let processed = if escaped.starts_with("&gt;") && !escaped.starts_with("&gt;&gt;") { + format!("<span class=\"quote\">{}</span>", escaped) + } else { + escaped + }; + // Reply links: >>id + RE_REPLY.replace_all(&processed, |caps: &regex::Captures| { + let id = &caps[1]; + format!("<a href=\"#post{}\" class=\"reply-link\">&gt;&gt;{}</a>", id, id) + }).into_owned() + }).collect(); + + lines.join("<br>\n") +} + +/// Export the thread to a simple static HTML +/// +/// Creates a directory as follows: +/// ./{thread_name}, where {thread_name} is OP subject or first 20 characters of OP text +/// If download_files is true, downloads files to ./{thread_name}/files +/// If download_thumbnails is true, downloads thumbnails to ./{thread_name}/thumb +/// +/// WARNING: If the directory already exists, it will be overwritten +pub async fn export2html( + posts: Vec<Post>, + download_files: bool, + download_thumbnails: bool, +) -> Result<(), Box<dyn std::error::Error>> { + if posts.is_empty() { + return Err("No posts to export".into()); + } + + // Get thread name + let thread_name: String = format!( + "{}_{}", + posts[0].subject.clone().unwrap_or_else(|| posts[0].text.chars().take(20).collect()), + posts[0].id + ); + + // Create directories + let dir = format!("{}", thread_name); + std::fs::create_dir_all(&dir)?; + + // Render the thread + let posts_html: String = posts + .iter() + .map(|p| render_post(p, download_files, download_thumbnails)) + .collect::<Vec<String>>() + .join("\n"); + // Download files + if download_files { + let dir = format!("{}/files", dir); + std::fs::create_dir_all(&dir)?; + for (f, filename) in posts.iter().flat_map(|p| &p.files) + .filter_map(|f| f.url.split('/').last().map(|name| (f, name))) + { + let path = format!("{}/{}", dir, filename); + download(&f.url, &path).await?; + } + } + // Download thumbnails + if download_thumbnails { + let dir = format!("{}/thumb", dir); + std::fs::create_dir_all(&dir)?; + for (f, filename) in posts.iter().flat_map(|p| &p.files) + .filter_map(|f| f.url_thumb.split('/').last().map(|name| (f, name))) + { + let path = format!("{}/{}", dir, filename); + download(&f.url, &path).await?; + } + } + + // Insert the posts html into a template and write as index.html + let template = std::fs::read_to_string("template.html")? + .replace("{{thread_name}}", &thread_name) + .replace("{{posts}}", &posts_html); + std::fs::write(format!("{}/index.html", dir), template)?; + + Ok(()) +} + +fn render_post( + post: &Post, + download_files: bool, + download_thumbnails: bool, +) -> String { + let mut html = format!("<div class=\"post\" id=\"post{}\">\n", post.id); + + html.push_str(" <div class=\"post-head\">\n"); + + // Sage + if let Some(ref mailto) = post.mailto { + if mailto.contains("sage") { + html.push_str(" <span class=\"sage\">[sage]</span>\n"); + } + } + + // Subject + if let Some(ref subject) = post.subject { + html.push_str(&format!( + " <span class=\"post-subject\">{}</span>\n", + html_escape(subject) + )); + } + + // Name /w mailto/sage + let name = post.name.as_deref().unwrap_or("Аноним"); + let name_display = if let Some(ref mailto) = post.mailto { + format!("[mailto:{}] {}", mailto, name) + } else { + name.to_string() + }; + html.push_str(&format!( + " <span class=\"post-name\">{}</span>\n", + html_escape(&name_display) + )); + + // Time, num, id + html.push_str(&format!(" <span class=\"post-time\">{}</span>\n", html_escape(&post.time))); + html.push_str(&format!(" <span class=\"post-num\">{}</span>\n", html_escape(&post.num))); + html.push_str(&format!( + " <span class=\"post-id\"><a href=\"#post{0}\">№{0}</a></span>\n", + post.id + )); + + html.push_str(" </div>\n"); + + // Images + html.push_str(&render_images(&post.files, download_files, download_thumbnails)); + + // Body + html.push_str(" <div class=\"post-body\">\n"); + if !post.text.is_empty() { + html.push_str(" "); + html.push_str(&render_text_to_html(&post.text)); + html.push('\n'); + } + html.push_str(" </div>\n"); + + html.push_str("</div>\n"); + html +} + +fn render_images( + files: &[crate::file::File], + download_files: bool, + download_thumbnails: bool, +) -> String { + if files.is_empty() { + return String::new(); + } + + let mut html = String::from(" <div class=\"post-images\">\n"); + for file in files { + let href = if download_files && !file.url.is_empty() { + format!("files/{}", file.name_timestamp) + } else { + file.url.clone() + }; + + let thumb_filename = file.url_thumb.split('/').last().unwrap_or("").to_string(); + let img_src = if download_thumbnails && !file.url_thumb.is_empty() { + format!("thumb/{}", thumb_filename) + } else { + file.url_thumb.clone() + }; + + html.push_str(&format!( + " <div class=\"post-image\">\n <a href=\"{}\" target=\"_blank\" title=\"{}\">\n <img src=\"{}\" alt=\"\" loading=\"lazy\">\n </a>\n </div>\n", + html_escape(&href), + html_escape(&file.name_orig), + html_escape(&img_src), + )); + } + html.push_str(" </div>\n"); + html +} + + +async fn download(url: &str, path: &str) -> Result<(), Box<dyn std::error::Error>> { + let bytes = reqwest::get(url).await?.bytes().await?; + std::fs::write(path, &bytes)?; + Ok(()) +} diff --git a/src/main.rs b/src/main.rs @@ -1,57 +1,24 @@ mod parse_args; mod post; mod file; +mod export; use parse_args::{Config, parse_args}; use post::Post; -use std::fs; - -const ARHIVACH_DOMAIN_NAME: &str = "arhivach.vc"; - -async fn download_html(url: &str) -> Result<String, reqwest::Error> { - let response = reqwest::get(url).await?; - let html = response.text().await?; - Ok(html) -} - -/// Validate and sanitize arhivach thread URL -/// param url: URL to validate (https?://arhivach\.vc/thread/\d{7}/?) -/// Returns None if the URL is invalid -/// Returns Some(thread_number) if the URL is valid -fn validate_and_sanitize_url(url: &str) -> Option<u32> { - let url = url.trim().trim_end_matches('/'); - let parts: Vec<&str> = url.split('/').collect(); - - // Expect: ["https:" or "http:", "", "arhivach.vc", "thread", "<number>"] - if parts.len() != 5 - || parts[0] != "https:" && parts[0] != "http:" - || parts[2] != ARHIVACH_DOMAIN_NAME - || parts[3] != "thread" { - return None; - } - - parts[4].parse::<u32>().ok() -} - async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::error::Error>> { - let thread_number = validate_and_sanitize_url(url) - .ok_or_else(|| format!("invalid URL: {}", url))?; - let html = download_html(url).await?; - - let dir = thread_number.to_string(); - fs::create_dir_all(&dir)?; + // Validate URL (expect https?://arhivach\.vc/thread/\d{7}/?) + let is_valid = matches!( + url.trim().trim_end_matches('/').split('/').collect::<Vec<_>>().as_slice(), + ["https:" | "http:", "", "arhivach.vc", "thread", _] + ); + if !is_valid { + return Err("invalid URL".into()); + } - // Get posts + let html = reqwest::get(url).await?.text().await?; let posts = Post::parse_posts(&html)?; - - // DELETE - for post in posts { - println!("{}", post); - // wait for user to press any button - let mut input = String::new(); - std::io::stdin().read_line(&mut input)?; - } + export::export2html(posts, config.files, config.thumb).await?; Ok(()) } @@ -59,19 +26,16 @@ async fn scrape_thread(url: &str, config: &Config) -> Result<(), Box<dyn std::er #[tokio::main] async fn main() { - let config = match parse_args() { - Ok(c) => c, - Err(e) => { - eprintln!("Error: {}", e); + let config = parse_args() + .unwrap_or_else(|e| { + eprintln!("Error parsing arguments: {}", e); std::process::exit(1); - } - }; + }); for (i, url) in config.urls.iter().enumerate() { println!("Processing: {} ({} / {})", url, i + 1, config.urls.len()); - if let Err(e) = scrape_thread(url, &config).await { - eprintln!("Error processing {}: {}", url, e); - } + scrape_thread(url, &config).await + .unwrap_or_else(|e| eprintln!("Error processing {}: {}", url, e)); } println!("Done"); diff --git a/template.html b/template.html @@ -0,0 +1,99 @@ +<!DOCTYPE html> +<html lang="ru"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>{{thread_title}} - Thread Archive</title> + <style> + * { box-sizing: border-box; } + body { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif; + line-height: 1.5; + max-width: 900px; + margin: 0 auto; + padding: 1rem; + background: #f5f5f5; + color: #333; + } + .thread-title { + font-size: 1.5rem; + margin: 0 0 1rem 0; + padding-bottom: 0.5rem; + border-bottom: 2px solid #ccc; + } + .post { + background: #fff; + border: 1px solid #ddd; + border-radius: 4px; + margin-bottom: 0.75rem; + padding: 0.75rem; + } + .post-head { + font-size: 0.85rem; + color: #666; + margin-bottom: 0.5rem; + } + .post-subject { + font-weight: bold; + color: #c41; + margin-right: 0.5rem; + } + .post-name { + font-weight: bold; + color: #117743; + } + .post-time { + margin-left: 0.5rem; + } + .post-num { + margin-left: 0.5rem; + color: #999; + } + .post-id a { + margin-left: 0.5rem; + color: #c41; + text-decoration: none; + } + .post-id a:hover { text-decoration: underline; } + .post-images { + display: flex; + flex-wrap: wrap; + gap: 0.5rem; + margin-bottom: 0.5rem; + } + .post-image { + max-width: 200px; + } + .post-image a { + display: block; + } + .post-image img { + max-width: 100%; + max-height: 200px; + border: 1px solid #ccc; + border-radius: 2px; + } + .post-image img:hover { + border-color: #c41; + } + .post-body { + word-wrap: break-word; + } + .post-body a { + color: #c41; + text-decoration: none; + } + .post-body a:hover { text-decoration: underline; } + .quote { color: #789922; } + .reply-link { color: #c41; } + .sage { color: #a00; font-style: italic; } + </style> +</head> +<body> + +<h1 class="thread-title">{{thread_title}}</h1> + +{{posts}} + +</body> +</html>